From 1b05ea8ffe7e2ac36d77c5ff712805f6fb476d1e Mon Sep 17 00:00:00 2001 From: Thom Johansen Date: Sun, 11 Mar 2007 23:33:58 +0000 Subject: ARM assembler for resampling. Should provide some gains, though not huge ones. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12732 a1c6a512-1295-4272-9138-f99709370657 --- apps/dsp_arm.S | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- apps/dsp_asm.h | 5 +-- 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/apps/dsp_arm.S b/apps/dsp_arm.S index 27669203f1..c3e5c7cd05 100644 --- a/apps/dsp_arm.S +++ b/apps/dsp_arm.S @@ -17,14 +17,14 @@ * ****************************************************************************/ -/* +/**************************************************************************** * void apply_crossfeed(int count, int32_t* src[]) */ .section .text .global apply_crossfeed apply_crossfeed: @ unfortunately, we ended up in a bit of a register squeeze here, and need - @ to keep both the count and the delay line index on the stack :/ + @ to keep the count on the stack :/ stmdb sp!, { r4-r11, lr } @ stack modified regs ldmia r1, { r2-r3 } @ r2 = src[0], r3 = src[1] @@ -74,7 +74,131 @@ apply_crossfeed: @ save data back to struct ldr r12, =crossfeed_data + 4*4 stmia r12, { r8-r11 } @ save filter history - str r0, [r12, #30*4] @ save delay line index + str r0, [r12, #30*4] @ save delay line index add sp, sp, #8 @ remove temp variables from stack ldmia sp!, { r4-r11, pc } +.cfend: + .size apply_crossfeed,.cfend-apply_crossfeed + +/**************************************************************************** + * int dsp_downsample(int count, struct dsp_data *data, + * in32_t *src[], int32_t *dst[]) + */ + .section .text + .global dsp_downsample +dsp_downsample: + stmdb sp!, { r4-r11, lr } @ stack modified regs + ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta + sub r5, r5, #1 @ pre-decrement num_channels for use + add r4, r1, #12 @ r4 = &resample_data.phase + mov r12, #0xff + orr r12, r12, #0xff00 @ r12 = 0xffff +.dschannel_loop: + ldr r1, [r4] @ r1 = resample_data.phase + ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1] + ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1] + add r9, r4, #4 @ r9 = &last_sample[0] + ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] + sub r11, r0, #1 + ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ... + str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample + movs r9, r1, lsr #16 @ r9 = pos = phase >> 16 + ldreq r11, [r7] @ if pos = 0, load src[0] and jump into loop + beq .dsuse_last_start + cmp r9, r0 @ if pos >= count, we're already done + bge .dsloop_skip + + @ Register usage in loop: + @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel, + @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos] +.dsloop: + add r9, r7, r9, lsl #2 @ r9 = &s[pos] + ldmda r9, { r10, r11 } @ r10 = s[pos - 1], r11 = s[pos] +.dsuse_last_start: + sub r11, r11, r10 @ r11 = diff = s[pos] - s[pos - 1] + @ keep frac in lower bits to take advantage of multiplier early termination + and r9, r1, r12 @ frac = phase & 0xffff + smull r9, r14, r11, r9 + add r10, r10, r14, lsl #16 + add r10, r10, r9, lsr #16 @ r10 = out = s[pos - 1] + frac*diff + str r10, [r8], #4 @ *d++ = out + add r1, r1, r6 @ phase += delta + mov r9, r1, lsr #16 @ pos = phase >> 16 + cmp r9, r0 @ pos < count? + blt .dsloop @ yup, do more samples +.dsloop_skip: + subs r5, r5, #1 + bpl .dschannel_loop @ if (--ch) >= 0, do another channel + sub r1, r1, r0, lsl #16 @ wrap phase back to start + str r1, [r4] @ store back + ldr r1, [r3] @ r1 = &dst[0] + sub r8, r8, r1 @ dst - &dst[0] + mov r0, r8, lsr #2 @ convert bytes->samples + ldmia sp!, { r4-r11, pc } @ ... and we're out +.dsend: + .size dsp_downsample,.dsend-dsp_downsample + +/**************************************************************************** + * int dsp_upsample(int count, struct dsp_data *dsp, + * in32_t *src[], int32_t *dst[]) + */ + .section .text + .global dsp_upsample +dsp_upsample: + stmdb sp!, { r4-r11, lr } @ stack modified regs + ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta + sub r5, r5, #1 @ pre-decrement num_channels for use + add r4, r1, #12 @ r4 = &resample_data.phase + stmdb sp!, { r0, r4 } @ stack count and &resample_data.phase +.uschannel_loop: + ldr r12, [r4] @ r12 = resample_data.phase + mov r1, r12, ror #16 @ swap halfword positions, we'll use carry + @ to detect pos increments + ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1] + ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1] + add r9, r4, #4 @ r9 = &last_sample[0] + ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] + sub r11, r0, #1 + ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ... + str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample + add r9, r7, r0, lsl #2 @ r9 = src_end = &src[count] + movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16 + beq .usstart_0 @ pos = 0 + cmp r14, r0 @ if pos >= count, we're already done + bge .usloop_skip + add r7, r7, r14, lsl #2 @ r7 = &s[pos] + ldr r10, [r7, #-4] @ r11 = s[pos - 1] + b .usstart_0 + + @ Register usage in loop: + @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel, + @ r6 = delta, r7 = s, r8 = d, r9 = src_end, r10 = s[pos - 1], r11 = s[pos] +.usloop_1: + mov r10, r11 @ r10 = previous sample +.usstart_0: + ldr r11, [r7], #4 @ r11 = next sample + sub r0, r11, r10 @ r0 = s[pos] - s[pos - 1] +.usloop_0: + mov r4, r1, lsr #16 @ r4 = frac = phase >> 16 + smull r12, r14, r4, r0 + add r14, r10, r14, lsl #16 + add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff + str r14, [r8], #4 @ *d++ = out + adds r1, r1, r6, lsl #16 @ phase += delta << 16 + bcc .usloop_0 @ if carry is set, pos is incremented + cmp r7, r9 @ if s < src_end, do another sample + blo .usloop_1 +.usloop_skip: + subs r5, r5, #1 + ldmia sp, { r0, r4 } @ reload count and &resample_data.phase + bpl .uschannel_loop @ if (--ch) >= 0, do another channel + mov r1, r1, ror #16 @ wrap phase back to start of next frame + str r1, [r4] @ store back + ldr r1, [r3] @ r1 = &dst[0] + sub r8, r8, r1 @ dst - &dst[0] + mov r0, r8, lsr #2 @ convert bytes->samples + add sp, sp, #8 @ adjust stack for temp variables + ldmia sp!, { r4-r11, pc } @ ... and we're out +.usend: + .size dsp_upsample,.usend-dsp_upsample diff --git a/apps/dsp_asm.h b/apps/dsp_asm.h index ee90f5763e..f8df337b37 100644 --- a/apps/dsp_asm.h +++ b/apps/dsp_asm.h @@ -27,13 +27,12 @@ #if defined(CPU_COLDFIRE) || defined(CPU_ARM) #define DSP_HAVE_ASM_CROSSFEED void apply_crossfeed(int count, int32_t *buf[]); -#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */ - -#if defined (CPU_COLDFIRE) #define DSP_HAVE_ASM_RESAMPLING int dsp_downsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]); int dsp_upsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]); +#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */ +#if defined (CPU_COLDFIRE) #define DSP_HAVE_ASM_SOUND_CHAN_MONO void channels_process_sound_chan_mono(int count, int32_t *buf[]); #define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM -- cgit v1.2.3