From cd9fc7a2b95204f0169e20409583278a13fe1ded Mon Sep 17 00:00:00 2001 From: Thom Johansen Date: Wed, 24 Oct 2007 22:39:08 +0000 Subject: Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16(). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libspeex/filters.c | 3 + apps/codecs/libspeex/filters_cf.S | 182 ++++++++++++++++++++++++++++++++++---- 2 files changed, 168 insertions(+), 17 deletions(-) (limited to 'apps/codecs') diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c index 02f93a27b1..e64f087a5d 100644 --- a/apps/codecs/libspeex/filters.c +++ b/apps/codecs/libspeex/filters.c @@ -47,6 +47,7 @@ #include "filters_arm4.h" #elif defined (COLDFIRE_ASM) #define OVERRIDE_IIR_MEM16 +#define OVERRIDE_QMF_SYNTH #elif defined (BFIN_ASM) #include "filters_bfin.h" #endif @@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1 } } +#ifndef OVERRIDE_QMF_SYNTH /* Re-synthesised a signal from the QMF low-band and high-band signals */ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) /* assumptions: @@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_ for (i = 0; i < M2; i++) mem2[2*i+1] = xx2[i]; } +#endif #ifdef FIXED_POINT #if 0 diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S index 579af11581..dd650844c8 100644 --- a/apps/codecs/libspeex/filters_cf.S +++ b/apps/codecs/libspeex/filters_cf.S @@ -31,7 +31,6 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - .text /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */ .global iir_mem16 @@ -59,14 +58,18 @@ iir_mem16: move.w (%a3)+, %d0 ext.l %d0 add.l %d1, %d0 | Add with x[i] - move.l #32768, %d1 - add.l %d1, %d0 | Bias result to [0..65535] - cmp.l #65535, %d0 | Clip to [0..65535] range - jle 1f - spl.b %d0 - ext.w %d0 + move.l #32767, %d1 + move.l #65534, %a6 + add.l %d1, %d0 | Bias result to [-1..65534] + cmp.l %a6, %d0 | Now do clip to [0..65534] range + jls 2f + jpl 1f + clr.l %d0 | Clip low + .word 0x51fa | trapf.w, shadow next insn 1: - sub.l %d1, %d0 | Bias clipped result back to [-32768..32767] + move.l %a6, %d0 | Clip high +2: + sub.l %d1, %d0 | Bias clipped result back to [-32767..32767] neg.l %d0 | msac.w is bugged in gas, do this for now move.w %d0, (%a5)+ | Write result to y[i] move.l (%a4)+, %a6 | Fetch den[0] and den[1] @@ -111,14 +114,18 @@ iir_mem16: move.w (%a3)+, %d0 ext.l %d0 add.l %d1, %d0 | Add with x[i] - move.l #32768, %d1 - add.l %d1, %d0 | Bias result to [0..65535] - cmp.l #65535, %d0 | Clip to [0..65535] range - jle 1f - spl.b %d0 - ext.w %d0 + move.l #32767, %d1 + move.l #65534, %a6 + add.l %d1, %d0 | Bias result to [-1..65534] + cmp.l %a6, %d0 | Now do clip to [0..65534] range + jls 2f + jpl 1f + clr.l %d0 | Clip low + .word 0x51fa | trapf.w, shadow next insn 1: - sub.l %d1, %d0 | Bias clipped result back to [-32768..32767] + move.l %a6, %d0 | Clip high +2: + sub.l %d1, %d0 | Bias clipped result back to [-32767..32767] neg.l %d0 | msac.w is bugged in gas, do this for now move.w %d0, (%a5)+ | Write result to y[i] move.l (%a4)+, %a6 | Fetch den[0] and den[1] @@ -159,7 +166,148 @@ iir_mem16: movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[] .exit: - movem.l (%sp), %d2-%d7/%a2-%a6 - lea.l (44, %sp), %sp + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (44, %sp), %sp + rts + +/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */ + .global qmf_synth +qmf_synth: + lea.l (-44, %sp), %sp + movem.l %d2-%d7/%a2-%a6, (%sp) + movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y + movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2 + move.l #0x80, %macsr | Enable saturation + + | Comments make more sense when compared to the reference C version + move.l %a2, %d6 | Backup a + lsr.l #1, %d0 | N2 = N >> 1 + lsr.l #1, %d1 | M2 = M >> 1 + move.l %d1, %d7 | Backup M2 + clr.l %d2 + sub.l %d0, %d2 + sub.l %d1, %d2 | d2 = -(N2 + M2) + lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts + lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2 + move.l %sp, %d3 + move.l %a6, %sp | Update sp + move.l %d3, -(%sp) | Stack old %sp + + | Backwards copy x1 and x2 arrays to xx1 and xx2 + | TODO: these copying loops probably have more potential for optimization + lea.l (%a0, %d0.l*2), %a0 | x1 += N2 + lea.l (%a1, %d0.l*2), %a1 | x2 += N2 + move.l %d0, %d2 | Loop counter is N2 +0: + move.w -(%a0), (%a2)+ + move.w -(%a1), (%a6)+ + subq.l #1, %d2 + jne 0b + + | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2 + move.l %d1, %d2 | Loop counter is M2 + addq.l #4, %a4 | a4 = &mem1[1] + addq.l #4, %a5 | a5 = &mem2[1] + move.l %a4, %d3 | Backup mem1 and mem2 + move.l %a5, %d4 +0: + move.l (%a4), %d5 + move.w %d5, (%a2)+ + move.l (%a5), %d5 + move.w %d5, (%a6)+ + addq.l #8, %a4 + addq.l #8, %a5 + subq.l #1, %d2 + jne 0b + move.l %d3, %a4 | a4 = &mem1[1] + move.l %d4, %a5 | a5 = &mem2[1] + + clr.l %d2 + sub.l %d1, %d2 | d2 = -M2 + lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2] + lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2] + move.l %d6, %a2 | a2 = a + + | Main loop, register usage: + | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup + | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1] + | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2 +0: | Outerloop + move.l #32768, %d2 | Rounding constant + move.l %d2, %acc0 + move.l %d2, %acc1 + move.l %d2, %acc2 + move.l %d2, %acc3 + move.w (%a0)+, %d2 | d2 = x10 + move.w (%a1)+, %d4 | d4 = x20 + move.l (%a2)+, %d6 | d6 = [a0, a1] +1: | Innerloop + move.w (%a0)+, %d3 | d3 = x11 + move.w (%a1)+, %d5 | d5 = x21 + mac.w %d6u, %d3l, #1, %acc0 | acc0 += a0*x11 + msac.w %d6u, %d5l, #1, %acc0 | acc0 -= a0*x21 + mac.w %d6l, %d3l, #1, %acc1 | acc1 += a1*x11 + mac.w %d6l, %d5l, #1, %acc1 | acc1 += a1*x21 + mac.w %d6u, %d2l, #1, %acc2 | acc2 += a0*x10 + msac.w %d6u, %d4l, #1, %acc2 | acc2 -= a0*x20 + mac.w %d6l, %d2l, #1, %acc3 | acc3 += a1*x10 + mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20 + + move.w (%a0)+, %d2 | d2 = x10 + move.w (%a1)+, %d4 | d4 = x20 + mac.w %d6u, %d2l, #1, %acc0 | acc0 += a0*x10 + msac.w %d6u, %d4l, #1, %acc0 | acc0 -= a0*x20 + mac.w %d6l, %d2l, #1, %acc1 | acc1 += a1*x10 + mac.w %d6l, %d4l, #1, %acc1 | acc1 += a1*x20 + mac.w %d6u, %d3l, #1, %acc2 | acc2 += a0*x11 + msac.w %d6u, %d5l, #1, %acc2 | acc2 -= a0*x21 + mac.w %d6l, %d3l, #1, %acc3 | acc3 += a1*x11 + mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21 + subq.l #2, %d1 + jne 1b + + sub.l %d7, %d1 | d1 = -M2 + lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0] + lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i] + lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i] + neg.l %d1 | d1 = M2 + movclr.l %acc0, %d2 + movclr.l %acc1, %d3 + movclr.l %acc2, %d4 + movclr.l %acc3, %d5 + swap.w %d2 | Shift 16 right + swap.w %d3 + swap.w %d4 + swap.w %d5 + | Thanks to the extra shift in the mac chain, we get clipping for free. + | The clipping will be [-32768..32767], not Speex standard [-32767..32767], + | but since qmf_synth() is called so late in the signal chain, it should + | work fine. + move.w %d2, (%a3)+ | Write results to y[] + move.w %d3, (%a3)+ + move.w %d4, (%a3)+ + move.w %d5, (%a3)+ + subq.l #2, %d0 + jne 0b + + | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries + addq.l #4, %a0 | a0 = &xx1[0] + addq.l #4, %a1 | a1 = &xx2[0] +0: + move.w (%a0)+, %d2 + move.w (%a1)+, %d3 + ext.l %d2 + ext.l %d3 + move.l %d2, (%a4) + move.l %d3, (%a5) + addq.l #8, %a4 + addq.l #8, %a5 + subq.l #1, %d1 + jne 0b + + move.l #0, %macsr + move.l (%sp), %sp + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (44, %sp), %sp rts -- cgit v1.2.3