From c7a8663c7bd6ccb9568fc31b6bad4f5a38aebb8e Mon Sep 17 00:00:00 2001 From: Thom Johansen Date: Wed, 7 Nov 2007 00:50:37 +0000 Subject: Assembler versions of signal_mul. Decent speedup for Coldfire and small speedup for ARM. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15502 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libspeex/filters.c | 4 ++++ apps/codecs/libspeex/filters_arm4.S | 27 +++++++++++++++++++++++- apps/codecs/libspeex/filters_cf.S | 42 +++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) (limited to 'apps') diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c index 13adc9c009..a8a2e0bc71 100644 --- a/apps/codecs/libspeex/filters.c +++ b/apps/codecs/libspeex/filters.c @@ -47,9 +47,11 @@ #include "filters_arm4.h" #define OVERRIDE_IIR_MEM16 #define OVERRIDE_QMF_SYNTH +#define OVERRIDE_SIGNAL_MUL #elif defined (COLDFIRE_ASM) #define OVERRIDE_IIR_MEM16 #define OVERRIDE_QMF_SYNTH +#define OVERRIDE_SIGNAL_MUL #elif defined (BFIN_ASM) #include "filters_bfin.h" #endif @@ -114,6 +116,7 @@ void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_m #ifdef FIXED_POINT +#ifndef OVERRIDE_SIGNAL_MUL /* FIXME: These functions are ugly and probably introduce too much error */ void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) { @@ -123,6 +126,7 @@ void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) y[i] = SHL32(MULT16_32_Q14(EXTRACT16(SHR32(x[i],7)),scale),7); } } +#endif #ifndef SPEEX_DISABLE_ENCODER void signal_div(const spx_word16_t *x, spx_word16_t *y, spx_word32_t scale, int len) diff --git a/apps/codecs/libspeex/filters_arm4.S b/apps/codecs/libspeex/filters_arm4.S index 7924e7030f..e0b33c848f 100644 --- a/apps/codecs/libspeex/filters_arm4.S +++ b/apps/codecs/libspeex/filters_arm4.S @@ -199,7 +199,7 @@ qmf_synth: bne 0b sub r0, r8, r5 @ r0 = &xx1[N2] - sub r1, r9, r5 @ r1 = %xx2[N2] + sub r1, r9, r5 @ r1 = &xx2[N2] str r4, [sp, #-4] @ Stack N mov r4, r5 str r4, [sp, #-8] @ Stack M @@ -300,3 +300,28 @@ qmf_synth: bne 0b ldmia sp!, { r4-r11, pc } @ Exit + +/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */ + .global signal_mul +signal_mul: + stmdb sp!, { r4-r8, lr } +0: + ldmia r0!, { r5-r8 } @ Load four input samples + smull r5, r12, r2, r5 + mov r12, r12, lsl #18 @ Recombine upper and lower parts + orr r5, r12, r5, lsr #14 + smull r6, r12, r2, r6 + mov r12, r12, lsl #18 + orr r6, r12, r6, lsr #14 + smull r7, r12, r2, r7 + mov r12, r12, lsl #18 + orr r7, r12, r7, lsr #14 + smull r8, r12, r2, r8 + mov r12, r12, lsl #18 + orr r8, r12, r8, lsr #14 + stmia r1!, { r5-r8 } @ Store four output samples + subs r3, r3, #4 @ Are we done? + bne 0b + + ldmia sp!, { r4-r8, pc } @ Exit + diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S index 861d6c18f9..a48af85095 100644 --- a/apps/codecs/libspeex/filters_cf.S +++ b/apps/codecs/libspeex/filters_cf.S @@ -312,3 +312,45 @@ qmf_synth: lea.l (44, %sp), %sp rts + +/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */ + .global signal_mul +signal_mul: + lea.l (-20, %sp), %sp + movem.l %d2-%d6, (%sp) + movem.l (20+4, %sp), %a0-%a1 | a0 = x, a1 = y + movem.l (20+12, %sp), %d0-%d1 | d0 = scale, d1 = len + moveq.l #0x20, %d6 + move.l %d6, %macsr | Set MAC unit to fractional mode + asl.l #3, %d0 | Pre-scale 'scale' + moveq.l #9, %d6 +0: + movem.l (%a0), %d2-%d5 | Fetch input + asl.l %d6, %d2 | Shift each value 9 to the left + asl.l %d6, %d3 + asl.l %d6, %d4 + asl.l %d6, %d5 + mac.l %d2, %d0, %acc0 | Do multiplies + mac.l %d3, %d0, %acc1 + mac.l %d4, %d0, %acc2 + mac.l %d5, %d0, %acc3 + lea.l (16, %a0), %a0 + movclr.l %acc0, %d2 + movclr.l %acc1, %d3 + movclr.l %acc2, %d4 + movclr.l %acc3, %d5 + asl.l #5, %d2 | Adjust to proper format + asl.l #5, %d3 + asl.l #5, %d4 + asl.l #5, %d5 + movem.l %d2-%d5, (%a1) | Save output + lea.l (16, %a1), %a1 + subq.l #4, %d1 + jne 0b + + clr.l %d0 + move.l %d0, %macsr | Set MAC unit back to integer mode + movem.l (%sp), %d2-%d6 + lea.l (20, %sp), %sp + rts + -- cgit v1.2.3