From cd9fc7a2b95204f0169e20409583278a13fe1ded Mon Sep 17 00:00:00 2001
From: Thom Johansen <thomj@rockbox.org>
Date: Wed, 24 Oct 2007 22:39:08 +0000
Subject: Coldfire assembler version of qmf_synth(). Wideband and
 ultra-wideband Speex files should see a great speedup. Also add faster and
 symmetric clipping in iir_mem16().

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libspeex/filters.c    |   3 +
 apps/codecs/libspeex/filters_cf.S | 182 ++++++++++++++++++++++++++++++++++----
 2 files changed, 168 insertions(+), 17 deletions(-)

(limited to 'apps/codecs')

diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c
index 02f93a27b1..e64f087a5d 100644
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@@ -47,6 +47,7 @@
 #include "filters_arm4.h"
 #elif defined (COLDFIRE_ASM)
 #define OVERRIDE_IIR_MEM16
+#define OVERRIDE_QMF_SYNTH
 #elif defined (BFIN_ASM)
 #include "filters_bfin.h"
 #endif
@@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
    }
 }
 
+#ifndef OVERRIDE_QMF_SYNTH
 /* Re-synthesised a signal from the QMF low-band and high-band signals */
 void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
    /* assumptions:
@@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
    for (i = 0; i < M2; i++)
       mem2[2*i+1] = xx2[i];
 }
+#endif
 
 #ifdef FIXED_POINT
 #if 0
diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S
index 579af11581..dd650844c8 100644
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@@ -31,7 +31,6 @@
    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-
     .text
 /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
     .global iir_mem16
@@ -59,14 +58,18 @@ iir_mem16:
     move.w   (%a3)+, %d0
     ext.l    %d0
     add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
-    jle      1f
-    spl.b    %d0                
-    ext.w    %d0
+    move.l   #32767, %d1
+    move.l   #65534, %a6
+    add.l    %d1, %d0           | Bias result to [-1..65534]
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
+    jls      2f
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
     neg.l    %d0                | msac.w is bugged in gas, do this for now
     move.w   %d0, (%a5)+        | Write result to y[i]
     move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@@ -111,14 +114,18 @@ iir_mem16:
     move.w   (%a3)+, %d0
     ext.l    %d0
     add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
-    jle      1f
-    spl.b    %d0                
-    ext.w    %d0
+    move.l   #32767, %d1
+    move.l   #65534, %a6
+    add.l    %d1, %d0           | Bias result to [-1..65534]
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
+    jls      2f
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
     neg.l    %d0                | msac.w is bugged in gas, do this for now
     move.w   %d0, (%a5)+        | Write result to y[i]
     move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@@ -159,7 +166,148 @@ iir_mem16:
     movem.l  %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
 
 .exit:
-    movem.l (%sp), %d2-%d7/%a2-%a6
-    lea.l (44, %sp), %sp
+    movem.l  (%sp), %d2-%d7/%a2-%a6
+    lea.l    (44, %sp), %sp
+    rts
+
+/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
+    .global qmf_synth
+qmf_synth:
+    lea.l    (-44, %sp), %sp
+    movem.l  %d2-%d7/%a2-%a6, (%sp)
+    movem.l  (44+4, %sp), %a0-%a3          | a0 = x1, a1 = x2, a2 = a, a3 = y
+    movem.l  (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
+    move.l   #0x80, %macsr                 | Enable saturation
+
+    | Comments make more sense when compared to the reference C version
+    move.l   %a2, %d6                   | Backup a
+    lsr.l    #1, %d0                    | N2 = N >> 1
+    lsr.l    #1, %d1                    | M2 = M >> 1
+    move.l   %d1, %d7                   | Backup M2
+    clr.l    %d2
+    sub.l    %d0, %d2
+    sub.l    %d1, %d2                   | d2 = -(N2 + M2)
+    lea.l    (%sp, %d2.l*2), %a2        | Alloc two buffers of N2 + M2 shorts
+    lea.l    (%a2, %d2.l*2), %a6        | a2 = xx1, a6 = xx2
+    move.l   %sp, %d3
+    move.l   %a6, %sp                   | Update sp
+    move.l   %d3, -(%sp)                | Stack old %sp
+
+    | Backwards copy x1 and x2 arrays to xx1 and xx2
+    | TODO: these copying loops probably have more potential for optimization
+    lea.l    (%a0, %d0.l*2), %a0        | x1 += N2
+    lea.l    (%a1, %d0.l*2), %a1        | x2 += N2
+    move.l   %d0, %d2                   | Loop counter is N2
+0:
+    move.w   -(%a0), (%a2)+
+    move.w   -(%a1), (%a6)+
+    subq.l   #1, %d2
+    jne      0b
+
+    | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
+    move.l   %d1, %d2                   | Loop counter is M2
+    addq.l   #4, %a4                    | a4 = &mem1[1]
+    addq.l   #4, %a5                    | a5 = &mem2[1]
+    move.l   %a4, %d3                   | Backup mem1 and mem2
+    move.l   %a5, %d4
+0:
+    move.l   (%a4), %d5
+    move.w   %d5, (%a2)+
+    move.l   (%a5), %d5
+    move.w   %d5, (%a6)+
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d2
+    jne      0b
+    move.l   %d3, %a4                   | a4 = &mem1[1]
+    move.l   %d4, %a5                   | a5 = &mem2[1]
+
+    clr.l    %d2
+    sub.l    %d1, %d2                   | d2 = -M2
+    lea.l    (-4, %a2, %d2.l*2), %a0    | a0 = &xx1[N2 - 2]
+    lea.l    (-4, %a6, %d2.l*2), %a1    | a1 = &xx2[N2 - 2]
+    move.l   %d6, %a2                   | a2 = a
+
+    | Main loop, register usage:
+    | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
+    | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
+    | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
+0:  | Outerloop
+    move.l   #32768, %d2                        | Rounding constant
+    move.l   %d2, %acc0
+    move.l   %d2, %acc1
+    move.l   %d2, %acc2
+    move.l   %d2, %acc3
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    move.l   (%a2)+, %d6                        | d6 = [a0, a1]
+1:  | Innerloop
+    move.w   (%a0)+, %d3                        | d3 = x11
+    move.w   (%a1)+, %d5                        | d5 = x21
+    mac.w    %d6u, %d3l, #1, %acc0              | acc0 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc0              | acc0 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc1              | acc1 += a1*x11
+    mac.w    %d6l, %d5l, #1, %acc1              | acc1 += a1*x21
+    mac.w    %d6u, %d2l, #1, %acc2              | acc2 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc2              | acc2 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc3              | acc3 += a1*x10
+    mac.w    %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
+
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    mac.w    %d6u, %d2l, #1, %acc0              | acc0 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc0              | acc0 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc1              | acc1 += a1*x10
+    mac.w    %d6l, %d4l, #1, %acc1              | acc1 += a1*x20
+    mac.w    %d6u, %d3l, #1, %acc2              | acc2 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc2              | acc2 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc3              | acc3 += a1*x11
+    mac.w    %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
+    subq.l   #2, %d1
+    jne      1b
+ 
+    sub.l    %d7, %d1                           | d1 = -M2
+    lea.l    (-4, %a2, %d1.l*4), %a2            | a2 = &a[0]
+    lea.l    (-6, %a0, %d1.l*2), %a0            | a0 = &xx1[N2 - 2 - i] 
+    lea.l    (-6, %a1, %d1.l*2), %a1            | a1 = &xx2[N2 - 2 - i]
+    neg.l    %d1                                | d1 = M2
+    movclr.l %acc0, %d2
+    movclr.l %acc1, %d3
+    movclr.l %acc2, %d4
+    movclr.l %acc3, %d5
+    swap.w   %d2                                | Shift 16 right
+    swap.w   %d3
+    swap.w   %d4
+    swap.w   %d5
+    | Thanks to the extra shift in the mac chain, we get clipping for free.
+    | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
+    | but since qmf_synth() is called so late in the signal chain, it should
+    | work fine.
+    move.w   %d2, (%a3)+        | Write results to y[]
+    move.w   %d3, (%a3)+
+    move.w   %d4, (%a3)+
+    move.w   %d5, (%a3)+
+    subq.l   #2, %d0
+    jne      0b
+
+    | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
+    addq.l   #4, %a0            | a0 = &xx1[0]
+    addq.l   #4, %a1            | a1 = &xx2[0]
+0:
+    move.w   (%a0)+, %d2
+    move.w   (%a1)+, %d3
+    ext.l    %d2
+    ext.l    %d3
+    move.l   %d2, (%a4)
+    move.l   %d3, (%a5)
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d1
+    jne      0b
+
+    move.l   #0, %macsr
+    move.l   (%sp), %sp
+    movem.l  (%sp), %d2-%d7/%a2-%a6
+    lea.l    (44, %sp), %sp
     rts
 
-- 
cgit v1.2.3