ARM assembler versions of iir_mem16() and qmf_synth(), yielding a very nice speedup. Touch some comments in filters_cf.S

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15393 a1c6a512-1295-4272-9138-f99709370657
author: Thom Johansen <thomj@rockbox.org> 2007-11-01 21:11:26 +0000
committer: Thom Johansen <thomj@rockbox.org> 2007-11-01 21:11:26 +0000
commit: 6d88717f6949587908ec08affa07d06239c3bae1 (patch)
tree: 55f93d99a48d0e571cee7eb8f024ca54cb58784d /apps/codecs/libspeex
parent: 9e23e9d43e6345bd3f8caa2176c1084251160a28 (diff)
download: rockbox-6d88717f6949587908ec08affa07d06239c3bae1.tar.gz
rockbox-6d88717f6949587908ec08affa07d06239c3bae1.zip
4 files changed, 321 insertions, 13 deletions
diff --git a/apps/codecs/libspeex/SOURCES b/apps/codecs/libspeex/SOURCES
index f5a6786fa1..e1f038160b 100644
--- a/apps/codecs/libspeex/SOURCES
+++ b/apps/codecs/libspeex/SOURCES
@@ -34,4 +34,6 @@ window.c
 #ifdef CPU_COLDFIRE
 filters_cf.S
 ltp_cf.S
+#elif defined(CPU_ARM)
+filters_arm4.S
 #endif
diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c
index 0e76e27e84..36b110af30 100644
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@@ -45,6 +45,8 @@
 #include "filters_sse.h"
 #elif defined (ARM4_ASM) || defined(ARM5E_ASM)
 #include "filters_arm4.h"
+#define OVERRIDE_IIR_MEM16
+#define OVERRIDE_QMF_SYNTH
 #elif defined (COLDFIRE_ASM)
 #define OVERRIDE_IIR_MEM16
 #define OVERRIDE_QMF_SYNTH
diff --git a/apps/codecs/libspeex/filters_arm4.S b/apps/codecs/libspeex/filters_arm4.S
new file mode 100644
index 0000000000..7924e7030f
--- /dev/null
+++ b/apps/codecs/libspeex/filters_arm4.S
@@ -0,0 +1,302 @@
+/* Copyright (C) 2007 Thom Johansen */
+/**
+   @file filters_arm4.S
+   @brief Various analysis/synthesis filters (ARMv4 version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+    .text
+/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
+    .global iir_mem16
+iir_mem16:
+    stmdb   sp!, { r4-r11, lr }
+    ldr     r5, [sp, #36]           @ r0 = x, r1 = den, r2 = y, r3 = N
+    ldr     r4, [sp, #40]           @ r4 = mem, r5 = ord
+    cmp     r5, #10
+    beq     .order_10
+    cmp     r5, #8
+    beq     .order_8
+    ldmia   sp!, { r4-r11, pc }     @ Mon-supported order, return
+    @ TODO: try using direct form 1 filtering
+.order_8:
+    ldmia   r4, { r5-r12 }          @ r5-r12 = mem[0..7]
+0:
+    add     r5, r5, #4096           @ Rounding constant
+    ldrsh   r14, [r0], #2
+    add     r14, r14, r5, asr #13   @ (mem[0] + 4096) >> 13 + x[i]
+    mov     r5, #0x7f00
+    orr     r5, r5, #0xff           @ r5 = 32767
+    cmp     r14, r5
+    movgt   r14, r5                 @ Clip positive
+    cmn     r14, r5
+    rsblt   r14, r5, #0             @ Clip negative
+    strh    r14, [r2], #2           @ Write result to y[i]
+    ldrsh   r4, [r1]
+    mul     r5, r4, r14 
+    sub     r5, r6, r5              @ mem[0] = mem[1] - den[0]*y[i]
+    ldrsh   r4, [r1, #2]
+    mul     r6, r4, r14 
+    sub     r6, r7, r6              @ mem[1] = mem[2] - den[1]*y[i]
+    ldrsh   r4, [r1, #4]
+    mul     r7, r4, r14 
+    sub     r7, r8, r7              @ mem[2] = mem[3] - den[2]*y[i]
+    ldrsh   r4, [r1, #6]
+    mul     r8, r4, r14 
+    sub     r8, r9, r8              @ mem[3] = mem[4] - den[3]*y[i]
+    ldrsh   r4, [r1, #8]
+    mul     r9, r4, r14 
+    sub     r9, r10, r9             @ mem[4] = mem[5] - den[4]*y[i]
+    ldrsh   r4, [r1, #10]
+    mul     r10, r4, r14 
+    sub     r10, r11, r10           @ mem[5] = mem[6] - den[5]*y[i]
+    ldrsh   r4, [r1, #12]
+    mul     r11, r4, r14 
+    sub     r11, r12, r11           @ mem[6] = mem[7] - den[6]*y[i]
+    ldrsh   r4, [r1, #14]
+    mul     r12, r4, r14 
+    rsb     r12, r12, #0            @ mem[7] = -den[7]*y[i]
+    subs    r3, r3, #1
+    bne     0b
+    ldr     r4, [sp, #40]           @ r4 = mem
+    stmia   r4, { r5-r12 }          @ Save back mem[]
+    ldmia   sp!, { r4-r11, pc }     @ Exit
+.order_10:
+    ldmia   r4, { r5-r9 }           @ r5-r9 = mem[0..4]
+    add     r5, r5, #4096           @ Rounding constant
+    ldrsh   r14, [r0], #2
+    add     r14, r14, r5, asr #13   @ (mem[0] + 4096) >> 13 + x[i]
+    mov     r5, #0x7f00
+    orr     r5, r5, #0xff           @ r5 = 32767
+    cmp     r14, r5
+    movgt   r14, r5                 @ Clip positive
+    cmn     r14, r5
+    rsblt   r14, r5, #0             @ Clip negative
+    strh    r14, [r2], #2           @ Write result to y[i]
+    ldmia   r1!, { r10-r12 }        @ r10-r12 = den[0..5]
+    mov     r5, r10, lsl #16
+    mov     r5, r5, asr #16
+    mul     r5, r14, r5 
+    sub     r5, r6, r5              @ mem[0] = mem[1] - den[0]*y[i]
+    mov     r10, r10, asr #16
+    mul     r6, r14, r10 
+    sub     r6, r7, r6              @ mem[1] = mem[2] - den[1]*y[i]
+    mov     r10, r11, lsl #16
+    mov     r10, r10, asr #16
+    mul     r7, r14, r10 
+    sub     r7, r8, r7              @ mem[2] = mem[3] - den[2]*y[i]
+    mov     r10, r11, asr #16
+    mul     r8, r14, r10 
+    sub     r8, r9, r8              @ mem[3] = mem[4] - den[3]*y[i]
+    stmia   r4!, { r5-r8 }          @ Write back mem[0..3], r4 = &mem[4]
+    mov     r10, r12, lsl #16
+    mov     r10, r10, asr #16
+    mul     r5, r14, r10 
+    ldmib   r4, { r6-r10 }          @ r6-r10 = mem[5..9]
+    sub     r5, r6, r5              @ mem[4] = mem[5] - den[4]*y[i]
+    mov     r12, r12, asr #16
+    mul     r6, r14, r12 
+    sub     r6, r7, r6              @ mem[5] = mem[6] - den[5]*y[i]
+    ldmia   r1!, { r11-r12 }        @ r11-r12 = den[6..9]
+    mov     r7, r11, lsl #16
+    mov     r7, r7, asr #16
+    mul     r7, r14, r7 
+    sub     r7, r8, r7              @ mem[6] = mem[7] - den[6]*y[i]
+    mov     r11, r11, asr #16
+    mul     r8, r14, r11 
+    sub     r8, r9, r8              @ mem[7] = mem[8] - den[7]*y[i]
+    mov     r11, r12, lsl #16
+    mov     r11, r11, asr #16
+    mul     r9, r14, r11 
+    sub     r9, r10, r9             @ mem[8] = mem[9] - den[8]*y[i]
+    mov     r12, r12, asr #16
+    mul     r10, r14, r12 
+    rsb     r10, r10, #0            @ mem[9] = -den[9]*y[i]
+    stmia   r4!, { r5-r10 }         @ Write back mem[4..9]
+    sub     r4, r4, #10*4
+    sub     r1, r1, #10*2
+    subs    r3, r3, #1
+    bne     .order_10 
+    ldmia   sp!, { r4-r11, pc }     @ Exit
+/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
+    .global qmf_synth
+qmf_synth:
+    stmdb   sp!, { r4-r11, lr }
+    add     r7, sp, #36             @ r0 = x1, r1 = x2, r2 = a, r3 = y
+    ldmia   r7, { r4-r7 }           @ r4 = N, r5 = M, r6 = mem1, r7 = mem2
+    add     r8, r4, r5
+    sub     r9, sp, r8              @ r9 = sp - (N + M >> 1) = xx2
+    sub     r8, r9, r8              @ r8 = r9 - (N + M >> 1) = xx1
+    str     sp, [r8, #-4]           @ Stack old sp
+    sub     sp, r8, #4              @ Update sp
+    add     r0, r0, r4              @ x1 += N >> 1
+    add     r1, r1, r4              @ x2 += N >> 1
+    mov     r14, r4                 @ Loop counter is N
+0:
+    @ Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
+    @ N should always be a multiple of four, so this should be OK
+    ldmdb   r0!, { r10-r11 }
+    mov     r12, r10, ror #16
+    mov     r11, r11, ror #16
+    stmia   r8!, { r11-r12 }
+    ldmdb   r1!, { r10-r11 }
+    mov     r12, r10, ror #16
+    mov     r11, r11, ror #16
+    stmia   r9!, { r11-r12 }
+    subs    r14, r14, #8
+    bne     0b
+    @ Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
+    mov     r14, r5                 @ Loop counter is M
+    add     r6, r6, #2
+    add     r7, r7, #2
+    stmdb   sp!, { r6-r7 }          @ Stack &mem1[1], &mem2[1]
+0:
+    ldrh    r10, [r6], #4
+    ldrh    r11, [r6], #4
+    ldrh    r12, [r7], #4
+    orr     r10, r10, r11, lsl #16
+    ldrh    r11, [r7], #4
+    orr     r11, r12, r11, lsl #16
+    str     r10, [r8], #4
+    str     r11, [r9], #4
+    subs    r14, r14, #4
+    bne     0b
+    sub     r0, r8, r5              @ r0 = &xx1[N2]
+    sub     r1, r9, r5              @ r1 = %xx2[N2]
+    str     r4, [sp, #-4]           @ Stack N
+    mov     r4, r5
+    str     r4, [sp, #-8]           @ Stack M
+    @ sp doesn't point to the end of the stack frame from here on, but we're not
+    @ calling anything so it shouldn't matter
+    @ Main loop, register usage:
+    @ r0 = xx1, r1 = xx2, r2 = a, r3 = y, r4 = M, r5 = x10, r6 = x11, r7 = x20
+    @ r8 = x21, r9 = [a1, a0], r10 = acc0, r11 = acc1, r12 = acc2, r14 = acc3
+0:  @ Outerloop
+    mov     r10, #16384             @ Init acccumulators to rounding const
+    mov     r11, #16384
+    mov     r12, #16384
+    mov     r14, #16384
+    ldrsh   r5, [r0, #-4]!          @ r5 = x10, r0 = &xx1[N2 - 2]
+    ldrsh   r7, [r1, #-4]!          @ r7 = x20, r1 = &xx2[N2 - 2]
+1:  @ Innerloop
+    ldrsh   r9, [r2], #2            @ r9 = a0
+    ldrsh   r6, [r0, #2]!           @ r6 = x11
+    ldrsh   r8, [r1, #2]!           @ r8 = x21
+    sub     r5, r5, r7              @ r5 = x10 - x20
+    add     r7, r5, r7, asl #1      @ r7 = x10 + x20
+    mla     r12, r9, r5, r12        @ acc2 += a0*(x10 - x20)
+    sub     r5, r6, r8              @ r5 = x11 - x21
+    mla     r10, r9, r5, r10        @ acc0 += a0*(x11 - x21)
+    ldrsh   r9, [r2], #2            @ r9 = a1
+    add     r5, r6, r8              @ r5 = x11 + x21
+    mla     r14, r9, r7, r14        @ acc3 += a1*(x10 + x20)
+    mla     r11, r9, r5, r11        @ acc1 += a1*(x11 + x21)
+    ldrsh   r9, [r2], #2            @ r9 = a1
+    ldrsh   r5, [r0, #2]!           @ r5 = x10
+    ldrsh   r7, [r1, #2]!           @ r7 = x20
+    sub     r6, r6, r8              @ r6 = x11 - x21
+    add     r8, r6, r8, asl #1      @ r8 = x11 + x21
+    mla     r12, r9, r6, r12        @ acc2 += a0*(x11 - x21)
+    sub     r6, r5, r7              @ r6 = x10 - x20
+    mla     r10, r9, r6, r10        @ acc0 += a0*(x10 - x20)
+    ldrsh   r9, [r2], #2            @ r9 = a1
+    add     r6, r5, r7              @ r5 = x10 + x20
+    mla     r14, r9, r8, r14        @ acc3 += a1*(x11 + x21)
+    mla     r11, r9, r6, r11        @ acc1 += a1*(x10 + x10)
+    subs    r4, r4, #4
+    bne     1b
+    ldr     r4, [sp, #-8]           @ r4 = M
+    sub     r2, r2, r4, lsl #1      @ r2 = &a[0]
+    sub     r0, r0, r4              @ r0 = &xx1[N2 - 2 - i]
+    sub     r1, r1, r4              @ r1 = &xx2[N2 - 2 - i]
+    
+    mov     r10, r10, asr #15       @ Shift outputs down
+    mov     r11, r11, asr #15
+    mov     r12, r12, asr #15
+    mov     r14, r14, asr #15
+    @ TODO: this can be optimized further
+    mov     r9, #0x7f00             @ Clip all four outputs
+    orr     r9, r9, #0xff           @ r9 = 32767
+    cmp     r10, r9
+    movgt   r10, r9
+    cmn     r10, r9
+    rsblt   r10, r9, #0 
+    cmp     r11, r9
+    movgt   r11, r9
+    cmn     r11, r9
+    rsblt   r11, r9, #0 
+    cmp     r12, r9
+    movgt   r12, r9
+    cmn     r12, r9
+    rsblt   r12, r9, #0 
+    cmp     r14, r9
+    movgt   r14, r9
+    cmn     r14, r9
+    rsblt   r14, r9, #0 
+    strh    r10, [r3], #2           @ Write outputs
+    strh    r11, [r3], #2
+    strh    r12, [r3], #2
+    strh    r14, [r3], #2
+    ldr     r10, [sp, #-4]          @ Load N
+    subs    r10, r10, #4            @ Are we done?
+    strne   r10, [sp, #-4]
+    bne     0b
+    @ Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
+    @ r0 and r1 are &xx1[0] and &xx2[0] at this point
+    ldmia   sp, { r5-r6, sp }       @ Fetch &mem1[1], &mem2[1], restore sp
+0:
+    ldr     r7, [r0], #4
+    ldr     r8, [r1], #4
+    strh    r7, [r5], #4
+    strh    r8, [r6], #4
+    mov     r7, r7, lsr #16
+    mov     r8, r8, lsr #16
+    strh    r7, [r5], #4
+    strh    r8, [r6], #4
+    subs    r4, r4, #4
+    bne     0b
+    ldmia   sp!, { r4-r11, pc }     @ Exit
diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S
index b0367025e1..861d6c18f9 100644
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@@ -48,6 +48,7 @@ iir_mem16:
    jeq      .order_10
    jra      .exit
+    | TODO: try using direct form 1 filtering
    | d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
    | a3 = x, a4 = den, a5 = y, a6 = temp 
 .order_8:
@@ -171,6 +172,7 @@ iir_mem16:
    lea.l    (44, %sp), %sp
    rts
 /* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
    .global qmf_synth
 qmf_synth:
@@ -210,10 +212,10 @@ qmf_synth:
    jne      0b
    | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
-    move.l   %d1, %d2                   | Loop counter is M2
+    move.l   %d1, %d2                           | Loop counter is M2
-    addq.l   #2, %a4                    | a4 = &mem1[1]
+    addq.l   #2, %a4                            | a4 = &mem1[1]
-    addq.l   #2, %a5                    | a5 = &mem2[1]
+    addq.l   #2, %a5                            | a5 = &mem2[1]
-    move.l   %a4, %d3                   | Backup mem1 and mem2
+    move.l   %a4, %d3                           | Backup mem1 and mem2
    move.l   %a5, %d4
 0:
    move.w   (%a4), (%a2)+
@@ -222,14 +224,14 @@ qmf_synth:
    addq.l   #4, %a5
    subq.l   #1, %d2
    jne      0b
-    move.l   %d3, %a4                   | a4 = &mem1[1]
+    move.l   %d3, %a4                           | a4 = &mem1[1]
-    move.l   %d4, %a5                   | a5 = &mem2[1]
+    move.l   %d4, %a5                           | a5 = &mem2[1]
    clr.l    %d2
-    sub.l    %d1, %d2                   | d2 = -M2
+    sub.l    %d1, %d2                           | d2 = -M2
-    lea.l    (-4, %a2, %d2.l*2), %a0    | a0 = &xx1[N2 - 2]
+    lea.l    (-4, %a2, %d2.l*2), %a0            | a0 = &xx1[N2 - 2]
-    lea.l    (-4, %a6, %d2.l*2), %a1    | a1 = &xx2[N2 - 2]
+    lea.l    (-4, %a6, %d2.l*2), %a1            | a1 = &xx2[N2 - 2]
-    move.l   %d6, %a2                   | a2 = a
+    move.l   %d6, %a2                           | a2 = a
    | Main loop, register usage:
    | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
@@ -286,7 +288,7 @@ qmf_synth:
    | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
    | but since qmf_synth() is called so late in the signal chain, it should
    | work fine.
-    move.w   %d2, (%a3)+        | Write results to y[]
+    move.w   %d2, (%a3)+                        | Write results to y[]
    move.w   %d3, (%a3)+
    move.w   %d4, (%a3)+
    move.w   %d5, (%a3)+
@@ -294,8 +296,8 @@ qmf_synth:
    jne      0b
    | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
-    addq.l   #4, %a0            | a0 = &xx1[0]
+    addq.l   #4, %a0                            | a0 = &xx1[0]
-    addq.l   #4, %a1            | a1 = &xx2[0]
+    addq.l   #4, %a1                            | a1 = &xx2[0]
 0:
    move.w   (%a0)+, (%a4)
    move.w   (%a1)+, (%a5)
author	Thom Johansen <thomj@rockbox.org>	2007-11-01 21:11:26 +0000
committer	Thom Johansen <thomj@rockbox.org>	2007-11-01 21:11:26 +0000
commit	6d88717f6949587908ec08affa07d06239c3bae1 (patch)
tree	55f93d99a48d0e571cee7eb8f024ca54cb58784d /apps/codecs/libspeex
parent	9e23e9d43e6345bd3f8caa2176c1084251160a28 (diff)
download	rockbox-6d88717f6949587908ec08affa07d06239c3bae1.tar.gz rockbox-6d88717f6949587908ec08affa07d06239c3bae1.zip

diff --git a/apps/codecs/libspeex/SOURCES b/apps/codecs/libspeex/SOURCES index f5a6786fa1..e1f038160b 100644 --- a/apps/codecs/libspeex/SOURCES +++ b/apps/codecs/libspeex/SOURCES
@@ -34,4 +34,6 @@ window.c
34	#ifdef CPU_COLDFIRE	34	#ifdef CPU_COLDFIRE
35	filters_cf.S	35	filters_cf.S
36	ltp_cf.S	36	ltp_cf.S
		37	#elif defined(CPU_ARM)
		38	filters_arm4.S
37	#endif	39	#endif


diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c index 0e76e27e84..36b110af30 100644 --- a/apps/codecs/libspeex/filters.c +++ b/apps/codecs/libspeex/filters.c
@@ -45,6 +45,8 @@
45	#include "filters_sse.h"	45	#include "filters_sse.h"
46	#elif defined (ARM4_ASM) \|\| defined(ARM5E_ASM)	46	#elif defined (ARM4_ASM) \|\| defined(ARM5E_ASM)
47	#include "filters_arm4.h"	47	#include "filters_arm4.h"
		48	#define OVERRIDE_IIR_MEM16
		49	#define OVERRIDE_QMF_SYNTH
48	#elif defined (COLDFIRE_ASM)	50	#elif defined (COLDFIRE_ASM)
49	#define OVERRIDE_IIR_MEM16	51	#define OVERRIDE_IIR_MEM16
50	#define OVERRIDE_QMF_SYNTH	52	#define OVERRIDE_QMF_SYNTH


diff --git a/apps/codecs/libspeex/filters_arm4.S b/apps/codecs/libspeex/filters_arm4.S new file mode 100644 index 0000000000..7924e7030f --- /dev/null +++ b/apps/codecs/libspeex/filters_arm4.S
@@ -0,0 +1,302 @@
		1	/* Copyright (C) 2007 Thom Johansen */
		2	/**
		3	@file filters_arm4.S
		4	@brief Various analysis/synthesis filters (ARMv4 version)
		5	*/
		6	/*
		7	Redistribution and use in source and binary forms, with or without
		8	modification, are permitted provided that the following conditions
		9	are met:
		10
		11	- Redistributions of source code must retain the above copyright
		12	notice, this list of conditions and the following disclaimer.
		13
		14	- Redistributions in binary form must reproduce the above copyright
		15	notice, this list of conditions and the following disclaimer in the
		16	documentation and/or other materials provided with the distribution.
		17
		18	- Neither the name of the Xiph.org Foundation nor the names of its
		19	contributors may be used to endorse or promote products derived from
		20	this software without specific prior written permission.
		21
		22	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		23	``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		24	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
		25	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
		26	CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
		27	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
		28	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
		29	PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
		30	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
		31	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
		32	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
		33	*/
		34
		35	.text
		36	/* void iir_mem16(const spx_word16_t x, const spx_coef_t den, spx_word16_t y, int N, int ord, spx_mem_t mem, char stack) /
		37	.global iir_mem16
		38	iir_mem16:
		39	stmdb sp!, { r4-r11, lr }
		40	ldr r5, [sp, #36] @ r0 = x, r1 = den, r2 = y, r3 = N
		41	ldr r4, [sp, #40] @ r4 = mem, r5 = ord
		42	cmp r5, #10
		43	beq .order_10
		44	cmp r5, #8
		45	beq .order_8
		46	ldmia sp!, { r4-r11, pc } @ Mon-supported order, return
		47
		48	@ TODO: try using direct form 1 filtering
		49	.order_8:
		50	ldmia r4, { r5-r12 } @ r5-r12 = mem[0..7]
		51	0:
		52	add r5, r5, #4096 @ Rounding constant
		53	ldrsh r14, [r0], #2
		54	add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
		55	mov r5, #0x7f00
		56	orr r5, r5, #0xff @ r5 = 32767
		57	cmp r14, r5
		58	movgt r14, r5 @ Clip positive
		59	cmn r14, r5
		60	rsblt r14, r5, #0 @ Clip negative
		61	strh r14, [r2], #2 @ Write result to y[i]
		62
		63	ldrsh r4, [r1]
		64	mul r5, r4, r14
		65	sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
		66	ldrsh r4, [r1, #2]
		67	mul r6, r4, r14
		68	sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
		69	ldrsh r4, [r1, #4]
		70	mul r7, r4, r14
		71	sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
		72	ldrsh r4, [r1, #6]
		73	mul r8, r4, r14
		74	sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
		75	ldrsh r4, [r1, #8]
		76	mul r9, r4, r14
		77	sub r9, r10, r9 @ mem[4] = mem[5] - den[4]*y[i]
		78	ldrsh r4, [r1, #10]
		79	mul r10, r4, r14
		80	sub r10, r11, r10 @ mem[5] = mem[6] - den[5]*y[i]
		81	ldrsh r4, [r1, #12]
		82	mul r11, r4, r14
		83	sub r11, r12, r11 @ mem[6] = mem[7] - den[6]*y[i]
		84	ldrsh r4, [r1, #14]
		85	mul r12, r4, r14
		86	rsb r12, r12, #0 @ mem[7] = -den[7]*y[i]
		87	subs r3, r3, #1
		88	bne 0b
		89	ldr r4, [sp, #40] @ r4 = mem
		90	stmia r4, { r5-r12 } @ Save back mem[]
		91	ldmia sp!, { r4-r11, pc } @ Exit
		92
		93	.order_10:
		94	ldmia r4, { r5-r9 } @ r5-r9 = mem[0..4]
		95	add r5, r5, #4096 @ Rounding constant
		96	ldrsh r14, [r0], #2
		97	add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
		98	mov r5, #0x7f00
		99	orr r5, r5, #0xff @ r5 = 32767
		100	cmp r14, r5
		101	movgt r14, r5 @ Clip positive
		102	cmn r14, r5
		103	rsblt r14, r5, #0 @ Clip negative
		104	strh r14, [r2], #2 @ Write result to y[i]
		105
		106	ldmia r1!, { r10-r12 } @ r10-r12 = den[0..5]
		107	mov r5, r10, lsl #16
		108	mov r5, r5, asr #16
		109	mul r5, r14, r5
		110	sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
		111	mov r10, r10, asr #16
		112	mul r6, r14, r10
		113	sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
		114	mov r10, r11, lsl #16
		115	mov r10, r10, asr #16
		116	mul r7, r14, r10
		117	sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
		118	mov r10, r11, asr #16
		119	mul r8, r14, r10
		120	sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
		121	stmia r4!, { r5-r8 } @ Write back mem[0..3], r4 = &mem[4]
		122	mov r10, r12, lsl #16
		123	mov r10, r10, asr #16
		124	mul r5, r14, r10
		125
		126	ldmib r4, { r6-r10 } @ r6-r10 = mem[5..9]
		127	sub r5, r6, r5 @ mem[4] = mem[5] - den[4]*y[i]
		128	mov r12, r12, asr #16
		129	mul r6, r14, r12
		130	sub r6, r7, r6 @ mem[5] = mem[6] - den[5]*y[i]
		131	ldmia r1!, { r11-r12 } @ r11-r12 = den[6..9]
		132	mov r7, r11, lsl #16
		133	mov r7, r7, asr #16
		134	mul r7, r14, r7
		135	sub r7, r8, r7 @ mem[6] = mem[7] - den[6]*y[i]
		136	mov r11, r11, asr #16
		137	mul r8, r14, r11
		138	sub r8, r9, r8 @ mem[7] = mem[8] - den[7]*y[i]
		139	mov r11, r12, lsl #16
		140	mov r11, r11, asr #16
		141	mul r9, r14, r11
		142	sub r9, r10, r9 @ mem[8] = mem[9] - den[8]*y[i]
		143	mov r12, r12, asr #16
		144	mul r10, r14, r12
		145	rsb r10, r10, #0 @ mem[9] = -den[9]*y[i]
		146	stmia r4!, { r5-r10 } @ Write back mem[4..9]
		147	sub r4, r4, #10*4
		148	sub r1, r1, #10*2
		149	subs r3, r3, #1
		150	bne .order_10
		151	ldmia sp!, { r4-r11, pc } @ Exit
		152
		153
		154	/* void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char stack) /
		155	.global qmf_synth
		156	qmf_synth:
		157	stmdb sp!, { r4-r11, lr }
		158	add r7, sp, #36 @ r0 = x1, r1 = x2, r2 = a, r3 = y
		159	ldmia r7, { r4-r7 } @ r4 = N, r5 = M, r6 = mem1, r7 = mem2
		160
		161	add r8, r4, r5
		162	sub r9, sp, r8 @ r9 = sp - (N + M >> 1) = xx2
		163	sub r8, r9, r8 @ r8 = r9 - (N + M >> 1) = xx1
		164	str sp, [r8, #-4] @ Stack old sp
		165	sub sp, r8, #4 @ Update sp
		166
		167	add r0, r0, r4 @ x1 += N >> 1
		168	add r1, r1, r4 @ x2 += N >> 1
		169	mov r14, r4 @ Loop counter is N
		170	0:
		171	@ Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
		172	@ N should always be a multiple of four, so this should be OK
		173	ldmdb r0!, { r10-r11 }
		174	mov r12, r10, ror #16
		175	mov r11, r11, ror #16
		176	stmia r8!, { r11-r12 }
		177	ldmdb r1!, { r10-r11 }
		178	mov r12, r10, ror #16
		179	mov r11, r11, ror #16
		180	stmia r9!, { r11-r12 }
		181	subs r14, r14, #8
		182	bne 0b
		183
		184	@ Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
		185	mov r14, r5 @ Loop counter is M
		186	add r6, r6, #2
		187	add r7, r7, #2
		188	stmdb sp!, { r6-r7 } @ Stack &mem1[1], &mem2[1]
		189	0:
		190	ldrh r10, [r6], #4
		191	ldrh r11, [r6], #4
		192	ldrh r12, [r7], #4
		193	orr r10, r10, r11, lsl #16
		194	ldrh r11, [r7], #4
		195	orr r11, r12, r11, lsl #16
		196	str r10, [r8], #4
		197	str r11, [r9], #4
		198	subs r14, r14, #4
		199	bne 0b
		200
		201	sub r0, r8, r5 @ r0 = &xx1[N2]
		202	sub r1, r9, r5 @ r1 = %xx2[N2]
		203	str r4, [sp, #-4] @ Stack N
		204	mov r4, r5
		205	str r4, [sp, #-8] @ Stack M
		206	@ sp doesn't point to the end of the stack frame from here on, but we're not
		207	@ calling anything so it shouldn't matter
		208	@ Main loop, register usage:
		209	@ r0 = xx1, r1 = xx2, r2 = a, r3 = y, r4 = M, r5 = x10, r6 = x11, r7 = x20
		210	@ r8 = x21, r9 = [a1, a0], r10 = acc0, r11 = acc1, r12 = acc2, r14 = acc3
		211	0: @ Outerloop
		212	mov r10, #16384 @ Init acccumulators to rounding const
		213	mov r11, #16384
		214	mov r12, #16384
		215	mov r14, #16384
		216
		217	ldrsh r5, [r0, #-4]! @ r5 = x10, r0 = &xx1[N2 - 2]
		218	ldrsh r7, [r1, #-4]! @ r7 = x20, r1 = &xx2[N2 - 2]
		219	1: @ Innerloop
		220	ldrsh r9, [r2], #2 @ r9 = a0
		221	ldrsh r6, [r0, #2]! @ r6 = x11
		222	ldrsh r8, [r1, #2]! @ r8 = x21
		223	sub r5, r5, r7 @ r5 = x10 - x20
		224	add r7, r5, r7, asl #1 @ r7 = x10 + x20
		225	mla r12, r9, r5, r12 @ acc2 += a0*(x10 - x20)
		226	sub r5, r6, r8 @ r5 = x11 - x21
		227	mla r10, r9, r5, r10 @ acc0 += a0*(x11 - x21)
		228	ldrsh r9, [r2], #2 @ r9 = a1
		229	add r5, r6, r8 @ r5 = x11 + x21
		230	mla r14, r9, r7, r14 @ acc3 += a1*(x10 + x20)
		231	mla r11, r9, r5, r11 @ acc1 += a1*(x11 + x21)
		232
		233	ldrsh r9, [r2], #2 @ r9 = a1
		234	ldrsh r5, [r0, #2]! @ r5 = x10
		235	ldrsh r7, [r1, #2]! @ r7 = x20
		236	sub r6, r6, r8 @ r6 = x11 - x21
		237	add r8, r6, r8, asl #1 @ r8 = x11 + x21
		238	mla r12, r9, r6, r12 @ acc2 += a0*(x11 - x21)
		239	sub r6, r5, r7 @ r6 = x10 - x20
		240	mla r10, r9, r6, r10 @ acc0 += a0*(x10 - x20)
		241	ldrsh r9, [r2], #2 @ r9 = a1
		242	add r6, r5, r7 @ r5 = x10 + x20
		243	mla r14, r9, r8, r14 @ acc3 += a1*(x11 + x21)
		244	mla r11, r9, r6, r11 @ acc1 += a1*(x10 + x10)
		245	subs r4, r4, #4
		246	bne 1b
		247
		248	ldr r4, [sp, #-8] @ r4 = M
		249	sub r2, r2, r4, lsl #1 @ r2 = &a[0]
		250	sub r0, r0, r4 @ r0 = &xx1[N2 - 2 - i]
		251	sub r1, r1, r4 @ r1 = &xx2[N2 - 2 - i]
		252
		253	mov r10, r10, asr #15 @ Shift outputs down
		254	mov r11, r11, asr #15
		255	mov r12, r12, asr #15
		256	mov r14, r14, asr #15
		257
		258	@ TODO: this can be optimized further
		259	mov r9, #0x7f00 @ Clip all four outputs
		260	orr r9, r9, #0xff @ r9 = 32767
		261	cmp r10, r9
		262	movgt r10, r9
		263	cmn r10, r9
		264	rsblt r10, r9, #0
		265	cmp r11, r9
		266	movgt r11, r9
		267	cmn r11, r9
		268	rsblt r11, r9, #0
		269	cmp r12, r9
		270	movgt r12, r9
		271	cmn r12, r9
		272	rsblt r12, r9, #0
		273	cmp r14, r9
		274	movgt r14, r9
		275	cmn r14, r9
		276	rsblt r14, r9, #0
		277
		278	strh r10, [r3], #2 @ Write outputs
		279	strh r11, [r3], #2
		280	strh r12, [r3], #2
		281	strh r14, [r3], #2
		282	ldr r10, [sp, #-4] @ Load N
		283	subs r10, r10, #4 @ Are we done?
		284	strne r10, [sp, #-4]
		285	bne 0b
		286
		287	@ Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
		288	@ r0 and r1 are &xx1[0] and &xx2[0] at this point
		289	ldmia sp, { r5-r6, sp } @ Fetch &mem1[1], &mem2[1], restore sp
		290	0:
		291	ldr r7, [r0], #4
		292	ldr r8, [r1], #4
		293	strh r7, [r5], #4
		294	strh r8, [r6], #4
		295	mov r7, r7, lsr #16
		296	mov r8, r8, lsr #16
		297	strh r7, [r5], #4
		298	strh r8, [r6], #4
		299	subs r4, r4, #4
		300	bne 0b
		301	ldmia sp!, { r4-r11, pc } @ Exit
		302


diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S index b0367025e1..861d6c18f9 100644 --- a/apps/codecs/libspeex/filters_cf.S +++ b/apps/codecs/libspeex/filters_cf.S
@@ -48,6 +48,7 @@ iir_mem16:
48	jeq .order_10	48	jeq .order_10
49	jra .exit	49	jra .exit
50		50
		51	\| TODO: try using direct form 1 filtering
51	\| d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]	52	\| d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
52	\| a3 = x, a4 = den, a5 = y, a6 = temp	53	\| a3 = x, a4 = den, a5 = y, a6 = temp
53	.order_8:	54	.order_8:
@@ -171,6 +172,7 @@ iir_mem16:
171	lea.l (44, %sp), %sp	172	lea.l (44, %sp), %sp
172	rts	173	rts
173		174
		175
174	/* void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char stack) /	176	/* void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char stack) /
175	.global qmf_synth	177	.global qmf_synth
176	qmf_synth:	178	qmf_synth:
@@ -210,10 +212,10 @@ qmf_synth:
210	jne 0b	212	jne 0b
211		213
212	\| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2	214	\| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
213	move.l %d1, %d2 \| Loop counter is M2	215	move.l %d1, %d2 \| Loop counter is M2
214	addq.l #2, %a4 \| a4 = &mem1[1]	216	addq.l #2, %a4 \| a4 = &mem1[1]
215	addq.l #2, %a5 \| a5 = &mem2[1]	217	addq.l #2, %a5 \| a5 = &mem2[1]
216	move.l %a4, %d3 \| Backup mem1 and mem2	218	move.l %a4, %d3 \| Backup mem1 and mem2
217	move.l %a5, %d4	219	move.l %a5, %d4
218	0:	220	0:
219	move.w (%a4), (%a2)+	221	move.w (%a4), (%a2)+
@@ -222,14 +224,14 @@ qmf_synth:
222	addq.l #4, %a5	224	addq.l #4, %a5
223	subq.l #1, %d2	225	subq.l #1, %d2
224	jne 0b	226	jne 0b
225	move.l %d3, %a4 \| a4 = &mem1[1]	227	move.l %d3, %a4 \| a4 = &mem1[1]
226	move.l %d4, %a5 \| a5 = &mem2[1]	228	move.l %d4, %a5 \| a5 = &mem2[1]
227		229
228	clr.l %d2	230	clr.l %d2
229	sub.l %d1, %d2 \| d2 = -M2	231	sub.l %d1, %d2 \| d2 = -M2
230	lea.l (-4, %a2, %d2.l*2), %a0 \| a0 = &xx1[N2 - 2]	232	lea.l (-4, %a2, %d2.l*2), %a0 \| a0 = &xx1[N2 - 2]
231	lea.l (-4, %a6, %d2.l*2), %a1 \| a1 = &xx2[N2 - 2]	233	lea.l (-4, %a6, %d2.l*2), %a1 \| a1 = &xx2[N2 - 2]
232	move.l %d6, %a2 \| a2 = a	234	move.l %d6, %a2 \| a2 = a
233		235
234	\| Main loop, register usage:	236	\| Main loop, register usage:
235	\| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup	237	\| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
@@ -286,7 +288,7 @@ qmf_synth:
286	\| The clipping will be [-32768..32767], not Speex standard [-32767..32767],	288	\| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
287	\| but since qmf_synth() is called so late in the signal chain, it should	289	\| but since qmf_synth() is called so late in the signal chain, it should
288	\| work fine.	290	\| work fine.
289	move.w %d2, (%a3)+ \| Write results to y[]	291	move.w %d2, (%a3)+ \| Write results to y[]
290	move.w %d3, (%a3)+	292	move.w %d3, (%a3)+
291	move.w %d4, (%a3)+	293	move.w %d4, (%a3)+
292	move.w %d5, (%a3)+	294	move.w %d5, (%a3)+
@@ -294,8 +296,8 @@ qmf_synth:
294	jne 0b	296	jne 0b
295		297
296	\| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries	298	\| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
297	addq.l #4, %a0 \| a0 = &xx1[0]	299	addq.l #4, %a0 \| a0 = &xx1[0]
298	addq.l #4, %a1 \| a1 = &xx2[0]	300	addq.l #4, %a1 \| a1 = &xx2[0]
299	0:	301	0:
300	move.w (%a0)+, (%a4)	302	move.w (%a0)+, (%a4)
301	move.w (%a1)+, (%a5)	303	move.w (%a1)+, (%a5)