summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--apps/codecs/libspeex/SOURCES2
-rw-r--r--apps/codecs/libspeex/filters.c2
-rw-r--r--apps/codecs/libspeex/filters_arm4.S302
-rw-r--r--apps/codecs/libspeex/filters_cf.S28
4 files changed, 321 insertions, 13 deletions
diff --git a/apps/codecs/libspeex/SOURCES b/apps/codecs/libspeex/SOURCES
index f5a6786fa1..e1f038160b 100644
--- a/apps/codecs/libspeex/SOURCES
+++ b/apps/codecs/libspeex/SOURCES
@@ -34,4 +34,6 @@ window.c
34#ifdef CPU_COLDFIRE 34#ifdef CPU_COLDFIRE
35filters_cf.S 35filters_cf.S
36ltp_cf.S 36ltp_cf.S
37#elif defined(CPU_ARM)
38filters_arm4.S
37#endif 39#endif
diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c
index 0e76e27e84..36b110af30 100644
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@@ -45,6 +45,8 @@
45#include "filters_sse.h" 45#include "filters_sse.h"
46#elif defined (ARM4_ASM) || defined(ARM5E_ASM) 46#elif defined (ARM4_ASM) || defined(ARM5E_ASM)
47#include "filters_arm4.h" 47#include "filters_arm4.h"
48#define OVERRIDE_IIR_MEM16
49#define OVERRIDE_QMF_SYNTH
48#elif defined (COLDFIRE_ASM) 50#elif defined (COLDFIRE_ASM)
49#define OVERRIDE_IIR_MEM16 51#define OVERRIDE_IIR_MEM16
50#define OVERRIDE_QMF_SYNTH 52#define OVERRIDE_QMF_SYNTH
diff --git a/apps/codecs/libspeex/filters_arm4.S b/apps/codecs/libspeex/filters_arm4.S
new file mode 100644
index 0000000000..7924e7030f
--- /dev/null
+++ b/apps/codecs/libspeex/filters_arm4.S
@@ -0,0 +1,302 @@
1/* Copyright (C) 2007 Thom Johansen */
2/**
3 @file filters_arm4.S
4 @brief Various analysis/synthesis filters (ARMv4 version)
5*/
6/*
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 - Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 - Neither the name of the Xiph.org Foundation nor the names of its
19 contributors may be used to endorse or promote products derived from
20 this software without specific prior written permission.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33*/
34
35 .text
36/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
37 .global iir_mem16
38iir_mem16:
39 stmdb sp!, { r4-r11, lr }
40 ldr r5, [sp, #36] @ r0 = x, r1 = den, r2 = y, r3 = N
41 ldr r4, [sp, #40] @ r4 = mem, r5 = ord
42 cmp r5, #10
43 beq .order_10
44 cmp r5, #8
45 beq .order_8
46 ldmia sp!, { r4-r11, pc } @ Mon-supported order, return
47
48 @ TODO: try using direct form 1 filtering
49.order_8:
50 ldmia r4, { r5-r12 } @ r5-r12 = mem[0..7]
510:
52 add r5, r5, #4096 @ Rounding constant
53 ldrsh r14, [r0], #2
54 add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
55 mov r5, #0x7f00
56 orr r5, r5, #0xff @ r5 = 32767
57 cmp r14, r5
58 movgt r14, r5 @ Clip positive
59 cmn r14, r5
60 rsblt r14, r5, #0 @ Clip negative
61 strh r14, [r2], #2 @ Write result to y[i]
62
63 ldrsh r4, [r1]
64 mul r5, r4, r14
65 sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
66 ldrsh r4, [r1, #2]
67 mul r6, r4, r14
68 sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
69 ldrsh r4, [r1, #4]
70 mul r7, r4, r14
71 sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
72 ldrsh r4, [r1, #6]
73 mul r8, r4, r14
74 sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
75 ldrsh r4, [r1, #8]
76 mul r9, r4, r14
77 sub r9, r10, r9 @ mem[4] = mem[5] - den[4]*y[i]
78 ldrsh r4, [r1, #10]
79 mul r10, r4, r14
80 sub r10, r11, r10 @ mem[5] = mem[6] - den[5]*y[i]
81 ldrsh r4, [r1, #12]
82 mul r11, r4, r14
83 sub r11, r12, r11 @ mem[6] = mem[7] - den[6]*y[i]
84 ldrsh r4, [r1, #14]
85 mul r12, r4, r14
86 rsb r12, r12, #0 @ mem[7] = -den[7]*y[i]
87 subs r3, r3, #1
88 bne 0b
89 ldr r4, [sp, #40] @ r4 = mem
90 stmia r4, { r5-r12 } @ Save back mem[]
91 ldmia sp!, { r4-r11, pc } @ Exit
92
93.order_10:
94 ldmia r4, { r5-r9 } @ r5-r9 = mem[0..4]
95 add r5, r5, #4096 @ Rounding constant
96 ldrsh r14, [r0], #2
97 add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
98 mov r5, #0x7f00
99 orr r5, r5, #0xff @ r5 = 32767
100 cmp r14, r5
101 movgt r14, r5 @ Clip positive
102 cmn r14, r5
103 rsblt r14, r5, #0 @ Clip negative
104 strh r14, [r2], #2 @ Write result to y[i]
105
106 ldmia r1!, { r10-r12 } @ r10-r12 = den[0..5]
107 mov r5, r10, lsl #16
108 mov r5, r5, asr #16
109 mul r5, r14, r5
110 sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
111 mov r10, r10, asr #16
112 mul r6, r14, r10
113 sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
114 mov r10, r11, lsl #16
115 mov r10, r10, asr #16
116 mul r7, r14, r10
117 sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
118 mov r10, r11, asr #16
119 mul r8, r14, r10
120 sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
121 stmia r4!, { r5-r8 } @ Write back mem[0..3], r4 = &mem[4]
122 mov r10, r12, lsl #16
123 mov r10, r10, asr #16
124 mul r5, r14, r10
125
126 ldmib r4, { r6-r10 } @ r6-r10 = mem[5..9]
127 sub r5, r6, r5 @ mem[4] = mem[5] - den[4]*y[i]
128 mov r12, r12, asr #16
129 mul r6, r14, r12
130 sub r6, r7, r6 @ mem[5] = mem[6] - den[5]*y[i]
131 ldmia r1!, { r11-r12 } @ r11-r12 = den[6..9]
132 mov r7, r11, lsl #16
133 mov r7, r7, asr #16
134 mul r7, r14, r7
135 sub r7, r8, r7 @ mem[6] = mem[7] - den[6]*y[i]
136 mov r11, r11, asr #16
137 mul r8, r14, r11
138 sub r8, r9, r8 @ mem[7] = mem[8] - den[7]*y[i]
139 mov r11, r12, lsl #16
140 mov r11, r11, asr #16
141 mul r9, r14, r11
142 sub r9, r10, r9 @ mem[8] = mem[9] - den[8]*y[i]
143 mov r12, r12, asr #16
144 mul r10, r14, r12
145 rsb r10, r10, #0 @ mem[9] = -den[9]*y[i]
146 stmia r4!, { r5-r10 } @ Write back mem[4..9]
147 sub r4, r4, #10*4
148 sub r1, r1, #10*2
149 subs r3, r3, #1
150 bne .order_10
151 ldmia sp!, { r4-r11, pc } @ Exit
152
153
154/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
155 .global qmf_synth
156qmf_synth:
157 stmdb sp!, { r4-r11, lr }
158 add r7, sp, #36 @ r0 = x1, r1 = x2, r2 = a, r3 = y
159 ldmia r7, { r4-r7 } @ r4 = N, r5 = M, r6 = mem1, r7 = mem2
160
161 add r8, r4, r5
162 sub r9, sp, r8 @ r9 = sp - (N + M >> 1) = xx2
163 sub r8, r9, r8 @ r8 = r9 - (N + M >> 1) = xx1
164 str sp, [r8, #-4] @ Stack old sp
165 sub sp, r8, #4 @ Update sp
166
167 add r0, r0, r4 @ x1 += N >> 1
168 add r1, r1, r4 @ x2 += N >> 1
169 mov r14, r4 @ Loop counter is N
1700:
171 @ Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
172 @ N should always be a multiple of four, so this should be OK
173 ldmdb r0!, { r10-r11 }
174 mov r12, r10, ror #16
175 mov r11, r11, ror #16
176 stmia r8!, { r11-r12 }
177 ldmdb r1!, { r10-r11 }
178 mov r12, r10, ror #16
179 mov r11, r11, ror #16
180 stmia r9!, { r11-r12 }
181 subs r14, r14, #8
182 bne 0b
183
184 @ Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
185 mov r14, r5 @ Loop counter is M
186 add r6, r6, #2
187 add r7, r7, #2
188 stmdb sp!, { r6-r7 } @ Stack &mem1[1], &mem2[1]
1890:
190 ldrh r10, [r6], #4
191 ldrh r11, [r6], #4
192 ldrh r12, [r7], #4
193 orr r10, r10, r11, lsl #16
194 ldrh r11, [r7], #4
195 orr r11, r12, r11, lsl #16
196 str r10, [r8], #4
197 str r11, [r9], #4
198 subs r14, r14, #4
199 bne 0b
200
201 sub r0, r8, r5 @ r0 = &xx1[N2]
202 sub r1, r9, r5 @ r1 = %xx2[N2]
203 str r4, [sp, #-4] @ Stack N
204 mov r4, r5
205 str r4, [sp, #-8] @ Stack M
206 @ sp doesn't point to the end of the stack frame from here on, but we're not
207 @ calling anything so it shouldn't matter
208 @ Main loop, register usage:
209 @ r0 = xx1, r1 = xx2, r2 = a, r3 = y, r4 = M, r5 = x10, r6 = x11, r7 = x20
210 @ r8 = x21, r9 = [a1, a0], r10 = acc0, r11 = acc1, r12 = acc2, r14 = acc3
2110: @ Outerloop
212 mov r10, #16384 @ Init acccumulators to rounding const
213 mov r11, #16384
214 mov r12, #16384
215 mov r14, #16384
216
217 ldrsh r5, [r0, #-4]! @ r5 = x10, r0 = &xx1[N2 - 2]
218 ldrsh r7, [r1, #-4]! @ r7 = x20, r1 = &xx2[N2 - 2]
2191: @ Innerloop
220 ldrsh r9, [r2], #2 @ r9 = a0
221 ldrsh r6, [r0, #2]! @ r6 = x11
222 ldrsh r8, [r1, #2]! @ r8 = x21
223 sub r5, r5, r7 @ r5 = x10 - x20
224 add r7, r5, r7, asl #1 @ r7 = x10 + x20
225 mla r12, r9, r5, r12 @ acc2 += a0*(x10 - x20)
226 sub r5, r6, r8 @ r5 = x11 - x21
227 mla r10, r9, r5, r10 @ acc0 += a0*(x11 - x21)
228 ldrsh r9, [r2], #2 @ r9 = a1
229 add r5, r6, r8 @ r5 = x11 + x21
230 mla r14, r9, r7, r14 @ acc3 += a1*(x10 + x20)
231 mla r11, r9, r5, r11 @ acc1 += a1*(x11 + x21)
232
233 ldrsh r9, [r2], #2 @ r9 = a1
234 ldrsh r5, [r0, #2]! @ r5 = x10
235 ldrsh r7, [r1, #2]! @ r7 = x20
236 sub r6, r6, r8 @ r6 = x11 - x21
237 add r8, r6, r8, asl #1 @ r8 = x11 + x21
238 mla r12, r9, r6, r12 @ acc2 += a0*(x11 - x21)
239 sub r6, r5, r7 @ r6 = x10 - x20
240 mla r10, r9, r6, r10 @ acc0 += a0*(x10 - x20)
241 ldrsh r9, [r2], #2 @ r9 = a1
242 add r6, r5, r7 @ r5 = x10 + x20
243 mla r14, r9, r8, r14 @ acc3 += a1*(x11 + x21)
244 mla r11, r9, r6, r11 @ acc1 += a1*(x10 + x10)
245 subs r4, r4, #4
246 bne 1b
247
248 ldr r4, [sp, #-8] @ r4 = M
249 sub r2, r2, r4, lsl #1 @ r2 = &a[0]
250 sub r0, r0, r4 @ r0 = &xx1[N2 - 2 - i]
251 sub r1, r1, r4 @ r1 = &xx2[N2 - 2 - i]
252
253 mov r10, r10, asr #15 @ Shift outputs down
254 mov r11, r11, asr #15
255 mov r12, r12, asr #15
256 mov r14, r14, asr #15
257
258 @ TODO: this can be optimized further
259 mov r9, #0x7f00 @ Clip all four outputs
260 orr r9, r9, #0xff @ r9 = 32767
261 cmp r10, r9
262 movgt r10, r9
263 cmn r10, r9
264 rsblt r10, r9, #0
265 cmp r11, r9
266 movgt r11, r9
267 cmn r11, r9
268 rsblt r11, r9, #0
269 cmp r12, r9
270 movgt r12, r9
271 cmn r12, r9
272 rsblt r12, r9, #0
273 cmp r14, r9
274 movgt r14, r9
275 cmn r14, r9
276 rsblt r14, r9, #0
277
278 strh r10, [r3], #2 @ Write outputs
279 strh r11, [r3], #2
280 strh r12, [r3], #2
281 strh r14, [r3], #2
282 ldr r10, [sp, #-4] @ Load N
283 subs r10, r10, #4 @ Are we done?
284 strne r10, [sp, #-4]
285 bne 0b
286
287 @ Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
288 @ r0 and r1 are &xx1[0] and &xx2[0] at this point
289 ldmia sp, { r5-r6, sp } @ Fetch &mem1[1], &mem2[1], restore sp
2900:
291 ldr r7, [r0], #4
292 ldr r8, [r1], #4
293 strh r7, [r5], #4
294 strh r8, [r6], #4
295 mov r7, r7, lsr #16
296 mov r8, r8, lsr #16
297 strh r7, [r5], #4
298 strh r8, [r6], #4
299 subs r4, r4, #4
300 bne 0b
301 ldmia sp!, { r4-r11, pc } @ Exit
302
diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S
index b0367025e1..861d6c18f9 100644
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@@ -48,6 +48,7 @@ iir_mem16:
48 jeq .order_10 48 jeq .order_10
49 jra .exit 49 jra .exit
50 50
51 | TODO: try using direct form 1 filtering
51 | d0 = y[i], d1-d7, a0 = mem[0] .. mem[7] 52 | d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
52 | a3 = x, a4 = den, a5 = y, a6 = temp 53 | a3 = x, a4 = den, a5 = y, a6 = temp
53.order_8: 54.order_8:
@@ -171,6 +172,7 @@ iir_mem16:
171 lea.l (44, %sp), %sp 172 lea.l (44, %sp), %sp
172 rts 173 rts
173 174
175
174/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */ 176/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
175 .global qmf_synth 177 .global qmf_synth
176qmf_synth: 178qmf_synth:
@@ -210,10 +212,10 @@ qmf_synth:
210 jne 0b 212 jne 0b
211 213
212 | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2 214 | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
213 move.l %d1, %d2 | Loop counter is M2 215 move.l %d1, %d2 | Loop counter is M2
214 addq.l #2, %a4 | a4 = &mem1[1] 216 addq.l #2, %a4 | a4 = &mem1[1]
215 addq.l #2, %a5 | a5 = &mem2[1] 217 addq.l #2, %a5 | a5 = &mem2[1]
216 move.l %a4, %d3 | Backup mem1 and mem2 218 move.l %a4, %d3 | Backup mem1 and mem2
217 move.l %a5, %d4 219 move.l %a5, %d4
2180: 2200:
219 move.w (%a4), (%a2)+ 221 move.w (%a4), (%a2)+
@@ -222,14 +224,14 @@ qmf_synth:
222 addq.l #4, %a5 224 addq.l #4, %a5
223 subq.l #1, %d2 225 subq.l #1, %d2
224 jne 0b 226 jne 0b
225 move.l %d3, %a4 | a4 = &mem1[1] 227 move.l %d3, %a4 | a4 = &mem1[1]
226 move.l %d4, %a5 | a5 = &mem2[1] 228 move.l %d4, %a5 | a5 = &mem2[1]
227 229
228 clr.l %d2 230 clr.l %d2
229 sub.l %d1, %d2 | d2 = -M2 231 sub.l %d1, %d2 | d2 = -M2
230 lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2] 232 lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
231 lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2] 233 lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
232 move.l %d6, %a2 | a2 = a 234 move.l %d6, %a2 | a2 = a
233 235
234 | Main loop, register usage: 236 | Main loop, register usage:
235 | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup 237 | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
@@ -286,7 +288,7 @@ qmf_synth:
286 | The clipping will be [-32768..32767], not Speex standard [-32767..32767], 288 | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
287 | but since qmf_synth() is called so late in the signal chain, it should 289 | but since qmf_synth() is called so late in the signal chain, it should
288 | work fine. 290 | work fine.
289 move.w %d2, (%a3)+ | Write results to y[] 291 move.w %d2, (%a3)+ | Write results to y[]
290 move.w %d3, (%a3)+ 292 move.w %d3, (%a3)+
291 move.w %d4, (%a3)+ 293 move.w %d4, (%a3)+
292 move.w %d5, (%a3)+ 294 move.w %d5, (%a3)+
@@ -294,8 +296,8 @@ qmf_synth:
294 jne 0b 296 jne 0b
295 297
296 | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries 298 | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
297 addq.l #4, %a0 | a0 = &xx1[0] 299 addq.l #4, %a0 | a0 = &xx1[0]
298 addq.l #4, %a1 | a1 = &xx2[0] 300 addq.l #4, %a1 | a1 = &xx2[0]
2990: 3010:
300 move.w (%a0)+, (%a4) 302 move.w (%a0)+, (%a4)
301 move.w (%a1)+, (%a5) 303 move.w (%a1)+, (%a5)