diff options
Diffstat (limited to 'lib/rbcodec/codecs/libmusepack/synth_filter_arm.S')
-rw-r--r-- | lib/rbcodec/codecs/libmusepack/synth_filter_arm.S | 693 |
1 files changed, 693 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S b/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S new file mode 100644 index 0000000000..9bd4e04626 --- /dev/null +++ b/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S | |||
@@ -0,0 +1,693 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2008 by Andree Buschmann | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version 2 | ||
15 | * of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
18 | * KIND, either express or implied. | ||
19 | * | ||
20 | ****************************************************************************/ | ||
21 | |||
22 | #include "config.h" | ||
23 | |||
24 | #if defined(CPU_S5L870X) | ||
25 | /* Moving this asm routine to iram is (little) faster on S5L870x. */ | ||
26 | .section .icode, "ax", %progbits | ||
27 | #else | ||
28 | .section .text, "ax", %progbits | ||
29 | #endif | ||
30 | |||
31 | /**************************************************************************** | ||
32 | * void mpc_decoder_windowing_D(...) | ||
33 | * | ||
34 | * 2nd step within synthesis filter. Does the dewindowing. | ||
35 | * 64=32x32 multiplies | ||
36 | * Uses un-shifted D[]-values. D[] will always be the second operand of | ||
37 | * smull/smlal to achieve higher speed as D[] has lower amplitude than V[]. | ||
38 | ****************************************************************************/ | ||
39 | .align 2 | ||
40 | .global mpc_decoder_windowing_D | ||
41 | .type mpc_decoder_windowing_D, %function | ||
42 | #if 0 | ||
43 | mpc_decoder_windowing_D: | ||
44 | /* r0 = Data[] */ | ||
45 | /* r1 = V[] */ | ||
46 | /* r2 = D[] */ | ||
47 | /* lr = counter */ | ||
48 | /************************************************************************ | ||
49 | * Reference implementation. | ||
50 | ***********************************************************************/ | ||
51 | stmfd sp!, {r4-r8, lr} | ||
52 | |||
53 | mov lr, #32 | ||
54 | .loop32: | ||
55 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
56 | ldr r7, [r1] /* 0 */ | ||
57 | smull r8, r12, r7, r3 | ||
58 | ldr r7, [r1, #96*4] /* 1 */ | ||
59 | smlal r8, r12, r7, r4 | ||
60 | ldr r7, [r1, #128*4] /* 2 */ | ||
61 | smlal r8, r12, r7, r5 | ||
62 | ldr r7, [r1, #224*4] /* 3 */ | ||
63 | smlal r8, r12, r7, r6 | ||
64 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
65 | ldr r7, [r1, #256*4] /* 4 */ | ||
66 | smlal r8, r12, r7, r3 | ||
67 | ldr r7, [r1, #352*4] /* 5 */ | ||
68 | smlal r8, r12, r7, r4 | ||
69 | ldr r7, [r1, #384*4] /* 6 */ | ||
70 | smlal r8, r12, r7, r5 | ||
71 | ldr r7, [r1, #480*4] /* 7 */ | ||
72 | smlal r8, r12, r7, r6 | ||
73 | ldmia r2!, { r3-r6 } /* load D[08..11] */ | ||
74 | ldr r7, [r1, #512*4] /* 8 */ | ||
75 | smlal r8, r12, r7, r3 | ||
76 | ldr r7, [r1, #608*4] /* 9 */ | ||
77 | smlal r8, r12, r7, r4 | ||
78 | ldr r7, [r1, #640*4] /* 10 */ | ||
79 | smlal r8, r12, r7, r5 | ||
80 | ldr r7, [r1, #736*4] /* 11 */ | ||
81 | smlal r8, r12, r7, r6 | ||
82 | ldmia r2!, { r3-r6 } /* load D[12..15] */ | ||
83 | ldr r7, [r1, #768*4] /* 12 */ | ||
84 | smlal r8, r12, r7, r3 | ||
85 | ldr r7, [r1, #864*4] /* 13 */ | ||
86 | smlal r8, r12, r7, r4 | ||
87 | ldr r7, [r1, #896*4] /* 14 */ | ||
88 | smlal r8, r12, r7, r5 | ||
89 | ldr r7, [r1, #992*4] /* 15 */ | ||
90 | smlal r8, r12, r7, r6 | ||
91 | mov r8, r8, lsr #16 | ||
92 | orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
93 | str r8, [r0], #4 /* store Data */ | ||
94 | add r1, r1, #4 /* V++ */ | ||
95 | |||
96 | subs lr, lr, #1 | ||
97 | bgt .loop32 | ||
98 | |||
99 | ldmpc regs=r4-r8 | ||
100 | #elif defined(CPU_ARM7TDMI) /* arm7 only */ | ||
101 | mpc_decoder_windowing_D: | ||
102 | /* r0 = Data[] */ | ||
103 | /* r1 = V[] */ | ||
104 | /* r2 = D[] */ | ||
105 | /* lr = counter */ | ||
106 | /************************************************************************ | ||
107 | * Further speed up through making use of symmetries within D[]-window. | ||
108 | * The row V[00] can be extracted as it has symmetries within this single | ||
109 | * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. | ||
110 | * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be | ||
111 | * saved at the cost of 15 x 4 + 1 add's. | ||
112 | * The row V[16] can be extracted as it has symmetries within this single | ||
113 | * row. 8 smull/mlal and 8 ldr's can be saved. | ||
114 | * Used for arm7 only. For arm9 and above see implementation below. | ||
115 | ***********************************************************************/ | ||
116 | stmfd sp!, {r4-r11, lr} | ||
117 | |||
118 | /****************************************** | ||
119 | * row 0 with internal symmetry | ||
120 | *****************************************/ | ||
121 | add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ | ||
122 | ldmia r2!, { r3-r6 } /* load D[01..04] */ | ||
123 | ldr r7 , [r1, #96*4] /* 1 */ | ||
124 | ldr r10, [r1, #992*4] /* 15 */ | ||
125 | rsb r10, r10, r7 /* V[01] - V[15] */ | ||
126 | smull r8, r9, r10, r3 | ||
127 | ldr r7 , [r1, #128*4] /* 2 */ | ||
128 | ldr r10, [r1, #896*4] /* 14 */ | ||
129 | add r10, r10, r7 /* V[02] + V[14] */ | ||
130 | smlal r8, r9, r10, r4 | ||
131 | ldr r7 , [r1, #224*4] /* 3 */ | ||
132 | ldr r10, [r1, #864*4] /* 13 */ | ||
133 | rsb r10, r10, r7 /* V[03] - V[13] */ | ||
134 | smlal r8, r9, r10, r5 | ||
135 | ldr r7 , [r1, #256*4] /* 4 */ | ||
136 | ldr r10, [r1, #768*4] /* 12 */ | ||
137 | add r10, r10, r7 /* V[04] + V[12] */ | ||
138 | smlal r8, r9, r10, r6 | ||
139 | ldmia r2!, { r3-r6 } /* load D[05..08] */ | ||
140 | ldr r7 , [r1, #352*4] /* 5 */ | ||
141 | ldr r10, [r1, #736*4] /* 11 */ | ||
142 | rsb r10, r10, r7 /* V[05] - V[11] */ | ||
143 | smlal r8, r9, r10, r3 | ||
144 | ldr r7 , [r1, #384*4] /* 6 */ | ||
145 | ldr r10, [r1, #640*4] /* 10 */ | ||
146 | add r10, r10, r7 /* V[06] + V[10] */ | ||
147 | smlal r8, r9, r10, r4 | ||
148 | ldr r7 , [r1, #480*4] /* 7 */ | ||
149 | ldr r10, [r1, #608*4] /* 9 */ | ||
150 | rsb r10, r10, r7 /* V[07] - V[09] */ | ||
151 | smlal r8, r9, r10, r5 | ||
152 | ldr r10, [r1, #512*4] /* 8 */ | ||
153 | smlal r8, r9, r10, r6 | ||
154 | mov r8, r8, lsr #16 | ||
155 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
156 | str r8, [r0], #4 /* store Data */ | ||
157 | add r1, r1, #4 /* V+=1, r1 = V[01] */ | ||
158 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | ||
159 | |||
160 | /****************************************** | ||
161 | * rows 01..15 are symmetric to rows 31..17 | ||
162 | * r8 = lo, r9 = hi of 01..15 | ||
163 | * r1 = V[01..15] | ||
164 | * r10 = lo, r11 = hi of 31..17 | ||
165 | * r12 = V[31..16] | ||
166 | *****************************************/ | ||
167 | mov lr, #15*8 | ||
168 | add r12, r1, #30*4 /* r12 = V[31] */ | ||
169 | .loop15: | ||
170 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
171 | ldr r7, [r12, #768*4] /* 12 */ | ||
172 | smull r10, r11, r7, r6 | ||
173 | ldr r7, [r12, #864*4] /* 13 */ | ||
174 | smlal r10, r11, r7, r5 | ||
175 | ldr r7, [r12, #896*4] /* 14 */ | ||
176 | smlal r10, r11, r7, r4 | ||
177 | ldr r7, [r12, #992*4] /* 15 */ | ||
178 | smlal r10, r11, r7, r3 | ||
179 | ldr r7, [r1] /* 0 */ | ||
180 | smull r8, r9, r7, r3 | ||
181 | ldr r7, [r1, #96*4] /* 1 */ | ||
182 | smlal r8, r9, r7, r4 | ||
183 | ldr r7, [r1, #128*4] /* 2 */ | ||
184 | smlal r8, r9, r7, r5 | ||
185 | ldr r7, [r1, #224*4] /* 3 */ | ||
186 | smlal r8, r9, r7, r6 | ||
187 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
188 | ldr r7, [r1, #256*4] /* 4 */ | ||
189 | smlal r8, r9, r7, r3 | ||
190 | ldr r7, [r1, #352*4] /* 5 */ | ||
191 | smlal r8, r9, r7, r4 | ||
192 | ldr r7, [r1, #384*4] /* 6 */ | ||
193 | smlal r8, r9, r7, r5 | ||
194 | ldr r7, [r1, #480*4] /* 7 */ | ||
195 | smlal r8, r9, r7, r6 | ||
196 | ldr r7, [r12, #512*4] /* 8 */ | ||
197 | smlal r10, r11, r7, r6 | ||
198 | ldr r7, [r12, #608*4] /* 9 */ | ||
199 | smlal r10, r11, r7, r5 | ||
200 | ldr r7, [r12, #640*4] /* 10 */ | ||
201 | smlal r10, r11, r7, r4 | ||
202 | ldr r7, [r12, #736*4] /* 11 */ | ||
203 | smlal r10, r11, r7, r3 | ||
204 | ldmia r2!, { r3-r6 } /* load D[08..11] */ | ||
205 | ldr r7, [r12, #256*4] /* 4 */ | ||
206 | smlal r10, r11, r7, r6 | ||
207 | ldr r7, [r12, #352*4] /* 5 */ | ||
208 | smlal r10, r11, r7, r5 | ||
209 | ldr r7, [r12, #384*4] /* 6 */ | ||
210 | smlal r10, r11, r7, r4 | ||
211 | ldr r7, [r12, #480*4] /* 7 */ | ||
212 | smlal r10, r11, r7, r3 | ||
213 | ldr r7, [r1, #512*4] /* 8 */ | ||
214 | smlal r8, r9, r7, r3 | ||
215 | ldr r7, [r1, #608*4] /* 9 */ | ||
216 | smlal r8, r9, r7, r4 | ||
217 | ldr r7, [r1, #640*4] /* 10 */ | ||
218 | smlal r8, r9, r7, r5 | ||
219 | ldr r7, [r1, #736*4] /* 11 */ | ||
220 | smlal r8, r9, r7, r6 | ||
221 | ldmia r2!, { r3-r6 } /* load D[12..15] */ | ||
222 | ldr r7, [r1, #768*4] /* 12 */ | ||
223 | smlal r8, r9, r7, r3 | ||
224 | ldr r7, [r1, #864*4] /* 13 */ | ||
225 | smlal r8, r9, r7, r4 | ||
226 | ldr r7, [r1, #896*4] /* 14 */ | ||
227 | smlal r8, r9, r7, r5 | ||
228 | ldr r7, [r1, #992*4] /* 15 */ | ||
229 | smlal r8, r9, r7, r6 | ||
230 | ldr r7, [r12] /* 0 */ | ||
231 | smlal r10, r11, r7, r6 | ||
232 | ldr r7, [r12, #96*4] /* 1 */ | ||
233 | smlal r10, r11, r7, r5 | ||
234 | ldr r7, [r12, #128*4] /* 2 */ | ||
235 | smlal r10, r11, r7, r4 | ||
236 | ldr r7, [r12, #224*4] /* 3 */ | ||
237 | smlal r10, r11, r7, r3 | ||
238 | /* store Data[01..15] */ | ||
239 | mov r8, r8, lsr #16 | ||
240 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
241 | /* store Data[31..17] */ | ||
242 | mov r10, r10, lsr #16 | ||
243 | orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
244 | rsb r10, r10, #0 /* r10 = -r10 */ | ||
245 | str r10, [r0, lr] /* store Data */ | ||
246 | str r8, [r0], #4 /* store Data */ | ||
247 | /* correct adresses for next loop */ | ||
248 | sub r12, r12, #4 /* r12 = V-- */ | ||
249 | add r1, r1, #4 /* r1 = V++ */ | ||
250 | /* next loop */ | ||
251 | subs lr, lr, #8 | ||
252 | bgt .loop15 | ||
253 | |||
254 | /****************************************** | ||
255 | * V[16] with internal symmetry | ||
256 | *****************************************/ | ||
257 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
258 | ldr r7 , [r1] /* 0 */ | ||
259 | ldr r10, [r1, #992*4] /* 15 */ | ||
260 | rsb r10, r10, r7 /* V[00] - V[15] */ | ||
261 | smull r8, r9, r10, r3 | ||
262 | ldr r7 , [r1, #96*4] /* 1 */ | ||
263 | ldr r10, [r1, #896*4] /* 14 */ | ||
264 | rsb r10, r10, r7 /* V[01] - V[14] */ | ||
265 | smlal r8, r9, r10, r4 | ||
266 | ldr r7 , [r1, #128*4] /* 2 */ | ||
267 | ldr r10, [r1, #864*4] /* 13 */ | ||
268 | rsb r10, r10, r7 /* V[02] - V[13] */ | ||
269 | smlal r8, r9, r10, r5 | ||
270 | ldr r7 , [r1, #224*4] /* 3 */ | ||
271 | ldr r10, [r1, #768*4] /* 12 */ | ||
272 | rsb r10, r10, r7 /* V[03] - V[12] */ | ||
273 | smlal r8, r9, r10, r6 | ||
274 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
275 | ldr r7 , [r1, #256*4] /* 4 */ | ||
276 | ldr r10, [r1, #736*4] /* 11 */ | ||
277 | rsb r10, r10, r7 /* V[04] - V[11] */ | ||
278 | smlal r8, r9, r10, r3 | ||
279 | ldr r7 , [r1, #352*4] /* 5 */ | ||
280 | ldr r10, [r1, #640*4] /* 10 */ | ||
281 | rsb r10, r10, r7 /* V[05] - V[10] */ | ||
282 | smlal r8, r9, r10, r4 | ||
283 | ldr r7 , [r1, #384*4] /* 6 */ | ||
284 | ldr r10, [r1, #608*4] /* 9 */ | ||
285 | rsb r10, r10, r7 /* V[06] - V[09] */ | ||
286 | smlal r8, r9, r10, r5 | ||
287 | ldr r7 , [r1, #480*4] /* 7 */ | ||
288 | ldr r10, [r1, #512*4] /* 8 */ | ||
289 | rsb r10, r10, r7 /* V[07] - V[08] */ | ||
290 | smlal r8, r9, r10, r6 | ||
291 | mov r8, r8, lsr #16 | ||
292 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
293 | str r8, [r0], #4 /* store Data */ | ||
294 | |||
295 | ldmpc regs=r4-r11 | ||
296 | #elif ARM_ARCH < 6 /* arm9 and above */ | ||
297 | mpc_decoder_windowing_D: | ||
298 | /* r0 = Data[] */ | ||
299 | /* r1 = V[] */ | ||
300 | /* r2 = D[] */ | ||
301 | /* lr = counter */ | ||
302 | /************************************************************************ | ||
303 | * Further speed up through making use of symmetries within D[]-window. | ||
304 | * The row V[00] can be extracted as it has symmetries within this single | ||
305 | * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. | ||
306 | * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be | ||
307 | * saved at the cost of 15 x 4 + 1 add's. | ||
308 | * The row V[16] can be extracted as it has symmetries within this single | ||
309 | * row. 8 smull/mlal and 8 ldr's can be saved. | ||
310 | * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds | ||
311 | * up decoding even though several ldm-calls are replaced with ldr to free | ||
312 | * 2 registers. | ||
313 | ***********************************************************************/ | ||
314 | stmfd sp!, {r4-r11, lr} | ||
315 | |||
316 | /****************************************** | ||
317 | * row 0 with internal symmetry | ||
318 | *****************************************/ | ||
319 | add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ | ||
320 | ldmia r2!, { r3-r6 } /* load D[01..04] */ | ||
321 | ldr r7 , [r1, #96*4] /* 1 */ | ||
322 | ldr r10, [r1, #992*4] /* 15 */ | ||
323 | ldr r11, [r1, #128*4] /* 2 */ | ||
324 | ldr r12, [r1, #896*4] /* 14 */ | ||
325 | rsb r10, r10, r7 /* V[01] - V[15] */ | ||
326 | smull r8, r9, r10, r3 | ||
327 | ldr r7 , [r1, #224*4] /* 3 */ | ||
328 | ldr r10, [r1, #864*4] /* 13 */ | ||
329 | add r12, r12, r11 /* V[02] + V[14] */ | ||
330 | smlal r8, r9, r12, r4 | ||
331 | ldr r11, [r1, #256*4] /* 4 */ | ||
332 | ldr r12, [r1, #768*4] /* 12 */ | ||
333 | rsb r10, r10, r7 /* V[03] - V[13] */ | ||
334 | smlal r8, r9, r10, r5 | ||
335 | ldr r7 , [r1, #352*4] /* 5 */ | ||
336 | ldr r10, [r1, #736*4] /* 11 */ | ||
337 | add r12, r12, r11 /* V[04] + V[12] */ | ||
338 | smlal r8, r9, r12, r6 | ||
339 | ldmia r2!, { r3-r6 } /* load D[05..08] */ | ||
340 | ldr r11, [r1, #384*4] /* 6 */ | ||
341 | ldr r12, [r1, #640*4] /* 10 */ | ||
342 | rsb r10, r10, r7 /* V[05] - V[11] */ | ||
343 | smlal r8, r9, r10, r3 | ||
344 | ldr r7 , [r1, #480*4] /* 7 */ | ||
345 | ldr r10, [r1, #608*4] /* 9 */ | ||
346 | add r12, r12, r11 /* V[06] + V[10] */ | ||
347 | smlal r8, r9, r12, r4 | ||
348 | ldr r11, [r1, #512*4] /* 8 */ | ||
349 | rsb r10, r10, r7 /* V[07] - V[09] */ | ||
350 | smlal r8, r9, r10, r5 | ||
351 | smlal r8, r9, r11, r6 | ||
352 | mov r8, r8, lsr #16 | ||
353 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
354 | str r8, [r0], #4 /* store Data */ | ||
355 | add r1, r1, #4 /* V+=1, r1 = V[01] */ | ||
356 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | ||
357 | |||
358 | /****************************************** | ||
359 | * rows 01..15 are symmetric to rows 31..17 | ||
360 | * r8 = lo, r9 = hi of 01..15 | ||
361 | * r1 = V[01..15] | ||
362 | * r10 = lo, r11 = hi of 31..17 | ||
363 | * r12 = V[31..16] | ||
364 | *****************************************/ | ||
365 | mov lr, #15*8 | ||
366 | add r12, r1, #30*4 /* r12 = V[31] */ | ||
367 | .loop15: | ||
368 | ldmia r2!, { r3-r4 } /* load D[00..01] */ | ||
369 | ldr r7, [r12, #896*4] /* 14 */ | ||
370 | ldr r5, [r12, #992*4] /* 15 */ | ||
371 | smull r10, r11, r7, r4 | ||
372 | ldr r7, [r1] /* 0 */ | ||
373 | smlal r10, r11, r5, r3 | ||
374 | ldr r5, [r1, #96*4] /* 1 */ | ||
375 | smull r8, r9, r7, r3 | ||
376 | ldr r7, [r12, #768*4] /* 12 */ | ||
377 | smlal r8, r9, r5, r4 | ||
378 | ldmia r2!, { r3-r4 } /* load D[02..03] */ | ||
379 | ldr r5, [r12, #864*4] /* 13 */ | ||
380 | smlal r10, r11, r7, r4 | ||
381 | ldr r7, [r1, #128*4] /* 2 */ | ||
382 | smlal r10, r11, r5, r3 | ||
383 | ldr r5, [r1, #224*4] /* 3 */ | ||
384 | smlal r8, r9, r7, r3 | ||
385 | ldr r7, [r1, #256*4] /* 4 */ | ||
386 | smlal r8, r9, r5, r4 | ||
387 | ldmia r2!, { r3-r4 } /* load D[04..04] */ | ||
388 | ldr r5, [r1, #352*4] /* 5 */ | ||
389 | smlal r8, r9, r7, r3 | ||
390 | ldr r7, [r12, #640*4] /* 10 */ | ||
391 | smlal r8, r9, r5, r4 | ||
392 | ldr r5, [r12, #736*4] /* 11 */ | ||
393 | smlal r10, r11, r7, r4 | ||
394 | ldr r7, [r1, #384*4] /* 6 */ | ||
395 | smlal r10, r11, r5, r3 | ||
396 | ldmia r2!, { r3-r4 } /* load D[06..07] */ | ||
397 | ldr r5, [r1, #480*4] /* 7 */ | ||
398 | smlal r8, r9, r7, r3 | ||
399 | ldr r7, [r12, #512*4] /* 8 */ | ||
400 | smlal r8, r9, r5, r4 | ||
401 | ldr r5, [r12, #608*4] /* 9 */ | ||
402 | smlal r10, r11, r7, r4 | ||
403 | ldr r7, [r12, #384*4] /* 6 */ | ||
404 | smlal r10, r11, r5, r3 | ||
405 | ldmia r2!, { r3-r4 } /* load D[08..09] */ | ||
406 | ldr r5, [r12, #480*4] /* 7 */ | ||
407 | smlal r10, r11, r7, r4 | ||
408 | ldr r7, [r1, #512*4] /* 8 */ | ||
409 | smlal r10, r11, r5, r3 | ||
410 | ldr r5, [r1, #608*4] /* 9 */ | ||
411 | smlal r8, r9, r7, r3 | ||
412 | ldr r7, [r1, #640*4] /* 10 */ | ||
413 | smlal r8, r9, r5, r4 | ||
414 | ldmia r2!, { r3-r4 } /* load D[10..11] */ | ||
415 | ldr r5, [r1, #736*4] /* 11 */ | ||
416 | smlal r8, r9, r7, r3 | ||
417 | ldr r7, [r12, #256*4] /* 4 */ | ||
418 | smlal r8, r9, r5, r4 | ||
419 | ldr r5, [r12, #352*4] /* 5 */ | ||
420 | smlal r10, r11, r7, r4 | ||
421 | ldr r7, [r1, #768*4] /* 12 */ | ||
422 | smlal r10, r11, r5, r3 | ||
423 | ldmia r2!, { r3-r4 } /* load D[12..13] */ | ||
424 | ldr r5, [r1, #864*4] /* 13 */ | ||
425 | smlal r8, r9, r7, r3 | ||
426 | ldr r7, [r12, #128*4] /* 2 */ | ||
427 | smlal r8, r9, r5, r4 | ||
428 | ldr r5, [r12, #224*4] /* 3 */ | ||
429 | smlal r10, r11, r7, r4 | ||
430 | ldr r7, [r12] /* 0 */ | ||
431 | smlal r10, r11, r5, r3 | ||
432 | ldmia r2!, { r3-r4 } /* load D[14..15] */ | ||
433 | ldr r5, [r12, #96*4] /* 1 */ | ||
434 | smlal r10, r11, r7, r4 | ||
435 | ldr r7, [r1, #896*4] /* 14 */ | ||
436 | smlal r10, r11, r5, r3 | ||
437 | ldr r5, [r1, #992*4] /* 15 */ | ||
438 | smlal r8, r9, r7, r3 | ||
439 | smlal r8, r9, r5, r4 | ||
440 | /* store Data[01..15] */ | ||
441 | mov r8, r8, lsr #16 | ||
442 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
443 | /* store Data[31..17] */ | ||
444 | mov r10, r10, lsr #16 | ||
445 | orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
446 | rsb r10, r10, #0 /* r10 = -r10 */ | ||
447 | str r10, [r0, lr] /* store Data */ | ||
448 | str r8, [r0], #4 /* store Data */ | ||
449 | /* correct adresses for next loop */ | ||
450 | sub r12, r12, #4 /* r12 = V-- */ | ||
451 | add r1, r1, #4 /* r1 = V++ */ | ||
452 | /* next loop */ | ||
453 | subs lr, lr, #8 | ||
454 | bgt .loop15 | ||
455 | |||
456 | /****************************************** | ||
457 | * V[16] with internal symmetry | ||
458 | *****************************************/ | ||
459 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
460 | ldr r7 , [r1] /* 0 */ | ||
461 | ldr r10, [r1, #992*4] /* 15 */ | ||
462 | ldr r11, [r1, #96*4] /* 1 */ | ||
463 | ldr r12, [r1, #896*4] /* 14 */ | ||
464 | rsb r10, r10, r7 /* V[00] - V[15] */ | ||
465 | smull r8, r9, r10, r3 | ||
466 | ldr r7 , [r1, #128*4] /* 2 */ | ||
467 | ldr r10, [r1, #864*4] /* 13 */ | ||
468 | rsb r12, r12, r11 /* V[01] - V[14] */ | ||
469 | smlal r8, r9, r12, r4 | ||
470 | ldr r11, [r1, #224*4] /* 3 */ | ||
471 | ldr r12, [r1, #768*4] /* 12 */ | ||
472 | rsb r10, r10, r7 /* V[02] - V[13] */ | ||
473 | smlal r8, r9, r10, r5 | ||
474 | ldr r7 , [r1, #256*4] /* 4 */ | ||
475 | ldr r10, [r1, #736*4] /* 11 */ | ||
476 | rsb r12, r12, r11 /* V[03] - V[12] */ | ||
477 | smlal r8, r9, r12, r6 | ||
478 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
479 | ldr r11, [r1, #352*4] /* 5 */ | ||
480 | ldr r12, [r1, #640*4] /* 10 */ | ||
481 | rsb r10, r10, r7 /* V[04] - V[11] */ | ||
482 | smlal r8, r9, r10, r3 | ||
483 | ldr r7 , [r1, #384*4] /* 6 */ | ||
484 | ldr r10, [r1, #608*4] /* 9 */ | ||
485 | rsb r12, r12, r11 /* V[05] - V[10] */ | ||
486 | smlal r8, r9, r12, r4 | ||
487 | ldr r11, [r1, #480*4] /* 7 */ | ||
488 | ldr r12, [r1, #512*4] /* 8 */ | ||
489 | rsb r10, r10, r7 /* V[06] - V[09] */ | ||
490 | smlal r8, r9, r10, r5 | ||
491 | rsb r12, r12, r11 /* V[07] - V[08] */ | ||
492 | smlal r8, r9, r12, r6 | ||
493 | mov r8, r8, lsr #16 | ||
494 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
495 | str r8, [r0], #4 /* store Data */ | ||
496 | |||
497 | ldmpc regs=r4-r11 | ||
498 | #else | ||
499 | mpc_decoder_windowing_D: | ||
500 | /* r0 = Data[] */ | ||
501 | /* r1 = V[] */ | ||
502 | /* r2 = D[] */ | ||
503 | /* lr = counter */ | ||
504 | /************************************************************************ | ||
505 | * Further speed up through making use of symmetries within D[]-window. | ||
506 | * The row V[00] can be extracted as it has symmetries within this single | ||
507 | * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. | ||
508 | * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be | ||
509 | * saved at the cost of 15 x 4 + 1 add's. | ||
510 | * The row V[16] can be extracted as it has symmetries within this single | ||
511 | * row. 8 smull/mlal and 8 ldr's can be saved. | ||
512 | * On armv6 use smmulr/smlalr which are faster than smull/smlal and only | ||
513 | * accumulate the top 32 bits of the result so that frees up 2 | ||
514 | * registers so we can ldm larger blocks. | ||
515 | ***********************************************************************/ | ||
516 | stmfd sp!, {r4-r11, lr} | ||
517 | |||
518 | /****************************************** | ||
519 | * row 0 with internal symmetry | ||
520 | *****************************************/ | ||
521 | add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ | ||
522 | ldmia r2!, { r3-r6 } /* load D[01..04] */ | ||
523 | ldr r7 , [r1, #96*4] /* 1 */ | ||
524 | ldr r10, [r1, #992*4] /* 15 */ | ||
525 | ldr r11, [r1, #128*4] /* 2 */ | ||
526 | rsb r10, r10, r7 /* V[01] - V[15] */ | ||
527 | ldr r12, [r1, #896*4] /* 14 */ | ||
528 | smmulr r9, r10, r3 | ||
529 | ldr r7 , [r1, #224*4] /* 3 */ | ||
530 | add r12, r12, r11 /* V[02] + V[14] */ | ||
531 | ldr r10, [r1, #864*4] /* 13 */ | ||
532 | smmlar r9, r12, r4, r9 | ||
533 | ldr r11, [r1, #256*4] /* 4 */ | ||
534 | rsb r10, r10, r7 /* V[03] - V[13] */ | ||
535 | ldr r12, [r1, #768*4] /* 12 */ | ||
536 | smmlar r9, r10, r5, r9 | ||
537 | ldr r7 , [r1, #352*4] /* 5 */ | ||
538 | add r12, r12, r11 /* V[04] + V[12] */ | ||
539 | ldr r10, [r1, #736*4] /* 11 */ | ||
540 | smmlar r9, r12, r6, r9 | ||
541 | ldmia r2!, { r3-r6 } /* load D[05..08] */ | ||
542 | ldr r11, [r1, #384*4] /* 6 */ | ||
543 | rsb r10, r10, r7 /* V[05] - V[11] */ | ||
544 | ldr r12, [r1, #640*4] /* 10 */ | ||
545 | smmlar r9, r10, r3, r9 | ||
546 | ldr r7 , [r1, #480*4] /* 7 */ | ||
547 | add r12, r12, r11 /* V[06] + V[10] */ | ||
548 | ldr r10, [r1, #608*4] /* 9 */ | ||
549 | smmlar r9, r12, r4, r9 | ||
550 | rsb r10, r10, r7 /* V[07] - V[09] */ | ||
551 | ldr r11, [r1, #512*4] /* 8 */ | ||
552 | smmlar r9, r10, r5, r9 | ||
553 | add r1, r1, #4 /* V+=1, r1 = V[01] */ | ||
554 | smmlar r9, r11, r6, r9 | ||
555 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | ||
556 | mov r9, r9, lsl #2 | ||
557 | str r9, [r0], #4 /* store Data */ | ||
558 | |||
559 | /****************************************** | ||
560 | * rows 01..15 are symmetric to rows 31..17 | ||
561 | * r9 = acc of 01..15 | ||
562 | * r1 = V[01..15] | ||
563 | * r11 = acc of 31..17 | ||
564 | * r12 = V[31..16] | ||
565 | *****************************************/ | ||
566 | mov lr, #15*8 | ||
567 | add r12, r1, #30*4 /* r12 = V[31] */ | ||
568 | .loop15: | ||
569 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
570 | ldr r7, [r12, #896*4] /* 14 */ | ||
571 | ldr r8, [r12, #992*4] /* 15 */ | ||
572 | smmulr r11, r7, r4 | ||
573 | ldr r7, [r1] /* 0 */ | ||
574 | smmlar r11, r8, r3, r11 | ||
575 | ldr r8, [r1, #96*4] /* 1 */ | ||
576 | smmulr r9, r7, r3 | ||
577 | ldr r7, [r12, #768*4] /* 12 */ | ||
578 | smmlar r9, r8, r4, r9 | ||
579 | ldr r8, [r12, #864*4] /* 13 */ | ||
580 | smmlar r11, r7, r6, r11 | ||
581 | ldr r7, [r1, #128*4] /* 2 */ | ||
582 | smmlar r11, r8, r5, r11 | ||
583 | ldr r8, [r1, #224*4] /* 3 */ | ||
584 | smmlar r9, r7, r5, r9 | ||
585 | ldr r7, [r1, #256*4] /* 4 */ | ||
586 | smmlar r9, r8, r6, r9 | ||
587 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
588 | ldr r8, [r1, #352*4] /* 5 */ | ||
589 | smmlar r9, r7, r3, r9 | ||
590 | ldr r7, [r12, #640*4] /* 10 */ | ||
591 | smmlar r9, r8, r4, r9 | ||
592 | ldr r8, [r12, #736*4] /* 11 */ | ||
593 | smmlar r11, r7, r4, r11 | ||
594 | ldr r7, [r1, #384*4] /* 6 */ | ||
595 | smmlar r11, r8, r3, r11 | ||
596 | ldr r8, [r1, #480*4] /* 7 */ | ||
597 | smmlar r9, r7, r5, r9 | ||
598 | ldr r7, [r12, #512*4] /* 8 */ | ||
599 | smmlar r9, r8, r6, r9 | ||
600 | ldr r8, [r12, #608*4] /* 9 */ | ||
601 | smmlar r11, r7, r6, r11 | ||
602 | ldr r7, [r12, #384*4] /* 6 */ | ||
603 | smmlar r11, r8, r5, r11 | ||
604 | ldmia r2!, { r3-r6 } /* load D[08..11] */ | ||
605 | ldr r8, [r12, #480*4] /* 7 */ | ||
606 | smmlar r11, r7, r4, r11 | ||
607 | ldr r7, [r1, #512*4] /* 8 */ | ||
608 | smmlar r11, r8, r3, r11 | ||
609 | ldr r8, [r1, #608*4] /* 9 */ | ||
610 | smmlar r9, r7, r3, r9 | ||
611 | ldr r7, [r1, #640*4] /* 10 */ | ||
612 | smmlar r9, r8, r4, r9 | ||
613 | ldr r8, [r1, #736*4] /* 11 */ | ||
614 | smmlar r9, r7, r5, r9 | ||
615 | ldr r7, [r12, #256*4] /* 4 */ | ||
616 | smmlar r9, r8, r6, r9 | ||
617 | ldr r8, [r12, #352*4] /* 5 */ | ||
618 | smmlar r11, r7, r6, r11 | ||
619 | ldr r7, [r1, #768*4] /* 12 */ | ||
620 | smmlar r11, r8, r5, r11 | ||
621 | ldmia r2!, { r3-r6 } /* load D[12..15] */ | ||
622 | ldr r8, [r1, #864*4] /* 13 */ | ||
623 | smmlar r9, r7, r3, r9 | ||
624 | ldr r7, [r12, #128*4] /* 2 */ | ||
625 | smmlar r9, r8, r4, r9 | ||
626 | ldr r8, [r12, #224*4] /* 3 */ | ||
627 | smmlar r11, r7, r4, r11 | ||
628 | ldr r7, [r12] /* 0 */ | ||
629 | smmlar r11, r8, r3, r11 | ||
630 | ldr r8, [r12, #96*4] /* 1 */ | ||
631 | smmlar r11, r7, r6, r11 | ||
632 | ldr r7, [r1, #896*4] /* 14 */ | ||
633 | smmlar r11, r8, r5, r11 | ||
634 | ldr r8, [r1, #992*4] /* 15 */ | ||
635 | smmlar r9, r7, r5, r9 | ||
636 | sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */ | ||
637 | smmlar r9, r8, r6, r9 | ||
638 | add r1, r1, #4 /* r1 = V++ correct adresses for next loop */ | ||
639 | rsb r11, r11, #0 /* r11 = -r11 */ | ||
640 | /* store Data[01..15] */ | ||
641 | mov r9, r9, lsl #2 | ||
642 | /* store Data[31..17] */ | ||
643 | mov r11, r11, lsl #2 | ||
644 | str r11, [r0, lr] /* store Data */ | ||
645 | str r9, [r0], #4 /* store Data */ | ||
646 | /* next loop */ | ||
647 | subs lr, lr, #8 | ||
648 | bgt .loop15 | ||
649 | |||
650 | /****************************************** | ||
651 | * V[16] with internal symmetry | ||
652 | *****************************************/ | ||
653 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
654 | ldr r7 , [r1] /* 0 */ | ||
655 | ldr r10, [r1, #992*4] /* 15 */ | ||
656 | ldr r11, [r1, #96*4] /* 1 */ | ||
657 | rsb r10, r10, r7 /* V[00] - V[15] */ | ||
658 | ldr r12, [r1, #896*4] /* 14 */ | ||
659 | smmulr r9, r10, r3 | ||
660 | ldr r7 , [r1, #128*4] /* 2 */ | ||
661 | rsb r12, r12, r11 /* V[01] - V[14] */ | ||
662 | ldr r10, [r1, #864*4] /* 13 */ | ||
663 | smmlar r9, r12, r4, r9 | ||
664 | ldr r11, [r1, #224*4] /* 3 */ | ||
665 | rsb r10, r10, r7 /* V[02] - V[13] */ | ||
666 | ldr r12, [r1, #768*4] /* 12 */ | ||
667 | smmlar r9, r10, r5, r9 | ||
668 | ldr r7 , [r1, #256*4] /* 4 */ | ||
669 | rsb r12, r12, r11 /* V[03] - V[12] */ | ||
670 | ldr r10, [r1, #736*4] /* 11 */ | ||
671 | smmlar r9, r12, r6, r9 | ||
672 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
673 | ldr r11, [r1, #352*4] /* 5 */ | ||
674 | rsb r10, r10, r7 /* V[04] - V[11] */ | ||
675 | ldr r12, [r1, #640*4] /* 10 */ | ||
676 | smmlar r9, r10, r3, r9 | ||
677 | ldr r7 , [r1, #384*4] /* 6 */ | ||
678 | rsb r12, r12, r11 /* V[05] - V[10] */ | ||
679 | ldr r10, [r1, #608*4] /* 9 */ | ||
680 | smmlar r9, r12, r4, r9 | ||
681 | ldr r11, [r1, #480*4] /* 7 */ | ||
682 | rsb r10, r10, r7 /* V[06] - V[09] */ | ||
683 | ldr r12, [r1, #512*4] /* 8 */ | ||
684 | smmlar r9, r10, r5, r9 | ||
685 | rsb r12, r12, r11 /* V[07] - V[08] */ | ||
686 | smmlar r9, r12, r6, r9 | ||
687 | mov r9, r9, lsl #2 | ||
688 | str r9, [r0], #4 /* store Data */ | ||
689 | |||
690 | ldmpc regs=r4-r11 | ||
691 | #endif | ||
692 | .mpc_dewindowing_end: | ||
693 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D | ||