summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libmusepack/synth_filter_arm.S')
-rw-r--r--lib/rbcodec/codecs/libmusepack/synth_filter_arm.S693
1 files changed, 693 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S b/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S
new file mode 100644
index 0000000000..9bd4e04626
--- /dev/null
+++ b/lib/rbcodec/codecs/libmusepack/synth_filter_arm.S
@@ -0,0 +1,693 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2008 by Andree Buschmann
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23
24#if defined(CPU_S5L870X)
25 /* Moving this asm routine to iram is (little) faster on S5L870x. */
26 .section .icode, "ax", %progbits
27#else
28 .section .text, "ax", %progbits
29#endif
30
31/****************************************************************************
32 * void mpc_decoder_windowing_D(...)
33 *
34 * 2nd step within synthesis filter. Does the dewindowing.
35 * 64=32x32 multiplies
36 * Uses un-shifted D[]-values. D[] will always be the second operand of
37 * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
38 ****************************************************************************/
39 .align 2
40 .global mpc_decoder_windowing_D
41 .type mpc_decoder_windowing_D, %function
42#if 0
43mpc_decoder_windowing_D:
44 /* r0 = Data[] */
45 /* r1 = V[] */
46 /* r2 = D[] */
47 /* lr = counter */
48 /************************************************************************
49 * Reference implementation.
50 ***********************************************************************/
51 stmfd sp!, {r4-r8, lr}
52
53 mov lr, #32
54.loop32:
55 ldmia r2!, { r3-r6 } /* load D[00..03] */
56 ldr r7, [r1] /* 0 */
57 smull r8, r12, r7, r3
58 ldr r7, [r1, #96*4] /* 1 */
59 smlal r8, r12, r7, r4
60 ldr r7, [r1, #128*4] /* 2 */
61 smlal r8, r12, r7, r5
62 ldr r7, [r1, #224*4] /* 3 */
63 smlal r8, r12, r7, r6
64 ldmia r2!, { r3-r6 } /* load D[04..07] */
65 ldr r7, [r1, #256*4] /* 4 */
66 smlal r8, r12, r7, r3
67 ldr r7, [r1, #352*4] /* 5 */
68 smlal r8, r12, r7, r4
69 ldr r7, [r1, #384*4] /* 6 */
70 smlal r8, r12, r7, r5
71 ldr r7, [r1, #480*4] /* 7 */
72 smlal r8, r12, r7, r6
73 ldmia r2!, { r3-r6 } /* load D[08..11] */
74 ldr r7, [r1, #512*4] /* 8 */
75 smlal r8, r12, r7, r3
76 ldr r7, [r1, #608*4] /* 9 */
77 smlal r8, r12, r7, r4
78 ldr r7, [r1, #640*4] /* 10 */
79 smlal r8, r12, r7, r5
80 ldr r7, [r1, #736*4] /* 11 */
81 smlal r8, r12, r7, r6
82 ldmia r2!, { r3-r6 } /* load D[12..15] */
83 ldr r7, [r1, #768*4] /* 12 */
84 smlal r8, r12, r7, r3
85 ldr r7, [r1, #864*4] /* 13 */
86 smlal r8, r12, r7, r4
87 ldr r7, [r1, #896*4] /* 14 */
88 smlal r8, r12, r7, r5
89 ldr r7, [r1, #992*4] /* 15 */
90 smlal r8, r12, r7, r6
91 mov r8, r8, lsr #16
92 orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */
93 str r8, [r0], #4 /* store Data */
94 add r1, r1, #4 /* V++ */
95
96 subs lr, lr, #1
97 bgt .loop32
98
99 ldmpc regs=r4-r8
100#elif defined(CPU_ARM7TDMI) /* arm7 only */
101mpc_decoder_windowing_D:
102 /* r0 = Data[] */
103 /* r1 = V[] */
104 /* r2 = D[] */
105 /* lr = counter */
106 /************************************************************************
107 * Further speed up through making use of symmetries within D[]-window.
108 * The row V[00] can be extracted as it has symmetries within this single
109 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
110 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
111 * saved at the cost of 15 x 4 + 1 add's.
112 * The row V[16] can be extracted as it has symmetries within this single
113 * row. 8 smull/mlal and 8 ldr's can be saved.
114 * Used for arm7 only. For arm9 and above see implementation below.
115 ***********************************************************************/
116 stmfd sp!, {r4-r11, lr}
117
118 /******************************************
119 * row 0 with internal symmetry
120 *****************************************/
121 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
122 ldmia r2!, { r3-r6 } /* load D[01..04] */
123 ldr r7 , [r1, #96*4] /* 1 */
124 ldr r10, [r1, #992*4] /* 15 */
125 rsb r10, r10, r7 /* V[01] - V[15] */
126 smull r8, r9, r10, r3
127 ldr r7 , [r1, #128*4] /* 2 */
128 ldr r10, [r1, #896*4] /* 14 */
129 add r10, r10, r7 /* V[02] + V[14] */
130 smlal r8, r9, r10, r4
131 ldr r7 , [r1, #224*4] /* 3 */
132 ldr r10, [r1, #864*4] /* 13 */
133 rsb r10, r10, r7 /* V[03] - V[13] */
134 smlal r8, r9, r10, r5
135 ldr r7 , [r1, #256*4] /* 4 */
136 ldr r10, [r1, #768*4] /* 12 */
137 add r10, r10, r7 /* V[04] + V[12] */
138 smlal r8, r9, r10, r6
139 ldmia r2!, { r3-r6 } /* load D[05..08] */
140 ldr r7 , [r1, #352*4] /* 5 */
141 ldr r10, [r1, #736*4] /* 11 */
142 rsb r10, r10, r7 /* V[05] - V[11] */
143 smlal r8, r9, r10, r3
144 ldr r7 , [r1, #384*4] /* 6 */
145 ldr r10, [r1, #640*4] /* 10 */
146 add r10, r10, r7 /* V[06] + V[10] */
147 smlal r8, r9, r10, r4
148 ldr r7 , [r1, #480*4] /* 7 */
149 ldr r10, [r1, #608*4] /* 9 */
150 rsb r10, r10, r7 /* V[07] - V[09] */
151 smlal r8, r9, r10, r5
152 ldr r10, [r1, #512*4] /* 8 */
153 smlal r8, r9, r10, r6
154 mov r8, r8, lsr #16
155 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
156 str r8, [r0], #4 /* store Data */
157 add r1, r1, #4 /* V+=1, r1 = V[01] */
158 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
159
160 /******************************************
161 * rows 01..15 are symmetric to rows 31..17
162 * r8 = lo, r9 = hi of 01..15
163 * r1 = V[01..15]
164 * r10 = lo, r11 = hi of 31..17
165 * r12 = V[31..16]
166 *****************************************/
167 mov lr, #15*8
168 add r12, r1, #30*4 /* r12 = V[31] */
169.loop15:
170 ldmia r2!, { r3-r6 } /* load D[00..03] */
171 ldr r7, [r12, #768*4] /* 12 */
172 smull r10, r11, r7, r6
173 ldr r7, [r12, #864*4] /* 13 */
174 smlal r10, r11, r7, r5
175 ldr r7, [r12, #896*4] /* 14 */
176 smlal r10, r11, r7, r4
177 ldr r7, [r12, #992*4] /* 15 */
178 smlal r10, r11, r7, r3
179 ldr r7, [r1] /* 0 */
180 smull r8, r9, r7, r3
181 ldr r7, [r1, #96*4] /* 1 */
182 smlal r8, r9, r7, r4
183 ldr r7, [r1, #128*4] /* 2 */
184 smlal r8, r9, r7, r5
185 ldr r7, [r1, #224*4] /* 3 */
186 smlal r8, r9, r7, r6
187 ldmia r2!, { r3-r6 } /* load D[04..07] */
188 ldr r7, [r1, #256*4] /* 4 */
189 smlal r8, r9, r7, r3
190 ldr r7, [r1, #352*4] /* 5 */
191 smlal r8, r9, r7, r4
192 ldr r7, [r1, #384*4] /* 6 */
193 smlal r8, r9, r7, r5
194 ldr r7, [r1, #480*4] /* 7 */
195 smlal r8, r9, r7, r6
196 ldr r7, [r12, #512*4] /* 8 */
197 smlal r10, r11, r7, r6
198 ldr r7, [r12, #608*4] /* 9 */
199 smlal r10, r11, r7, r5
200 ldr r7, [r12, #640*4] /* 10 */
201 smlal r10, r11, r7, r4
202 ldr r7, [r12, #736*4] /* 11 */
203 smlal r10, r11, r7, r3
204 ldmia r2!, { r3-r6 } /* load D[08..11] */
205 ldr r7, [r12, #256*4] /* 4 */
206 smlal r10, r11, r7, r6
207 ldr r7, [r12, #352*4] /* 5 */
208 smlal r10, r11, r7, r5
209 ldr r7, [r12, #384*4] /* 6 */
210 smlal r10, r11, r7, r4
211 ldr r7, [r12, #480*4] /* 7 */
212 smlal r10, r11, r7, r3
213 ldr r7, [r1, #512*4] /* 8 */
214 smlal r8, r9, r7, r3
215 ldr r7, [r1, #608*4] /* 9 */
216 smlal r8, r9, r7, r4
217 ldr r7, [r1, #640*4] /* 10 */
218 smlal r8, r9, r7, r5
219 ldr r7, [r1, #736*4] /* 11 */
220 smlal r8, r9, r7, r6
221 ldmia r2!, { r3-r6 } /* load D[12..15] */
222 ldr r7, [r1, #768*4] /* 12 */
223 smlal r8, r9, r7, r3
224 ldr r7, [r1, #864*4] /* 13 */
225 smlal r8, r9, r7, r4
226 ldr r7, [r1, #896*4] /* 14 */
227 smlal r8, r9, r7, r5
228 ldr r7, [r1, #992*4] /* 15 */
229 smlal r8, r9, r7, r6
230 ldr r7, [r12] /* 0 */
231 smlal r10, r11, r7, r6
232 ldr r7, [r12, #96*4] /* 1 */
233 smlal r10, r11, r7, r5
234 ldr r7, [r12, #128*4] /* 2 */
235 smlal r10, r11, r7, r4
236 ldr r7, [r12, #224*4] /* 3 */
237 smlal r10, r11, r7, r3
238 /* store Data[01..15] */
239 mov r8, r8, lsr #16
240 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
241 /* store Data[31..17] */
242 mov r10, r10, lsr #16
243 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
244 rsb r10, r10, #0 /* r10 = -r10 */
245 str r10, [r0, lr] /* store Data */
246 str r8, [r0], #4 /* store Data */
247 /* correct adresses for next loop */
248 sub r12, r12, #4 /* r12 = V-- */
249 add r1, r1, #4 /* r1 = V++ */
250 /* next loop */
251 subs lr, lr, #8
252 bgt .loop15
253
254 /******************************************
255 * V[16] with internal symmetry
256 *****************************************/
257 ldmia r2!, { r3-r6 } /* load D[00..03] */
258 ldr r7 , [r1] /* 0 */
259 ldr r10, [r1, #992*4] /* 15 */
260 rsb r10, r10, r7 /* V[00] - V[15] */
261 smull r8, r9, r10, r3
262 ldr r7 , [r1, #96*4] /* 1 */
263 ldr r10, [r1, #896*4] /* 14 */
264 rsb r10, r10, r7 /* V[01] - V[14] */
265 smlal r8, r9, r10, r4
266 ldr r7 , [r1, #128*4] /* 2 */
267 ldr r10, [r1, #864*4] /* 13 */
268 rsb r10, r10, r7 /* V[02] - V[13] */
269 smlal r8, r9, r10, r5
270 ldr r7 , [r1, #224*4] /* 3 */
271 ldr r10, [r1, #768*4] /* 12 */
272 rsb r10, r10, r7 /* V[03] - V[12] */
273 smlal r8, r9, r10, r6
274 ldmia r2!, { r3-r6 } /* load D[04..07] */
275 ldr r7 , [r1, #256*4] /* 4 */
276 ldr r10, [r1, #736*4] /* 11 */
277 rsb r10, r10, r7 /* V[04] - V[11] */
278 smlal r8, r9, r10, r3
279 ldr r7 , [r1, #352*4] /* 5 */
280 ldr r10, [r1, #640*4] /* 10 */
281 rsb r10, r10, r7 /* V[05] - V[10] */
282 smlal r8, r9, r10, r4
283 ldr r7 , [r1, #384*4] /* 6 */
284 ldr r10, [r1, #608*4] /* 9 */
285 rsb r10, r10, r7 /* V[06] - V[09] */
286 smlal r8, r9, r10, r5
287 ldr r7 , [r1, #480*4] /* 7 */
288 ldr r10, [r1, #512*4] /* 8 */
289 rsb r10, r10, r7 /* V[07] - V[08] */
290 smlal r8, r9, r10, r6
291 mov r8, r8, lsr #16
292 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
293 str r8, [r0], #4 /* store Data */
294
295 ldmpc regs=r4-r11
296#elif ARM_ARCH < 6 /* arm9 and above */
297 mpc_decoder_windowing_D:
298 /* r0 = Data[] */
299 /* r1 = V[] */
300 /* r2 = D[] */
301 /* lr = counter */
302 /************************************************************************
303 * Further speed up through making use of symmetries within D[]-window.
304 * The row V[00] can be extracted as it has symmetries within this single
305 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
306 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
307 * saved at the cost of 15 x 4 + 1 add's.
308 * The row V[16] can be extracted as it has symmetries within this single
309 * row. 8 smull/mlal and 8 ldr's can be saved.
310 * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds
311 * up decoding even though several ldm-calls are replaced with ldr to free
312 * 2 registers.
313 ***********************************************************************/
314 stmfd sp!, {r4-r11, lr}
315
316 /******************************************
317 * row 0 with internal symmetry
318 *****************************************/
319 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
320 ldmia r2!, { r3-r6 } /* load D[01..04] */
321 ldr r7 , [r1, #96*4] /* 1 */
322 ldr r10, [r1, #992*4] /* 15 */
323 ldr r11, [r1, #128*4] /* 2 */
324 ldr r12, [r1, #896*4] /* 14 */
325 rsb r10, r10, r7 /* V[01] - V[15] */
326 smull r8, r9, r10, r3
327 ldr r7 , [r1, #224*4] /* 3 */
328 ldr r10, [r1, #864*4] /* 13 */
329 add r12, r12, r11 /* V[02] + V[14] */
330 smlal r8, r9, r12, r4
331 ldr r11, [r1, #256*4] /* 4 */
332 ldr r12, [r1, #768*4] /* 12 */
333 rsb r10, r10, r7 /* V[03] - V[13] */
334 smlal r8, r9, r10, r5
335 ldr r7 , [r1, #352*4] /* 5 */
336 ldr r10, [r1, #736*4] /* 11 */
337 add r12, r12, r11 /* V[04] + V[12] */
338 smlal r8, r9, r12, r6
339 ldmia r2!, { r3-r6 } /* load D[05..08] */
340 ldr r11, [r1, #384*4] /* 6 */
341 ldr r12, [r1, #640*4] /* 10 */
342 rsb r10, r10, r7 /* V[05] - V[11] */
343 smlal r8, r9, r10, r3
344 ldr r7 , [r1, #480*4] /* 7 */
345 ldr r10, [r1, #608*4] /* 9 */
346 add r12, r12, r11 /* V[06] + V[10] */
347 smlal r8, r9, r12, r4
348 ldr r11, [r1, #512*4] /* 8 */
349 rsb r10, r10, r7 /* V[07] - V[09] */
350 smlal r8, r9, r10, r5
351 smlal r8, r9, r11, r6
352 mov r8, r8, lsr #16
353 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
354 str r8, [r0], #4 /* store Data */
355 add r1, r1, #4 /* V+=1, r1 = V[01] */
356 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
357
358 /******************************************
359 * rows 01..15 are symmetric to rows 31..17
360 * r8 = lo, r9 = hi of 01..15
361 * r1 = V[01..15]
362 * r10 = lo, r11 = hi of 31..17
363 * r12 = V[31..16]
364 *****************************************/
365 mov lr, #15*8
366 add r12, r1, #30*4 /* r12 = V[31] */
367.loop15:
368 ldmia r2!, { r3-r4 } /* load D[00..01] */
369 ldr r7, [r12, #896*4] /* 14 */
370 ldr r5, [r12, #992*4] /* 15 */
371 smull r10, r11, r7, r4
372 ldr r7, [r1] /* 0 */
373 smlal r10, r11, r5, r3
374 ldr r5, [r1, #96*4] /* 1 */
375 smull r8, r9, r7, r3
376 ldr r7, [r12, #768*4] /* 12 */
377 smlal r8, r9, r5, r4
378 ldmia r2!, { r3-r4 } /* load D[02..03] */
379 ldr r5, [r12, #864*4] /* 13 */
380 smlal r10, r11, r7, r4
381 ldr r7, [r1, #128*4] /* 2 */
382 smlal r10, r11, r5, r3
383 ldr r5, [r1, #224*4] /* 3 */
384 smlal r8, r9, r7, r3
385 ldr r7, [r1, #256*4] /* 4 */
386 smlal r8, r9, r5, r4
387 ldmia r2!, { r3-r4 } /* load D[04..04] */
388 ldr r5, [r1, #352*4] /* 5 */
389 smlal r8, r9, r7, r3
390 ldr r7, [r12, #640*4] /* 10 */
391 smlal r8, r9, r5, r4
392 ldr r5, [r12, #736*4] /* 11 */
393 smlal r10, r11, r7, r4
394 ldr r7, [r1, #384*4] /* 6 */
395 smlal r10, r11, r5, r3
396 ldmia r2!, { r3-r4 } /* load D[06..07] */
397 ldr r5, [r1, #480*4] /* 7 */
398 smlal r8, r9, r7, r3
399 ldr r7, [r12, #512*4] /* 8 */
400 smlal r8, r9, r5, r4
401 ldr r5, [r12, #608*4] /* 9 */
402 smlal r10, r11, r7, r4
403 ldr r7, [r12, #384*4] /* 6 */
404 smlal r10, r11, r5, r3
405 ldmia r2!, { r3-r4 } /* load D[08..09] */
406 ldr r5, [r12, #480*4] /* 7 */
407 smlal r10, r11, r7, r4
408 ldr r7, [r1, #512*4] /* 8 */
409 smlal r10, r11, r5, r3
410 ldr r5, [r1, #608*4] /* 9 */
411 smlal r8, r9, r7, r3
412 ldr r7, [r1, #640*4] /* 10 */
413 smlal r8, r9, r5, r4
414 ldmia r2!, { r3-r4 } /* load D[10..11] */
415 ldr r5, [r1, #736*4] /* 11 */
416 smlal r8, r9, r7, r3
417 ldr r7, [r12, #256*4] /* 4 */
418 smlal r8, r9, r5, r4
419 ldr r5, [r12, #352*4] /* 5 */
420 smlal r10, r11, r7, r4
421 ldr r7, [r1, #768*4] /* 12 */
422 smlal r10, r11, r5, r3
423 ldmia r2!, { r3-r4 } /* load D[12..13] */
424 ldr r5, [r1, #864*4] /* 13 */
425 smlal r8, r9, r7, r3
426 ldr r7, [r12, #128*4] /* 2 */
427 smlal r8, r9, r5, r4
428 ldr r5, [r12, #224*4] /* 3 */
429 smlal r10, r11, r7, r4
430 ldr r7, [r12] /* 0 */
431 smlal r10, r11, r5, r3
432 ldmia r2!, { r3-r4 } /* load D[14..15] */
433 ldr r5, [r12, #96*4] /* 1 */
434 smlal r10, r11, r7, r4
435 ldr r7, [r1, #896*4] /* 14 */
436 smlal r10, r11, r5, r3
437 ldr r5, [r1, #992*4] /* 15 */
438 smlal r8, r9, r7, r3
439 smlal r8, r9, r5, r4
440 /* store Data[01..15] */
441 mov r8, r8, lsr #16
442 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
443 /* store Data[31..17] */
444 mov r10, r10, lsr #16
445 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
446 rsb r10, r10, #0 /* r10 = -r10 */
447 str r10, [r0, lr] /* store Data */
448 str r8, [r0], #4 /* store Data */
449 /* correct adresses for next loop */
450 sub r12, r12, #4 /* r12 = V-- */
451 add r1, r1, #4 /* r1 = V++ */
452 /* next loop */
453 subs lr, lr, #8
454 bgt .loop15
455
456 /******************************************
457 * V[16] with internal symmetry
458 *****************************************/
459 ldmia r2!, { r3-r6 } /* load D[00..03] */
460 ldr r7 , [r1] /* 0 */
461 ldr r10, [r1, #992*4] /* 15 */
462 ldr r11, [r1, #96*4] /* 1 */
463 ldr r12, [r1, #896*4] /* 14 */
464 rsb r10, r10, r7 /* V[00] - V[15] */
465 smull r8, r9, r10, r3
466 ldr r7 , [r1, #128*4] /* 2 */
467 ldr r10, [r1, #864*4] /* 13 */
468 rsb r12, r12, r11 /* V[01] - V[14] */
469 smlal r8, r9, r12, r4
470 ldr r11, [r1, #224*4] /* 3 */
471 ldr r12, [r1, #768*4] /* 12 */
472 rsb r10, r10, r7 /* V[02] - V[13] */
473 smlal r8, r9, r10, r5
474 ldr r7 , [r1, #256*4] /* 4 */
475 ldr r10, [r1, #736*4] /* 11 */
476 rsb r12, r12, r11 /* V[03] - V[12] */
477 smlal r8, r9, r12, r6
478 ldmia r2!, { r3-r6 } /* load D[04..07] */
479 ldr r11, [r1, #352*4] /* 5 */
480 ldr r12, [r1, #640*4] /* 10 */
481 rsb r10, r10, r7 /* V[04] - V[11] */
482 smlal r8, r9, r10, r3
483 ldr r7 , [r1, #384*4] /* 6 */
484 ldr r10, [r1, #608*4] /* 9 */
485 rsb r12, r12, r11 /* V[05] - V[10] */
486 smlal r8, r9, r12, r4
487 ldr r11, [r1, #480*4] /* 7 */
488 ldr r12, [r1, #512*4] /* 8 */
489 rsb r10, r10, r7 /* V[06] - V[09] */
490 smlal r8, r9, r10, r5
491 rsb r12, r12, r11 /* V[07] - V[08] */
492 smlal r8, r9, r12, r6
493 mov r8, r8, lsr #16
494 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
495 str r8, [r0], #4 /* store Data */
496
497 ldmpc regs=r4-r11
498#else
499 mpc_decoder_windowing_D:
500 /* r0 = Data[] */
501 /* r1 = V[] */
502 /* r2 = D[] */
503 /* lr = counter */
504 /************************************************************************
505 * Further speed up through making use of symmetries within D[]-window.
506 * The row V[00] can be extracted as it has symmetries within this single
507 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
508 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
509 * saved at the cost of 15 x 4 + 1 add's.
510 * The row V[16] can be extracted as it has symmetries within this single
511 * row. 8 smull/mlal and 8 ldr's can be saved.
512 * On armv6 use smmulr/smlalr which are faster than smull/smlal and only
513 * accumulate the top 32 bits of the result so that frees up 2
514 * registers so we can ldm larger blocks.
515 ***********************************************************************/
516 stmfd sp!, {r4-r11, lr}
517
518 /******************************************
519 * row 0 with internal symmetry
520 *****************************************/
521 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
522 ldmia r2!, { r3-r6 } /* load D[01..04] */
523 ldr r7 , [r1, #96*4] /* 1 */
524 ldr r10, [r1, #992*4] /* 15 */
525 ldr r11, [r1, #128*4] /* 2 */
526 rsb r10, r10, r7 /* V[01] - V[15] */
527 ldr r12, [r1, #896*4] /* 14 */
528 smmulr r9, r10, r3
529 ldr r7 , [r1, #224*4] /* 3 */
530 add r12, r12, r11 /* V[02] + V[14] */
531 ldr r10, [r1, #864*4] /* 13 */
532 smmlar r9, r12, r4, r9
533 ldr r11, [r1, #256*4] /* 4 */
534 rsb r10, r10, r7 /* V[03] - V[13] */
535 ldr r12, [r1, #768*4] /* 12 */
536 smmlar r9, r10, r5, r9
537 ldr r7 , [r1, #352*4] /* 5 */
538 add r12, r12, r11 /* V[04] + V[12] */
539 ldr r10, [r1, #736*4] /* 11 */
540 smmlar r9, r12, r6, r9
541 ldmia r2!, { r3-r6 } /* load D[05..08] */
542 ldr r11, [r1, #384*4] /* 6 */
543 rsb r10, r10, r7 /* V[05] - V[11] */
544 ldr r12, [r1, #640*4] /* 10 */
545 smmlar r9, r10, r3, r9
546 ldr r7 , [r1, #480*4] /* 7 */
547 add r12, r12, r11 /* V[06] + V[10] */
548 ldr r10, [r1, #608*4] /* 9 */
549 smmlar r9, r12, r4, r9
550 rsb r10, r10, r7 /* V[07] - V[09] */
551 ldr r11, [r1, #512*4] /* 8 */
552 smmlar r9, r10, r5, r9
553 add r1, r1, #4 /* V+=1, r1 = V[01] */
554 smmlar r9, r11, r6, r9
555 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
556 mov r9, r9, lsl #2
557 str r9, [r0], #4 /* store Data */
558
559 /******************************************
560 * rows 01..15 are symmetric to rows 31..17
561 * r9 = acc of 01..15
562 * r1 = V[01..15]
563 * r11 = acc of 31..17
564 * r12 = V[31..16]
565 *****************************************/
566 mov lr, #15*8
567 add r12, r1, #30*4 /* r12 = V[31] */
568.loop15:
569 ldmia r2!, { r3-r6 } /* load D[00..03] */
570 ldr r7, [r12, #896*4] /* 14 */
571 ldr r8, [r12, #992*4] /* 15 */
572 smmulr r11, r7, r4
573 ldr r7, [r1] /* 0 */
574 smmlar r11, r8, r3, r11
575 ldr r8, [r1, #96*4] /* 1 */
576 smmulr r9, r7, r3
577 ldr r7, [r12, #768*4] /* 12 */
578 smmlar r9, r8, r4, r9
579 ldr r8, [r12, #864*4] /* 13 */
580 smmlar r11, r7, r6, r11
581 ldr r7, [r1, #128*4] /* 2 */
582 smmlar r11, r8, r5, r11
583 ldr r8, [r1, #224*4] /* 3 */
584 smmlar r9, r7, r5, r9
585 ldr r7, [r1, #256*4] /* 4 */
586 smmlar r9, r8, r6, r9
587 ldmia r2!, { r3-r6 } /* load D[04..07] */
588 ldr r8, [r1, #352*4] /* 5 */
589 smmlar r9, r7, r3, r9
590 ldr r7, [r12, #640*4] /* 10 */
591 smmlar r9, r8, r4, r9
592 ldr r8, [r12, #736*4] /* 11 */
593 smmlar r11, r7, r4, r11
594 ldr r7, [r1, #384*4] /* 6 */
595 smmlar r11, r8, r3, r11
596 ldr r8, [r1, #480*4] /* 7 */
597 smmlar r9, r7, r5, r9
598 ldr r7, [r12, #512*4] /* 8 */
599 smmlar r9, r8, r6, r9
600 ldr r8, [r12, #608*4] /* 9 */
601 smmlar r11, r7, r6, r11
602 ldr r7, [r12, #384*4] /* 6 */
603 smmlar r11, r8, r5, r11
604 ldmia r2!, { r3-r6 } /* load D[08..11] */
605 ldr r8, [r12, #480*4] /* 7 */
606 smmlar r11, r7, r4, r11
607 ldr r7, [r1, #512*4] /* 8 */
608 smmlar r11, r8, r3, r11
609 ldr r8, [r1, #608*4] /* 9 */
610 smmlar r9, r7, r3, r9
611 ldr r7, [r1, #640*4] /* 10 */
612 smmlar r9, r8, r4, r9
613 ldr r8, [r1, #736*4] /* 11 */
614 smmlar r9, r7, r5, r9
615 ldr r7, [r12, #256*4] /* 4 */
616 smmlar r9, r8, r6, r9
617 ldr r8, [r12, #352*4] /* 5 */
618 smmlar r11, r7, r6, r11
619 ldr r7, [r1, #768*4] /* 12 */
620 smmlar r11, r8, r5, r11
621 ldmia r2!, { r3-r6 } /* load D[12..15] */
622 ldr r8, [r1, #864*4] /* 13 */
623 smmlar r9, r7, r3, r9
624 ldr r7, [r12, #128*4] /* 2 */
625 smmlar r9, r8, r4, r9
626 ldr r8, [r12, #224*4] /* 3 */
627 smmlar r11, r7, r4, r11
628 ldr r7, [r12] /* 0 */
629 smmlar r11, r8, r3, r11
630 ldr r8, [r12, #96*4] /* 1 */
631 smmlar r11, r7, r6, r11
632 ldr r7, [r1, #896*4] /* 14 */
633 smmlar r11, r8, r5, r11
634 ldr r8, [r1, #992*4] /* 15 */
635 smmlar r9, r7, r5, r9
636 sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */
637 smmlar r9, r8, r6, r9
638 add r1, r1, #4 /* r1 = V++ correct adresses for next loop */
639 rsb r11, r11, #0 /* r11 = -r11 */
640 /* store Data[01..15] */
641 mov r9, r9, lsl #2
642 /* store Data[31..17] */
643 mov r11, r11, lsl #2
644 str r11, [r0, lr] /* store Data */
645 str r9, [r0], #4 /* store Data */
646 /* next loop */
647 subs lr, lr, #8
648 bgt .loop15
649
650 /******************************************
651 * V[16] with internal symmetry
652 *****************************************/
653 ldmia r2!, { r3-r6 } /* load D[00..03] */
654 ldr r7 , [r1] /* 0 */
655 ldr r10, [r1, #992*4] /* 15 */
656 ldr r11, [r1, #96*4] /* 1 */
657 rsb r10, r10, r7 /* V[00] - V[15] */
658 ldr r12, [r1, #896*4] /* 14 */
659 smmulr r9, r10, r3
660 ldr r7 , [r1, #128*4] /* 2 */
661 rsb r12, r12, r11 /* V[01] - V[14] */
662 ldr r10, [r1, #864*4] /* 13 */
663 smmlar r9, r12, r4, r9
664 ldr r11, [r1, #224*4] /* 3 */
665 rsb r10, r10, r7 /* V[02] - V[13] */
666 ldr r12, [r1, #768*4] /* 12 */
667 smmlar r9, r10, r5, r9
668 ldr r7 , [r1, #256*4] /* 4 */
669 rsb r12, r12, r11 /* V[03] - V[12] */
670 ldr r10, [r1, #736*4] /* 11 */
671 smmlar r9, r12, r6, r9
672 ldmia r2!, { r3-r6 } /* load D[04..07] */
673 ldr r11, [r1, #352*4] /* 5 */
674 rsb r10, r10, r7 /* V[04] - V[11] */
675 ldr r12, [r1, #640*4] /* 10 */
676 smmlar r9, r10, r3, r9
677 ldr r7 , [r1, #384*4] /* 6 */
678 rsb r12, r12, r11 /* V[05] - V[10] */
679 ldr r10, [r1, #608*4] /* 9 */
680 smmlar r9, r12, r4, r9
681 ldr r11, [r1, #480*4] /* 7 */
682 rsb r10, r10, r7 /* V[06] - V[09] */
683 ldr r12, [r1, #512*4] /* 8 */
684 smmlar r9, r10, r5, r9
685 rsb r12, r12, r11 /* V[07] - V[08] */
686 smmlar r9, r12, r6, r9
687 mov r9, r9, lsl #2
688 str r9, [r0], #4 /* store Data */
689
690 ldmpc regs=r4-r11
691#endif
692.mpc_dewindowing_end:
693 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D