summaryrefslogtreecommitdiff
path: root/apps/codecs/lib/mdct.c
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs/lib/mdct.c')
-rw-r--r--apps/codecs/lib/mdct.c644
1 files changed, 0 insertions, 644 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
deleted file mode 100644
index 777aec4a55..0000000000
--- a/apps/codecs/lib/mdct.c
+++ /dev/null
@@ -1,644 +0,0 @@
1/*
2 * Fixed Point IMDCT
3 * Copyright (c) 2002 The FFmpeg Project.
4 * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include "codeclib.h"
22#include "mdct.h"
23#include "codeclib_misc.h"
24#include "mdct_lookup.h"
25
26#ifndef ICODE_ATTR_TREMOR_MDCT
27#define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
28#endif
29
30/**
31 * Compute the middle half of the inverse MDCT of size N = 2^nbits
32 * thus excluding the parts that can be derived by symmetry
33 * @param output N/2 samples
34 * @param input N/2 samples
35 *
36 * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
37 * not overlap or intersect at all)
38 */
39void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
40void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
41{
42 int n8, n4, n2, n, j;
43 const fixed32 *in1, *in2;
44 (void)j;
45 n = 1 << nbits;
46
47 n2 = n >> 1;
48 n4 = n >> 2;
49 n8 = n >> 3;
50
51 FFTComplex *z = (FFTComplex *)output;
52
53 /* pre rotation */
54 in1 = input;
55 in2 = input + n2 - 1;
56
57 /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
58 The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
59 const int revtab_shift = (14- nbits);
60
61 /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
62 /* (note that when using the current split radix, the bitreverse ordering is
63 complex, meaning that this reordering cannot easily be done in-place) */
64 /* Using the following pdf, you can see that it is possible to rearrange
65 the 'classic' pre/post rotate with an alternative one that enables
66 us to use fewer distinct twiddle factors.
67 http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
68
69 For prerotation, the factors are just sin,cos(2PI*i/N)
70 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
71
72 Therefore, prerotation can immediately reuse the same twiddles as fft
73 (for postrotation it's still a bit complex, we reuse the fft trig tables
74 where we can, or a special table for N=2048, or interpolate between
75 trig tables for N>2048)
76 */
77 const int32_t *T = sincos_lookup0;
78 const int step = 2<<(12-nbits);
79 const uint16_t * p_revtab=revtab;
80 {
81 const uint16_t * const p_revtab_end = p_revtab + n8;
82#ifdef CPU_COLDFIRE
83 asm volatile ("move.l (%[in2]), %%d0\n\t"
84 "move.l (%[in1]), %%d1\n\t"
85 "bra.s 1f\n\t"
86 "0:\n\t"
87 "movem.l (%[T]), %%d2-%%d3\n\t"
88
89 "addq.l #8, %[in1]\n\t"
90 "subq.l #8, %[in2]\n\t"
91
92 "lea (%[step]*4, %[T]), %[T]\n\t"
93
94 "mac.l %%d0, %%d3, (%[T]), %%d4, %%acc0;"
95 "msac.l %%d1, %%d2, (4, %[T]), %%d5, %%acc0;"
96 "mac.l %%d1, %%d3, (%[in1]), %%d1, %%acc1;"
97 "mac.l %%d0, %%d2, (%[in2]), %%d0, %%acc1;"
98
99 "addq.l #8, %[in1]\n\t"
100 "subq.l #8, %[in2]\n\t"
101
102 "mac.l %%d0, %%d5, %%acc2;"
103 "msac.l %%d1, %%d4, (%[p_revtab])+, %%d2, %%acc2;"
104 "mac.l %%d1, %%d5, (%[in1]), %%d1, %%acc3;"
105 "mac.l %%d0, %%d4, (%[in2]), %%d0, %%acc3;"
106
107 "clr.l %%d3\n\t"
108 "move.w %%d2, %%d3\n\t"
109 "eor.l %%d3, %%d2\n\t"
110 "swap %%d2\n\t"
111 "lsr.l %[revtab_shift], %%d2\n\t"
112
113 "movclr.l %%acc0, %%d4;"
114 "movclr.l %%acc1, %%d5;"
115 "lsl.l #3, %%d2\n\t"
116 "lea (%%d2, %[z]), %%a1\n\t"
117 "movem.l %%d4-%%d5, (%%a1)\n\t"
118
119 "lsr.l %[revtab_shift], %%d3\n\t"
120
121 "movclr.l %%acc2, %%d4;"
122 "movclr.l %%acc3, %%d5;"
123 "lsl.l #3, %%d3\n\t"
124 "lea (%%d3, %[z]), %%a1\n\t"
125 "movem.l %%d4-%%d5, (%%a1)\n\t"
126
127 "lea (%[step]*4, %[T]), %[T]\n\t"
128
129 "1:\n\t"
130 "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
131 "bcs.s 0b\n\t"
132 : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
133 [p_revtab] "+a" (p_revtab)
134 : [z] "a" (z), [step] "d" (step), [revtab_shift] "d" (revtab_shift),
135 [p_revtab_end] "r" (p_revtab_end)
136 : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
137#else
138 while(LIKELY(p_revtab < p_revtab_end))
139 {
140 j = (*p_revtab)>>revtab_shift;
141 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
142 T += step;
143 in1 += 2;
144 in2 -= 2;
145 p_revtab++;
146 j = (*p_revtab)>>revtab_shift;
147 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
148 T += step;
149 in1 += 2;
150 in2 -= 2;
151 p_revtab++;
152 }
153#endif
154 }
155 {
156 const uint16_t * const p_revtab_end = p_revtab + n8;
157#ifdef CPU_COLDFIRE
158 asm volatile ("move.l (%[in2]), %%d0\n\t"
159 "move.l (%[in1]), %%d1\n\t"
160 "bra.s 1f\n\t"
161 "0:\n\t"
162 "movem.l (%[T]), %%d2-%%d3\n\t"
163
164 "addq.l #8, %[in1]\n\t"
165 "subq.l #8, %[in2]\n\t"
166
167 "lea (%[step]*4, %[T]), %[T]\n\t"
168
169 "mac.l %%d0, %%d2, (%[T]), %%d4, %%acc0;"
170 "msac.l %%d1, %%d3, (4, %[T]), %%d5, %%acc0;"
171 "mac.l %%d1, %%d2, (%[in1]), %%d1, %%acc1;"
172 "mac.l %%d0, %%d3, (%[in2]), %%d0, %%acc1;"
173
174 "addq.l #8, %[in1]\n\t"
175 "subq.l #8, %[in2]\n\t"
176
177 "mac.l %%d0, %%d4, %%acc2;"
178 "msac.l %%d1, %%d5, (%[p_revtab])+, %%d2, %%acc2;"
179 "mac.l %%d1, %%d4, (%[in1]), %%d1, %%acc3;"
180 "mac.l %%d0, %%d5, (%[in2]), %%d0, %%acc3;"
181
182 "clr.l %%d3\n\t"
183 "move.w %%d2, %%d3\n\t"
184 "eor.l %%d3, %%d2\n\t"
185 "swap %%d2\n\t"
186 "lsr.l %[revtab_shift], %%d2\n\t"
187
188 "movclr.l %%acc0, %%d4;"
189 "movclr.l %%acc1, %%d5;"
190 "lsl.l #3, %%d2\n\t"
191 "lea (%%d2, %[z]), %%a1\n\t"
192 "movem.l %%d4-%%d5, (%%a1)\n\t"
193
194 "lsr.l %[revtab_shift], %%d3\n\t"
195
196 "movclr.l %%acc2, %%d4;"
197 "movclr.l %%acc3, %%d5;"
198 "lsl.l #3, %%d3\n\t"
199 "lea (%%d3, %[z]), %%a1\n\t"
200 "movem.l %%d4-%%d5, (%%a1)\n\t"
201
202 "lea (%[step]*4, %[T]), %[T]\n\t"
203
204 "1:\n\t"
205 "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
206 "bcs.s 0b\n\t"
207 : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
208 [p_revtab] "+a" (p_revtab)
209 : [z] "a" (z), [step] "d" (-step), [revtab_shift] "d" (revtab_shift),
210 [p_revtab_end] "r" (p_revtab_end)
211 : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
212#else
213 while(LIKELY(p_revtab < p_revtab_end))
214 {
215 j = (*p_revtab)>>revtab_shift;
216 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
217 T -= step;
218 in1 += 2;
219 in2 -= 2;
220 p_revtab++;
221 j = (*p_revtab)>>revtab_shift;
222 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
223 T -= step;
224 in1 += 2;
225 in2 -= 2;
226 p_revtab++;
227 }
228#endif
229 }
230
231
232 /* ... and so fft runs in OUTPUT buffer */
233 ff_fft_calc_c(nbits-2, z);
234
235 /* post rotation + reordering. now keeps the result within the OUTPUT buffer */
236 switch( nbits )
237 {
238 default:
239 {
240 fixed32 * z1 = (fixed32 *)(&z[0]);
241 int magic_step = step>>2;
242 int newstep;
243 if(n<=1024)
244 {
245 T = sincos_lookup0 + magic_step;
246 newstep = step>>1;
247 }
248 else
249 {
250 T = sincos_lookup1;
251 newstep = 2;
252 }
253
254#ifdef CPU_COLDFIRE
255 fixed32 * z2 = (fixed32 *)(&z[n4]);
256 int c = n4;
257 if (newstep == 2)
258 {
259 asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
260 "addq.l #8, %[z1]\n\t"
261 "movem.l (%[T]), %%d2-%%d3\n\t"
262 "addq.l #8, %[T]\n\t"
263 "bra.s 1f\n\t"
264 "0:\n\t"
265 "msac.l %%d1, %%d2, (%[T])+, %%a3, %%acc0\n\t"
266 "mac.l %%d0, %%d3, (%[T])+, %%a4, %%acc0\n\t"
267
268 "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
269 "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
270
271 "msac.l %%d1, %%a4, (%[T])+, %%d2, %%acc2\n\t"
272 "mac.l %%d0, %%a3, (%[T])+, %%d3, %%acc2\n\t"
273 "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
274 "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
275
276 "movclr.l %%acc0, %%a3\n\t"
277 "movclr.l %%acc3, %%a4\n\t"
278 "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
279
280 "movclr.l %%acc1, %%a4\n\t"
281 "movclr.l %%acc2, %%a3\n\t"
282 "movem.l %%a3-%%a4, (%[z2])\n\t"
283
284 "subq.l #2, %[n]\n\t"
285 "1:\n\t"
286 "bhi.s 0b\n\t"
287 : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
288 :
289 : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
290 }
291 else
292 {
293 asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
294 "addq.l #8, %[z1]\n\t"
295 "movem.l (%[T]), %%d2-%%d3\n\t"
296 "lea (%[newstep]*4, %[T]), %[T]\n\t"
297 "bra.s 1f\n\t"
298 "0:\n\t"
299 "msac.l %%d1, %%d2, (%[T]), %%a3, %%acc0\n\t"
300 "mac.l %%d0, %%d3, (4, %[T]), %%a4, %%acc0\n\t"
301 "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
302 "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
303
304 "lea (%[newstep]*4, %[T]), %[T]\n\t"
305 "msac.l %%d1, %%a4, (%[T]), %%d2, %%acc2\n\t"
306 "mac.l %%d0, %%a3, (4, %[T]), %%d3, %%acc2\n\t"
307 "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
308 "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
309
310 "lea (%[newstep]*4, %[T]), %[T]\n\t"
311
312 "movclr.l %%acc0, %%a3\n\t"
313 "movclr.l %%acc3, %%a4\n\t"
314 "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
315
316 "movclr.l %%acc1, %%a4\n\t"
317 "movclr.l %%acc2, %%a3\n\t"
318 "movem.l %%a3-%%a4, (%[z2])\n\t"
319
320 "subq.l #2, %[n]\n\t"
321 "1:\n\t"
322 "bhi.s 0b\n\t"
323 : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
324 : [newstep] "d" (newstep)
325 : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
326 }
327#else
328 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
329 while(z1<z2)
330 {
331 fixed32 r0,i0,r1,i1;
332 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
333 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
334 z1[0] = -r0;
335 z1[1] = -i0;
336 z2[0] = -r1;
337 z2[1] = -i1;
338 z1+=2;
339 z2-=2;
340 }
341#endif
342 break;
343 }
344
345 case 12: /* n=4096 */
346 {
347 /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
348 const int32_t * V = sincos_lookup1;
349 T = sincos_lookup0;
350 int32_t t0,t1,v0,v1;
351 fixed32 * z1 = (fixed32 *)(&z[0]);
352 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
353
354 t0 = T[0]>>1; t1=T[1]>>1;
355
356 while(z1<z2)
357 {
358 fixed32 r0,i0,r1,i1;
359 t0 += (v0 = (V[0]>>1));
360 t1 += (v1 = (V[1]>>1));
361 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
362 T+=2;
363 v0 += (t0 = (T[0]>>1));
364 v1 += (t1 = (T[1]>>1));
365 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
366 z1[0] = -r0;
367 z1[1] = -i0;
368 z2[0] = -r1;
369 z2[1] = -i1;
370 z1+=2;
371 z2-=2;
372 V+=2;
373 }
374
375 break;
376 }
377
378 case 13: /* n = 8192 */
379 {
380 /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
381 specifically: 25:75 for first twiddle and 75:25 for second twiddle */
382 const int32_t * V = sincos_lookup1;
383 T = sincos_lookup0;
384 int32_t t0,t1,v0,v1,q0,q1;
385 fixed32 * z1 = (fixed32 *)(&z[0]);
386 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
387
388 t0 = T[0]; t1=T[1];
389
390 while(z1<z2)
391 {
392 fixed32 r0,i0,r1,i1;
393 v0 = V[0]; v1 = V[1];
394 t0 += (q0 = (v0-t0)>>1);
395 t1 += (q1 = (v1-t1)>>1);
396 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
397 t0 = v0-q0;
398 t1 = v1-q1;
399 XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
400 z1[0] = -r0;
401 z1[1] = -i0;
402 z2[0] = -r1;
403 z2[1] = -i1;
404 z1+=2;
405 z2-=2;
406 T+=2;
407
408 t0 = T[0]; t1 = T[1];
409 v0 += (q0 = (t0-v0)>>1);
410 v1 += (q1 = (t1-v1)>>1);
411 XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
412 v0 = t0-q0;
413 v1 = t1-q1;
414 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
415 z1[0] = -r0;
416 z1[1] = -i0;
417 z2[0] = -r1;
418 z2[1] = -i1;
419 z1+=2;
420 z2-=2;
421 V+=2;
422 }
423
424 break;
425 }
426 }
427}
428
429/**
430 * Compute inverse MDCT of size N = 2^nbits
431 * @param output N samples
432 * @param input N/2 samples
433 * "In-place" processing can be achieved provided that:
434 * [0 .. N/2-1 | N/2 .. N-1 ]
435 * <----input---->
436 * <-----------output----------->
437 *
438 * The result of ff_imdct_half is to put the 'half' imdct here
439 *
440 * N/2 N-1
441 * <--half imdct-->
442 *
443 * We want it here for the full imdct:
444 * N/4 3N/4-1
445 * <-------------->
446 *
447 * In addition we need to apply two symmetries to get the full imdct:
448 *
449 * <AAAAAA> <DDDDDD>
450 * <BBBBBB><CCCCCC>
451 *
452 * D is a reflection of C
453 * A is a reflection of B (but with sign flipped)
454 *
455 * We process the symmetries at the same time as we 'move' the half imdct
456 * from [N/2,N-1] to [N/4,3N/4-1]
457 *
458 * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
459 * This would require being able to use revtab 'inplace' (since the input
460 * and output of imdct_half would then overlap somewhat)
461 */
462void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
463#ifndef CPU_ARM
464void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
465{
466 const int n = (1<<nbits);
467 const int n2 = (n>>1);
468 const int n4 = (n>>2);
469
470 /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
471 ff_imdct_half(nbits,output+n2,input);
472
473 fixed32 * in_r, * in_r2, * out_r, * out_r2;
474
475 /* Copy BBBB to AAAA, reflected and sign-flipped.
476 Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
477 out_r = output;
478 out_r2 = output+n2-8;
479 in_r = output+n2+n4-8;
480 while(out_r<out_r2)
481 {
482#if defined CPU_COLDFIRE
483 asm volatile(
484 "movem.l (%[in_r]), %%d0-%%d7\n\t"
485 "movem.l %%d0-%%d7, (%[out_r2])\n\t"
486 "neg.l %%d7\n\t"
487 "move.l %%d7, (%[out_r])+\n\t"
488 "neg.l %%d6\n\t"
489 "move.l %%d6, (%[out_r])+\n\t"
490 "neg.l %%d5\n\t"
491 "move.l %%d5, (%[out_r])+\n\t"
492 "neg.l %%d4\n\t"
493 "move.l %%d4, (%[out_r])+\n\t"
494 "neg.l %%d3\n\t"
495 "move.l %%d3, (%[out_r])+\n\t"
496 "neg.l %%d2\n\t"
497 "move.l %%d2, (%[out_r])+\n\t"
498 "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
499 "neg.l %%d1\n\t"
500 "move.l %%d1, (%[out_r])+\n\t"
501 "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
502 "neg.l %%d0\n\t"
503 "move.l %%d0, (%[out_r])+\n\t"
504 : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
505 :
506 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
507#else
508 out_r[0] = -(out_r2[7] = in_r[7]);
509 out_r[1] = -(out_r2[6] = in_r[6]);
510 out_r[2] = -(out_r2[5] = in_r[5]);
511 out_r[3] = -(out_r2[4] = in_r[4]);
512 out_r[4] = -(out_r2[3] = in_r[3]);
513 out_r[5] = -(out_r2[2] = in_r[2]);
514 out_r[6] = -(out_r2[1] = in_r[1]);
515 out_r[7] = -(out_r2[0] = in_r[0]);
516 in_r -= 8;
517 out_r += 8;
518 out_r2 -= 8;
519#endif
520 }
521 in_r = output + n2+n4;
522 in_r2 = output + n-4;
523 out_r = output + n2;
524 out_r2 = output + n2 + n4 - 4;
525 while(in_r<in_r2)
526 {
527#if defined CPU_COLDFIRE
528 asm volatile(
529 "movem.l (%[in_r]), %%d0-%%d3\n\t"
530 "movem.l %%d0-%%d3, (%[out_r])\n\t"
531 "movem.l (%[in_r2]), %%d4-%%d7\n\t"
532 "movem.l %%d4-%%d7, (%[out_r2])\n\t"
533 "move.l %%d0, %%a3\n\t"
534 "move.l %%d3, %%d0\n\t"
535 "move.l %%d1, %%d3\n\t"
536 "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
537 "move.l %%d7, %%d1\n\t"
538 "move.l %%d6, %%d2\n\t"
539 "move.l %%d5, %%d3\n\t"
540 "movem.l %%d1-%%d4, (%[in_r])\n\t"
541 "lea.l (4*4, %[in_r]), %[in_r]\n\t"
542 "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
543 "lea.l (4*4, %[out_r]), %[out_r]\n\t"
544 "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
545 : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
546 [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
547 :
548 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
549#else
550 register fixed32 t0,t1,t2,t3;
551 register fixed32 s0,s1,s2,s3;
552
553 /* Copy and reflect CCCC to DDDD. Because CCCC is already where
554 we actually want to put DDDD this is a bit complicated.
555 * So simultaneously do the following things:
556 * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
557 * 2. reflect range from [n2+n4 .. n-1] inplace
558 *
559 * [ | ]
560 * ^a -> <- ^b ^c -> <- ^d
561 *
562 * #1: copy from ^c to ^a
563 * #2: copy from ^d to ^b
564 * #3: swap ^c and ^d in place
565 */
566 /* #1 pt1 : load 4 words from ^c. */
567 t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
568 /* #1 pt2 : write to ^a */
569 out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
570 /* #2 pt1 : load 4 words from ^d */
571 s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
572 /* #2 pt2 : write to ^b */
573 out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
574 /* #3 pt1 : write words from #2 to ^c */
575 in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
576 /* #3 pt2 : write words from #1 to ^d */
577 in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
578
579 in_r += 4;
580 in_r2 -= 4;
581 out_r += 4;
582 out_r2 -= 4;
583#endif
584 }
585}
586#else
587/* Follows the same structure as the canonical version above */
588void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
589{
590 const int n = (1<<nbits);
591 const int n2 = (n>>1);
592 const int n4 = (n>>2);
593
594 ff_imdct_half(nbits,output+n2,input);
595
596 fixed32 * in_r, * in_r2, * out_r, * out_r2;
597
598 out_r = output;
599 out_r2 = output+n2;
600 in_r = output+n2+n4;
601 while(out_r<out_r2)
602 {
603 asm volatile(
604 "ldmdb %[in_r]!, {r0-r7}\n\t"
605 "stmdb %[out_r2]!, {r0-r7}\n\t"
606 "rsb r8,r0,#0\n\t"
607 "rsb r0,r7,#0\n\t"
608 "rsb r7,r1,#0\n\t"
609 "rsb r1,r6,#0\n\t"
610 "rsb r6,r2,#0\n\t"
611 "rsb r2,r5,#0\n\t"
612 "rsb r5,r3,#0\n\t"
613 "rsb r3,r4,#0\n\t"
614 "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
615 : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
616 :
617 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
618 }
619 in_r = output + n2+n4;
620 in_r2 = output + n;
621 out_r = output + n2;
622 out_r2 = output + n2 + n4;
623 while(in_r<in_r2)
624 {
625 asm volatile(
626 "ldmia %[in_r], {r0-r3}\n\t"
627 "stmia %[out_r]!, {r0-r3}\n\t"
628 "ldmdb %[in_r2], {r5-r8}\n\t"
629 "stmdb %[out_r2]!, {r5-r8}\n\t"
630 "mov r4,r0\n\t"
631 "mov r0,r3\n\t"
632 "mov r3,r1\n\t"
633 "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
634 "mov r4,r8\n\t"
635 "mov r8,r5\n\t"
636 "mov r5,r7\n\t"
637 "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
638 :
639 [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
640 :
641 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
642 }
643}
644#endif