diff options
author | Sean Bartell <wingedtachikoma@gmail.com> | 2011-06-25 21:32:25 -0400 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2012-04-25 22:13:20 +0200 |
commit | f40bfc9267b13b54e6379dfe7539447662879d24 (patch) | |
tree | 9b20069d5e62809ff434061ad730096836f916f2 /apps/codecs/lib/mdct.c | |
parent | a0009907de7a0107d49040d8a180f140e2eff299 (diff) | |
download | rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.tar.gz rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.zip |
Add codecs to librbcodec.
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97
Reviewed-on: http://gerrit.rockbox.org/137
Reviewed-by: Nils Wallménius <nils@rockbox.org>
Tested-by: Nils Wallménius <nils@rockbox.org>
Diffstat (limited to 'apps/codecs/lib/mdct.c')
-rw-r--r-- | apps/codecs/lib/mdct.c | 644 |
1 files changed, 0 insertions, 644 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c deleted file mode 100644 index 777aec4a55..0000000000 --- a/apps/codecs/lib/mdct.c +++ /dev/null | |||
@@ -1,644 +0,0 @@ | |||
1 | /* | ||
2 | * Fixed Point IMDCT | ||
3 | * Copyright (c) 2002 The FFmpeg Project. | ||
4 | * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli | ||
5 | * | ||
6 | * This library is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This library is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with this library; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include "codeclib.h" | ||
22 | #include "mdct.h" | ||
23 | #include "codeclib_misc.h" | ||
24 | #include "mdct_lookup.h" | ||
25 | |||
26 | #ifndef ICODE_ATTR_TREMOR_MDCT | ||
27 | #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR | ||
28 | #endif | ||
29 | |||
30 | /** | ||
31 | * Compute the middle half of the inverse MDCT of size N = 2^nbits | ||
32 | * thus excluding the parts that can be derived by symmetry | ||
33 | * @param output N/2 samples | ||
34 | * @param input N/2 samples | ||
35 | * | ||
36 | * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must | ||
37 | * not overlap or intersect at all) | ||
38 | */ | ||
39 | void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; | ||
40 | void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
41 | { | ||
42 | int n8, n4, n2, n, j; | ||
43 | const fixed32 *in1, *in2; | ||
44 | (void)j; | ||
45 | n = 1 << nbits; | ||
46 | |||
47 | n2 = n >> 1; | ||
48 | n4 = n >> 2; | ||
49 | n8 = n >> 3; | ||
50 | |||
51 | FFTComplex *z = (FFTComplex *)output; | ||
52 | |||
53 | /* pre rotation */ | ||
54 | in1 = input; | ||
55 | in2 = input + n2 - 1; | ||
56 | |||
57 | /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12. | ||
58 | The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */ | ||
59 | const int revtab_shift = (14- nbits); | ||
60 | |||
61 | /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */ | ||
62 | /* (note that when using the current split radix, the bitreverse ordering is | ||
63 | complex, meaning that this reordering cannot easily be done in-place) */ | ||
64 | /* Using the following pdf, you can see that it is possible to rearrange | ||
65 | the 'classic' pre/post rotate with an alternative one that enables | ||
66 | us to use fewer distinct twiddle factors. | ||
67 | http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf | ||
68 | |||
69 | For prerotation, the factors are just sin,cos(2PI*i/N) | ||
70 | For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) | ||
71 | |||
72 | Therefore, prerotation can immediately reuse the same twiddles as fft | ||
73 | (for postrotation it's still a bit complex, we reuse the fft trig tables | ||
74 | where we can, or a special table for N=2048, or interpolate between | ||
75 | trig tables for N>2048) | ||
76 | */ | ||
77 | const int32_t *T = sincos_lookup0; | ||
78 | const int step = 2<<(12-nbits); | ||
79 | const uint16_t * p_revtab=revtab; | ||
80 | { | ||
81 | const uint16_t * const p_revtab_end = p_revtab + n8; | ||
82 | #ifdef CPU_COLDFIRE | ||
83 | asm volatile ("move.l (%[in2]), %%d0\n\t" | ||
84 | "move.l (%[in1]), %%d1\n\t" | ||
85 | "bra.s 1f\n\t" | ||
86 | "0:\n\t" | ||
87 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
88 | |||
89 | "addq.l #8, %[in1]\n\t" | ||
90 | "subq.l #8, %[in2]\n\t" | ||
91 | |||
92 | "lea (%[step]*4, %[T]), %[T]\n\t" | ||
93 | |||
94 | "mac.l %%d0, %%d3, (%[T]), %%d4, %%acc0;" | ||
95 | "msac.l %%d1, %%d2, (4, %[T]), %%d5, %%acc0;" | ||
96 | "mac.l %%d1, %%d3, (%[in1]), %%d1, %%acc1;" | ||
97 | "mac.l %%d0, %%d2, (%[in2]), %%d0, %%acc1;" | ||
98 | |||
99 | "addq.l #8, %[in1]\n\t" | ||
100 | "subq.l #8, %[in2]\n\t" | ||
101 | |||
102 | "mac.l %%d0, %%d5, %%acc2;" | ||
103 | "msac.l %%d1, %%d4, (%[p_revtab])+, %%d2, %%acc2;" | ||
104 | "mac.l %%d1, %%d5, (%[in1]), %%d1, %%acc3;" | ||
105 | "mac.l %%d0, %%d4, (%[in2]), %%d0, %%acc3;" | ||
106 | |||
107 | "clr.l %%d3\n\t" | ||
108 | "move.w %%d2, %%d3\n\t" | ||
109 | "eor.l %%d3, %%d2\n\t" | ||
110 | "swap %%d2\n\t" | ||
111 | "lsr.l %[revtab_shift], %%d2\n\t" | ||
112 | |||
113 | "movclr.l %%acc0, %%d4;" | ||
114 | "movclr.l %%acc1, %%d5;" | ||
115 | "lsl.l #3, %%d2\n\t" | ||
116 | "lea (%%d2, %[z]), %%a1\n\t" | ||
117 | "movem.l %%d4-%%d5, (%%a1)\n\t" | ||
118 | |||
119 | "lsr.l %[revtab_shift], %%d3\n\t" | ||
120 | |||
121 | "movclr.l %%acc2, %%d4;" | ||
122 | "movclr.l %%acc3, %%d5;" | ||
123 | "lsl.l #3, %%d3\n\t" | ||
124 | "lea (%%d3, %[z]), %%a1\n\t" | ||
125 | "movem.l %%d4-%%d5, (%%a1)\n\t" | ||
126 | |||
127 | "lea (%[step]*4, %[T]), %[T]\n\t" | ||
128 | |||
129 | "1:\n\t" | ||
130 | "cmp.l %[p_revtab_end], %[p_revtab]\n\t" | ||
131 | "bcs.s 0b\n\t" | ||
132 | : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T), | ||
133 | [p_revtab] "+a" (p_revtab) | ||
134 | : [z] "a" (z), [step] "d" (step), [revtab_shift] "d" (revtab_shift), | ||
135 | [p_revtab_end] "r" (p_revtab_end) | ||
136 | : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory"); | ||
137 | #else | ||
138 | while(LIKELY(p_revtab < p_revtab_end)) | ||
139 | { | ||
140 | j = (*p_revtab)>>revtab_shift; | ||
141 | XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im ); | ||
142 | T += step; | ||
143 | in1 += 2; | ||
144 | in2 -= 2; | ||
145 | p_revtab++; | ||
146 | j = (*p_revtab)>>revtab_shift; | ||
147 | XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im ); | ||
148 | T += step; | ||
149 | in1 += 2; | ||
150 | in2 -= 2; | ||
151 | p_revtab++; | ||
152 | } | ||
153 | #endif | ||
154 | } | ||
155 | { | ||
156 | const uint16_t * const p_revtab_end = p_revtab + n8; | ||
157 | #ifdef CPU_COLDFIRE | ||
158 | asm volatile ("move.l (%[in2]), %%d0\n\t" | ||
159 | "move.l (%[in1]), %%d1\n\t" | ||
160 | "bra.s 1f\n\t" | ||
161 | "0:\n\t" | ||
162 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
163 | |||
164 | "addq.l #8, %[in1]\n\t" | ||
165 | "subq.l #8, %[in2]\n\t" | ||
166 | |||
167 | "lea (%[step]*4, %[T]), %[T]\n\t" | ||
168 | |||
169 | "mac.l %%d0, %%d2, (%[T]), %%d4, %%acc0;" | ||
170 | "msac.l %%d1, %%d3, (4, %[T]), %%d5, %%acc0;" | ||
171 | "mac.l %%d1, %%d2, (%[in1]), %%d1, %%acc1;" | ||
172 | "mac.l %%d0, %%d3, (%[in2]), %%d0, %%acc1;" | ||
173 | |||
174 | "addq.l #8, %[in1]\n\t" | ||
175 | "subq.l #8, %[in2]\n\t" | ||
176 | |||
177 | "mac.l %%d0, %%d4, %%acc2;" | ||
178 | "msac.l %%d1, %%d5, (%[p_revtab])+, %%d2, %%acc2;" | ||
179 | "mac.l %%d1, %%d4, (%[in1]), %%d1, %%acc3;" | ||
180 | "mac.l %%d0, %%d5, (%[in2]), %%d0, %%acc3;" | ||
181 | |||
182 | "clr.l %%d3\n\t" | ||
183 | "move.w %%d2, %%d3\n\t" | ||
184 | "eor.l %%d3, %%d2\n\t" | ||
185 | "swap %%d2\n\t" | ||
186 | "lsr.l %[revtab_shift], %%d2\n\t" | ||
187 | |||
188 | "movclr.l %%acc0, %%d4;" | ||
189 | "movclr.l %%acc1, %%d5;" | ||
190 | "lsl.l #3, %%d2\n\t" | ||
191 | "lea (%%d2, %[z]), %%a1\n\t" | ||
192 | "movem.l %%d4-%%d5, (%%a1)\n\t" | ||
193 | |||
194 | "lsr.l %[revtab_shift], %%d3\n\t" | ||
195 | |||
196 | "movclr.l %%acc2, %%d4;" | ||
197 | "movclr.l %%acc3, %%d5;" | ||
198 | "lsl.l #3, %%d3\n\t" | ||
199 | "lea (%%d3, %[z]), %%a1\n\t" | ||
200 | "movem.l %%d4-%%d5, (%%a1)\n\t" | ||
201 | |||
202 | "lea (%[step]*4, %[T]), %[T]\n\t" | ||
203 | |||
204 | "1:\n\t" | ||
205 | "cmp.l %[p_revtab_end], %[p_revtab]\n\t" | ||
206 | "bcs.s 0b\n\t" | ||
207 | : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T), | ||
208 | [p_revtab] "+a" (p_revtab) | ||
209 | : [z] "a" (z), [step] "d" (-step), [revtab_shift] "d" (revtab_shift), | ||
210 | [p_revtab_end] "r" (p_revtab_end) | ||
211 | : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory"); | ||
212 | #else | ||
213 | while(LIKELY(p_revtab < p_revtab_end)) | ||
214 | { | ||
215 | j = (*p_revtab)>>revtab_shift; | ||
216 | XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im); | ||
217 | T -= step; | ||
218 | in1 += 2; | ||
219 | in2 -= 2; | ||
220 | p_revtab++; | ||
221 | j = (*p_revtab)>>revtab_shift; | ||
222 | XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im); | ||
223 | T -= step; | ||
224 | in1 += 2; | ||
225 | in2 -= 2; | ||
226 | p_revtab++; | ||
227 | } | ||
228 | #endif | ||
229 | } | ||
230 | |||
231 | |||
232 | /* ... and so fft runs in OUTPUT buffer */ | ||
233 | ff_fft_calc_c(nbits-2, z); | ||
234 | |||
235 | /* post rotation + reordering. now keeps the result within the OUTPUT buffer */ | ||
236 | switch( nbits ) | ||
237 | { | ||
238 | default: | ||
239 | { | ||
240 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
241 | int magic_step = step>>2; | ||
242 | int newstep; | ||
243 | if(n<=1024) | ||
244 | { | ||
245 | T = sincos_lookup0 + magic_step; | ||
246 | newstep = step>>1; | ||
247 | } | ||
248 | else | ||
249 | { | ||
250 | T = sincos_lookup1; | ||
251 | newstep = 2; | ||
252 | } | ||
253 | |||
254 | #ifdef CPU_COLDFIRE | ||
255 | fixed32 * z2 = (fixed32 *)(&z[n4]); | ||
256 | int c = n4; | ||
257 | if (newstep == 2) | ||
258 | { | ||
259 | asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t" | ||
260 | "addq.l #8, %[z1]\n\t" | ||
261 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
262 | "addq.l #8, %[T]\n\t" | ||
263 | "bra.s 1f\n\t" | ||
264 | "0:\n\t" | ||
265 | "msac.l %%d1, %%d2, (%[T])+, %%a3, %%acc0\n\t" | ||
266 | "mac.l %%d0, %%d3, (%[T])+, %%a4, %%acc0\n\t" | ||
267 | |||
268 | "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t" | ||
269 | "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t" | ||
270 | |||
271 | "msac.l %%d1, %%a4, (%[T])+, %%d2, %%acc2\n\t" | ||
272 | "mac.l %%d0, %%a3, (%[T])+, %%d3, %%acc2\n\t" | ||
273 | "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t" | ||
274 | "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t" | ||
275 | |||
276 | "movclr.l %%acc0, %%a3\n\t" | ||
277 | "movclr.l %%acc3, %%a4\n\t" | ||
278 | "movem.l %%a3-%%a4, (-16, %[z1])\n\t" | ||
279 | |||
280 | "movclr.l %%acc1, %%a4\n\t" | ||
281 | "movclr.l %%acc2, %%a3\n\t" | ||
282 | "movem.l %%a3-%%a4, (%[z2])\n\t" | ||
283 | |||
284 | "subq.l #2, %[n]\n\t" | ||
285 | "1:\n\t" | ||
286 | "bhi.s 0b\n\t" | ||
287 | : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c) | ||
288 | : | ||
289 | : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory"); | ||
290 | } | ||
291 | else | ||
292 | { | ||
293 | asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t" | ||
294 | "addq.l #8, %[z1]\n\t" | ||
295 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
296 | "lea (%[newstep]*4, %[T]), %[T]\n\t" | ||
297 | "bra.s 1f\n\t" | ||
298 | "0:\n\t" | ||
299 | "msac.l %%d1, %%d2, (%[T]), %%a3, %%acc0\n\t" | ||
300 | "mac.l %%d0, %%d3, (4, %[T]), %%a4, %%acc0\n\t" | ||
301 | "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t" | ||
302 | "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t" | ||
303 | |||
304 | "lea (%[newstep]*4, %[T]), %[T]\n\t" | ||
305 | "msac.l %%d1, %%a4, (%[T]), %%d2, %%acc2\n\t" | ||
306 | "mac.l %%d0, %%a3, (4, %[T]), %%d3, %%acc2\n\t" | ||
307 | "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t" | ||
308 | "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t" | ||
309 | |||
310 | "lea (%[newstep]*4, %[T]), %[T]\n\t" | ||
311 | |||
312 | "movclr.l %%acc0, %%a3\n\t" | ||
313 | "movclr.l %%acc3, %%a4\n\t" | ||
314 | "movem.l %%a3-%%a4, (-16, %[z1])\n\t" | ||
315 | |||
316 | "movclr.l %%acc1, %%a4\n\t" | ||
317 | "movclr.l %%acc2, %%a3\n\t" | ||
318 | "movem.l %%a3-%%a4, (%[z2])\n\t" | ||
319 | |||
320 | "subq.l #2, %[n]\n\t" | ||
321 | "1:\n\t" | ||
322 | "bhi.s 0b\n\t" | ||
323 | : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c) | ||
324 | : [newstep] "d" (newstep) | ||
325 | : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory"); | ||
326 | } | ||
327 | #else | ||
328 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
329 | while(z1<z2) | ||
330 | { | ||
331 | fixed32 r0,i0,r1,i1; | ||
332 | XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep; | ||
333 | XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep; | ||
334 | z1[0] = -r0; | ||
335 | z1[1] = -i0; | ||
336 | z2[0] = -r1; | ||
337 | z2[1] = -i1; | ||
338 | z1+=2; | ||
339 | z2-=2; | ||
340 | } | ||
341 | #endif | ||
342 | break; | ||
343 | } | ||
344 | |||
345 | case 12: /* n=4096 */ | ||
346 | { | ||
347 | /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */ | ||
348 | const int32_t * V = sincos_lookup1; | ||
349 | T = sincos_lookup0; | ||
350 | int32_t t0,t1,v0,v1; | ||
351 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
352 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
353 | |||
354 | t0 = T[0]>>1; t1=T[1]>>1; | ||
355 | |||
356 | while(z1<z2) | ||
357 | { | ||
358 | fixed32 r0,i0,r1,i1; | ||
359 | t0 += (v0 = (V[0]>>1)); | ||
360 | t1 += (v1 = (V[1]>>1)); | ||
361 | XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 ); | ||
362 | T+=2; | ||
363 | v0 += (t0 = (T[0]>>1)); | ||
364 | v1 += (t1 = (T[1]>>1)); | ||
365 | XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 ); | ||
366 | z1[0] = -r0; | ||
367 | z1[1] = -i0; | ||
368 | z2[0] = -r1; | ||
369 | z2[1] = -i1; | ||
370 | z1+=2; | ||
371 | z2-=2; | ||
372 | V+=2; | ||
373 | } | ||
374 | |||
375 | break; | ||
376 | } | ||
377 | |||
378 | case 13: /* n = 8192 */ | ||
379 | { | ||
380 | /* weight linear interpolation between sincos_lookup0 and sincos_lookup1 | ||
381 | specifically: 25:75 for first twiddle and 75:25 for second twiddle */ | ||
382 | const int32_t * V = sincos_lookup1; | ||
383 | T = sincos_lookup0; | ||
384 | int32_t t0,t1,v0,v1,q0,q1; | ||
385 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
386 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
387 | |||
388 | t0 = T[0]; t1=T[1]; | ||
389 | |||
390 | while(z1<z2) | ||
391 | { | ||
392 | fixed32 r0,i0,r1,i1; | ||
393 | v0 = V[0]; v1 = V[1]; | ||
394 | t0 += (q0 = (v0-t0)>>1); | ||
395 | t1 += (q1 = (v1-t1)>>1); | ||
396 | XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 ); | ||
397 | t0 = v0-q0; | ||
398 | t1 = v1-q1; | ||
399 | XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 ); | ||
400 | z1[0] = -r0; | ||
401 | z1[1] = -i0; | ||
402 | z2[0] = -r1; | ||
403 | z2[1] = -i1; | ||
404 | z1+=2; | ||
405 | z2-=2; | ||
406 | T+=2; | ||
407 | |||
408 | t0 = T[0]; t1 = T[1]; | ||
409 | v0 += (q0 = (t0-v0)>>1); | ||
410 | v1 += (q1 = (t1-v1)>>1); | ||
411 | XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 ); | ||
412 | v0 = t0-q0; | ||
413 | v1 = t1-q1; | ||
414 | XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 ); | ||
415 | z1[0] = -r0; | ||
416 | z1[1] = -i0; | ||
417 | z2[0] = -r1; | ||
418 | z2[1] = -i1; | ||
419 | z1+=2; | ||
420 | z2-=2; | ||
421 | V+=2; | ||
422 | } | ||
423 | |||
424 | break; | ||
425 | } | ||
426 | } | ||
427 | } | ||
428 | |||
429 | /** | ||
430 | * Compute inverse MDCT of size N = 2^nbits | ||
431 | * @param output N samples | ||
432 | * @param input N/2 samples | ||
433 | * "In-place" processing can be achieved provided that: | ||
434 | * [0 .. N/2-1 | N/2 .. N-1 ] | ||
435 | * <----input----> | ||
436 | * <-----------output-----------> | ||
437 | * | ||
438 | * The result of ff_imdct_half is to put the 'half' imdct here | ||
439 | * | ||
440 | * N/2 N-1 | ||
441 | * <--half imdct--> | ||
442 | * | ||
443 | * We want it here for the full imdct: | ||
444 | * N/4 3N/4-1 | ||
445 | * <--------------> | ||
446 | * | ||
447 | * In addition we need to apply two symmetries to get the full imdct: | ||
448 | * | ||
449 | * <AAAAAA> <DDDDDD> | ||
450 | * <BBBBBB><CCCCCC> | ||
451 | * | ||
452 | * D is a reflection of C | ||
453 | * A is a reflection of B (but with sign flipped) | ||
454 | * | ||
455 | * We process the symmetries at the same time as we 'move' the half imdct | ||
456 | * from [N/2,N-1] to [N/4,3N/4-1] | ||
457 | * | ||
458 | * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1] | ||
459 | * This would require being able to use revtab 'inplace' (since the input | ||
460 | * and output of imdct_half would then overlap somewhat) | ||
461 | */ | ||
462 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; | ||
463 | #ifndef CPU_ARM | ||
464 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
465 | { | ||
466 | const int n = (1<<nbits); | ||
467 | const int n2 = (n>>1); | ||
468 | const int n4 = (n>>2); | ||
469 | |||
470 | /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */ | ||
471 | ff_imdct_half(nbits,output+n2,input); | ||
472 | |||
473 | fixed32 * in_r, * in_r2, * out_r, * out_r2; | ||
474 | |||
475 | /* Copy BBBB to AAAA, reflected and sign-flipped. | ||
476 | Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */ | ||
477 | out_r = output; | ||
478 | out_r2 = output+n2-8; | ||
479 | in_r = output+n2+n4-8; | ||
480 | while(out_r<out_r2) | ||
481 | { | ||
482 | #if defined CPU_COLDFIRE | ||
483 | asm volatile( | ||
484 | "movem.l (%[in_r]), %%d0-%%d7\n\t" | ||
485 | "movem.l %%d0-%%d7, (%[out_r2])\n\t" | ||
486 | "neg.l %%d7\n\t" | ||
487 | "move.l %%d7, (%[out_r])+\n\t" | ||
488 | "neg.l %%d6\n\t" | ||
489 | "move.l %%d6, (%[out_r])+\n\t" | ||
490 | "neg.l %%d5\n\t" | ||
491 | "move.l %%d5, (%[out_r])+\n\t" | ||
492 | "neg.l %%d4\n\t" | ||
493 | "move.l %%d4, (%[out_r])+\n\t" | ||
494 | "neg.l %%d3\n\t" | ||
495 | "move.l %%d3, (%[out_r])+\n\t" | ||
496 | "neg.l %%d2\n\t" | ||
497 | "move.l %%d2, (%[out_r])+\n\t" | ||
498 | "lea.l (-8*4, %[in_r]), %[in_r]\n\t" | ||
499 | "neg.l %%d1\n\t" | ||
500 | "move.l %%d1, (%[out_r])+\n\t" | ||
501 | "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t" | ||
502 | "neg.l %%d0\n\t" | ||
503 | "move.l %%d0, (%[out_r])+\n\t" | ||
504 | : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2) | ||
505 | : | ||
506 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" ); | ||
507 | #else | ||
508 | out_r[0] = -(out_r2[7] = in_r[7]); | ||
509 | out_r[1] = -(out_r2[6] = in_r[6]); | ||
510 | out_r[2] = -(out_r2[5] = in_r[5]); | ||
511 | out_r[3] = -(out_r2[4] = in_r[4]); | ||
512 | out_r[4] = -(out_r2[3] = in_r[3]); | ||
513 | out_r[5] = -(out_r2[2] = in_r[2]); | ||
514 | out_r[6] = -(out_r2[1] = in_r[1]); | ||
515 | out_r[7] = -(out_r2[0] = in_r[0]); | ||
516 | in_r -= 8; | ||
517 | out_r += 8; | ||
518 | out_r2 -= 8; | ||
519 | #endif | ||
520 | } | ||
521 | in_r = output + n2+n4; | ||
522 | in_r2 = output + n-4; | ||
523 | out_r = output + n2; | ||
524 | out_r2 = output + n2 + n4 - 4; | ||
525 | while(in_r<in_r2) | ||
526 | { | ||
527 | #if defined CPU_COLDFIRE | ||
528 | asm volatile( | ||
529 | "movem.l (%[in_r]), %%d0-%%d3\n\t" | ||
530 | "movem.l %%d0-%%d3, (%[out_r])\n\t" | ||
531 | "movem.l (%[in_r2]), %%d4-%%d7\n\t" | ||
532 | "movem.l %%d4-%%d7, (%[out_r2])\n\t" | ||
533 | "move.l %%d0, %%a3\n\t" | ||
534 | "move.l %%d3, %%d0\n\t" | ||
535 | "move.l %%d1, %%d3\n\t" | ||
536 | "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t" | ||
537 | "move.l %%d7, %%d1\n\t" | ||
538 | "move.l %%d6, %%d2\n\t" | ||
539 | "move.l %%d5, %%d3\n\t" | ||
540 | "movem.l %%d1-%%d4, (%[in_r])\n\t" | ||
541 | "lea.l (4*4, %[in_r]), %[in_r]\n\t" | ||
542 | "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t" | ||
543 | "lea.l (4*4, %[out_r]), %[out_r]\n\t" | ||
544 | "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t" | ||
545 | : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2), | ||
546 | [out_r] "+a" (out_r), [out_r2] "+a" (out_r2) | ||
547 | : | ||
548 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" ); | ||
549 | #else | ||
550 | register fixed32 t0,t1,t2,t3; | ||
551 | register fixed32 s0,s1,s2,s3; | ||
552 | |||
553 | /* Copy and reflect CCCC to DDDD. Because CCCC is already where | ||
554 | we actually want to put DDDD this is a bit complicated. | ||
555 | * So simultaneously do the following things: | ||
556 | * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] | ||
557 | * 2. reflect range from [n2+n4 .. n-1] inplace | ||
558 | * | ||
559 | * [ | ] | ||
560 | * ^a -> <- ^b ^c -> <- ^d | ||
561 | * | ||
562 | * #1: copy from ^c to ^a | ||
563 | * #2: copy from ^d to ^b | ||
564 | * #3: swap ^c and ^d in place | ||
565 | */ | ||
566 | /* #1 pt1 : load 4 words from ^c. */ | ||
567 | t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; | ||
568 | /* #1 pt2 : write to ^a */ | ||
569 | out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; | ||
570 | /* #2 pt1 : load 4 words from ^d */ | ||
571 | s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; | ||
572 | /* #2 pt2 : write to ^b */ | ||
573 | out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; | ||
574 | /* #3 pt1 : write words from #2 to ^c */ | ||
575 | in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; | ||
576 | /* #3 pt2 : write words from #1 to ^d */ | ||
577 | in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; | ||
578 | |||
579 | in_r += 4; | ||
580 | in_r2 -= 4; | ||
581 | out_r += 4; | ||
582 | out_r2 -= 4; | ||
583 | #endif | ||
584 | } | ||
585 | } | ||
586 | #else | ||
587 | /* Follows the same structure as the canonical version above */ | ||
588 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
589 | { | ||
590 | const int n = (1<<nbits); | ||
591 | const int n2 = (n>>1); | ||
592 | const int n4 = (n>>2); | ||
593 | |||
594 | ff_imdct_half(nbits,output+n2,input); | ||
595 | |||
596 | fixed32 * in_r, * in_r2, * out_r, * out_r2; | ||
597 | |||
598 | out_r = output; | ||
599 | out_r2 = output+n2; | ||
600 | in_r = output+n2+n4; | ||
601 | while(out_r<out_r2) | ||
602 | { | ||
603 | asm volatile( | ||
604 | "ldmdb %[in_r]!, {r0-r7}\n\t" | ||
605 | "stmdb %[out_r2]!, {r0-r7}\n\t" | ||
606 | "rsb r8,r0,#0\n\t" | ||
607 | "rsb r0,r7,#0\n\t" | ||
608 | "rsb r7,r1,#0\n\t" | ||
609 | "rsb r1,r6,#0\n\t" | ||
610 | "rsb r6,r2,#0\n\t" | ||
611 | "rsb r2,r5,#0\n\t" | ||
612 | "rsb r5,r3,#0\n\t" | ||
613 | "rsb r3,r4,#0\n\t" | ||
614 | "stmia %[out_r]!, {r0-r3,r5-r8}\n\t" | ||
615 | : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) | ||
616 | : | ||
617 | : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" ); | ||
618 | } | ||
619 | in_r = output + n2+n4; | ||
620 | in_r2 = output + n; | ||
621 | out_r = output + n2; | ||
622 | out_r2 = output + n2 + n4; | ||
623 | while(in_r<in_r2) | ||
624 | { | ||
625 | asm volatile( | ||
626 | "ldmia %[in_r], {r0-r3}\n\t" | ||
627 | "stmia %[out_r]!, {r0-r3}\n\t" | ||
628 | "ldmdb %[in_r2], {r5-r8}\n\t" | ||
629 | "stmdb %[out_r2]!, {r5-r8}\n\t" | ||
630 | "mov r4,r0\n\t" | ||
631 | "mov r0,r3\n\t" | ||
632 | "mov r3,r1\n\t" | ||
633 | "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t" | ||
634 | "mov r4,r8\n\t" | ||
635 | "mov r8,r5\n\t" | ||
636 | "mov r5,r7\n\t" | ||
637 | "stmia %[in_r]!, {r4,r5,r6,r8}\n\t" | ||
638 | : | ||
639 | [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) | ||
640 | : | ||
641 | : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" ); | ||
642 | } | ||
643 | } | ||
644 | #endif | ||