diff options
author | Dave Hooper <dave@beermex.com> | 2010-02-17 00:49:53 +0000 |
---|---|---|
committer | Dave Hooper <dave@beermex.com> | 2010-02-17 00:49:53 +0000 |
commit | 42774d3128b91d5a37344cb40d56d3c4d147e5f2 (patch) | |
tree | bf336b407992ec9a5e454556f3351e3f8a0d10de /apps/codecs/lib/mdct.c | |
parent | 62257ebc38bc0a3095b25dd0f58c4c8215edf602 (diff) | |
download | rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.tar.gz rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.zip |
Merge from branches/mdctexp - faster ifft+imdct in codec lib
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24712 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/lib/mdct.c')
-rw-r--r-- | apps/codecs/lib/mdct.c | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c new file mode 100644 index 0000000000..03baa4db4a --- /dev/null +++ b/apps/codecs/lib/mdct.c | |||
@@ -0,0 +1,414 @@ | |||
1 | /* | ||
2 | * Fixed Point IMDCT | ||
3 | * Copyright (c) 2002 The FFmpeg Project. | ||
4 | * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli | ||
5 | * | ||
6 | * This library is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This library is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with this library; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include "codeclib.h" | ||
22 | #include "mdct.h" | ||
23 | #include "asm_arm.h" | ||
24 | #include "asm_mcf5249.h" | ||
25 | #include "codeclib_misc.h" | ||
26 | #include "mdct_lookup.h" | ||
27 | |||
28 | /** | ||
29 | * Compute the middle half of the inverse MDCT of size N = 2^nbits | ||
30 | * thus excluding the parts that can be derived by symmetry | ||
31 | * @param output N/2 samples | ||
32 | * @param input N/2 samples | ||
33 | * | ||
34 | * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must | ||
35 | * not overlap or intersect at all) | ||
36 | */ | ||
37 | void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
38 | { | ||
39 | int n8, n4, n2, n, j; | ||
40 | const fixed32 *in1, *in2; | ||
41 | |||
42 | n = 1 << nbits; | ||
43 | |||
44 | n2 = n >> 1; | ||
45 | n4 = n >> 2; | ||
46 | n8 = n >> 3; | ||
47 | |||
48 | FFTComplex *z = (FFTComplex *)output; | ||
49 | |||
50 | /* pre rotation */ | ||
51 | in1 = input; | ||
52 | in2 = input + n2 - 1; | ||
53 | |||
54 | /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12. | ||
55 | The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */ | ||
56 | const int revtab_shift = (14- nbits); | ||
57 | |||
58 | /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */ | ||
59 | /* (note that when using the current split radix, the bitreverse ordering is | ||
60 | complex, meaning that this reordering cannot easily be done in-place) */ | ||
61 | /* Using the following pdf, you can see that it is possible to rearrange | ||
62 | the 'classic' pre/post rotate with an alternative one that enables | ||
63 | us to use fewer distinct twiddle factors. | ||
64 | http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf | ||
65 | |||
66 | For prerotation, the factors are just sin,cos(2PI*i/N) | ||
67 | For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) | ||
68 | |||
69 | Therefore, prerotation can immediately reuse the same twiddles as fft | ||
70 | (for postrotation it's still a bit complex, so this is still using | ||
71 | an mdct-local set of twiddles to do that part) | ||
72 | */ | ||
73 | const int32_t *T = sincos_lookup0; | ||
74 | const int step = 2<<(12-nbits); | ||
75 | const uint16_t * p_revtab=revtab; | ||
76 | { | ||
77 | const uint16_t * const p_revtab_end = p_revtab + n8; | ||
78 | while(LIKELY(p_revtab < p_revtab_end)) | ||
79 | { | ||
80 | j = (*p_revtab)>>revtab_shift; | ||
81 | XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im ); | ||
82 | T += step; | ||
83 | in1 += 2; | ||
84 | in2 -= 2; | ||
85 | p_revtab++; | ||
86 | j = (*p_revtab)>>revtab_shift; | ||
87 | XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im ); | ||
88 | T += step; | ||
89 | in1 += 2; | ||
90 | in2 -= 2; | ||
91 | p_revtab++; | ||
92 | } | ||
93 | } | ||
94 | { | ||
95 | const uint16_t * const p_revtab_end = p_revtab + n8; | ||
96 | while(LIKELY(p_revtab < p_revtab_end)) | ||
97 | { | ||
98 | j = (*p_revtab)>>revtab_shift; | ||
99 | XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im); | ||
100 | T -= step; | ||
101 | in1 += 2; | ||
102 | in2 -= 2; | ||
103 | p_revtab++; | ||
104 | j = (*p_revtab)>>revtab_shift; | ||
105 | XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im); | ||
106 | T -= step; | ||
107 | in1 += 2; | ||
108 | in2 -= 2; | ||
109 | p_revtab++; | ||
110 | } | ||
111 | } | ||
112 | |||
113 | |||
114 | /* ... and so fft runs in OUTPUT buffer */ | ||
115 | ff_fft_calc_c(nbits-2, z); | ||
116 | |||
117 | /* post rotation + reordering. now keeps the result within the OUTPUT buffer */ | ||
118 | switch( nbits ) | ||
119 | { | ||
120 | default: | ||
121 | { | ||
122 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
123 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
124 | int magic_step = step>>2; | ||
125 | int newstep; | ||
126 | if(n<=1024) | ||
127 | { | ||
128 | T = sincos_lookup0 + magic_step; | ||
129 | newstep = step>>1; | ||
130 | } | ||
131 | else | ||
132 | { | ||
133 | T = sincos_lookup1; | ||
134 | newstep = 2; | ||
135 | } | ||
136 | |||
137 | while(z1<z2) | ||
138 | { | ||
139 | fixed32 r0,i0,r1,i1; | ||
140 | XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep; | ||
141 | XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep; | ||
142 | z1[0] = r0; | ||
143 | z1[1] = i0; | ||
144 | z2[0] = r1; | ||
145 | z2[1] = i1; | ||
146 | z1+=2; | ||
147 | z2-=2; | ||
148 | } | ||
149 | |||
150 | break; | ||
151 | } | ||
152 | |||
153 | case 12: /* n=4096 */ | ||
154 | { | ||
155 | /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */ | ||
156 | const int32_t * V = sincos_lookup1; | ||
157 | T = sincos_lookup0; | ||
158 | int32_t t0,t1,v0,v1; | ||
159 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
160 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
161 | |||
162 | t0 = T[0]>>1; t1=T[1]>>1; | ||
163 | |||
164 | while(z1<z2) | ||
165 | { | ||
166 | fixed32 r0,i0,r1,i1; | ||
167 | t0 += (v0 = (V[0]>>1)); | ||
168 | t1 += (v1 = (V[1]>>1)); | ||
169 | XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 ); | ||
170 | T+=2; | ||
171 | v0 += (t0 = (T[0]>>1)); | ||
172 | v1 += (t1 = (T[1]>>1)); | ||
173 | XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 ); | ||
174 | z1[0] = r0; | ||
175 | z1[1] = i0; | ||
176 | z2[0] = r1; | ||
177 | z2[1] = i1; | ||
178 | z1+=2; | ||
179 | z2-=2; | ||
180 | V+=2; | ||
181 | } | ||
182 | |||
183 | break; | ||
184 | } | ||
185 | |||
186 | case 13: /* n = 8192 */ | ||
187 | { | ||
188 | /* weight linear interpolation between sincos_lookup0 and sincos_lookup1 | ||
189 | specifically: 25:75 for first twiddle and 75:25 for second twiddle */ | ||
190 | const int32_t * V = sincos_lookup1; | ||
191 | T = sincos_lookup0; | ||
192 | int32_t t0,t1,v0,v1,q0,q1; | ||
193 | fixed32 * z1 = (fixed32 *)(&z[0]); | ||
194 | fixed32 * z2 = (fixed32 *)(&z[n4-1]); | ||
195 | |||
196 | t0 = T[0]; t1=T[1]; | ||
197 | |||
198 | while(z1<z2) | ||
199 | { | ||
200 | fixed32 r0,i0,r1,i1; | ||
201 | v0 = V[0]; v1 = V[1]; | ||
202 | t0 += (q0 = (v0-t0)>>1); | ||
203 | t1 += (q1 = (v1-t1)>>1); | ||
204 | XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 ); | ||
205 | t0 = v0-q0; | ||
206 | t1 = v1-q1; | ||
207 | XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 ); | ||
208 | z1[0] = r0; | ||
209 | z1[1] = i0; | ||
210 | z2[0] = r1; | ||
211 | z2[1] = i1; | ||
212 | z1+=2; | ||
213 | z2-=2; | ||
214 | T+=2; | ||
215 | |||
216 | t0 = T[0]; t1 = T[1]; | ||
217 | v0 += (q0 = (t0-v0)>>1); | ||
218 | v1 += (q1 = (t1-v1)>>1); | ||
219 | XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 ); | ||
220 | v0 = t0-q0; | ||
221 | v1 = t1-q1; | ||
222 | XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 ); | ||
223 | z1[0] = r0; | ||
224 | z1[1] = i0; | ||
225 | z2[0] = r1; | ||
226 | z2[1] = i1; | ||
227 | z1+=2; | ||
228 | z2-=2; | ||
229 | V+=2; | ||
230 | } | ||
231 | |||
232 | break; | ||
233 | } | ||
234 | } | ||
235 | } | ||
236 | |||
237 | /** | ||
238 | * Compute inverse MDCT of size N = 2^nbits | ||
239 | * @param output N samples | ||
240 | * @param input N/2 samples | ||
241 | * "In-place" processing can be achieved provided that: | ||
242 | * [0 .. N/2-1 | N/2 .. N-1 ] | ||
243 | * <----input----> | ||
244 | * <-----------output-----------> | ||
245 | * | ||
246 | */ | ||
247 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
248 | { | ||
249 | const int n = (1<<nbits); | ||
250 | const int n2 = (n>>1); | ||
251 | const int n4 = (n>>2); | ||
252 | |||
253 | ff_imdct_half(nbits,output+n2,input); | ||
254 | |||
255 | /* reflect the half imdct into the full N samples */ | ||
256 | /* TODO: this could easily be optimised more! */ | ||
257 | fixed32 * in_r, * in_r2, * out_r, * out_r2; | ||
258 | |||
259 | out_r = output; | ||
260 | out_r2 = output+n2-8; | ||
261 | in_r = output+n2+n4-8; | ||
262 | while(out_r<out_r2) | ||
263 | { | ||
264 | out_r[0] = -(out_r2[7] = in_r[7]); | ||
265 | out_r[1] = -(out_r2[6] = in_r[6]); | ||
266 | out_r[2] = -(out_r2[5] = in_r[5]); | ||
267 | out_r[3] = -(out_r2[4] = in_r[4]); | ||
268 | out_r[4] = -(out_r2[3] = in_r[3]); | ||
269 | out_r[5] = -(out_r2[2] = in_r[2]); | ||
270 | out_r[6] = -(out_r2[1] = in_r[1]); | ||
271 | out_r[7] = -(out_r2[0] = in_r[0]); | ||
272 | in_r -= 8; | ||
273 | out_r += 8; | ||
274 | out_r2 -= 8; | ||
275 | } | ||
276 | |||
277 | in_r = output + n2+n4; | ||
278 | in_r2 = output + n-4; | ||
279 | out_r = output + n2; | ||
280 | out_r2 = output + n2 + n4 - 4; | ||
281 | while(in_r<in_r2) | ||
282 | { | ||
283 | register fixed32 t0,t1,t2,t3; | ||
284 | register fixed32 s0,s1,s2,s3; | ||
285 | |||
286 | //simultaneously do the following things: | ||
287 | // 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] | ||
288 | // 2. reflect range from [n2+n4 .. n-1] inplace | ||
289 | // | ||
290 | // [ | ] | ||
291 | // ^a -> <- ^b ^c -> <- ^d | ||
292 | // | ||
293 | // #1: copy from ^c to ^a | ||
294 | // #2: copy from ^d to ^b | ||
295 | // #3: swap ^c and ^d in place | ||
296 | // | ||
297 | // #1 pt1 : load 4 words from ^c. | ||
298 | t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; | ||
299 | // #1 pt2 : write to ^a | ||
300 | out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; | ||
301 | // #2 pt1 : load 4 words from ^d | ||
302 | s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; | ||
303 | // #2 pt2 : write to ^b | ||
304 | out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; | ||
305 | // #3 pt1 : write words from #2 to ^c | ||
306 | in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; | ||
307 | // #3 pt2 : write words from #1 to ^d | ||
308 | in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; | ||
309 | |||
310 | in_r += 4; | ||
311 | in_r2 -= 4; | ||
312 | out_r += 4; | ||
313 | out_r2 -= 4; | ||
314 | } | ||
315 | } | ||
316 | |||
317 | static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */ | ||
318 | |||
319 | /* Table of values of atan(2^-i) in 0.32 format fractions of pi where pi = 0xffffffff / 2 */ | ||
320 | static const unsigned long atan_table[] = { | ||
321 | 0x1fffffff, /* +0.785398163 (or pi/4) */ | ||
322 | 0x12e4051d, /* +0.463647609 */ | ||
323 | 0x09fb385b, /* +0.244978663 */ | ||
324 | 0x051111d4, /* +0.124354995 */ | ||
325 | 0x028b0d43, /* +0.062418810 */ | ||
326 | 0x0145d7e1, /* +0.031239833 */ | ||
327 | 0x00a2f61e, /* +0.015623729 */ | ||
328 | 0x00517c55, /* +0.007812341 */ | ||
329 | 0x0028be53, /* +0.003906230 */ | ||
330 | 0x00145f2e, /* +0.001953123 */ | ||
331 | 0x000a2f98, /* +0.000976562 */ | ||
332 | 0x000517cc, /* +0.000488281 */ | ||
333 | 0x00028be6, /* +0.000244141 */ | ||
334 | 0x000145f3, /* +0.000122070 */ | ||
335 | 0x0000a2f9, /* +0.000061035 */ | ||
336 | 0x0000517c, /* +0.000030518 */ | ||
337 | 0x000028be, /* +0.000015259 */ | ||
338 | 0x0000145f, /* +0.000007629 */ | ||
339 | 0x00000a2f, /* +0.000003815 */ | ||
340 | 0x00000517, /* +0.000001907 */ | ||
341 | 0x0000028b, /* +0.000000954 */ | ||
342 | 0x00000145, /* +0.000000477 */ | ||
343 | 0x000000a2, /* +0.000000238 */ | ||
344 | 0x00000051, /* +0.000000119 */ | ||
345 | 0x00000028, /* +0.000000060 */ | ||
346 | 0x00000014, /* +0.000000030 */ | ||
347 | 0x0000000a, /* +0.000000015 */ | ||
348 | 0x00000005, /* +0.000000007 */ | ||
349 | 0x00000002, /* +0.000000004 */ | ||
350 | 0x00000001, /* +0.000000002 */ | ||
351 | 0x00000000, /* +0.000000001 */ | ||
352 | 0x00000000, /* +0.000000000 */ | ||
353 | }; | ||
354 | |||
355 | /** | ||
356 | * Implements sin and cos using CORDIC rotation. | ||
357 | * | ||
358 | * @param phase has range from 0 to 0xffffffff, representing 0 and | ||
359 | * 2*pi respectively. | ||
360 | * @param cos return address for cos | ||
361 | * @return sin of phase, value is a signed value from LONG_MIN to LONG_MAX, | ||
362 | * representing -1 and 1 respectively. | ||
363 | * | ||
364 | * Gives at least 24 bits precision (last 2-8 bits or so are probably off) | ||
365 | */ | ||
366 | |||
367 | long fsincos(unsigned long phase, fixed32 *cos) | ||
368 | { | ||
369 | int32_t x, x1, y, y1; | ||
370 | unsigned long z, z1; | ||
371 | int i; | ||
372 | |||
373 | /* Setup initial vector */ | ||
374 | x = cordic_circular_gain; | ||
375 | y = 0; | ||
376 | z = phase; | ||
377 | |||
378 | /* The phase has to be somewhere between 0..pi for this to work right */ | ||
379 | if (z < 0xffffffff / 4) { | ||
380 | /* z in first quadrant, z += pi/2 to correct */ | ||
381 | x = -x; | ||
382 | z += 0xffffffff / 4; | ||
383 | } else if (z < 3 * (0xffffffff / 4)) { | ||
384 | /* z in third quadrant, z -= pi/2 to correct */ | ||
385 | z -= 0xffffffff / 4; | ||
386 | } else { | ||
387 | /* z in fourth quadrant, z -= 3pi/2 to correct */ | ||
388 | x = -x; | ||
389 | z -= 3 * (0xffffffff / 4); | ||
390 | } | ||
391 | |||
392 | /* Each iteration adds roughly 1-bit of extra precision */ | ||
393 | for (i = 0; i < 31; i++) { | ||
394 | x1 = x >> i; | ||
395 | y1 = y >> i; | ||
396 | z1 = atan_table[i]; | ||
397 | |||
398 | /* Decided which direction to rotate vector. Pivot point is pi/2 */ | ||
399 | if (z >= 0xffffffff / 4) { | ||
400 | x -= y1; | ||
401 | y += x1; | ||
402 | z -= z1; | ||
403 | } else { | ||
404 | x += y1; | ||
405 | y -= x1; | ||
406 | z += z1; | ||
407 | } | ||
408 | } | ||
409 | |||
410 | if (cos) | ||
411 | *cos = x; | ||
412 | |||
413 | return y; | ||
414 | } | ||