summaryrefslogtreecommitdiff
path: root/apps/codecs/lib/mdct.c
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs/lib/mdct.c')
-rw-r--r--apps/codecs/lib/mdct.c414
1 files changed, 414 insertions, 0 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
new file mode 100644
index 0000000000..03baa4db4a
--- /dev/null
+++ b/apps/codecs/lib/mdct.c
@@ -0,0 +1,414 @@
1/*
2 * Fixed Point IMDCT
3 * Copyright (c) 2002 The FFmpeg Project.
4 * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include "codeclib.h"
22#include "mdct.h"
23#include "asm_arm.h"
24#include "asm_mcf5249.h"
25#include "codeclib_misc.h"
26#include "mdct_lookup.h"
27
28/**
29 * Compute the middle half of the inverse MDCT of size N = 2^nbits
30 * thus excluding the parts that can be derived by symmetry
31 * @param output N/2 samples
32 * @param input N/2 samples
33 *
34 * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
35 * not overlap or intersect at all)
36 */
37void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
38{
39 int n8, n4, n2, n, j;
40 const fixed32 *in1, *in2;
41
42 n = 1 << nbits;
43
44 n2 = n >> 1;
45 n4 = n >> 2;
46 n8 = n >> 3;
47
48 FFTComplex *z = (FFTComplex *)output;
49
50 /* pre rotation */
51 in1 = input;
52 in2 = input + n2 - 1;
53
54 /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
55 The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
56 const int revtab_shift = (14- nbits);
57
58 /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
59 /* (note that when using the current split radix, the bitreverse ordering is
60 complex, meaning that this reordering cannot easily be done in-place) */
61 /* Using the following pdf, you can see that it is possible to rearrange
62 the 'classic' pre/post rotate with an alternative one that enables
63 us to use fewer distinct twiddle factors.
64 http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
65
66 For prerotation, the factors are just sin,cos(2PI*i/N)
67 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
68
69 Therefore, prerotation can immediately reuse the same twiddles as fft
70 (for postrotation it's still a bit complex, so this is still using
71 an mdct-local set of twiddles to do that part)
72 */
73 const int32_t *T = sincos_lookup0;
74 const int step = 2<<(12-nbits);
75 const uint16_t * p_revtab=revtab;
76 {
77 const uint16_t * const p_revtab_end = p_revtab + n8;
78 while(LIKELY(p_revtab < p_revtab_end))
79 {
80 j = (*p_revtab)>>revtab_shift;
81 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
82 T += step;
83 in1 += 2;
84 in2 -= 2;
85 p_revtab++;
86 j = (*p_revtab)>>revtab_shift;
87 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
88 T += step;
89 in1 += 2;
90 in2 -= 2;
91 p_revtab++;
92 }
93 }
94 {
95 const uint16_t * const p_revtab_end = p_revtab + n8;
96 while(LIKELY(p_revtab < p_revtab_end))
97 {
98 j = (*p_revtab)>>revtab_shift;
99 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
100 T -= step;
101 in1 += 2;
102 in2 -= 2;
103 p_revtab++;
104 j = (*p_revtab)>>revtab_shift;
105 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
106 T -= step;
107 in1 += 2;
108 in2 -= 2;
109 p_revtab++;
110 }
111 }
112
113
114 /* ... and so fft runs in OUTPUT buffer */
115 ff_fft_calc_c(nbits-2, z);
116
117 /* post rotation + reordering. now keeps the result within the OUTPUT buffer */
118 switch( nbits )
119 {
120 default:
121 {
122 fixed32 * z1 = (fixed32 *)(&z[0]);
123 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
124 int magic_step = step>>2;
125 int newstep;
126 if(n<=1024)
127 {
128 T = sincos_lookup0 + magic_step;
129 newstep = step>>1;
130 }
131 else
132 {
133 T = sincos_lookup1;
134 newstep = 2;
135 }
136
137 while(z1<z2)
138 {
139 fixed32 r0,i0,r1,i1;
140 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
141 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
142 z1[0] = r0;
143 z1[1] = i0;
144 z2[0] = r1;
145 z2[1] = i1;
146 z1+=2;
147 z2-=2;
148 }
149
150 break;
151 }
152
153 case 12: /* n=4096 */
154 {
155 /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
156 const int32_t * V = sincos_lookup1;
157 T = sincos_lookup0;
158 int32_t t0,t1,v0,v1;
159 fixed32 * z1 = (fixed32 *)(&z[0]);
160 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
161
162 t0 = T[0]>>1; t1=T[1]>>1;
163
164 while(z1<z2)
165 {
166 fixed32 r0,i0,r1,i1;
167 t0 += (v0 = (V[0]>>1));
168 t1 += (v1 = (V[1]>>1));
169 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
170 T+=2;
171 v0 += (t0 = (T[0]>>1));
172 v1 += (t1 = (T[1]>>1));
173 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
174 z1[0] = r0;
175 z1[1] = i0;
176 z2[0] = r1;
177 z2[1] = i1;
178 z1+=2;
179 z2-=2;
180 V+=2;
181 }
182
183 break;
184 }
185
186 case 13: /* n = 8192 */
187 {
188 /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
189 specifically: 25:75 for first twiddle and 75:25 for second twiddle */
190 const int32_t * V = sincos_lookup1;
191 T = sincos_lookup0;
192 int32_t t0,t1,v0,v1,q0,q1;
193 fixed32 * z1 = (fixed32 *)(&z[0]);
194 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
195
196 t0 = T[0]; t1=T[1];
197
198 while(z1<z2)
199 {
200 fixed32 r0,i0,r1,i1;
201 v0 = V[0]; v1 = V[1];
202 t0 += (q0 = (v0-t0)>>1);
203 t1 += (q1 = (v1-t1)>>1);
204 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
205 t0 = v0-q0;
206 t1 = v1-q1;
207 XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
208 z1[0] = r0;
209 z1[1] = i0;
210 z2[0] = r1;
211 z2[1] = i1;
212 z1+=2;
213 z2-=2;
214 T+=2;
215
216 t0 = T[0]; t1 = T[1];
217 v0 += (q0 = (t0-v0)>>1);
218 v1 += (q1 = (t1-v1)>>1);
219 XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
220 v0 = t0-q0;
221 v1 = t1-q1;
222 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
223 z1[0] = r0;
224 z1[1] = i0;
225 z2[0] = r1;
226 z2[1] = i1;
227 z1+=2;
228 z2-=2;
229 V+=2;
230 }
231
232 break;
233 }
234 }
235}
236
237/**
238 * Compute inverse MDCT of size N = 2^nbits
239 * @param output N samples
240 * @param input N/2 samples
241 * "In-place" processing can be achieved provided that:
242 * [0 .. N/2-1 | N/2 .. N-1 ]
243 * <----input---->
244 * <-----------output----------->
245 *
246 */
247void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
248{
249 const int n = (1<<nbits);
250 const int n2 = (n>>1);
251 const int n4 = (n>>2);
252
253 ff_imdct_half(nbits,output+n2,input);
254
255 /* reflect the half imdct into the full N samples */
256 /* TODO: this could easily be optimised more! */
257 fixed32 * in_r, * in_r2, * out_r, * out_r2;
258
259 out_r = output;
260 out_r2 = output+n2-8;
261 in_r = output+n2+n4-8;
262 while(out_r<out_r2)
263 {
264 out_r[0] = -(out_r2[7] = in_r[7]);
265 out_r[1] = -(out_r2[6] = in_r[6]);
266 out_r[2] = -(out_r2[5] = in_r[5]);
267 out_r[3] = -(out_r2[4] = in_r[4]);
268 out_r[4] = -(out_r2[3] = in_r[3]);
269 out_r[5] = -(out_r2[2] = in_r[2]);
270 out_r[6] = -(out_r2[1] = in_r[1]);
271 out_r[7] = -(out_r2[0] = in_r[0]);
272 in_r -= 8;
273 out_r += 8;
274 out_r2 -= 8;
275 }
276
277 in_r = output + n2+n4;
278 in_r2 = output + n-4;
279 out_r = output + n2;
280 out_r2 = output + n2 + n4 - 4;
281 while(in_r<in_r2)
282 {
283 register fixed32 t0,t1,t2,t3;
284 register fixed32 s0,s1,s2,s3;
285
286 //simultaneously do the following things:
287 // 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
288 // 2. reflect range from [n2+n4 .. n-1] inplace
289 //
290 // [ | ]
291 // ^a -> <- ^b ^c -> <- ^d
292 //
293 // #1: copy from ^c to ^a
294 // #2: copy from ^d to ^b
295 // #3: swap ^c and ^d in place
296 //
297 // #1 pt1 : load 4 words from ^c.
298 t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
299 // #1 pt2 : write to ^a
300 out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
301 // #2 pt1 : load 4 words from ^d
302 s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
303 // #2 pt2 : write to ^b
304 out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
305 // #3 pt1 : write words from #2 to ^c
306 in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
307 // #3 pt2 : write words from #1 to ^d
308 in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
309
310 in_r += 4;
311 in_r2 -= 4;
312 out_r += 4;
313 out_r2 -= 4;
314 }
315}
316
317static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */
318
319/* Table of values of atan(2^-i) in 0.32 format fractions of pi where pi = 0xffffffff / 2 */
320static const unsigned long atan_table[] = {
321 0x1fffffff, /* +0.785398163 (or pi/4) */
322 0x12e4051d, /* +0.463647609 */
323 0x09fb385b, /* +0.244978663 */
324 0x051111d4, /* +0.124354995 */
325 0x028b0d43, /* +0.062418810 */
326 0x0145d7e1, /* +0.031239833 */
327 0x00a2f61e, /* +0.015623729 */
328 0x00517c55, /* +0.007812341 */
329 0x0028be53, /* +0.003906230 */
330 0x00145f2e, /* +0.001953123 */
331 0x000a2f98, /* +0.000976562 */
332 0x000517cc, /* +0.000488281 */
333 0x00028be6, /* +0.000244141 */
334 0x000145f3, /* +0.000122070 */
335 0x0000a2f9, /* +0.000061035 */
336 0x0000517c, /* +0.000030518 */
337 0x000028be, /* +0.000015259 */
338 0x0000145f, /* +0.000007629 */
339 0x00000a2f, /* +0.000003815 */
340 0x00000517, /* +0.000001907 */
341 0x0000028b, /* +0.000000954 */
342 0x00000145, /* +0.000000477 */
343 0x000000a2, /* +0.000000238 */
344 0x00000051, /* +0.000000119 */
345 0x00000028, /* +0.000000060 */
346 0x00000014, /* +0.000000030 */
347 0x0000000a, /* +0.000000015 */
348 0x00000005, /* +0.000000007 */
349 0x00000002, /* +0.000000004 */
350 0x00000001, /* +0.000000002 */
351 0x00000000, /* +0.000000001 */
352 0x00000000, /* +0.000000000 */
353};
354
355/**
356 * Implements sin and cos using CORDIC rotation.
357 *
358 * @param phase has range from 0 to 0xffffffff, representing 0 and
359 * 2*pi respectively.
360 * @param cos return address for cos
361 * @return sin of phase, value is a signed value from LONG_MIN to LONG_MAX,
362 * representing -1 and 1 respectively.
363 *
364 * Gives at least 24 bits precision (last 2-8 bits or so are probably off)
365 */
366
367long fsincos(unsigned long phase, fixed32 *cos)
368{
369 int32_t x, x1, y, y1;
370 unsigned long z, z1;
371 int i;
372
373 /* Setup initial vector */
374 x = cordic_circular_gain;
375 y = 0;
376 z = phase;
377
378 /* The phase has to be somewhere between 0..pi for this to work right */
379 if (z < 0xffffffff / 4) {
380 /* z in first quadrant, z += pi/2 to correct */
381 x = -x;
382 z += 0xffffffff / 4;
383 } else if (z < 3 * (0xffffffff / 4)) {
384 /* z in third quadrant, z -= pi/2 to correct */
385 z -= 0xffffffff / 4;
386 } else {
387 /* z in fourth quadrant, z -= 3pi/2 to correct */
388 x = -x;
389 z -= 3 * (0xffffffff / 4);
390 }
391
392 /* Each iteration adds roughly 1-bit of extra precision */
393 for (i = 0; i < 31; i++) {
394 x1 = x >> i;
395 y1 = y >> i;
396 z1 = atan_table[i];
397
398 /* Decided which direction to rotate vector. Pivot point is pi/2 */
399 if (z >= 0xffffffff / 4) {
400 x -= y1;
401 y += x1;
402 z -= z1;
403 } else {
404 x += y1;
405 y -= x1;
406 z += z1;
407 }
408 }
409
410 if (cos)
411 *cos = x;
412
413 return y;
414}