diff options
Diffstat (limited to 'apps/codecs/lib/fft-ffmpeg.c')
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg.c | 467 |
1 files changed, 467 insertions, 0 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg.c b/apps/codecs/lib/fft-ffmpeg.c new file mode 100644 index 0000000000..f08b7fa2eb --- /dev/null +++ b/apps/codecs/lib/fft-ffmpeg.c | |||
@@ -0,0 +1,467 @@ | |||
1 | /* | ||
2 | * FFT/IFFT transforms converted to integer precision | ||
3 | * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli | ||
4 | * Copyright (c) 2008 Loren Merritt | ||
5 | * Copyright (c) 2002 Fabrice Bellard | ||
6 | * Partly based on libdjbfft by D. J. Bernstein | ||
7 | * | ||
8 | * This file is part of FFmpeg. | ||
9 | * | ||
10 | * FFmpeg is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2.1 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * FFmpeg is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with FFmpeg; if not, write to the Free Software | ||
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
23 | */ | ||
24 | |||
25 | /** | ||
26 | * @file libavcodec/fft.c | ||
27 | * FFT/IFFT transforms. | ||
28 | */ | ||
29 | |||
30 | |||
31 | #ifdef CPU_ARM | ||
32 | // we definitely want CONFIG_SMALL undefined for ipod | ||
33 | // so we get the inlined version of fft16 (which is measurably faster) | ||
34 | #undef CONFIG_SMALL | ||
35 | #else | ||
36 | #undef CONFIG_SMALL | ||
37 | #endif | ||
38 | |||
39 | #include "fft.h" | ||
40 | #include <string.h> | ||
41 | #include <stdlib.h> | ||
42 | #include <math.h> | ||
43 | #include <inttypes.h> | ||
44 | #include <time.h> | ||
45 | #include <codecs/lib/codeclib.h> | ||
46 | |||
47 | #include "asm_arm.h" | ||
48 | #include "asm_mcf5249.h" | ||
49 | #include "codeclib_misc.h" | ||
50 | #include "mdct_lookup.h" | ||
51 | |||
52 | static void ff_fft_permute_c(FFTContext *s, FFTComplex *z); | ||
53 | |||
54 | /* constants for fft_16 (same constants as in mdct_arm.S ... ) */ | ||
55 | #define cPI1_8 (0x7641af3d) /* cos(pi/8) s.31 */ | ||
56 | #define cPI2_8 (0x5a82799a) /* cos(2pi/8) = 1/sqrt(2) s.31 */ | ||
57 | #define cPI3_8 (0x30fbc54d) /* cos(3pi/8) s.31 */ | ||
58 | |||
59 | /* asm-optimised functions and/or macros */ | ||
60 | #include "fft-ffmpeg_arm.h" | ||
61 | |||
62 | static int split_radix_permutation(int i, int n, int inverse) | ||
63 | { | ||
64 | int m; | ||
65 | if(n <= 2) return i&1; | ||
66 | m = n >> 1; | ||
67 | if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; | ||
68 | m >>= 1; | ||
69 | if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; | ||
70 | else return split_radix_permutation(i, m, inverse)*4 - 1; | ||
71 | } | ||
72 | |||
73 | static void ff_fft_permute_c(FFTContext *s, FFTComplex *z) | ||
74 | { | ||
75 | int j, k, np; | ||
76 | FFTComplex tmp; | ||
77 | //const uint16_t *revtab = s->revtab; | ||
78 | np = 1 << s->nbits; | ||
79 | |||
80 | const int revtab_shift = (12 - s->nbits); | ||
81 | |||
82 | /* reverse */ | ||
83 | for(j=0;j<np;j++) { | ||
84 | k = revtab[j]>>revtab_shift; | ||
85 | if (k < j) { | ||
86 | tmp = z[k]; | ||
87 | z[k] = z[j]; | ||
88 | z[j] = tmp; | ||
89 | } | ||
90 | } | ||
91 | } | ||
92 | |||
93 | #define BF(x,y,a,b) {\ | ||
94 | x = a - b;\ | ||
95 | y = a + b;\ | ||
96 | } | ||
97 | |||
98 | #define BF_REV(x,y,a,b) {\ | ||
99 | x = a + b;\ | ||
100 | y = a - b;\ | ||
101 | } | ||
102 | |||
103 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES | ||
104 | #define BUTTERFLIES(a0,a1,a2,a3) {\ | ||
105 | {\ | ||
106 | FFTSample temp1,temp2;\ | ||
107 | BF(temp1, temp2, t5, t1);\ | ||
108 | BF(a2.re, a0.re, a0.re, temp2);\ | ||
109 | BF(a3.im, a1.im, a1.im, temp1);\ | ||
110 | }\ | ||
111 | {\ | ||
112 | FFTSample temp1,temp2;\ | ||
113 | BF(temp1, temp2, t2, t6);\ | ||
114 | BF(a3.re, a1.re, a1.re, temp1);\ | ||
115 | BF(a2.im, a0.im, a0.im, temp2);\ | ||
116 | }\ | ||
117 | } | ||
118 | |||
119 | // force loading all the inputs before storing any. | ||
120 | // this is slightly slower for small data, but avoids store->load aliasing | ||
121 | // for addresses separated by large powers of 2. | ||
122 | #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ | ||
123 | FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ | ||
124 | {\ | ||
125 | FFTSample temp1, temp2;\ | ||
126 | BF(temp1, temp2, t5, t1);\ | ||
127 | BF(a2.re, a0.re, r0, temp2);\ | ||
128 | BF(a3.im, a1.im, i1, temp1);\ | ||
129 | }\ | ||
130 | {\ | ||
131 | FFTSample temp1, temp2;\ | ||
132 | BF(temp1, temp2, t2, t6);\ | ||
133 | BF(a3.re, a1.re, r1, temp1);\ | ||
134 | BF(a2.im, a0.im, i0, temp2);\ | ||
135 | }\ | ||
136 | } | ||
137 | #endif | ||
138 | |||
139 | /* | ||
140 | see conjugate pair description in | ||
141 | http://www.fftw.org/newsplit.pdf | ||
142 | |||
143 | a0 = z[k] | ||
144 | a1 = z[k+N/4] | ||
145 | a2 = z[k+2N/4] | ||
146 | a3 = z[k+3N/4] | ||
147 | |||
148 | result: | ||
149 | y[k] = z[k]+w(z[k+2N/4])+w'(z[k+3N/4]) | ||
150 | y[k+N/4] = z[k+N/4]-iw(z[k+2N/4])+iw'(z[k+3N/4]) | ||
151 | y[k+2N/4] = z[k]-w(z[k+2N/4])-w'(z[k+3N/4]) | ||
152 | y[k+3N/4] = z[k+N/4]+iw(z[k+2N/4])-iw'(z[k+3N/4]) | ||
153 | |||
154 | i.e. | ||
155 | |||
156 | a0 = a0 + (w.a2 + w'.a3) | ||
157 | a1 = a1 - i(w.a2 - w'.a3) | ||
158 | a2 = a0 - (w.a2 + w'.a3) | ||
159 | a3 = a1 + i(w.a2 - w'.a3) | ||
160 | |||
161 | note re(w') = re(w) and im(w') = -im(w) | ||
162 | |||
163 | so therefore | ||
164 | |||
165 | re(a0) = re(a0) + re(w.a2) + re(w.a3) | ||
166 | im(a0) = im(a0) + im(w.a2) - im(w.a3) etc | ||
167 | |||
168 | and remember also that | ||
169 | Re([s+it][u+iv]) = su-tv | ||
170 | Im([s+it][u+iv]) = sv+tu | ||
171 | |||
172 | so | ||
173 | Re(w'.(s+it)) = Re(w').s - Im(w').t = Re(w).s + Im(w).t | ||
174 | Im(w'.(s+it)) = Re(w').t + Im(w').s = Re(w).t - Im(w).s | ||
175 | |||
176 | For inverse dft we take the complex conjugate of all twiddle factors. | ||
177 | Hence | ||
178 | |||
179 | a0 = a0 + (w'.a2 + w.a3) | ||
180 | a1 = a1 - i(w'.a2 - w.a3) | ||
181 | a2 = a0 - (w'.a2 + w.a3) | ||
182 | a3 = a1 + i(w'.a2 - w.a3) | ||
183 | |||
184 | Define t1 = Re(w'.a2) = Re(w)*Re(a2) + Im(w)*Im(a2) | ||
185 | t2 = Im(w'.a2) = Re(w)*Im(a2) - Im(w)*Re(a2) | ||
186 | t5 = Re(w.a3) = Re(w)*Re(a3) - Im(w)*Im(a3) | ||
187 | t6 = Im(w.a3) = Re(w)*Im(a3) + Im(w)*Re(a3) | ||
188 | |||
189 | Then we just output: | ||
190 | a0.re = a0.re + ( t1 + t5 ) | ||
191 | a0.im = a0.im + ( t2 + t6 ) | ||
192 | a1.re = a1.re + ( t2 - t6 ) // since we multiply by -i and i(-i) = 1 | ||
193 | a1.im = a1.im - ( t1 - t5 ) // since we multiply by -i and 1(-i) = -i | ||
194 | a2.re = a0.re - ( t1 + t5 ) | ||
195 | a2.im = a0.im - ( t1 + t5 ) | ||
196 | a3.re = a1.re - ( t2 - t6 ) // since we multiply by +i and i(+i) = -1 | ||
197 | a3.im = a1.im + ( t1 - t5 ) // since we multiply by +i and 1(+i) = i | ||
198 | |||
199 | |||
200 | */ | ||
201 | |||
202 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | ||
203 | static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) | ||
204 | { | ||
205 | register FFTSample t1,t2,t5,t6,r_re,r_im; | ||
206 | r_re = z[n*2].re; | ||
207 | r_im = z[n*2].im; | ||
208 | XPROD31_R(r_re, r_im, wre, wim, t1,t2); | ||
209 | r_re = z[n*3].re; | ||
210 | r_im = z[n*3].im; | ||
211 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | ||
212 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | ||
213 | } | ||
214 | |||
215 | static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) | ||
216 | { | ||
217 | register const FFTSample wre=w[0],wim=w[1]; | ||
218 | register FFTSample t1,t2,t5,t6,r_re,r_im; | ||
219 | r_re = z[n*2].re; | ||
220 | r_im = z[n*2].im; | ||
221 | XPROD31_R(r_re, r_im, wre, wim, t1,t2); | ||
222 | r_re = z[n*3].re; | ||
223 | r_im = z[n*3].im; | ||
224 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | ||
225 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | ||
226 | } | ||
227 | |||
228 | static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) | ||
229 | { | ||
230 | register const FFTSample wim=w[0],wre=w[1]; | ||
231 | register FFTSample t1,t2,t5,t6,r_re,r_im; | ||
232 | r_re = z[n*2].re; | ||
233 | r_im = z[n*2].im; | ||
234 | XPROD31_R(r_re, r_im, wre, wim, t1,t2); | ||
235 | r_re = z[n*3].re; | ||
236 | r_im = z[n*3].im; | ||
237 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | ||
238 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | ||
239 | } | ||
240 | |||
241 | static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) | ||
242 | { | ||
243 | register FFTSample t1,t2,t5,t6,temp1,temp2; | ||
244 | register FFTSample * my_z = (FFTSample *)(z); | ||
245 | my_z += n*4; | ||
246 | t2 = MULT31(my_z[0], cPI2_8); | ||
247 | temp1 = MULT31(my_z[1], cPI2_8); | ||
248 | my_z += n*2; | ||
249 | temp2 = MULT31(my_z[0], cPI2_8); | ||
250 | t5 = MULT31(my_z[1], cPI2_8); | ||
251 | t1 = ( temp1 + t2 ); | ||
252 | t2 = ( temp1 - t2 ); | ||
253 | t6 = ( temp2 + t5 ); | ||
254 | t5 = ( temp2 - t5 ); | ||
255 | my_z -= n*6; | ||
256 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | ||
257 | } | ||
258 | |||
259 | static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n) | ||
260 | { | ||
261 | FFTSample t1,t2,t5,t6; | ||
262 | t1 = z[n*2].re; | ||
263 | t2 = z[n*2].im; | ||
264 | t5 = z[n*3].re; | ||
265 | t6 = z[n*3].im; | ||
266 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | ||
267 | } | ||
268 | #endif | ||
269 | |||
270 | /* z[0...8n-1], w[1...2n-1] */ | ||
271 | static void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg) | ||
272 | { | ||
273 | register FFTComplex * z = z_arg; | ||
274 | register unsigned int STEP = STEP_arg; | ||
275 | register unsigned int n = n_arg; | ||
276 | |||
277 | register const FFTSample *w = sincos_lookup0+STEP; | ||
278 | /* wre = *(wim+1) . ordering is sin,cos */ | ||
279 | register const FFTSample *w_end = sincos_lookup0+1024; | ||
280 | |||
281 | /* first two are special (well, first one is special, but we need to do pairs) */ | ||
282 | TRANSFORM_ZERO(z,n); | ||
283 | z++; | ||
284 | TRANSFORM_W10(z,n,w); | ||
285 | w += STEP; | ||
286 | /* first pass forwards through sincos_lookup0*/ | ||
287 | do { | ||
288 | z++; | ||
289 | TRANSFORM_W10(z,n,w); | ||
290 | w += STEP; | ||
291 | z++; | ||
292 | TRANSFORM_W10(z,n,w); | ||
293 | w += STEP; | ||
294 | } while(LIKELY(w < w_end)); | ||
295 | /* second half: pass backwards through sincos_lookup0*/ | ||
296 | /* wim and wre are now in opposite places so ordering now [0],[1] */ | ||
297 | w_end=sincos_lookup0; | ||
298 | while(LIKELY(w>w_end)) | ||
299 | { | ||
300 | z++; | ||
301 | TRANSFORM_W01(z,n,w); | ||
302 | w -= STEP; | ||
303 | z++; | ||
304 | TRANSFORM_W01(z,n,w); | ||
305 | w -= STEP; | ||
306 | } | ||
307 | } | ||
308 | |||
309 | /* what is STEP? | ||
310 | sincos_lookup0 has sin,cos pairs for 1/4 cycle, in 1024 points | ||
311 | so half cycle would be 2048 points | ||
312 | ff_cos_16 has 8 elements corresponding to 4 cos points and 4 sin points | ||
313 | so each of the 4 points pairs corresponds to a 256*2-byte jump in sincos_lookup0 | ||
314 | 8192/16 (from "ff_cos_16") is 512 bytes. | ||
315 | i.e. for fft16, STEP = 8192/16 */ | ||
316 | #define DECL_FFT(n,n2,n4)\ | ||
317 | static void fft##n(FFTComplex *z)\ | ||
318 | {\ | ||
319 | fft##n2(z);\ | ||
320 | fft##n4(z+n4*2);\ | ||
321 | fft##n4(z+n4*3);\ | ||
322 | pass(z,8192/n,n4);\ | ||
323 | } | ||
324 | |||
325 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_FFT4 | ||
326 | static inline void fft4(FFTComplex *z) | ||
327 | { | ||
328 | FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | ||
329 | |||
330 | BF(t3, t1, z[0].re, z[1].re); // t3=r1-r3 ; t1 = r1+r3 | ||
331 | BF(t8, t6, z[3].re, z[2].re); // t8=r7-r5 ; t6 = r7+r5 | ||
332 | |||
333 | BF(z[2].re, z[0].re, t1, t6); // r5=t1-t6 ; r1 = t1+t6 | ||
334 | |||
335 | BF(t4, t2, z[0].im, z[1].im); // t4=r2-r4 ; t2 = r2+r4 | ||
336 | BF(t7, t5, z[2].im, z[3].im); // t7=r6-r8 ; t5 = r6+r8 | ||
337 | |||
338 | BF(z[3].im, z[1].im, t4, t8); // r8=t4-t8 ; r4 = t4+t8 | ||
339 | BF(z[3].re, z[1].re, t3, t7); // r7=t3-t7 ; r3 = t3+t7 | ||
340 | BF(z[2].im, z[0].im, t2, t5); // r6=t2-t5 ; r2 = t2+t5 | ||
341 | } | ||
342 | #endif | ||
343 | |||
344 | static void fft4_dispatch(FFTComplex *z) | ||
345 | { | ||
346 | fft4(z); | ||
347 | } | ||
348 | |||
349 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_FFT8 | ||
350 | static inline void fft8(FFTComplex *z) | ||
351 | { | ||
352 | fft4(z); | ||
353 | FFTSample t1,t2,t3,t4,t7,t8; | ||
354 | |||
355 | BF(t1, z[5].re, z[4].re, -z[5].re); | ||
356 | BF(t2, z[5].im, z[4].im, -z[5].im); | ||
357 | BF(t3, z[7].re, z[6].re, -z[7].re); | ||
358 | BF(t4, z[7].im, z[6].im, -z[7].im); | ||
359 | BF(t8, t1, t3, t1); | ||
360 | BF(t7, t2, t2, t4); | ||
361 | BF(z[4].re, z[0].re, z[0].re, t1); | ||
362 | BF(z[4].im, z[0].im, z[0].im, t2); | ||
363 | BF(z[6].re, z[2].re, z[2].re, t7); | ||
364 | BF(z[6].im, z[2].im, z[2].im, t8); | ||
365 | |||
366 | z++; | ||
367 | TRANSFORM_EQUAL(z,2); | ||
368 | } | ||
369 | #endif | ||
370 | |||
371 | static void fft8_dispatch(FFTComplex *z) | ||
372 | { | ||
373 | fft8(z); | ||
374 | } | ||
375 | |||
376 | #ifndef CONFIG_SMALL | ||
377 | static void fft16(FFTComplex *z) | ||
378 | { | ||
379 | fft8(z); | ||
380 | fft4(z+8); | ||
381 | fft4(z+12); | ||
382 | |||
383 | TRANSFORM_ZERO(z,4); | ||
384 | z+=2; | ||
385 | TRANSFORM_EQUAL(z,4); | ||
386 | z-=1; | ||
387 | TRANSFORM(z,4,cPI1_8,cPI3_8); | ||
388 | z+=2; | ||
389 | TRANSFORM(z,4,cPI3_8,cPI1_8); | ||
390 | } | ||
391 | #else | ||
392 | DECL_FFT(16,8,4) | ||
393 | #endif | ||
394 | DECL_FFT(32,16,8) | ||
395 | DECL_FFT(64,32,16) | ||
396 | DECL_FFT(128,64,32) | ||
397 | DECL_FFT(256,128,64) | ||
398 | DECL_FFT(512,256,128) | ||
399 | DECL_FFT(1024,512,256) | ||
400 | DECL_FFT(2048,1024,512) | ||
401 | DECL_FFT(4096,2048,1024) | ||
402 | |||
403 | static void (*fft_dispatch[])(FFTComplex*) = { | ||
404 | fft4_dispatch, fft8_dispatch, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, | ||
405 | fft2048, fft4096 | ||
406 | }; | ||
407 | |||
408 | void ff_fft_calc_c(int nbits, FFTComplex *z) | ||
409 | { | ||
410 | fft_dispatch[nbits-2](z); | ||
411 | } | ||
412 | |||
413 | #if 0 | ||
414 | int main (void) | ||
415 | { | ||
416 | #define PRECISION 16 | ||
417 | #define FFT_SIZE 1024 | ||
418 | #define ftofix32(x) ((fixed32)((x) * (float)(1 << PRECISION) + ((x) < 0 ? -0.5 : 0.5))) | ||
419 | #define itofix32(x) ((x) << PRECISION) | ||
420 | #define fixtoi32(x) ((x) >> PRECISION) | ||
421 | |||
422 | int j; | ||
423 | const long N = FFT_SIZE; | ||
424 | double r[FFT_SIZE] = {0.0}, i[FFT_SIZE] = {0.0}; | ||
425 | long n; | ||
426 | double t; | ||
427 | double amp, phase; | ||
428 | clock_t start, end; | ||
429 | double exec_time = 0; | ||
430 | FFTContext s; | ||
431 | FFTComplex z[FFT_SIZE]; | ||
432 | memset(z, 0, 64*sizeof(FFTComplex)); | ||
433 | |||
434 | /* Generate saw-tooth test data */ | ||
435 | for (n = 0; n < FFT_SIZE; n++) | ||
436 | { | ||
437 | t = (2 * M_PI * n)/N; | ||
438 | /*z[n].re = 1.1 + sin( t) + | ||
439 | 0.5 * sin(2.0 * t) + | ||
440 | (1.0/3.0) * sin(3.0 * t) + | ||
441 | 0.25 * sin(4.0 * t) + | ||
442 | 0.2 * sin(5.0 * t) + | ||
443 | (1.0/6.0) * sin(6.0 * t) + | ||
444 | (1.0/7.0) * sin(7.0 * t) ;*/ | ||
445 | z[n].re = ftofix32(cos(2*M_PI*n/64)); | ||
446 | //printf("z[%d] = %f\n", n, z[n].re); | ||
447 | //getchar(); | ||
448 | } | ||
449 | |||
450 | ff_fft_init(&s, 10, 1); | ||
451 | //start = clock(); | ||
452 | //for(n = 0; n < 1000000; n++) | ||
453 | ff_fft_permute_c(&s, z); | ||
454 | ff_fft_calc_c(&s, z); | ||
455 | //end = clock(); | ||
456 | //exec_time = (((double)end-(double)start)/CLOCKS_PER_SEC); | ||
457 | for(j = 0; j < FFT_SIZE; j++) | ||
458 | { | ||
459 | printf("%8.4f\n", sqrt(pow(fixtof32(z[j].re),2)+ pow(fixtof32(z[j].im), 2))); | ||
460 | //getchar(); | ||
461 | } | ||
462 | printf("muls = %d, adds = %d\n", muls, adds); | ||
463 | //printf(" Time elapsed = %f\n", exec_time); | ||
464 | //ff_fft_end(&s); | ||
465 | |||
466 | } | ||
467 | #endif | ||