diff options
Diffstat (limited to 'apps/codecs/lib/fft-ffmpeg_arm.h')
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg_arm.h | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg_arm.h b/apps/codecs/lib/fft-ffmpeg_arm.h new file mode 100644 index 0000000000..94969b4b3d --- /dev/null +++ b/apps/codecs/lib/fft-ffmpeg_arm.h | |||
@@ -0,0 +1,342 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id: $ | ||
9 | * | ||
10 | * Copyright (C) 2010 Dave Hooper | ||
11 | * | ||
12 | * ARM optimisations for ffmpeg's fft (used in fft-ffmpeg.c) | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version 2 | ||
17 | * of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
20 | * KIND, either express or implied. | ||
21 | * | ||
22 | ****************************************************************************/ | ||
23 | |||
24 | #ifdef CPU_ARM | ||
25 | |||
26 | /* Start off with optimised variants of the butterflies that work | ||
27 | nicely on arm */ | ||
28 | /* 1. where y and a share the same variable/register */ | ||
29 | #define BF_OPT(x,y,a,b) {\ | ||
30 | y = a + b;\ | ||
31 | x = y - (b<<1);\ | ||
32 | } | ||
33 | |||
34 | /* 2. where y and b share the same variable/register */ | ||
35 | #define BF_OPT2(x,y,a,b) {\ | ||
36 | x = a - b;\ | ||
37 | y = x + (b<<1);\ | ||
38 | } | ||
39 | |||
40 | /* 3. where y and b share the same variable/register (but y=(-b)) */ | ||
41 | #define BF_OPT2_REV(x,y,a,b) {\ | ||
42 | x = a + b;\ | ||
43 | y = x - (b<<1);\ | ||
44 | } | ||
45 | |||
46 | /* standard BUTTERFLIES package. Note, we actually manually inline this | ||
47 | in all the TRANSFORM macros below anyway */ | ||
48 | #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES | ||
49 | #define BUTTERFLIES(a0,a1,a2,a3) {\ | ||
50 | {\ | ||
51 | BF_OPT(t1, t5, t5, t1);\ | ||
52 | BF_OPT(t6, t2, t2, t6);\ | ||
53 | BF_OPT(a2.re, a0.re, a0.re, t5);\ | ||
54 | BF_OPT(a2.im, a0.im, a0.im, t2);\ | ||
55 | BF_OPT(a3.re, a1.re, a1.re, t6);\ | ||
56 | BF_OPT(a3.im, a1.im, a1.im, t1);\ | ||
57 | }\ | ||
58 | } | ||
59 | |||
60 | #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | ||
61 | |||
62 | /* on ARM, all the TRANSFORM_etc inlines use the following registers: | ||
63 | r5,r6,r7,r8,r9,r10,r4,r12 | ||
64 | |||
65 | inputs are: z, n, STEP | ||
66 | |||
67 | NOTE THAT THESE MACROS ACTUALLY CHANGE z INPUT INPLACE- | ||
68 | so sequential actions, z += n*3, z -= n*2 etc etc matter | ||
69 | */ | ||
70 | |||
71 | |||
72 | #define TRANSFORM_POST_STORE( z, n ) {\ | ||
73 | /*{*/\ | ||
74 | /* BF_OPT(t1, t5, t5, t1);*/\ | ||
75 | /* BF_OPT(t6, t2, t2, t6);*/\ | ||
76 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/\ | ||
77 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/\ | ||
78 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/\ | ||
79 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/\ | ||
80 | /*}*/\ | ||
81 | z -= n*3;\ | ||
82 | /* r_re = my_z[0]; r_im = my_z[1]; */\ | ||
83 | {\ | ||
84 | register FFTSample rt0temp asm("r4");\ | ||
85 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
86 | BF_OPT(rt0temp, r_re, r_re, t5);\ | ||
87 | BF_OPT(t2, r_im, r_im, t2);\ | ||
88 | /* my_z[0] = r_re; my_z[1] = r_im; */\ | ||
89 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ | ||
90 | z += n;\ | ||
91 | /* r_re = my_z[0]; r_im = my_z[1]; */\ | ||
92 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
93 | BF_OPT(t5, r_re, r_re, t6);\ | ||
94 | BF_OPT(t6, r_im, r_im, t1);\ | ||
95 | /* my_z[0] = r_re; my_z[1] = r_im; */\ | ||
96 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ | ||
97 | z += n;\ | ||
98 | /* my_z[0] = rt0temp; my_z[1] = t2; */\ | ||
99 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2));\ | ||
100 | z += n;\ | ||
101 | }\ | ||
102 | /* my_z[0] = t5; my_z[1] = t6; */\ | ||
103 | asm volatile( "stmia %[my_z], {%[t5],%[t6]}\n\t"::[my_z] "r" (z), [t5] "r" (t5), [t6] "r" (t6));\ | ||
104 | z -= n*3;\ | ||
105 | } | ||
106 | |||
107 | #define TRANSFORM( z, n, wre_arg, wim_arg )\ | ||
108 | {\ | ||
109 | FFTSample wre = wre_arg, wim = wim_arg;\ | ||
110 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | ||
111 | z += n*2; /* z[o2] */\ | ||
112 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
113 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | ||
114 | \ | ||
115 | z += n; /* z[o3] */\ | ||
116 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
117 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | ||
118 | \ | ||
119 | BF_OPT(t1, t5, t5, t1);\ | ||
120 | BF_OPT(t6, t2, t2, t6);\ | ||
121 | TRANSFORM_POST_STORE( z, n );\ | ||
122 | } | ||
123 | |||
124 | #define TRANSFORM_W01( z, n, w )\ | ||
125 | {\ | ||
126 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | ||
127 | \ | ||
128 | {\ | ||
129 | register FFTSample wre asm("r4"),wim asm("r12");\ | ||
130 | asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (wre), [wim] "=r" (wim):[w] "r" (w));\ | ||
131 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | ||
132 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
133 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | ||
134 | \ | ||
135 | z += n; /* z[o3] */\ | ||
136 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
137 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | ||
138 | }\ | ||
139 | \ | ||
140 | BF_OPT(t1, t5, t5, t1);\ | ||
141 | BF_OPT(t6, t2, t2, t6);\ | ||
142 | TRANSFORM_POST_STORE( z, n );\ | ||
143 | } | ||
144 | |||
145 | //static inline void TRANSFORM_W10(int32_t * z, unsigned int n, const int32_t * w) | ||
146 | #define TRANSFORM_W10( z, n, w )\ | ||
147 | {\ | ||
148 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | ||
149 | \ | ||
150 | {\ | ||
151 | register FFTSample wim asm("r4"),wre asm("r12");\ | ||
152 | asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (wim), [wre] "=r" (wre):[w] "r" (w));\ | ||
153 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | ||
154 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
155 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | ||
156 | \ | ||
157 | z += n; /* z[o3] */\ | ||
158 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
159 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | ||
160 | }\ | ||
161 | \ | ||
162 | BF_OPT(t1, t5, t5, t1);\ | ||
163 | BF_OPT(t6, t2, t2, t6);\ | ||
164 | TRANSFORM_POST_STORE( z, n );\ | ||
165 | } | ||
166 | |||
167 | #define TRANSFORM_EQUAL( z, n )\ | ||
168 | {\ | ||
169 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | ||
170 | \ | ||
171 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | ||
172 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ | ||
173 | z += n; /* z[o3] */\ | ||
174 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
175 | \ | ||
176 | /**/\ | ||
177 | /*t2 = MULT32(cPI2_8, t5);*/\ | ||
178 | /*t1 = MULT31(cPI2_8, t6);*/\ | ||
179 | /*t6 = MULT31(cPI2_8, r_re);*/\ | ||
180 | /*t5 = MULT32(cPI2_8, r_im);*/\ | ||
181 | \ | ||
182 | /*t1 = ( t1 + (t2<<1) );*/\ | ||
183 | /*t2 = ( t1 - (t2<<2) );*/\ | ||
184 | /*t6 = ( t6 + (t5<<1) );*/\ | ||
185 | /*t5 = ( t6 - (t5<<2) );*/\ | ||
186 | /**/\ | ||
187 | t2 = MULT31(cPI2_8, t5);\ | ||
188 | t6 = MULT31(cPI2_8, t6);\ | ||
189 | r_re = MULT31(cPI2_8, r_re);\ | ||
190 | t5 = MULT31(cPI2_8, r_im);\ | ||
191 | \ | ||
192 | t1 = ( t6 + t2 );\ | ||
193 | t2 = ( t6 - t2 );\ | ||
194 | t6 = ( r_re + t5 );\ | ||
195 | t5 = ( r_re - t5 );\ | ||
196 | \ | ||
197 | BF_OPT(t1, t5, t5, t1);\ | ||
198 | BF_OPT(t6, t2, t2, t6);\ | ||
199 | TRANSFORM_POST_STORE( z, n );\ | ||
200 | } | ||
201 | |||
202 | #define TRANSFORM_ZERO( z,n )\ | ||
203 | {\ | ||
204 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | ||
205 | \ | ||
206 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | ||
207 | asm volatile( "ldmia %[my_z], {%[t1],%[t2]}\n\t":[t1] "=r" (t1), [t2] "=r" (t2):[my_z] "r" (z));\ | ||
208 | z += n; /* z[o3] */\ | ||
209 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ | ||
210 | \ | ||
211 | BF_OPT(t1, t5, t5, t1);\ | ||
212 | BF_OPT(t6, t2, t2, t6);\ | ||
213 | TRANSFORM_POST_STORE( z, n );\ | ||
214 | } | ||
215 | |||
216 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT4 | ||
217 | #define fft4(z_arg)\ | ||
218 | {\ | ||
219 | /* input[0..7] -> output[0..7] */\ | ||
220 | fixed32 * m = (fixed32 *) ( ( z_arg ) );\ | ||
221 | /* load r1=z[0],r2=z[1],...,r8=z[7] */\ | ||
222 | asm volatile(\ | ||
223 | "ldmia %[z], {r1-r8}\n\t"\ | ||
224 | "add r1,r1,r3\n\t" /* r1 :=t1 */\ | ||
225 | "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */\ | ||
226 | "sub r7,r7,r5\n\t" /* r10:=t8 */\ | ||
227 | "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */\ | ||
228 | \ | ||
229 | "add r1,r1,r5\n\t" /* r1 = o[0] */\ | ||
230 | "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */\ | ||
231 | \ | ||
232 | "add r2,r2,r4\n\t" /* r2 :=t2 */\ | ||
233 | "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */\ | ||
234 | \ | ||
235 | "add r12,r6,r8\n\t" /* r10:=t5 */\ | ||
236 | "sub r6,r6,r8\n\t" /* r6 :=t7 */\ | ||
237 | \ | ||
238 | "sub r8,r4,r7\n\t" /* r8 = o[7]*/ \ | ||
239 | "add r4,r4,r7\n\t" /* r4 = o[3]*/ \ | ||
240 | "sub r7,r3,r6\n\t" /* r7 = o[6]*/ \ | ||
241 | "add r3,r3,r6\n\t" /* r3 = o[2]*/ \ | ||
242 | "sub r6,r2,r12\n\t" /* r6 = o[5]*/ \ | ||
243 | "add r2,r2,r12\n\t" /* r2 = o[1]*/ \ | ||
244 | \ | ||
245 | "stmia %[z], {r1-r8}\n\t"\ | ||
246 | : /* outputs */\ | ||
247 | : /* inputs */ [z] "r" (m)\ | ||
248 | : /* clobbers */\ | ||
249 | "r1","r2","r3","r4","r5","r6","r7","r8","r12","memory"\ | ||
250 | );\ | ||
251 | } | ||
252 | |||
253 | |||
254 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT8 | ||
255 | /* The chunk of asm below is equivalent to the following: | ||
256 | |||
257 | // first load in z[4].re thru z[7].im into local registers | ||
258 | // ... | ||
259 | BF_OPT2_REV(z[4].re, z[5].re, z[4].re, z[5].re); // x=a+b; y=x-(b<<1) | ||
260 | BF_OPT2_REV(z[4].im, z[5].im, z[4].im, z[5].im); | ||
261 | BF_REV (temp, z[7].re, z[6].re, z[7].re); // x=a+b; y=a-b; | ||
262 | BF_REV (z[6].re, z[7].im, z[6].im, z[7].im); | ||
263 | // save z[7].re and z[7].im as those are complete now | ||
264 | // z[5].re and z[5].im are also complete now but save these later on | ||
265 | |||
266 | BF(z[6].im, z[4].re, temp, z[4].re); // x=a-b; y=a+b | ||
267 | BF_OPT(z[6].re, z[4].im, z[4].im, z[6].re); // y=a+b; x=y-(b<<1) | ||
268 | // now load z[2].re and z[2].im | ||
269 | // ... | ||
270 | BF_OPT(z[6].re, z[2].re, z[2].re, z[6].re); // y=a+b; x=y-(b<<1) | ||
271 | BF_OPT(z[6].im, z[2].im, z[2].im, z[6].im); // y=a+b; x=y-(b<<1) | ||
272 | // Now save z[6].re and z[6].im, along with z[5].re and z[5].im | ||
273 | // for efficiency. Also save z[2].re and z[2].im. | ||
274 | // Now load z[0].re and z[0].im | ||
275 | // ... | ||
276 | |||
277 | BF_OPT(z[4].re, z[0].re, z[0].re, z[4].re); // y=a+b; x=y-(b<<1) | ||
278 | BF_OPT(z[4].im, z[0].im, z[0].im, z[4].im); // y=a+b; x=y-(b<<1) | ||
279 | // Finally save out z[4].re, z[4].im, z[0].re and z[0].im | ||
280 | // ... | ||
281 | */ | ||
282 | static inline void fft8( FFTComplex * z ) | ||
283 | { | ||
284 | fft4(z); | ||
285 | { | ||
286 | FFTSample temp; | ||
287 | fixed32 * m4 = (fixed32 *)(&(z[4].re)); | ||
288 | |||
289 | asm volatile( | ||
290 | "add %[z_ptr], %[z_ptr], #16\n\t" /* point to &z[2].re */ | ||
291 | /* read in z[4].re thru z[7].im */ | ||
292 | "ldmia %[z4_ptr]!, {r1,r2,r3,r4,r5,r6,r7,r8}\n\t" | ||
293 | /* (now points one word past &z[7].im) */ | ||
294 | "add r1,r1,r3\n\t" | ||
295 | "sub r3,r1,r3,lsl #1\n\t" | ||
296 | "add r2,r2,r4\n\t" | ||
297 | "sub r4,r2,r4,lsl #1\n\t" | ||
298 | "add %[temp],r5,r7\n\t" | ||
299 | "sub r7,r5,r7\n\t" | ||
300 | "add r5,r6,r8\n\t" | ||
301 | "sub r8,r6,r8\n\t" | ||
302 | |||
303 | "stmdb %[z4_ptr]!, {r7,r8}\n\t" /* write z[7].re,z[7].im straight away */ | ||
304 | /* Note, registers r7 & r8 now free */ | ||
305 | |||
306 | "sub r6,%[temp],r1\n\t" | ||
307 | "add r1,%[temp],r1\n\t" | ||
308 | "add r2,r2,r5\n\t" | ||
309 | "sub r5,r2,r5,lsl #1\n\t" | ||
310 | |||
311 | "ldmia %[z_ptr],{r7,r8}\n\t" /* load z[2].re and z[2].im */ | ||
312 | "add r7,r7,r5\n\t" | ||
313 | "sub r5,r7,r5,lsl #1\n\t" | ||
314 | "add r8,r8,r6\n\t" | ||
315 | "sub r6,r8,r6,lsl #1\n\t" | ||
316 | |||
317 | /* write out z[5].re, z[5].im, z[6].re, z[6].im in one go*/ | ||
318 | "stmdb %[z4_ptr]!, {r3,r4,r5,r6}\n\t" | ||
319 | "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[2].re, z[2].im */ | ||
320 | "sub %[z_ptr],%[z_ptr], #16\n\t" /* point z_ptr back to &z[0].re */ | ||
321 | "ldmia %[z_ptr],{r7,r8}\n\t" /* load r[0].re, r[0].im */ | ||
322 | |||
323 | "add r7,r7,r1\n\t" | ||
324 | "sub r1,r7,r1,lsl #1\n\t" | ||
325 | "add r8,r8,r2\n\t" | ||
326 | "sub r2,r8,r2,lsl #1\n\t" | ||
327 | |||
328 | "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[0].re, z[0].im */ | ||
329 | "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ | ||
330 | : [z4_ptr] "+r" (m4), [z_ptr] "+r" (z), [temp] "=r" (temp) | ||
331 | : | ||
332 | : "r1","r2","r3","r4","r5","r6","r7","r8","memory" | ||
333 | ); | ||
334 | } | ||
335 | |||
336 | z++; | ||
337 | TRANSFORM_EQUAL(z,2); | ||
338 | } | ||
339 | |||
340 | |||
341 | #endif // CPU_ARM | ||
342 | |||