diff options
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg.c | 32 | ||||
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg_arm.h | 490 |
2 files changed, 319 insertions, 203 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg.c b/apps/codecs/lib/fft-ffmpeg.c index a5ffab9086..c00abde694 100644 --- a/apps/codecs/lib/fft-ffmpeg.c +++ b/apps/codecs/lib/fft-ffmpeg.c | |||
@@ -202,7 +202,7 @@ static void ff_fft_permute_c(FFTContext *s, FFTComplex *z) | |||
202 | */ | 202 | */ |
203 | 203 | ||
204 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | 204 | #ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM |
205 | static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) | 205 | static inline FFTComplex* TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) |
206 | { | 206 | { |
207 | register FFTSample t1,t2,t5,t6,r_re,r_im; | 207 | register FFTSample t1,t2,t5,t6,r_re,r_im; |
208 | r_re = z[n*2].re; | 208 | r_re = z[n*2].re; |
@@ -212,9 +212,10 @@ static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTS | |||
212 | r_im = z[n*3].im; | 212 | r_im = z[n*3].im; |
213 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | 213 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); |
214 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | 214 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); |
215 | return z+1; | ||
215 | } | 216 | } |
216 | 217 | ||
217 | static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) | 218 | static inline FFTComplex* TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) |
218 | { | 219 | { |
219 | register const FFTSample wre=w[0],wim=w[1]; | 220 | register const FFTSample wre=w[0],wim=w[1]; |
220 | register FFTSample t1,t2,t5,t6,r_re,r_im; | 221 | register FFTSample t1,t2,t5,t6,r_re,r_im; |
@@ -225,9 +226,10 @@ static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample | |||
225 | r_im = z[n*3].im; | 226 | r_im = z[n*3].im; |
226 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | 227 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); |
227 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | 228 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); |
229 | return z+1; | ||
228 | } | 230 | } |
229 | 231 | ||
230 | static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) | 232 | static inline FFTComplex* TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) |
231 | { | 233 | { |
232 | register const FFTSample wim=w[0],wre=w[1]; | 234 | register const FFTSample wim=w[0],wre=w[1]; |
233 | register FFTSample t1,t2,t5,t6,r_re,r_im; | 235 | register FFTSample t1,t2,t5,t6,r_re,r_im; |
@@ -238,9 +240,10 @@ static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample | |||
238 | r_im = z[n*3].im; | 240 | r_im = z[n*3].im; |
239 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | 241 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); |
240 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | 242 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); |
243 | return z+1; | ||
241 | } | 244 | } |
242 | 245 | ||
243 | static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) | 246 | static inline FFTComplex* TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) |
244 | { | 247 | { |
245 | register FFTSample t1,t2,t5,t6,temp1,temp2; | 248 | register FFTSample t1,t2,t5,t6,temp1,temp2; |
246 | register FFTSample * my_z = (FFTSample *)(z); | 249 | register FFTSample * my_z = (FFTSample *)(z); |
@@ -256,9 +259,10 @@ static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) | |||
256 | t5 = ( temp2 - t5 ); | 259 | t5 = ( temp2 - t5 ); |
257 | my_z -= n*6; | 260 | my_z -= n*6; |
258 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | 261 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); |
262 | return z+1; | ||
259 | } | 263 | } |
260 | 264 | ||
261 | static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n) | 265 | static inline FFTComplex* TRANSFORM_ZERO(FFTComplex * z, unsigned int n) |
262 | { | 266 | { |
263 | FFTSample t1,t2,t5,t6; | 267 | FFTSample t1,t2,t5,t6; |
264 | t1 = z[n*2].re; | 268 | t1 = z[n*2].re; |
@@ -266,6 +270,7 @@ static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n) | |||
266 | t5 = z[n*3].re; | 270 | t5 = z[n*3].re; |
267 | t6 = z[n*3].im; | 271 | t6 = z[n*3].im; |
268 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); | 272 | BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); |
273 | return z+1; | ||
269 | } | 274 | } |
270 | #endif | 275 | #endif |
271 | 276 | ||
@@ -282,17 +287,14 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg) | |||
282 | register const FFTSample *w_end = sincos_lookup0+1024; | 287 | register const FFTSample *w_end = sincos_lookup0+1024; |
283 | 288 | ||
284 | /* first two are special (well, first one is special, but we need to do pairs) */ | 289 | /* first two are special (well, first one is special, but we need to do pairs) */ |
285 | TRANSFORM_ZERO(z,n); | 290 | z = TRANSFORM_ZERO(z,n); |
286 | z++; | 291 | z = TRANSFORM_W10(z,n,w); |
287 | TRANSFORM_W10(z,n,w); | ||
288 | w += STEP; | 292 | w += STEP; |
289 | /* first pass forwards through sincos_lookup0*/ | 293 | /* first pass forwards through sincos_lookup0*/ |
290 | do { | 294 | do { |
291 | z++; | 295 | z = TRANSFORM_W10(z,n,w); |
292 | TRANSFORM_W10(z,n,w); | ||
293 | w += STEP; | 296 | w += STEP; |
294 | z++; | 297 | z = TRANSFORM_W10(z,n,w); |
295 | TRANSFORM_W10(z,n,w); | ||
296 | w += STEP; | 298 | w += STEP; |
297 | } while(LIKELY(w < w_end)); | 299 | } while(LIKELY(w < w_end)); |
298 | /* second half: pass backwards through sincos_lookup0*/ | 300 | /* second half: pass backwards through sincos_lookup0*/ |
@@ -300,11 +302,9 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg) | |||
300 | w_end=sincos_lookup0; | 302 | w_end=sincos_lookup0; |
301 | while(LIKELY(w>w_end)) | 303 | while(LIKELY(w>w_end)) |
302 | { | 304 | { |
303 | z++; | 305 | z = TRANSFORM_W01(z,n,w); |
304 | TRANSFORM_W01(z,n,w); | ||
305 | w -= STEP; | 306 | w -= STEP; |
306 | z++; | 307 | z = TRANSFORM_W01(z,n,w); |
307 | TRANSFORM_W01(z,n,w); | ||
308 | w -= STEP; | 308 | w -= STEP; |
309 | } | 309 | } |
310 | } | 310 | } |
diff --git a/apps/codecs/lib/fft-ffmpeg_arm.h b/apps/codecs/lib/fft-ffmpeg_arm.h index 9d396a3fc0..073ad8ee46 100644 --- a/apps/codecs/lib/fft-ffmpeg_arm.h +++ b/apps/codecs/lib/fft-ffmpeg_arm.h | |||
@@ -43,6 +43,7 @@ | |||
43 | y = x - (b<<1);\ | 43 | y = x - (b<<1);\ |
44 | } | 44 | } |
45 | 45 | ||
46 | |||
46 | /* standard BUTTERFLIES package. Note, we actually manually inline this | 47 | /* standard BUTTERFLIES package. Note, we actually manually inline this |
47 | in all the TRANSFORM macros below anyway */ | 48 | in all the TRANSFORM macros below anyway */ |
48 | #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES | 49 | #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES |
@@ -59,198 +60,314 @@ | |||
59 | 60 | ||
60 | #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | 61 | #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM |
61 | 62 | ||
62 | /* on ARM, all the TRANSFORM_etc inlines use the following registers: | 63 | static inline FFTComplex* TRANSFORM( FFTComplex* z, int n, FFTSample wre, FFTSample wim ) |
63 | r5,r6,r7,r8,r9,r10,r4,r12 | 64 | { |
64 | 65 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); | |
65 | inputs are: z, n, STEP | 66 | z += n*2; /* z[o2] */ |
66 | 67 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | |
67 | NOTE THAT THESE MACROS ACTUALLY CHANGE z INPUT INPLACE- | 68 | XPROD31_R(r_re, r_im, wre, wim, t1,t2); |
68 | so sequential actions, z += n*3, z -= n*2 etc etc matter | 69 | |
69 | */ | 70 | z += n; /* z[o3] */ |
70 | 71 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | |
72 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | ||
73 | |||
74 | BF_OPT(t1, t5, t5, t1); | ||
75 | BF_OPT(t6, t2, t2, t6); | ||
71 | 76 | ||
72 | #define TRANSFORM_POST_STORE( z, n ) {\ | 77 | { |
73 | /*{*/\ | 78 | register FFTSample rt0temp asm("r4"); |
74 | /* BF_OPT(t1, t5, t5, t1);*/\ | 79 | /*{*/ |
75 | /* BF_OPT(t6, t2, t2, t6);*/\ | 80 | /* BF_OPT(t1, t5, t5, t1);*/ |
76 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/\ | 81 | /* BF_OPT(t6, t2, t2, t6);*/ |
77 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/\ | 82 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ |
78 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/\ | 83 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ |
79 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/\ | 84 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ |
80 | /*}*/\ | 85 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ |
81 | z -= n*3;\ | 86 | /*}*/ |
82 | /* r_re = my_z[0]; r_im = my_z[1]; */\ | 87 | z -= n*3; |
83 | {\ | 88 | /* r_re = my_z[0]; r_im = my_z[1]; */ |
84 | register FFTSample rt0temp asm("r4");\ | 89 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
85 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 90 | BF_OPT(rt0temp, r_re, r_re, t5); |
86 | BF_OPT(rt0temp, r_re, r_re, t5);\ | 91 | BF_OPT(t2, r_im, r_im, t2); |
87 | BF_OPT(t2, r_im, r_im, t2);\ | 92 | /* my_z[0] = r_re; my_z[1] = r_im; */ |
88 | /* my_z[0] = r_re; my_z[1] = r_im; */\ | 93 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory" ); |
89 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ | 94 | z += n; |
90 | z += n;\ | 95 | /* r_re = my_z[0]; r_im = my_z[1]; */ |
91 | /* r_re = my_z[0]; r_im = my_z[1]; */\ | 96 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
92 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 97 | BF_OPT(t5, r_re, r_re, t6); |
93 | BF_OPT(t5, r_re, r_re, t6);\ | 98 | BF_OPT(t6, r_im, r_im, t1); |
94 | BF_OPT(t6, r_im, r_im, t1);\ | 99 | /* my_z[0] = r_re; my_z[1] = r_im; */ |
95 | /* my_z[0] = r_re; my_z[1] = r_im; */\ | 100 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); |
96 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ | 101 | z += n; |
97 | z += n;\ | 102 | /* my_z[0] = rt0temp; my_z[1] = t2; */ |
98 | /* my_z[0] = rt0temp; my_z[1] = t2; */\ | 103 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); |
99 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2));\ | 104 | } |
100 | z += n;\ | 105 | z += n; |
101 | }\ | 106 | |
102 | /* my_z[0] = t5; my_z[1] = t6; */\ | 107 | /* my_z[0] = t5; my_z[1] = t6; */ |
103 | asm volatile( "stmia %[my_z], {%[t5],%[t6]}\n\t"::[my_z] "r" (z), [t5] "r" (t5), [t6] "r" (t6));\ | 108 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); |
104 | z -= n*3;\ | 109 | z -= n*3; |
110 | return(z); | ||
105 | } | 111 | } |
106 | 112 | ||
107 | #define TRANSFORM( z, n, wre_arg, wim_arg )\ | 113 | static inline FFTComplex* TRANSFORM_W01( FFTComplex* z, int n, const FFTSample* w ) |
108 | {\ | 114 | { |
109 | FFTSample wre = wre_arg, wim = wim_arg;\ | 115 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); |
110 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | 116 | |
111 | z += n*2; /* z[o2] */\ | 117 | /* load wre,wim into t5,t6 */ |
112 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 118 | asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (t5), [wim] "=r" (t6):[w] "r" (w)); |
113 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | 119 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ |
114 | \ | 120 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
115 | z += n; /* z[o3] */\ | 121 | XPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t1,t2); |
116 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | ||
117 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | ||
118 | \ | ||
119 | BF_OPT(t1, t5, t5, t1);\ | ||
120 | BF_OPT(t6, t2, t2, t6);\ | ||
121 | TRANSFORM_POST_STORE( z, n );\ | ||
122 | } | ||
123 | 122 | ||
124 | #define TRANSFORM_W01( z, n, w )\ | 123 | z += n; /* z[o3] */ |
125 | {\ | 124 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
126 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | 125 | XNPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t5,t6); |
127 | \ | 126 | |
128 | {\ | 127 | BF_OPT(t1, t5, t5, t1); |
129 | register FFTSample wre asm("r4"),wim asm("r12");\ | 128 | BF_OPT(t6, t2, t2, t6); |
130 | asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (wre), [wim] "=r" (wim):[w] "r" (w));\ | 129 | { |
131 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | 130 | register FFTSample rt0temp asm("r4"); |
132 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 131 | /*{*/ |
133 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | 132 | /* BF_OPT(t1, t5, t5, t1);*/ |
134 | \ | 133 | /* BF_OPT(t6, t2, t2, t6);*/ |
135 | z += n; /* z[o3] */\ | 134 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ |
136 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 135 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ |
137 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | 136 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ |
138 | }\ | 137 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ |
139 | \ | 138 | /*}*/ |
140 | BF_OPT(t1, t5, t5, t1);\ | 139 | z -= n*3; |
141 | BF_OPT(t6, t2, t2, t6);\ | 140 | /* r_re = my_z[0]; r_im = my_z[1]; */ |
142 | TRANSFORM_POST_STORE( z, n );\ | 141 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
142 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
143 | BF_OPT(t2, r_im, r_im, t2); | ||
144 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
145 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
146 | z += n; | ||
147 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
148 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
149 | BF_OPT(t5, r_re, r_re, t6); | ||
150 | BF_OPT(t6, r_im, r_im, t1); | ||
151 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
152 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
153 | z += n; | ||
154 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
155 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
156 | } | ||
157 | z += n; | ||
158 | |||
159 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
160 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
161 | z -= n*3; | ||
162 | return(z); | ||
143 | } | 163 | } |
144 | 164 | ||
145 | //static inline void TRANSFORM_W10(int32_t * z, unsigned int n, const int32_t * w) | 165 | static inline FFTComplex* TRANSFORM_W10( FFTComplex* z, int n, const FFTSample* w ) |
146 | #define TRANSFORM_W10( z, n, w )\ | 166 | { |
147 | {\ | 167 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); |
148 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | 168 | |
149 | \ | 169 | /* load wim,wre into t5,t6 */ |
150 | {\ | 170 | asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (t5), [wre] "=r" (t6):[w] "r" (w)); |
151 | register FFTSample wim asm("r4"),wre asm("r12");\ | 171 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ |
152 | asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (wim), [wre] "=r" (wre):[w] "r" (w));\ | 172 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
153 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | 173 | XPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t1,t2); |
154 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 174 | |
155 | XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ | 175 | z += n; /* z[o3] */ |
156 | \ | 176 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
157 | z += n; /* z[o3] */\ | 177 | XNPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t5,t6); |
158 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 178 | |
159 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ | 179 | BF_OPT(t1, t5, t5, t1); |
160 | }\ | 180 | BF_OPT(t6, t2, t2, t6); |
161 | \ | 181 | { |
162 | BF_OPT(t1, t5, t5, t1);\ | 182 | register FFTSample rt0temp asm("r4"); |
163 | BF_OPT(t6, t2, t2, t6);\ | 183 | /*{*/ |
164 | TRANSFORM_POST_STORE( z, n );\ | 184 | /* BF_OPT(t1, t5, t5, t1);*/ |
185 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
186 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
187 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
188 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
189 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
190 | /*}*/ | ||
191 | z -= n*3; | ||
192 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
193 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
194 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
195 | BF_OPT(t2, r_im, r_im, t2); | ||
196 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
197 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
198 | z += n; | ||
199 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
200 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
201 | BF_OPT(t5, r_re, r_re, t6); | ||
202 | BF_OPT(t6, r_im, r_im, t1); | ||
203 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
204 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
205 | z += n; | ||
206 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
207 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
208 | } | ||
209 | z += n; | ||
210 | |||
211 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
212 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
213 | z -= n*3; | ||
214 | return(z); | ||
165 | } | 215 | } |
166 | 216 | ||
167 | #define TRANSFORM_EQUAL( z, n )\ | 217 | static inline FFTComplex* TRANSFORM_EQUAL( FFTComplex* z, int n ) |
168 | {\ | 218 | { |
169 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | 219 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); |
170 | \ | 220 | |
171 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | 221 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ |
172 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ | 222 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z)); |
173 | z += n; /* z[o3] */\ | 223 | z += n; /* z[o3] */ |
174 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ | 224 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
175 | \ | 225 | |
176 | /**/\ | 226 | /**/ |
177 | /*t2 = MULT32(cPI2_8, t5);*/\ | 227 | /*t2 = MULT32(cPI2_8, t5);*/ |
178 | /*t1 = MULT31(cPI2_8, t6);*/\ | 228 | /*t1 = MULT31(cPI2_8, t6);*/ |
179 | /*t6 = MULT31(cPI2_8, r_re);*/\ | 229 | /*t6 = MULT31(cPI2_8, r_re);*/ |
180 | /*t5 = MULT32(cPI2_8, r_im);*/\ | 230 | /*t5 = MULT32(cPI2_8, r_im);*/ |
181 | \ | 231 | |
182 | /*t1 = ( t1 + (t2<<1) );*/\ | 232 | /*t1 = ( t1 + (t2<<1) );*/ |
183 | /*t2 = ( t1 - (t2<<2) );*/\ | 233 | /*t2 = ( t1 - (t2<<2) );*/ |
184 | /*t6 = ( t6 + (t5<<1) );*/\ | 234 | /*t6 = ( t6 + (t5<<1) );*/ |
185 | /*t5 = ( t6 - (t5<<2) );*/\ | 235 | /*t5 = ( t6 - (t5<<2) );*/ |
186 | /**/\ | 236 | /**/ |
187 | t2 = MULT31(cPI2_8, t5);\ | 237 | t2 = MULT31(cPI2_8, t5); |
188 | t6 = MULT31(cPI2_8, t6);\ | 238 | t6 = MULT31(cPI2_8, t6); |
189 | r_re = MULT31(cPI2_8, r_re);\ | 239 | r_re = MULT31(cPI2_8, r_re); |
190 | t5 = MULT31(cPI2_8, r_im);\ | 240 | t5 = MULT31(cPI2_8, r_im); |
191 | \ | 241 | |
192 | t1 = ( t6 + t2 );\ | 242 | t1 = ( t6 + t2 ); |
193 | t2 = ( t6 - t2 );\ | 243 | t2 = ( t6 - t2 ); |
194 | t6 = ( r_re + t5 );\ | 244 | t6 = ( r_re + t5 ); |
195 | t5 = ( r_re - t5 );\ | 245 | t5 = ( r_re - t5 ); |
196 | \ | 246 | |
197 | BF_OPT(t1, t5, t5, t1);\ | 247 | BF_OPT(t1, t5, t5, t1); |
198 | BF_OPT(t6, t2, t2, t6);\ | 248 | BF_OPT(t6, t2, t2, t6); |
199 | TRANSFORM_POST_STORE( z, n );\ | 249 | { |
250 | register FFTSample rt0temp asm("r4"); | ||
251 | /*{*/ | ||
252 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
253 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
254 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
255 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
256 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
257 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
258 | /*}*/ | ||
259 | z -= n*3; | ||
260 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
261 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
262 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
263 | BF_OPT(t2, r_im, r_im, t2); | ||
264 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
265 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
266 | z += n; | ||
267 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
268 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
269 | BF_OPT(t5, r_re, r_re, t6); | ||
270 | BF_OPT(t6, r_im, r_im, t1); | ||
271 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
272 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
273 | z += n; | ||
274 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
275 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
276 | } | ||
277 | z += n; | ||
278 | |||
279 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
280 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
281 | z -= n*3; | ||
282 | return(z); | ||
200 | } | 283 | } |
201 | 284 | ||
202 | #define TRANSFORM_ZERO( z,n )\ | 285 | static inline FFTComplex* TRANSFORM_ZERO( FFTComplex* z, int n ) |
203 | {\ | 286 | { |
204 | register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ | 287 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"), r_re asm("r8"), r_im asm("r9"); |
205 | \ | 288 | |
206 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ | 289 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ |
207 | asm volatile( "ldmia %[my_z], {%[t1],%[t2]}\n\t":[t1] "=r" (t1), [t2] "=r" (t2):[my_z] "r" (z));\ | 290 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); |
208 | z += n; /* z[o3] */\ | 291 | z += n; /* z[o3] */ |
209 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ | 292 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z)); |
210 | \ | 293 | |
211 | BF_OPT(t1, t5, t5, t1);\ | 294 | BF_OPT(t1, t5, t5, r_re); |
212 | BF_OPT(t6, t2, t2, t6);\ | 295 | BF_OPT(t6, t2, r_im, t6); |
213 | TRANSFORM_POST_STORE( z, n );\ | 296 | { |
297 | register FFTSample rt0temp asm("r4"); | ||
298 | /*{*/ | ||
299 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
300 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
301 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
302 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
303 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
304 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
305 | /*}*/ | ||
306 | z -= n*3; | ||
307 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
308 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
309 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
310 | BF_OPT(t2, r_im, r_im, t2); | ||
311 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
312 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
313 | z += n; | ||
314 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
315 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
316 | BF_OPT(t5, r_re, r_re, t6); | ||
317 | BF_OPT(t6, r_im, r_im, t1); | ||
318 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
319 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
320 | z += n; | ||
321 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
322 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
323 | } | ||
324 | z += n; | ||
325 | |||
326 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
327 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
328 | z -= n*3; | ||
329 | return(z); | ||
214 | } | 330 | } |
215 | 331 | ||
216 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT4 | 332 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT4 |
217 | #define fft4(z_arg)\ | 333 | static inline FFTComplex* fft4(FFTComplex * z) |
218 | {\ | 334 | { |
219 | /* input[0..7] -> output[0..7] */\ | 335 | FFTSample temp; |
220 | fixed32 * m = (fixed32 *) ( ( z_arg ) );\ | 336 | |
221 | /* load r1=z[0],r2=z[1],...,r8=z[7] */\ | 337 | /* input[0..7] -> output[0..7] */ |
222 | asm volatile(\ | 338 | /* load r1=z[0],r2=z[1],...,r8=z[7] */ |
223 | "ldmia %[z], {r1-r8}\n\t"\ | 339 | asm volatile( |
224 | "add r1,r1,r3\n\t" /* r1 :=t1 */\ | 340 | "ldmia %[z], {r1-r8}\n\t" |
225 | "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */\ | 341 | "add r1,r1,r3\n\t" /* r1 :=t1 */ |
226 | "sub r7,r7,r5\n\t" /* r10:=t8 */\ | 342 | "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */ |
227 | "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */\ | 343 | "sub r7,r7,r5\n\t" /* r10:=t8 */ |
228 | \ | 344 | "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */ |
229 | "add r1,r1,r5\n\t" /* r1 = o[0] */\ | 345 | |
230 | "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */\ | 346 | "add r1,r1,r5\n\t" /* r1 = o[0] */ |
231 | \ | 347 | "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */ |
232 | "add r2,r2,r4\n\t" /* r2 :=t2 */\ | 348 | |
233 | "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */\ | 349 | "add r2,r2,r4\n\t" /* r2 :=t2 */ |
234 | \ | 350 | "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */ |
235 | "add r12,r6,r8\n\t" /* r10:=t5 */\ | 351 | |
236 | "sub r6,r6,r8\n\t" /* r6 :=t7 */\ | 352 | "add %[temp],r6,r8\n\t" /* r10:=t5 */ |
237 | \ | 353 | "sub r6,r6,r8\n\t" /* r6 :=t7 */ |
238 | "sub r8,r4,r7\n\t" /* r8 = o[7]*/ \ | 354 | |
239 | "add r4,r4,r7\n\t" /* r4 = o[3]*/ \ | 355 | "sub r8,r4,r7\n\t" /* r8 = o[7]*/ |
240 | "sub r7,r3,r6\n\t" /* r7 = o[6]*/ \ | 356 | "add r4,r4,r7\n\t" /* r4 = o[3]*/ |
241 | "add r3,r3,r6\n\t" /* r3 = o[2]*/ \ | 357 | "sub r7,r3,r6\n\t" /* r7 = o[6]*/ |
242 | "sub r6,r2,r12\n\t" /* r6 = o[5]*/ \ | 358 | "add r3,r3,r6\n\t" /* r3 = o[2]*/ |
243 | "add r2,r2,r12\n\t" /* r2 = o[1]*/ \ | 359 | "sub r6,r2,%[temp]\n\t" /* r6 = o[5]*/ |
244 | \ | 360 | "add r2,r2,%[temp]\n\t" /* r2 = o[1]*/ |
245 | "stmia %[z], {r1-r8}\n\t"\ | 361 | |
246 | : /* outputs */\ | 362 | "stmia %[z]!, {r1-r8}\n\t" |
247 | : /* inputs */ [z] "r" (m)\ | 363 | : /* outputs */ [z] "+r" (z), [temp] "=r" (temp) |
248 | : /* clobbers */\ | 364 | : /* inputs */ |
249 | "r1","r2","r3","r4","r5","r6","r7","r8","r12","memory"\ | 365 | : /* clobbers */ |
250 | );\ | 366 | "r1","r2","r3","r4","r5","r6","r7","r8","memory" |
367 | ); | ||
368 | return z; | ||
251 | } | 369 | } |
252 | 370 | ||
253 | |||
254 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT8 | 371 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT8 |
255 | /* The chunk of asm below is equivalent to the following: | 372 | /* The chunk of asm below is equivalent to the following: |
256 | 373 | ||
@@ -279,12 +396,14 @@ | |||
279 | // Finally save out z[4].re, z[4].im, z[0].re and z[0].im | 396 | // Finally save out z[4].re, z[4].im, z[0].re and z[0].im |
280 | // ... | 397 | // ... |
281 | */ | 398 | */ |
282 | static inline void fft8( FFTComplex * z ) | 399 | static inline void fft8(FFTComplex * z) |
283 | { | 400 | { |
284 | fft4(z); | 401 | FFTComplex* m4 = fft4(z); |
285 | { | 402 | { |
286 | FFTSample temp; | 403 | /* note that we increment z_ptr on the final stmia, which |
287 | fixed32 * m4 = (fixed32 *)(&(z[4].re)); | 404 | leaves z_ptr pointing to z[1].re ready for the Transform step */ |
405 | |||
406 | register FFTSample temp; | ||
288 | 407 | ||
289 | asm volatile( | 408 | asm volatile( |
290 | /* read in z[4].re thru z[7].im */ | 409 | /* read in z[4].re thru z[7].im */ |
@@ -323,18 +442,15 @@ static inline void fft8( FFTComplex * z ) | |||
323 | "add r8,r8,r2\n\t" | 442 | "add r8,r8,r2\n\t" |
324 | "sub r2,r8,r2,lsl #1\n\t" | 443 | "sub r2,r8,r2,lsl #1\n\t" |
325 | 444 | ||
326 | "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[0].re, z[0].im */ | 445 | "stmia %[z_ptr]!,{r7,r8}\n\t" /* write out z[0].re, z[0].im */ |
327 | "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ | 446 | "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ |
328 | : [z4_ptr] "+r" (m4), [temp] "=r" (temp) | 447 | : [z4_ptr] "+r" (m4), [temp] "=r" (temp), [z_ptr] "+r" (z) |
329 | : [z_ptr] "r" (z) | 448 | : |
330 | : "r1","r2","r3","r4","r5","r6","r7","r8","memory" | 449 | : "r1","r2","r3","r4","r5","r6","r7","r8","memory" |
331 | ); | 450 | ); |
332 | } | 451 | } |
333 | 452 | ||
334 | z++; | ||
335 | TRANSFORM_EQUAL(z,2); | 453 | TRANSFORM_EQUAL(z,2); |
336 | } | 454 | } |
337 | 455 | ||
338 | |||
339 | #endif // CPU_ARM | 456 | #endif // CPU_ARM |
340 | |||