summaryrefslogtreecommitdiff
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/codecs/lib/fft-ffmpeg.c32
-rw-r--r--apps/codecs/lib/fft-ffmpeg_arm.h490
2 files changed, 319 insertions, 203 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg.c b/apps/codecs/lib/fft-ffmpeg.c
index a5ffab9086..c00abde694 100644
--- a/apps/codecs/lib/fft-ffmpeg.c
+++ b/apps/codecs/lib/fft-ffmpeg.c
@@ -202,7 +202,7 @@ static void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
202*/ 202*/
203 203
204#ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM 204#ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
205static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) 205static inline FFTComplex* TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim)
206{ 206{
207 register FFTSample t1,t2,t5,t6,r_re,r_im; 207 register FFTSample t1,t2,t5,t6,r_re,r_im;
208 r_re = z[n*2].re; 208 r_re = z[n*2].re;
@@ -212,9 +212,10 @@ static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTS
212 r_im = z[n*3].im; 212 r_im = z[n*3].im;
213 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 213 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
214 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 214 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
215 return z+1;
215} 216}
216 217
217static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) 218static inline FFTComplex* TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w)
218{ 219{
219 register const FFTSample wre=w[0],wim=w[1]; 220 register const FFTSample wre=w[0],wim=w[1];
220 register FFTSample t1,t2,t5,t6,r_re,r_im; 221 register FFTSample t1,t2,t5,t6,r_re,r_im;
@@ -225,9 +226,10 @@ static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample
225 r_im = z[n*3].im; 226 r_im = z[n*3].im;
226 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 227 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
227 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 228 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
229 return z+1;
228} 230}
229 231
230static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) 232static inline FFTComplex* TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w)
231{ 233{
232 register const FFTSample wim=w[0],wre=w[1]; 234 register const FFTSample wim=w[0],wre=w[1];
233 register FFTSample t1,t2,t5,t6,r_re,r_im; 235 register FFTSample t1,t2,t5,t6,r_re,r_im;
@@ -238,9 +240,10 @@ static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample
238 r_im = z[n*3].im; 240 r_im = z[n*3].im;
239 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 241 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
240 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 242 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
243 return z+1;
241} 244}
242 245
243static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) 246static inline FFTComplex* TRANSFORM_EQUAL(FFTComplex * z, unsigned int n)
244{ 247{
245 register FFTSample t1,t2,t5,t6,temp1,temp2; 248 register FFTSample t1,t2,t5,t6,temp1,temp2;
246 register FFTSample * my_z = (FFTSample *)(z); 249 register FFTSample * my_z = (FFTSample *)(z);
@@ -256,9 +259,10 @@ static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n)
256 t5 = ( temp2 - t5 ); 259 t5 = ( temp2 - t5 );
257 my_z -= n*6; 260 my_z -= n*6;
258 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 261 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
262 return z+1;
259} 263}
260 264
261static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n) 265static inline FFTComplex* TRANSFORM_ZERO(FFTComplex * z, unsigned int n)
262{ 266{
263 FFTSample t1,t2,t5,t6; 267 FFTSample t1,t2,t5,t6;
264 t1 = z[n*2].re; 268 t1 = z[n*2].re;
@@ -266,6 +270,7 @@ static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n)
266 t5 = z[n*3].re; 270 t5 = z[n*3].re;
267 t6 = z[n*3].im; 271 t6 = z[n*3].im;
268 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 272 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
273 return z+1;
269} 274}
270#endif 275#endif
271 276
@@ -282,17 +287,14 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg)
282 register const FFTSample *w_end = sincos_lookup0+1024; 287 register const FFTSample *w_end = sincos_lookup0+1024;
283 288
284 /* first two are special (well, first one is special, but we need to do pairs) */ 289 /* first two are special (well, first one is special, but we need to do pairs) */
285 TRANSFORM_ZERO(z,n); 290 z = TRANSFORM_ZERO(z,n);
286 z++; 291 z = TRANSFORM_W10(z,n,w);
287 TRANSFORM_W10(z,n,w);
288 w += STEP; 292 w += STEP;
289 /* first pass forwards through sincos_lookup0*/ 293 /* first pass forwards through sincos_lookup0*/
290 do { 294 do {
291 z++; 295 z = TRANSFORM_W10(z,n,w);
292 TRANSFORM_W10(z,n,w);
293 w += STEP; 296 w += STEP;
294 z++; 297 z = TRANSFORM_W10(z,n,w);
295 TRANSFORM_W10(z,n,w);
296 w += STEP; 298 w += STEP;
297 } while(LIKELY(w < w_end)); 299 } while(LIKELY(w < w_end));
298 /* second half: pass backwards through sincos_lookup0*/ 300 /* second half: pass backwards through sincos_lookup0*/
@@ -300,11 +302,9 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg)
300 w_end=sincos_lookup0; 302 w_end=sincos_lookup0;
301 while(LIKELY(w>w_end)) 303 while(LIKELY(w>w_end))
302 { 304 {
303 z++; 305 z = TRANSFORM_W01(z,n,w);
304 TRANSFORM_W01(z,n,w);
305 w -= STEP; 306 w -= STEP;
306 z++; 307 z = TRANSFORM_W01(z,n,w);
307 TRANSFORM_W01(z,n,w);
308 w -= STEP; 308 w -= STEP;
309 } 309 }
310} 310}
diff --git a/apps/codecs/lib/fft-ffmpeg_arm.h b/apps/codecs/lib/fft-ffmpeg_arm.h
index 9d396a3fc0..073ad8ee46 100644
--- a/apps/codecs/lib/fft-ffmpeg_arm.h
+++ b/apps/codecs/lib/fft-ffmpeg_arm.h
@@ -43,6 +43,7 @@
43 y = x - (b<<1);\ 43 y = x - (b<<1);\
44} 44}
45 45
46
46/* standard BUTTERFLIES package. Note, we actually manually inline this 47/* standard BUTTERFLIES package. Note, we actually manually inline this
47 in all the TRANSFORM macros below anyway */ 48 in all the TRANSFORM macros below anyway */
48#define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES 49#define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES
@@ -59,198 +60,314 @@
59 60
60#define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM 61#define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
61 62
62/* on ARM, all the TRANSFORM_etc inlines use the following registers: 63static inline FFTComplex* TRANSFORM( FFTComplex* z, int n, FFTSample wre, FFTSample wim )
63 r5,r6,r7,r8,r9,r10,r4,r12 64{
64 65 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
65 inputs are: z, n, STEP 66 z += n*2; /* z[o2] */
66 67 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
67 NOTE THAT THESE MACROS ACTUALLY CHANGE z INPUT INPLACE- 68 XPROD31_R(r_re, r_im, wre, wim, t1,t2);
68 so sequential actions, z += n*3, z -= n*2 etc etc matter 69
69*/ 70 z += n; /* z[o3] */
70 71 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
72 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
73
74 BF_OPT(t1, t5, t5, t1);
75 BF_OPT(t6, t2, t2, t6);
71 76
72#define TRANSFORM_POST_STORE( z, n ) {\ 77 {
73 /*{*/\ 78 register FFTSample rt0temp asm("r4");
74 /* BF_OPT(t1, t5, t5, t1);*/\ 79 /*{*/
75 /* BF_OPT(t6, t2, t2, t6);*/\ 80 /* BF_OPT(t1, t5, t5, t1);*/
76 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/\ 81 /* BF_OPT(t6, t2, t2, t6);*/
77 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/\ 82 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
78 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/\ 83 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
79 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/\ 84 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
80 /*}*/\ 85 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
81 z -= n*3;\ 86 /*}*/
82 /* r_re = my_z[0]; r_im = my_z[1]; */\ 87 z -= n*3;
83 {\ 88 /* r_re = my_z[0]; r_im = my_z[1]; */
84 register FFTSample rt0temp asm("r4");\ 89 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
85 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 90 BF_OPT(rt0temp, r_re, r_re, t5);
86 BF_OPT(rt0temp, r_re, r_re, t5);\ 91 BF_OPT(t2, r_im, r_im, t2);
87 BF_OPT(t2, r_im, r_im, t2);\ 92 /* my_z[0] = r_re; my_z[1] = r_im; */
88 /* my_z[0] = r_re; my_z[1] = r_im; */\ 93 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory" );
89 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ 94 z += n;
90 z += n;\ 95 /* r_re = my_z[0]; r_im = my_z[1]; */
91 /* r_re = my_z[0]; r_im = my_z[1]; */\ 96 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
92 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 97 BF_OPT(t5, r_re, r_re, t6);
93 BF_OPT(t5, r_re, r_re, t6);\ 98 BF_OPT(t6, r_im, r_im, t1);
94 BF_OPT(t6, r_im, r_im, t1);\ 99 /* my_z[0] = r_re; my_z[1] = r_im; */
95 /* my_z[0] = r_re; my_z[1] = r_im; */\ 100 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
96 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ 101 z += n;
97 z += n;\ 102 /* my_z[0] = rt0temp; my_z[1] = t2; */
98 /* my_z[0] = rt0temp; my_z[1] = t2; */\ 103 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
99 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2));\ 104 }
100 z += n;\ 105 z += n;
101 }\ 106
102 /* my_z[0] = t5; my_z[1] = t6; */\ 107 /* my_z[0] = t5; my_z[1] = t6; */
103 asm volatile( "stmia %[my_z], {%[t5],%[t6]}\n\t"::[my_z] "r" (z), [t5] "r" (t5), [t6] "r" (t6));\ 108 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
104 z -= n*3;\ 109 z -= n*3;
110 return(z);
105} 111}
106 112
107#define TRANSFORM( z, n, wre_arg, wim_arg )\ 113static inline FFTComplex* TRANSFORM_W01( FFTComplex* z, int n, const FFTSample* w )
108{\ 114{
109 FFTSample wre = wre_arg, wim = wim_arg;\ 115 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
110 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 116
111 z += n*2; /* z[o2] */\ 117 /* load wre,wim into t5,t6 */
112 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 118 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (t5), [wim] "=r" (t6):[w] "r" (w));
113 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 119 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
114 \ 120 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
115 z += n; /* z[o3] */\ 121 XPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t1,t2);
116 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
117 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\
118 \
119 BF_OPT(t1, t5, t5, t1);\
120 BF_OPT(t6, t2, t2, t6);\
121 TRANSFORM_POST_STORE( z, n );\
122}
123 122
124#define TRANSFORM_W01( z, n, w )\ 123 z += n; /* z[o3] */
125{\ 124 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
126 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 125 XNPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t5,t6);
127 \ 126
128 {\ 127 BF_OPT(t1, t5, t5, t1);
129 register FFTSample wre asm("r4"),wim asm("r12");\ 128 BF_OPT(t6, t2, t2, t6);
130 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (wre), [wim] "=r" (wim):[w] "r" (w));\ 129 {
131 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 130 register FFTSample rt0temp asm("r4");
132 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 131 /*{*/
133 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 132 /* BF_OPT(t1, t5, t5, t1);*/
134\ 133 /* BF_OPT(t6, t2, t2, t6);*/
135 z += n; /* z[o3] */\ 134 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
136 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 135 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
137 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ 136 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
138 }\ 137 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
139 \ 138 /*}*/
140 BF_OPT(t1, t5, t5, t1);\ 139 z -= n*3;
141 BF_OPT(t6, t2, t2, t6);\ 140 /* r_re = my_z[0]; r_im = my_z[1]; */
142 TRANSFORM_POST_STORE( z, n );\ 141 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
142 BF_OPT(rt0temp, r_re, r_re, t5);
143 BF_OPT(t2, r_im, r_im, t2);
144 /* my_z[0] = r_re; my_z[1] = r_im; */
145 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
146 z += n;
147 /* r_re = my_z[0]; r_im = my_z[1]; */
148 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
149 BF_OPT(t5, r_re, r_re, t6);
150 BF_OPT(t6, r_im, r_im, t1);
151 /* my_z[0] = r_re; my_z[1] = r_im; */
152 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
153 z += n;
154 /* my_z[0] = rt0temp; my_z[1] = t2; */
155 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
156 }
157 z += n;
158
159 /* my_z[0] = t5; my_z[1] = t6; */
160 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
161 z -= n*3;
162 return(z);
143} 163}
144 164
145//static inline void TRANSFORM_W10(int32_t * z, unsigned int n, const int32_t * w) 165static inline FFTComplex* TRANSFORM_W10( FFTComplex* z, int n, const FFTSample* w )
146#define TRANSFORM_W10( z, n, w )\ 166{
147{\ 167 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
148 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 168
149 \ 169 /* load wim,wre into t5,t6 */
150 {\ 170 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (t5), [wre] "=r" (t6):[w] "r" (w));
151 register FFTSample wim asm("r4"),wre asm("r12");\ 171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
152 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (wim), [wre] "=r" (wre):[w] "r" (w));\ 172 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
153 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 173 XPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t1,t2);
154 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 174
155 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 175 z += n; /* z[o3] */
156\ 176 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
157 z += n; /* z[o3] */\ 177 XNPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t5,t6);
158 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 178
159 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ 179 BF_OPT(t1, t5, t5, t1);
160 }\ 180 BF_OPT(t6, t2, t2, t6);
161 \ 181 {
162 BF_OPT(t1, t5, t5, t1);\ 182 register FFTSample rt0temp asm("r4");
163 BF_OPT(t6, t2, t2, t6);\ 183 /*{*/
164 TRANSFORM_POST_STORE( z, n );\ 184 /* BF_OPT(t1, t5, t5, t1);*/
185 /* BF_OPT(t6, t2, t2, t6);*/
186 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
187 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
188 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
189 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
190 /*}*/
191 z -= n*3;
192 /* r_re = my_z[0]; r_im = my_z[1]; */
193 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
194 BF_OPT(rt0temp, r_re, r_re, t5);
195 BF_OPT(t2, r_im, r_im, t2);
196 /* my_z[0] = r_re; my_z[1] = r_im; */
197 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
198 z += n;
199 /* r_re = my_z[0]; r_im = my_z[1]; */
200 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
201 BF_OPT(t5, r_re, r_re, t6);
202 BF_OPT(t6, r_im, r_im, t1);
203 /* my_z[0] = r_re; my_z[1] = r_im; */
204 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
205 z += n;
206 /* my_z[0] = rt0temp; my_z[1] = t2; */
207 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
208 }
209 z += n;
210
211 /* my_z[0] = t5; my_z[1] = t6; */
212 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
213 z -= n*3;
214 return(z);
165} 215}
166 216
167#define TRANSFORM_EQUAL( z, n )\ 217static inline FFTComplex* TRANSFORM_EQUAL( FFTComplex* z, int n )
168{\ 218{
169 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 219 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
170\ 220
171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 221 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
172 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ 222 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
173 z += n; /* z[o3] */\ 223 z += n; /* z[o3] */
174 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 224 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
175\ 225
176/**/\ 226/**/
177/*t2 = MULT32(cPI2_8, t5);*/\ 227/*t2 = MULT32(cPI2_8, t5);*/
178/*t1 = MULT31(cPI2_8, t6);*/\ 228/*t1 = MULT31(cPI2_8, t6);*/
179/*t6 = MULT31(cPI2_8, r_re);*/\ 229/*t6 = MULT31(cPI2_8, r_re);*/
180/*t5 = MULT32(cPI2_8, r_im);*/\ 230/*t5 = MULT32(cPI2_8, r_im);*/
181\ 231
182/*t1 = ( t1 + (t2<<1) );*/\ 232/*t1 = ( t1 + (t2<<1) );*/
183/*t2 = ( t1 - (t2<<2) );*/\ 233/*t2 = ( t1 - (t2<<2) );*/
184/*t6 = ( t6 + (t5<<1) );*/\ 234/*t6 = ( t6 + (t5<<1) );*/
185/*t5 = ( t6 - (t5<<2) );*/\ 235/*t5 = ( t6 - (t5<<2) );*/
186/**/\ 236/**/
187 t2 = MULT31(cPI2_8, t5);\ 237 t2 = MULT31(cPI2_8, t5);
188 t6 = MULT31(cPI2_8, t6);\ 238 t6 = MULT31(cPI2_8, t6);
189 r_re = MULT31(cPI2_8, r_re);\ 239 r_re = MULT31(cPI2_8, r_re);
190 t5 = MULT31(cPI2_8, r_im);\ 240 t5 = MULT31(cPI2_8, r_im);
191 \ 241
192 t1 = ( t6 + t2 );\ 242 t1 = ( t6 + t2 );
193 t2 = ( t6 - t2 );\ 243 t2 = ( t6 - t2 );
194 t6 = ( r_re + t5 );\ 244 t6 = ( r_re + t5 );
195 t5 = ( r_re - t5 );\ 245 t5 = ( r_re - t5 );
196 \ 246
197 BF_OPT(t1, t5, t5, t1);\ 247 BF_OPT(t1, t5, t5, t1);
198 BF_OPT(t6, t2, t2, t6);\ 248 BF_OPT(t6, t2, t2, t6);
199 TRANSFORM_POST_STORE( z, n );\ 249 {
250 register FFTSample rt0temp asm("r4");
251 /*{*/
252 /* BF_OPT(t1, t5, t5, t1);*/
253 /* BF_OPT(t6, t2, t2, t6);*/
254 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
255 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
256 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
257 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
258 /*}*/
259 z -= n*3;
260 /* r_re = my_z[0]; r_im = my_z[1]; */
261 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
262 BF_OPT(rt0temp, r_re, r_re, t5);
263 BF_OPT(t2, r_im, r_im, t2);
264 /* my_z[0] = r_re; my_z[1] = r_im; */
265 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
266 z += n;
267 /* r_re = my_z[0]; r_im = my_z[1]; */
268 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
269 BF_OPT(t5, r_re, r_re, t6);
270 BF_OPT(t6, r_im, r_im, t1);
271 /* my_z[0] = r_re; my_z[1] = r_im; */
272 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
273 z += n;
274 /* my_z[0] = rt0temp; my_z[1] = t2; */
275 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
276 }
277 z += n;
278
279 /* my_z[0] = t5; my_z[1] = t6; */
280 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
281 z -= n*3;
282 return(z);
200} 283}
201 284
202#define TRANSFORM_ZERO( z,n )\ 285static inline FFTComplex* TRANSFORM_ZERO( FFTComplex* z, int n )
203{\ 286{
204 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 287 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"), r_re asm("r8"), r_im asm("r9");
205\ 288
206 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 289 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
207 asm volatile( "ldmia %[my_z], {%[t1],%[t2]}\n\t":[t1] "=r" (t1), [t2] "=r" (t2):[my_z] "r" (z));\ 290 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
208 z += n; /* z[o3] */\ 291 z += n; /* z[o3] */
209 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ 292 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
210\ 293
211 BF_OPT(t1, t5, t5, t1);\ 294 BF_OPT(t1, t5, t5, r_re);
212 BF_OPT(t6, t2, t2, t6);\ 295 BF_OPT(t6, t2, r_im, t6);
213 TRANSFORM_POST_STORE( z, n );\ 296 {
297 register FFTSample rt0temp asm("r4");
298 /*{*/
299 /* BF_OPT(t1, t5, t5, t1);*/
300 /* BF_OPT(t6, t2, t2, t6);*/
301 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
302 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
303 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
304 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
305 /*}*/
306 z -= n*3;
307 /* r_re = my_z[0]; r_im = my_z[1]; */
308 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
309 BF_OPT(rt0temp, r_re, r_re, t5);
310 BF_OPT(t2, r_im, r_im, t2);
311 /* my_z[0] = r_re; my_z[1] = r_im; */
312 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
313 z += n;
314 /* r_re = my_z[0]; r_im = my_z[1]; */
315 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
316 BF_OPT(t5, r_re, r_re, t6);
317 BF_OPT(t6, r_im, r_im, t1);
318 /* my_z[0] = r_re; my_z[1] = r_im; */
319 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
320 z += n;
321 /* my_z[0] = rt0temp; my_z[1] = t2; */
322 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
323 }
324 z += n;
325
326 /* my_z[0] = t5; my_z[1] = t6; */
327 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
328 z -= n*3;
329 return(z);
214} 330}
215 331
216#define FFT_FFMPEG_INCL_OPTIMISED_FFT4 332#define FFT_FFMPEG_INCL_OPTIMISED_FFT4
217#define fft4(z_arg)\ 333static inline FFTComplex* fft4(FFTComplex * z)
218{\ 334{
219 /* input[0..7] -> output[0..7] */\ 335 FFTSample temp;
220 fixed32 * m = (fixed32 *) ( ( z_arg ) );\ 336
221 /* load r1=z[0],r2=z[1],...,r8=z[7] */\ 337 /* input[0..7] -> output[0..7] */
222 asm volatile(\ 338 /* load r1=z[0],r2=z[1],...,r8=z[7] */
223 "ldmia %[z], {r1-r8}\n\t"\ 339 asm volatile(
224 "add r1,r1,r3\n\t" /* r1 :=t1 */\ 340 "ldmia %[z], {r1-r8}\n\t"
225 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */\ 341 "add r1,r1,r3\n\t" /* r1 :=t1 */
226 "sub r7,r7,r5\n\t" /* r10:=t8 */\ 342 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */
227 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */\ 343 "sub r7,r7,r5\n\t" /* r10:=t8 */
228 \ 344 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */
229 "add r1,r1,r5\n\t" /* r1 = o[0] */\ 345
230 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */\ 346 "add r1,r1,r5\n\t" /* r1 = o[0] */
231 \ 347 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */
232 "add r2,r2,r4\n\t" /* r2 :=t2 */\ 348
233 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */\ 349 "add r2,r2,r4\n\t" /* r2 :=t2 */
234 \ 350 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */
235 "add r12,r6,r8\n\t" /* r10:=t5 */\ 351
236 "sub r6,r6,r8\n\t" /* r6 :=t7 */\ 352 "add %[temp],r6,r8\n\t" /* r10:=t5 */
237 \ 353 "sub r6,r6,r8\n\t" /* r6 :=t7 */
238 "sub r8,r4,r7\n\t" /* r8 = o[7]*/ \ 354
239 "add r4,r4,r7\n\t" /* r4 = o[3]*/ \ 355 "sub r8,r4,r7\n\t" /* r8 = o[7]*/
240 "sub r7,r3,r6\n\t" /* r7 = o[6]*/ \ 356 "add r4,r4,r7\n\t" /* r4 = o[3]*/
241 "add r3,r3,r6\n\t" /* r3 = o[2]*/ \ 357 "sub r7,r3,r6\n\t" /* r7 = o[6]*/
242 "sub r6,r2,r12\n\t" /* r6 = o[5]*/ \ 358 "add r3,r3,r6\n\t" /* r3 = o[2]*/
243 "add r2,r2,r12\n\t" /* r2 = o[1]*/ \ 359 "sub r6,r2,%[temp]\n\t" /* r6 = o[5]*/
244 \ 360 "add r2,r2,%[temp]\n\t" /* r2 = o[1]*/
245 "stmia %[z], {r1-r8}\n\t"\ 361
246 : /* outputs */\ 362 "stmia %[z]!, {r1-r8}\n\t"
247 : /* inputs */ [z] "r" (m)\ 363 : /* outputs */ [z] "+r" (z), [temp] "=r" (temp)
248 : /* clobbers */\ 364 : /* inputs */
249 "r1","r2","r3","r4","r5","r6","r7","r8","r12","memory"\ 365 : /* clobbers */
250 );\ 366 "r1","r2","r3","r4","r5","r6","r7","r8","memory"
367 );
368 return z;
251} 369}
252 370
253
254#define FFT_FFMPEG_INCL_OPTIMISED_FFT8 371#define FFT_FFMPEG_INCL_OPTIMISED_FFT8
255 /* The chunk of asm below is equivalent to the following: 372 /* The chunk of asm below is equivalent to the following:
256 373
@@ -279,12 +396,14 @@
279 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im 396 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im
280 // ... 397 // ...
281 */ 398 */
282static inline void fft8( FFTComplex * z ) 399static inline void fft8(FFTComplex * z)
283{ 400{
284 fft4(z); 401 FFTComplex* m4 = fft4(z);
285 { 402 {
286 FFTSample temp; 403 /* note that we increment z_ptr on the final stmia, which
287 fixed32 * m4 = (fixed32 *)(&(z[4].re)); 404 leaves z_ptr pointing to z[1].re ready for the Transform step */
405
406 register FFTSample temp;
288 407
289 asm volatile( 408 asm volatile(
290 /* read in z[4].re thru z[7].im */ 409 /* read in z[4].re thru z[7].im */
@@ -323,18 +442,15 @@ static inline void fft8( FFTComplex * z )
323 "add r8,r8,r2\n\t" 442 "add r8,r8,r2\n\t"
324 "sub r2,r8,r2,lsl #1\n\t" 443 "sub r2,r8,r2,lsl #1\n\t"
325 444
326 "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[0].re, z[0].im */ 445 "stmia %[z_ptr]!,{r7,r8}\n\t" /* write out z[0].re, z[0].im */
327 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ 446 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */
328 : [z4_ptr] "+r" (m4), [temp] "=r" (temp) 447 : [z4_ptr] "+r" (m4), [temp] "=r" (temp), [z_ptr] "+r" (z)
329 : [z_ptr] "r" (z) 448 :
330 : "r1","r2","r3","r4","r5","r6","r7","r8","memory" 449 : "r1","r2","r3","r4","r5","r6","r7","r8","memory"
331 ); 450 );
332 } 451 }
333 452
334 z++;
335 TRANSFORM_EQUAL(z,2); 453 TRANSFORM_EQUAL(z,2);
336} 454}
337 455
338
339#endif // CPU_ARM 456#endif // CPU_ARM
340