summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Hooper <dave@beermex.com>2010-10-12 23:29:17 +0000
committerDave Hooper <dave@beermex.com>2010-10-12 23:29:17 +0000
commita5b17b45113bc023367b5b470d634fec66c8b374 (patch)
tree1fbd90be7c1883c1d97ac782345f4e1809f3eb13
parent986910175cfc8a91668c54274a660ec853935a6b (diff)
downloadrockbox-a5b17b45113bc023367b5b470d634fec66c8b374.tar.gz
rockbox-a5b17b45113bc023367b5b470d634fec66c8b374.zip
Rearrange and remove some manual register assignments, make use of pointer address increments for free on arm, and remove macros in favour of explicit inline fns. also add memory clobbers to all uses of stm in inline asm. appears to resolve issues with codeclib failing to work as expected when using ARM_ASM optimisations on targets with native position independent code (e.g. android rockbox targets, but also tremor on non-rockbox targets that has been patched with the fastermdct patches)
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28262 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/lib/fft-ffmpeg.c32
-rw-r--r--apps/codecs/lib/fft-ffmpeg_arm.h490
2 files changed, 319 insertions, 203 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg.c b/apps/codecs/lib/fft-ffmpeg.c
index a5ffab9086..c00abde694 100644
--- a/apps/codecs/lib/fft-ffmpeg.c
+++ b/apps/codecs/lib/fft-ffmpeg.c
@@ -202,7 +202,7 @@ static void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
202*/ 202*/
203 203
204#ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM 204#ifndef FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
205static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) 205static inline FFTComplex* TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim)
206{ 206{
207 register FFTSample t1,t2,t5,t6,r_re,r_im; 207 register FFTSample t1,t2,t5,t6,r_re,r_im;
208 r_re = z[n*2].re; 208 r_re = z[n*2].re;
@@ -212,9 +212,10 @@ static inline void TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTS
212 r_im = z[n*3].im; 212 r_im = z[n*3].im;
213 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 213 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
214 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 214 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
215 return z+1;
215} 216}
216 217
217static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) 218static inline FFTComplex* TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w)
218{ 219{
219 register const FFTSample wre=w[0],wim=w[1]; 220 register const FFTSample wre=w[0],wim=w[1];
220 register FFTSample t1,t2,t5,t6,r_re,r_im; 221 register FFTSample t1,t2,t5,t6,r_re,r_im;
@@ -225,9 +226,10 @@ static inline void TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample
225 r_im = z[n*3].im; 226 r_im = z[n*3].im;
226 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 227 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
227 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 228 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
229 return z+1;
228} 230}
229 231
230static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) 232static inline FFTComplex* TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w)
231{ 233{
232 register const FFTSample wim=w[0],wre=w[1]; 234 register const FFTSample wim=w[0],wre=w[1];
233 register FFTSample t1,t2,t5,t6,r_re,r_im; 235 register FFTSample t1,t2,t5,t6,r_re,r_im;
@@ -238,9 +240,10 @@ static inline void TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample
238 r_im = z[n*3].im; 240 r_im = z[n*3].im;
239 XNPROD31_R(r_re, r_im, wre, wim, t5,t6); 241 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
240 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 242 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
243 return z+1;
241} 244}
242 245
243static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) 246static inline FFTComplex* TRANSFORM_EQUAL(FFTComplex * z, unsigned int n)
244{ 247{
245 register FFTSample t1,t2,t5,t6,temp1,temp2; 248 register FFTSample t1,t2,t5,t6,temp1,temp2;
246 register FFTSample * my_z = (FFTSample *)(z); 249 register FFTSample * my_z = (FFTSample *)(z);
@@ -256,9 +259,10 @@ static inline void TRANSFORM_EQUAL(FFTComplex * z, unsigned int n)
256 t5 = ( temp2 - t5 ); 259 t5 = ( temp2 - t5 );
257 my_z -= n*6; 260 my_z -= n*6;
258 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 261 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
262 return z+1;
259} 263}
260 264
261static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n) 265static inline FFTComplex* TRANSFORM_ZERO(FFTComplex * z, unsigned int n)
262{ 266{
263 FFTSample t1,t2,t5,t6; 267 FFTSample t1,t2,t5,t6;
264 t1 = z[n*2].re; 268 t1 = z[n*2].re;
@@ -266,6 +270,7 @@ static inline void TRANSFORM_ZERO(FFTComplex * z, unsigned int n)
266 t5 = z[n*3].re; 270 t5 = z[n*3].re;
267 t6 = z[n*3].im; 271 t6 = z[n*3].im;
268 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]); 272 BUTTERFLIES(z[0],z[n],z[n*2],z[n*3]);
273 return z+1;
269} 274}
270#endif 275#endif
271 276
@@ -282,17 +287,14 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg)
282 register const FFTSample *w_end = sincos_lookup0+1024; 287 register const FFTSample *w_end = sincos_lookup0+1024;
283 288
284 /* first two are special (well, first one is special, but we need to do pairs) */ 289 /* first two are special (well, first one is special, but we need to do pairs) */
285 TRANSFORM_ZERO(z,n); 290 z = TRANSFORM_ZERO(z,n);
286 z++; 291 z = TRANSFORM_W10(z,n,w);
287 TRANSFORM_W10(z,n,w);
288 w += STEP; 292 w += STEP;
289 /* first pass forwards through sincos_lookup0*/ 293 /* first pass forwards through sincos_lookup0*/
290 do { 294 do {
291 z++; 295 z = TRANSFORM_W10(z,n,w);
292 TRANSFORM_W10(z,n,w);
293 w += STEP; 296 w += STEP;
294 z++; 297 z = TRANSFORM_W10(z,n,w);
295 TRANSFORM_W10(z,n,w);
296 w += STEP; 298 w += STEP;
297 } while(LIKELY(w < w_end)); 299 } while(LIKELY(w < w_end));
298 /* second half: pass backwards through sincos_lookup0*/ 300 /* second half: pass backwards through sincos_lookup0*/
@@ -300,11 +302,9 @@ void pass(FFTComplex *z_arg, unsigned int STEP_arg, unsigned int n_arg)
300 w_end=sincos_lookup0; 302 w_end=sincos_lookup0;
301 while(LIKELY(w>w_end)) 303 while(LIKELY(w>w_end))
302 { 304 {
303 z++; 305 z = TRANSFORM_W01(z,n,w);
304 TRANSFORM_W01(z,n,w);
305 w -= STEP; 306 w -= STEP;
306 z++; 307 z = TRANSFORM_W01(z,n,w);
307 TRANSFORM_W01(z,n,w);
308 w -= STEP; 308 w -= STEP;
309 } 309 }
310} 310}
diff --git a/apps/codecs/lib/fft-ffmpeg_arm.h b/apps/codecs/lib/fft-ffmpeg_arm.h
index 9d396a3fc0..073ad8ee46 100644
--- a/apps/codecs/lib/fft-ffmpeg_arm.h
+++ b/apps/codecs/lib/fft-ffmpeg_arm.h
@@ -43,6 +43,7 @@
43 y = x - (b<<1);\ 43 y = x - (b<<1);\
44} 44}
45 45
46
46/* standard BUTTERFLIES package. Note, we actually manually inline this 47/* standard BUTTERFLIES package. Note, we actually manually inline this
47 in all the TRANSFORM macros below anyway */ 48 in all the TRANSFORM macros below anyway */
48#define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES 49#define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES
@@ -59,198 +60,314 @@
59 60
60#define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM 61#define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
61 62
62/* on ARM, all the TRANSFORM_etc inlines use the following registers: 63static inline FFTComplex* TRANSFORM( FFTComplex* z, int n, FFTSample wre, FFTSample wim )
63 r5,r6,r7,r8,r9,r10,r4,r12 64{
64 65 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
65 inputs are: z, n, STEP 66 z += n*2; /* z[o2] */
66 67 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
67 NOTE THAT THESE MACROS ACTUALLY CHANGE z INPUT INPLACE- 68 XPROD31_R(r_re, r_im, wre, wim, t1,t2);
68 so sequential actions, z += n*3, z -= n*2 etc etc matter 69
69*/ 70 z += n; /* z[o3] */
70 71 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
72 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
73
74 BF_OPT(t1, t5, t5, t1);
75 BF_OPT(t6, t2, t2, t6);
71 76
72#define TRANSFORM_POST_STORE( z, n ) {\ 77 {
73 /*{*/\ 78 register FFTSample rt0temp asm("r4");
74 /* BF_OPT(t1, t5, t5, t1);*/\ 79 /*{*/
75 /* BF_OPT(t6, t2, t2, t6);*/\ 80 /* BF_OPT(t1, t5, t5, t1);*/
76 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/\ 81 /* BF_OPT(t6, t2, t2, t6);*/
77 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/\ 82 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
78 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/\ 83 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
79 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/\ 84 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
80 /*}*/\ 85 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
81 z -= n*3;\ 86 /*}*/
82 /* r_re = my_z[0]; r_im = my_z[1]; */\ 87 z -= n*3;
83 {\ 88 /* r_re = my_z[0]; r_im = my_z[1]; */
84 register FFTSample rt0temp asm("r4");\ 89 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
85 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 90 BF_OPT(rt0temp, r_re, r_re, t5);
86 BF_OPT(rt0temp, r_re, r_re, t5);\ 91 BF_OPT(t2, r_im, r_im, t2);
87 BF_OPT(t2, r_im, r_im, t2);\ 92 /* my_z[0] = r_re; my_z[1] = r_im; */
88 /* my_z[0] = r_re; my_z[1] = r_im; */\ 93 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory" );
89 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ 94 z += n;
90 z += n;\ 95 /* r_re = my_z[0]; r_im = my_z[1]; */
91 /* r_re = my_z[0]; r_im = my_z[1]; */\ 96 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
92 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 97 BF_OPT(t5, r_re, r_re, t6);
93 BF_OPT(t5, r_re, r_re, t6);\ 98 BF_OPT(t6, r_im, r_im, t1);
94 BF_OPT(t6, r_im, r_im, t1);\ 99 /* my_z[0] = r_re; my_z[1] = r_im; */
95 /* my_z[0] = r_re; my_z[1] = r_im; */\ 100 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
96 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\ 101 z += n;
97 z += n;\ 102 /* my_z[0] = rt0temp; my_z[1] = t2; */
98 /* my_z[0] = rt0temp; my_z[1] = t2; */\ 103 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
99 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2));\ 104 }
100 z += n;\ 105 z += n;
101 }\ 106
102 /* my_z[0] = t5; my_z[1] = t6; */\ 107 /* my_z[0] = t5; my_z[1] = t6; */
103 asm volatile( "stmia %[my_z], {%[t5],%[t6]}\n\t"::[my_z] "r" (z), [t5] "r" (t5), [t6] "r" (t6));\ 108 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
104 z -= n*3;\ 109 z -= n*3;
110 return(z);
105} 111}
106 112
107#define TRANSFORM( z, n, wre_arg, wim_arg )\ 113static inline FFTComplex* TRANSFORM_W01( FFTComplex* z, int n, const FFTSample* w )
108{\ 114{
109 FFTSample wre = wre_arg, wim = wim_arg;\ 115 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
110 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 116
111 z += n*2; /* z[o2] */\ 117 /* load wre,wim into t5,t6 */
112 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 118 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (t5), [wim] "=r" (t6):[w] "r" (w));
113 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 119 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
114 \ 120 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
115 z += n; /* z[o3] */\ 121 XPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t1,t2);
116 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
117 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\
118 \
119 BF_OPT(t1, t5, t5, t1);\
120 BF_OPT(t6, t2, t2, t6);\
121 TRANSFORM_POST_STORE( z, n );\
122}
123 122
124#define TRANSFORM_W01( z, n, w )\ 123 z += n; /* z[o3] */
125{\ 124 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
126 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 125 XNPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t5,t6);
127 \ 126
128 {\ 127 BF_OPT(t1, t5, t5, t1);
129 register FFTSample wre asm("r4"),wim asm("r12");\ 128 BF_OPT(t6, t2, t2, t6);
130 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (wre), [wim] "=r" (wim):[w] "r" (w));\ 129 {
131 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 130 register FFTSample rt0temp asm("r4");
132 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 131 /*{*/
133 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 132 /* BF_OPT(t1, t5, t5, t1);*/
134\ 133 /* BF_OPT(t6, t2, t2, t6);*/
135 z += n; /* z[o3] */\ 134 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
136 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 135 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
137 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ 136 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
138 }\ 137 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
139 \ 138 /*}*/
140 BF_OPT(t1, t5, t5, t1);\ 139 z -= n*3;
141 BF_OPT(t6, t2, t2, t6);\ 140 /* r_re = my_z[0]; r_im = my_z[1]; */
142 TRANSFORM_POST_STORE( z, n );\ 141 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
142 BF_OPT(rt0temp, r_re, r_re, t5);
143 BF_OPT(t2, r_im, r_im, t2);
144 /* my_z[0] = r_re; my_z[1] = r_im; */
145 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
146 z += n;
147 /* r_re = my_z[0]; r_im = my_z[1]; */
148 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
149 BF_OPT(t5, r_re, r_re, t6);
150 BF_OPT(t6, r_im, r_im, t1);
151 /* my_z[0] = r_re; my_z[1] = r_im; */
152 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
153 z += n;
154 /* my_z[0] = rt0temp; my_z[1] = t2; */
155 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
156 }
157 z += n;
158
159 /* my_z[0] = t5; my_z[1] = t6; */
160 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
161 z -= n*3;
162 return(z);
143} 163}
144 164
145//static inline void TRANSFORM_W10(int32_t * z, unsigned int n, const int32_t * w) 165static inline FFTComplex* TRANSFORM_W10( FFTComplex* z, int n, const FFTSample* w )
146#define TRANSFORM_W10( z, n, w )\ 166{
147{\ 167 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
148 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 168
149 \ 169 /* load wim,wre into t5,t6 */
150 {\ 170 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (t5), [wre] "=r" (t6):[w] "r" (w));
151 register FFTSample wim asm("r4"),wre asm("r12");\ 171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
152 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (wim), [wre] "=r" (wre):[w] "r" (w));\ 172 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
153 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 173 XPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t1,t2);
154 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 174
155 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\ 175 z += n; /* z[o3] */
156\ 176 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
157 z += n; /* z[o3] */\ 177 XNPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t5,t6);
158 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 178
159 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\ 179 BF_OPT(t1, t5, t5, t1);
160 }\ 180 BF_OPT(t6, t2, t2, t6);
161 \ 181 {
162 BF_OPT(t1, t5, t5, t1);\ 182 register FFTSample rt0temp asm("r4");
163 BF_OPT(t6, t2, t2, t6);\ 183 /*{*/
164 TRANSFORM_POST_STORE( z, n );\ 184 /* BF_OPT(t1, t5, t5, t1);*/
185 /* BF_OPT(t6, t2, t2, t6);*/
186 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
187 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
188 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
189 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
190 /*}*/
191 z -= n*3;
192 /* r_re = my_z[0]; r_im = my_z[1]; */
193 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
194 BF_OPT(rt0temp, r_re, r_re, t5);
195 BF_OPT(t2, r_im, r_im, t2);
196 /* my_z[0] = r_re; my_z[1] = r_im; */
197 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
198 z += n;
199 /* r_re = my_z[0]; r_im = my_z[1]; */
200 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
201 BF_OPT(t5, r_re, r_re, t6);
202 BF_OPT(t6, r_im, r_im, t1);
203 /* my_z[0] = r_re; my_z[1] = r_im; */
204 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
205 z += n;
206 /* my_z[0] = rt0temp; my_z[1] = t2; */
207 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
208 }
209 z += n;
210
211 /* my_z[0] = t5; my_z[1] = t6; */
212 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
213 z -= n*3;
214 return(z);
165} 215}
166 216
167#define TRANSFORM_EQUAL( z, n )\ 217static inline FFTComplex* TRANSFORM_EQUAL( FFTComplex* z, int n )
168{\ 218{
169 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 219 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
170\ 220
171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 221 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
172 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ 222 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
173 z += n; /* z[o3] */\ 223 z += n; /* z[o3] */
174 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\ 224 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
175\ 225
176/**/\ 226/**/
177/*t2 = MULT32(cPI2_8, t5);*/\ 227/*t2 = MULT32(cPI2_8, t5);*/
178/*t1 = MULT31(cPI2_8, t6);*/\ 228/*t1 = MULT31(cPI2_8, t6);*/
179/*t6 = MULT31(cPI2_8, r_re);*/\ 229/*t6 = MULT31(cPI2_8, r_re);*/
180/*t5 = MULT32(cPI2_8, r_im);*/\ 230/*t5 = MULT32(cPI2_8, r_im);*/
181\ 231
182/*t1 = ( t1 + (t2<<1) );*/\ 232/*t1 = ( t1 + (t2<<1) );*/
183/*t2 = ( t1 - (t2<<2) );*/\ 233/*t2 = ( t1 - (t2<<2) );*/
184/*t6 = ( t6 + (t5<<1) );*/\ 234/*t6 = ( t6 + (t5<<1) );*/
185/*t5 = ( t6 - (t5<<2) );*/\ 235/*t5 = ( t6 - (t5<<2) );*/
186/**/\ 236/**/
187 t2 = MULT31(cPI2_8, t5);\ 237 t2 = MULT31(cPI2_8, t5);
188 t6 = MULT31(cPI2_8, t6);\ 238 t6 = MULT31(cPI2_8, t6);
189 r_re = MULT31(cPI2_8, r_re);\ 239 r_re = MULT31(cPI2_8, r_re);
190 t5 = MULT31(cPI2_8, r_im);\ 240 t5 = MULT31(cPI2_8, r_im);
191 \ 241
192 t1 = ( t6 + t2 );\ 242 t1 = ( t6 + t2 );
193 t2 = ( t6 - t2 );\ 243 t2 = ( t6 - t2 );
194 t6 = ( r_re + t5 );\ 244 t6 = ( r_re + t5 );
195 t5 = ( r_re - t5 );\ 245 t5 = ( r_re - t5 );
196 \ 246
197 BF_OPT(t1, t5, t5, t1);\ 247 BF_OPT(t1, t5, t5, t1);
198 BF_OPT(t6, t2, t2, t6);\ 248 BF_OPT(t6, t2, t2, t6);
199 TRANSFORM_POST_STORE( z, n );\ 249 {
250 register FFTSample rt0temp asm("r4");
251 /*{*/
252 /* BF_OPT(t1, t5, t5, t1);*/
253 /* BF_OPT(t6, t2, t2, t6);*/
254 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
255 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
256 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
257 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
258 /*}*/
259 z -= n*3;
260 /* r_re = my_z[0]; r_im = my_z[1]; */
261 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
262 BF_OPT(rt0temp, r_re, r_re, t5);
263 BF_OPT(t2, r_im, r_im, t2);
264 /* my_z[0] = r_re; my_z[1] = r_im; */
265 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
266 z += n;
267 /* r_re = my_z[0]; r_im = my_z[1]; */
268 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
269 BF_OPT(t5, r_re, r_re, t6);
270 BF_OPT(t6, r_im, r_im, t1);
271 /* my_z[0] = r_re; my_z[1] = r_im; */
272 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
273 z += n;
274 /* my_z[0] = rt0temp; my_z[1] = t2; */
275 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
276 }
277 z += n;
278
279 /* my_z[0] = t5; my_z[1] = t6; */
280 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
281 z -= n*3;
282 return(z);
200} 283}
201 284
202#define TRANSFORM_ZERO( z,n )\ 285static inline FFTComplex* TRANSFORM_ZERO( FFTComplex* z, int n )
203{\ 286{
204 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\ 287 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"), r_re asm("r8"), r_im asm("r9");
205\ 288
206 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\ 289 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
207 asm volatile( "ldmia %[my_z], {%[t1],%[t2]}\n\t":[t1] "=r" (t1), [t2] "=r" (t2):[my_z] "r" (z));\ 290 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
208 z += n; /* z[o3] */\ 291 z += n; /* z[o3] */
209 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\ 292 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
210\ 293
211 BF_OPT(t1, t5, t5, t1);\ 294 BF_OPT(t1, t5, t5, r_re);
212 BF_OPT(t6, t2, t2, t6);\ 295 BF_OPT(t6, t2, r_im, t6);
213 TRANSFORM_POST_STORE( z, n );\ 296 {
297 register FFTSample rt0temp asm("r4");
298 /*{*/
299 /* BF_OPT(t1, t5, t5, t1);*/
300 /* BF_OPT(t6, t2, t2, t6);*/
301 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
302 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
303 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
304 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
305 /*}*/
306 z -= n*3;
307 /* r_re = my_z[0]; r_im = my_z[1]; */
308 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
309 BF_OPT(rt0temp, r_re, r_re, t5);
310 BF_OPT(t2, r_im, r_im, t2);
311 /* my_z[0] = r_re; my_z[1] = r_im; */
312 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
313 z += n;
314 /* r_re = my_z[0]; r_im = my_z[1]; */
315 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
316 BF_OPT(t5, r_re, r_re, t6);
317 BF_OPT(t6, r_im, r_im, t1);
318 /* my_z[0] = r_re; my_z[1] = r_im; */
319 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
320 z += n;
321 /* my_z[0] = rt0temp; my_z[1] = t2; */
322 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
323 }
324 z += n;
325
326 /* my_z[0] = t5; my_z[1] = t6; */
327 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
328 z -= n*3;
329 return(z);
214} 330}
215 331
216#define FFT_FFMPEG_INCL_OPTIMISED_FFT4 332#define FFT_FFMPEG_INCL_OPTIMISED_FFT4
217#define fft4(z_arg)\ 333static inline FFTComplex* fft4(FFTComplex * z)
218{\ 334{
219 /* input[0..7] -> output[0..7] */\ 335 FFTSample temp;
220 fixed32 * m = (fixed32 *) ( ( z_arg ) );\ 336
221 /* load r1=z[0],r2=z[1],...,r8=z[7] */\ 337 /* input[0..7] -> output[0..7] */
222 asm volatile(\ 338 /* load r1=z[0],r2=z[1],...,r8=z[7] */
223 "ldmia %[z], {r1-r8}\n\t"\ 339 asm volatile(
224 "add r1,r1,r3\n\t" /* r1 :=t1 */\ 340 "ldmia %[z], {r1-r8}\n\t"
225 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */\ 341 "add r1,r1,r3\n\t" /* r1 :=t1 */
226 "sub r7,r7,r5\n\t" /* r10:=t8 */\ 342 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */
227 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */\ 343 "sub r7,r7,r5\n\t" /* r10:=t8 */
228 \ 344 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */
229 "add r1,r1,r5\n\t" /* r1 = o[0] */\ 345
230 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */\ 346 "add r1,r1,r5\n\t" /* r1 = o[0] */
231 \ 347 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */
232 "add r2,r2,r4\n\t" /* r2 :=t2 */\ 348
233 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */\ 349 "add r2,r2,r4\n\t" /* r2 :=t2 */
234 \ 350 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */
235 "add r12,r6,r8\n\t" /* r10:=t5 */\ 351
236 "sub r6,r6,r8\n\t" /* r6 :=t7 */\ 352 "add %[temp],r6,r8\n\t" /* r10:=t5 */
237 \ 353 "sub r6,r6,r8\n\t" /* r6 :=t7 */
238 "sub r8,r4,r7\n\t" /* r8 = o[7]*/ \ 354
239 "add r4,r4,r7\n\t" /* r4 = o[3]*/ \ 355 "sub r8,r4,r7\n\t" /* r8 = o[7]*/
240 "sub r7,r3,r6\n\t" /* r7 = o[6]*/ \ 356 "add r4,r4,r7\n\t" /* r4 = o[3]*/
241 "add r3,r3,r6\n\t" /* r3 = o[2]*/ \ 357 "sub r7,r3,r6\n\t" /* r7 = o[6]*/
242 "sub r6,r2,r12\n\t" /* r6 = o[5]*/ \ 358 "add r3,r3,r6\n\t" /* r3 = o[2]*/
243 "add r2,r2,r12\n\t" /* r2 = o[1]*/ \ 359 "sub r6,r2,%[temp]\n\t" /* r6 = o[5]*/
244 \ 360 "add r2,r2,%[temp]\n\t" /* r2 = o[1]*/
245 "stmia %[z], {r1-r8}\n\t"\ 361
246 : /* outputs */\ 362 "stmia %[z]!, {r1-r8}\n\t"
247 : /* inputs */ [z] "r" (m)\ 363 : /* outputs */ [z] "+r" (z), [temp] "=r" (temp)
248 : /* clobbers */\ 364 : /* inputs */
249 "r1","r2","r3","r4","r5","r6","r7","r8","r12","memory"\ 365 : /* clobbers */
250 );\ 366 "r1","r2","r3","r4","r5","r6","r7","r8","memory"
367 );
368 return z;
251} 369}
252 370
253
254#define FFT_FFMPEG_INCL_OPTIMISED_FFT8 371#define FFT_FFMPEG_INCL_OPTIMISED_FFT8
255 /* The chunk of asm below is equivalent to the following: 372 /* The chunk of asm below is equivalent to the following:
256 373
@@ -279,12 +396,14 @@
279 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im 396 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im
280 // ... 397 // ...
281 */ 398 */
282static inline void fft8( FFTComplex * z ) 399static inline void fft8(FFTComplex * z)
283{ 400{
284 fft4(z); 401 FFTComplex* m4 = fft4(z);
285 { 402 {
286 FFTSample temp; 403 /* note that we increment z_ptr on the final stmia, which
287 fixed32 * m4 = (fixed32 *)(&(z[4].re)); 404 leaves z_ptr pointing to z[1].re ready for the Transform step */
405
406 register FFTSample temp;
288 407
289 asm volatile( 408 asm volatile(
290 /* read in z[4].re thru z[7].im */ 409 /* read in z[4].re thru z[7].im */
@@ -323,18 +442,15 @@ static inline void fft8( FFTComplex * z )
323 "add r8,r8,r2\n\t" 442 "add r8,r8,r2\n\t"
324 "sub r2,r8,r2,lsl #1\n\t" 443 "sub r2,r8,r2,lsl #1\n\t"
325 444
326 "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[0].re, z[0].im */ 445 "stmia %[z_ptr]!,{r7,r8}\n\t" /* write out z[0].re, z[0].im */
327 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ 446 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */
328 : [z4_ptr] "+r" (m4), [temp] "=r" (temp) 447 : [z4_ptr] "+r" (m4), [temp] "=r" (temp), [z_ptr] "+r" (z)
329 : [z_ptr] "r" (z) 448 :
330 : "r1","r2","r3","r4","r5","r6","r7","r8","memory" 449 : "r1","r2","r3","r4","r5","r6","r7","r8","memory"
331 ); 450 );
332 } 451 }
333 452
334 z++;
335 TRANSFORM_EQUAL(z,2); 453 TRANSFORM_EQUAL(z,2);
336} 454}
337 455
338
339#endif // CPU_ARM 456#endif // CPU_ARM
340