diff options
author | Sean Bartell <wingedtachikoma@gmail.com> | 2011-06-25 21:32:25 -0400 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2012-04-25 22:13:20 +0200 |
commit | f40bfc9267b13b54e6379dfe7539447662879d24 (patch) | |
tree | 9b20069d5e62809ff434061ad730096836f916f2 /apps/codecs/lib/fft-ffmpeg_arm.h | |
parent | a0009907de7a0107d49040d8a180f140e2eff299 (diff) | |
download | rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.tar.gz rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.zip |
Add codecs to librbcodec.
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97
Reviewed-on: http://gerrit.rockbox.org/137
Reviewed-by: Nils Wallménius <nils@rockbox.org>
Tested-by: Nils Wallménius <nils@rockbox.org>
Diffstat (limited to 'apps/codecs/lib/fft-ffmpeg_arm.h')
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg_arm.h | 456 |
1 files changed, 0 insertions, 456 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg_arm.h b/apps/codecs/lib/fft-ffmpeg_arm.h deleted file mode 100644 index 073ad8ee46..0000000000 --- a/apps/codecs/lib/fft-ffmpeg_arm.h +++ /dev/null | |||
@@ -1,456 +0,0 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2010 Dave Hooper | ||
11 | * | ||
12 | * ARM optimisations for ffmpeg's fft (used in fft-ffmpeg.c) | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version 2 | ||
17 | * of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
20 | * KIND, either express or implied. | ||
21 | * | ||
22 | ****************************************************************************/ | ||
23 | |||
24 | #ifdef CPU_ARM | ||
25 | |||
26 | /* Start off with optimised variants of the butterflies that work | ||
27 | nicely on arm */ | ||
28 | /* 1. where y and a share the same variable/register */ | ||
29 | #define BF_OPT(x,y,a,b) {\ | ||
30 | y = a + b;\ | ||
31 | x = y - (b<<1);\ | ||
32 | } | ||
33 | |||
34 | /* 2. where y and b share the same variable/register */ | ||
35 | #define BF_OPT2(x,y,a,b) {\ | ||
36 | x = a - b;\ | ||
37 | y = x + (b<<1);\ | ||
38 | } | ||
39 | |||
40 | /* 3. where y and b share the same variable/register (but y=(-b)) */ | ||
41 | #define BF_OPT2_REV(x,y,a,b) {\ | ||
42 | x = a + b;\ | ||
43 | y = x - (b<<1);\ | ||
44 | } | ||
45 | |||
46 | |||
47 | /* standard BUTTERFLIES package. Note, we actually manually inline this | ||
48 | in all the TRANSFORM macros below anyway */ | ||
49 | #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES | ||
50 | #define BUTTERFLIES(a0,a1,a2,a3) {\ | ||
51 | {\ | ||
52 | BF_OPT(t1, t5, t5, t1);\ | ||
53 | BF_OPT(t6, t2, t2, t6);\ | ||
54 | BF_OPT(a2.re, a0.re, a0.re, t5);\ | ||
55 | BF_OPT(a2.im, a0.im, a0.im, t2);\ | ||
56 | BF_OPT(a3.re, a1.re, a1.re, t6);\ | ||
57 | BF_OPT(a3.im, a1.im, a1.im, t1);\ | ||
58 | }\ | ||
59 | } | ||
60 | |||
61 | #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | ||
62 | |||
63 | static inline FFTComplex* TRANSFORM( FFTComplex* z, int n, FFTSample wre, FFTSample wim ) | ||
64 | { | ||
65 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); | ||
66 | z += n*2; /* z[o2] */ | ||
67 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
68 | XPROD31_R(r_re, r_im, wre, wim, t1,t2); | ||
69 | |||
70 | z += n; /* z[o3] */ | ||
71 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
72 | XNPROD31_R(r_re, r_im, wre, wim, t5,t6); | ||
73 | |||
74 | BF_OPT(t1, t5, t5, t1); | ||
75 | BF_OPT(t6, t2, t2, t6); | ||
76 | |||
77 | { | ||
78 | register FFTSample rt0temp asm("r4"); | ||
79 | /*{*/ | ||
80 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
81 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
82 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
83 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
84 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
85 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
86 | /*}*/ | ||
87 | z -= n*3; | ||
88 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
89 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
90 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
91 | BF_OPT(t2, r_im, r_im, t2); | ||
92 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
93 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory" ); | ||
94 | z += n; | ||
95 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
96 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
97 | BF_OPT(t5, r_re, r_re, t6); | ||
98 | BF_OPT(t6, r_im, r_im, t1); | ||
99 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
100 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
101 | z += n; | ||
102 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
103 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
104 | } | ||
105 | z += n; | ||
106 | |||
107 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
108 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
109 | z -= n*3; | ||
110 | return(z); | ||
111 | } | ||
112 | |||
113 | static inline FFTComplex* TRANSFORM_W01( FFTComplex* z, int n, const FFTSample* w ) | ||
114 | { | ||
115 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); | ||
116 | |||
117 | /* load wre,wim into t5,t6 */ | ||
118 | asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (t5), [wim] "=r" (t6):[w] "r" (w)); | ||
119 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ | ||
120 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
121 | XPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t1,t2); | ||
122 | |||
123 | z += n; /* z[o3] */ | ||
124 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
125 | XNPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t5,t6); | ||
126 | |||
127 | BF_OPT(t1, t5, t5, t1); | ||
128 | BF_OPT(t6, t2, t2, t6); | ||
129 | { | ||
130 | register FFTSample rt0temp asm("r4"); | ||
131 | /*{*/ | ||
132 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
133 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
134 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
135 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
136 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
137 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
138 | /*}*/ | ||
139 | z -= n*3; | ||
140 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
141 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
142 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
143 | BF_OPT(t2, r_im, r_im, t2); | ||
144 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
145 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
146 | z += n; | ||
147 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
148 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
149 | BF_OPT(t5, r_re, r_re, t6); | ||
150 | BF_OPT(t6, r_im, r_im, t1); | ||
151 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
152 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
153 | z += n; | ||
154 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
155 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
156 | } | ||
157 | z += n; | ||
158 | |||
159 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
160 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
161 | z -= n*3; | ||
162 | return(z); | ||
163 | } | ||
164 | |||
165 | static inline FFTComplex* TRANSFORM_W10( FFTComplex* z, int n, const FFTSample* w ) | ||
166 | { | ||
167 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); | ||
168 | |||
169 | /* load wim,wre into t5,t6 */ | ||
170 | asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (t5), [wre] "=r" (t6):[w] "r" (w)); | ||
171 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ | ||
172 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
173 | XPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t1,t2); | ||
174 | |||
175 | z += n; /* z[o3] */ | ||
176 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
177 | XNPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t5,t6); | ||
178 | |||
179 | BF_OPT(t1, t5, t5, t1); | ||
180 | BF_OPT(t6, t2, t2, t6); | ||
181 | { | ||
182 | register FFTSample rt0temp asm("r4"); | ||
183 | /*{*/ | ||
184 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
185 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
186 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
187 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
188 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
189 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
190 | /*}*/ | ||
191 | z -= n*3; | ||
192 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
193 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
194 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
195 | BF_OPT(t2, r_im, r_im, t2); | ||
196 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
197 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
198 | z += n; | ||
199 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
200 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
201 | BF_OPT(t5, r_re, r_re, t6); | ||
202 | BF_OPT(t6, r_im, r_im, t1); | ||
203 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
204 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
205 | z += n; | ||
206 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
207 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
208 | } | ||
209 | z += n; | ||
210 | |||
211 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
212 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
213 | z -= n*3; | ||
214 | return(z); | ||
215 | } | ||
216 | |||
217 | static inline FFTComplex* TRANSFORM_EQUAL( FFTComplex* z, int n ) | ||
218 | { | ||
219 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9"); | ||
220 | |||
221 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ | ||
222 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z)); | ||
223 | z += n; /* z[o3] */ | ||
224 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
225 | |||
226 | /**/ | ||
227 | /*t2 = MULT32(cPI2_8, t5);*/ | ||
228 | /*t1 = MULT31(cPI2_8, t6);*/ | ||
229 | /*t6 = MULT31(cPI2_8, r_re);*/ | ||
230 | /*t5 = MULT32(cPI2_8, r_im);*/ | ||
231 | |||
232 | /*t1 = ( t1 + (t2<<1) );*/ | ||
233 | /*t2 = ( t1 - (t2<<2) );*/ | ||
234 | /*t6 = ( t6 + (t5<<1) );*/ | ||
235 | /*t5 = ( t6 - (t5<<2) );*/ | ||
236 | /**/ | ||
237 | t2 = MULT31(cPI2_8, t5); | ||
238 | t6 = MULT31(cPI2_8, t6); | ||
239 | r_re = MULT31(cPI2_8, r_re); | ||
240 | t5 = MULT31(cPI2_8, r_im); | ||
241 | |||
242 | t1 = ( t6 + t2 ); | ||
243 | t2 = ( t6 - t2 ); | ||
244 | t6 = ( r_re + t5 ); | ||
245 | t5 = ( r_re - t5 ); | ||
246 | |||
247 | BF_OPT(t1, t5, t5, t1); | ||
248 | BF_OPT(t6, t2, t2, t6); | ||
249 | { | ||
250 | register FFTSample rt0temp asm("r4"); | ||
251 | /*{*/ | ||
252 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
253 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
254 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
255 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
256 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
257 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
258 | /*}*/ | ||
259 | z -= n*3; | ||
260 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
261 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
262 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
263 | BF_OPT(t2, r_im, r_im, t2); | ||
264 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
265 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
266 | z += n; | ||
267 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
268 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
269 | BF_OPT(t5, r_re, r_re, t6); | ||
270 | BF_OPT(t6, r_im, r_im, t1); | ||
271 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
272 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
273 | z += n; | ||
274 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
275 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
276 | } | ||
277 | z += n; | ||
278 | |||
279 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
280 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
281 | z -= n*3; | ||
282 | return(z); | ||
283 | } | ||
284 | |||
285 | static inline FFTComplex* TRANSFORM_ZERO( FFTComplex* z, int n ) | ||
286 | { | ||
287 | register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"), r_re asm("r8"), r_im asm("r9"); | ||
288 | |||
289 | z += n*2; /* z[o2] -- 2n * 2 since complex numbers */ | ||
290 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
291 | z += n; /* z[o3] */ | ||
292 | asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z)); | ||
293 | |||
294 | BF_OPT(t1, t5, t5, r_re); | ||
295 | BF_OPT(t6, t2, r_im, t6); | ||
296 | { | ||
297 | register FFTSample rt0temp asm("r4"); | ||
298 | /*{*/ | ||
299 | /* BF_OPT(t1, t5, t5, t1);*/ | ||
300 | /* BF_OPT(t6, t2, t2, t6);*/ | ||
301 | /* BF_OPT(a2.re, a0.re, a0.re, t5);*/ | ||
302 | /* BF_OPT(a2.im, a0.im, a0.im, t2);*/ | ||
303 | /* BF_OPT(a3.re, a1.re, a1.re, t6);*/ | ||
304 | /* BF_OPT(a3.im, a1.im, a1.im, t1);*/ | ||
305 | /*}*/ | ||
306 | z -= n*3; | ||
307 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
308 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
309 | BF_OPT(rt0temp, r_re, r_re, t5); | ||
310 | BF_OPT(t2, r_im, r_im, t2); | ||
311 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
312 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
313 | z += n; | ||
314 | /* r_re = my_z[0]; r_im = my_z[1]; */ | ||
315 | asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z)); | ||
316 | BF_OPT(t5, r_re, r_re, t6); | ||
317 | BF_OPT(t6, r_im, r_im, t1); | ||
318 | /* my_z[0] = r_re; my_z[1] = r_im; */ | ||
319 | asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory"); | ||
320 | z += n; | ||
321 | /* my_z[0] = rt0temp; my_z[1] = t2; */ | ||
322 | asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory"); | ||
323 | } | ||
324 | z += n; | ||
325 | |||
326 | /* my_z[0] = t5; my_z[1] = t6; */ | ||
327 | asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory"); | ||
328 | z -= n*3; | ||
329 | return(z); | ||
330 | } | ||
331 | |||
332 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT4 | ||
333 | static inline FFTComplex* fft4(FFTComplex * z) | ||
334 | { | ||
335 | FFTSample temp; | ||
336 | |||
337 | /* input[0..7] -> output[0..7] */ | ||
338 | /* load r1=z[0],r2=z[1],...,r8=z[7] */ | ||
339 | asm volatile( | ||
340 | "ldmia %[z], {r1-r8}\n\t" | ||
341 | "add r1,r1,r3\n\t" /* r1 :=t1 */ | ||
342 | "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */ | ||
343 | "sub r7,r7,r5\n\t" /* r10:=t8 */ | ||
344 | "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */ | ||
345 | |||
346 | "add r1,r1,r5\n\t" /* r1 = o[0] */ | ||
347 | "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */ | ||
348 | |||
349 | "add r2,r2,r4\n\t" /* r2 :=t2 */ | ||
350 | "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */ | ||
351 | |||
352 | "add %[temp],r6,r8\n\t" /* r10:=t5 */ | ||
353 | "sub r6,r6,r8\n\t" /* r6 :=t7 */ | ||
354 | |||
355 | "sub r8,r4,r7\n\t" /* r8 = o[7]*/ | ||
356 | "add r4,r4,r7\n\t" /* r4 = o[3]*/ | ||
357 | "sub r7,r3,r6\n\t" /* r7 = o[6]*/ | ||
358 | "add r3,r3,r6\n\t" /* r3 = o[2]*/ | ||
359 | "sub r6,r2,%[temp]\n\t" /* r6 = o[5]*/ | ||
360 | "add r2,r2,%[temp]\n\t" /* r2 = o[1]*/ | ||
361 | |||
362 | "stmia %[z]!, {r1-r8}\n\t" | ||
363 | : /* outputs */ [z] "+r" (z), [temp] "=r" (temp) | ||
364 | : /* inputs */ | ||
365 | : /* clobbers */ | ||
366 | "r1","r2","r3","r4","r5","r6","r7","r8","memory" | ||
367 | ); | ||
368 | return z; | ||
369 | } | ||
370 | |||
371 | #define FFT_FFMPEG_INCL_OPTIMISED_FFT8 | ||
372 | /* The chunk of asm below is equivalent to the following: | ||
373 | |||
374 | // first load in z[4].re thru z[7].im into local registers | ||
375 | // ... | ||
376 | BF_OPT2_REV(z[4].re, z[5].re, z[4].re, z[5].re); // x=a+b; y=x-(b<<1) | ||
377 | BF_OPT2_REV(z[4].im, z[5].im, z[4].im, z[5].im); | ||
378 | BF_REV (temp, z[7].re, z[6].re, z[7].re); // x=a+b; y=a-b; | ||
379 | BF_REV (z[6].re, z[7].im, z[6].im, z[7].im); | ||
380 | // save z[7].re and z[7].im as those are complete now | ||
381 | // z[5].re and z[5].im are also complete now but save these later on | ||
382 | |||
383 | BF(z[6].im, z[4].re, temp, z[4].re); // x=a-b; y=a+b | ||
384 | BF_OPT(z[6].re, z[4].im, z[4].im, z[6].re); // y=a+b; x=y-(b<<1) | ||
385 | // now load z[2].re and z[2].im | ||
386 | // ... | ||
387 | BF_OPT(z[6].re, z[2].re, z[2].re, z[6].re); // y=a+b; x=y-(b<<1) | ||
388 | BF_OPT(z[6].im, z[2].im, z[2].im, z[6].im); // y=a+b; x=y-(b<<1) | ||
389 | // Now save z[6].re and z[6].im, along with z[5].re and z[5].im | ||
390 | // for efficiency. Also save z[2].re and z[2].im. | ||
391 | // Now load z[0].re and z[0].im | ||
392 | // ... | ||
393 | |||
394 | BF_OPT(z[4].re, z[0].re, z[0].re, z[4].re); // y=a+b; x=y-(b<<1) | ||
395 | BF_OPT(z[4].im, z[0].im, z[0].im, z[4].im); // y=a+b; x=y-(b<<1) | ||
396 | // Finally save out z[4].re, z[4].im, z[0].re and z[0].im | ||
397 | // ... | ||
398 | */ | ||
399 | static inline void fft8(FFTComplex * z) | ||
400 | { | ||
401 | FFTComplex* m4 = fft4(z); | ||
402 | { | ||
403 | /* note that we increment z_ptr on the final stmia, which | ||
404 | leaves z_ptr pointing to z[1].re ready for the Transform step */ | ||
405 | |||
406 | register FFTSample temp; | ||
407 | |||
408 | asm volatile( | ||
409 | /* read in z[4].re thru z[7].im */ | ||
410 | "ldmia %[z4_ptr]!, {r1-r8}\n\t" | ||
411 | /* (now points one word past &z[7].im) */ | ||
412 | "add r1,r1,r3\n\t" | ||
413 | "sub r3,r1,r3,lsl #1\n\t" | ||
414 | "add r2,r2,r4\n\t" | ||
415 | "sub r4,r2,r4,lsl #1\n\t" | ||
416 | "add %[temp],r5,r7\n\t" | ||
417 | "sub r7,r5,r7\n\t" | ||
418 | "add r5,r6,r8\n\t" | ||
419 | "sub r8,r6,r8\n\t" | ||
420 | |||
421 | "stmdb %[z4_ptr]!, {r7,r8}\n\t" /* write z[7].re,z[7].im straight away */ | ||
422 | /* Note, registers r7 & r8 now free */ | ||
423 | |||
424 | "sub r6,%[temp],r1\n\t" | ||
425 | "add r1,%[temp],r1\n\t" | ||
426 | "add r2,r2,r5\n\t" | ||
427 | "sub r5,r2,r5,lsl #1\n\t" | ||
428 | "add %[temp], %[z_ptr], #16\n\t" /* point to &z[2].re */ | ||
429 | "ldmia %[temp],{r7,r8}\n\t" /* load z[2].re and z[2].im */ | ||
430 | "add r7,r7,r5\n\t" | ||
431 | "sub r5,r7,r5,lsl #1\n\t" | ||
432 | "add r8,r8,r6\n\t" | ||
433 | "sub r6,r8,r6,lsl #1\n\t" | ||
434 | |||
435 | /* write out z[5].re, z[5].im, z[6].re, z[6].im in one go*/ | ||
436 | "stmdb %[z4_ptr]!, {r3-r6}\n\t" | ||
437 | "stmia %[temp],{r7,r8}\n\t" /* write out z[2].re, z[2].im */ | ||
438 | "ldmia %[z_ptr],{r7,r8}\n\t" /* load r[0].re, r[0].im */ | ||
439 | |||
440 | "add r7,r7,r1\n\t" | ||
441 | "sub r1,r7,r1,lsl #1\n\t" | ||
442 | "add r8,r8,r2\n\t" | ||
443 | "sub r2,r8,r2,lsl #1\n\t" | ||
444 | |||
445 | "stmia %[z_ptr]!,{r7,r8}\n\t" /* write out z[0].re, z[0].im */ | ||
446 | "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */ | ||
447 | : [z4_ptr] "+r" (m4), [temp] "=r" (temp), [z_ptr] "+r" (z) | ||
448 | : | ||
449 | : "r1","r2","r3","r4","r5","r6","r7","r8","memory" | ||
450 | ); | ||
451 | } | ||
452 | |||
453 | TRANSFORM_EQUAL(z,2); | ||
454 | } | ||
455 | |||
456 | #endif // CPU_ARM | ||