diff options
author | Nils Wallménius <nils@rockbox.org> | 2011-09-12 11:27:48 +0000 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2011-09-12 11:27:48 +0000 |
commit | f93530c4badf0811110baaee1e196a67f0e98eb8 (patch) | |
tree | fe3ead8bb0b78131c75936efa236f76295ebdc04 /apps/codecs/lib | |
parent | edf06b7324e1d5fd6d2e342fe4c069727ced22dd (diff) | |
download | rockbox-f93530c4badf0811110baaee1e196a67f0e98eb8.tar.gz rockbox-f93530c4badf0811110baaee1e196a67f0e98eb8.zip |
codeclib: coldfire asm for the TRANSFORM* functions in the fft and a little for the mdct, speeds up codecs using the codeclib mdct 0.5-1.5MHz on h300.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@30513 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/lib')
-rw-r--r-- | apps/codecs/lib/fft-ffmpeg_cf.h | 164 | ||||
-rw-r--r-- | apps/codecs/lib/mdct.c | 43 |
2 files changed, 205 insertions, 2 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg_cf.h b/apps/codecs/lib/fft-ffmpeg_cf.h index 3a419eb51f..710e1dd1af 100644 --- a/apps/codecs/lib/fft-ffmpeg_cf.h +++ b/apps/codecs/lib/fft-ffmpeg_cf.h | |||
@@ -203,4 +203,168 @@ static inline void fft8(FFTComplex *z) | |||
203 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | 203 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", |
204 | "a0", "a1", "a2", "a3", "a4", "cc", "memory"); | 204 | "a0", "a1", "a2", "a3", "a4", "cc", "memory"); |
205 | } | 205 | } |
206 | |||
207 | #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM | ||
208 | |||
209 | static inline FFTComplex* TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim) | ||
210 | { | ||
211 | asm volatile ("move.l (%[z2]), %%d5\n\t" | ||
212 | "mac.l %%d5, %[wre], (4, %[z2]), %%d4, %%acc0\n\t" | ||
213 | "mac.l %%d4, %[wim], %%acc0\n\t" | ||
214 | "mac.l %%d4, %[wre], (%[z3]), %%d6, %%acc1\n\t" | ||
215 | "msac.l %%d5, %[wim], (4,%[z3]), %%d7, %%acc1\n\t" | ||
216 | "mac.l %%d6, %[wre], %%acc2\n\t" | ||
217 | "msac.l %%d7, %[wim], %%acc2\n\t" | ||
218 | "mac.l %%d7, %[wre], %%acc3\n\t" | ||
219 | "mac.l %%d6, %[wim], %%acc3\n\t" | ||
220 | |||
221 | "movclr.l %%acc0, %[wre]\n\t" /* t1 */ | ||
222 | "movclr.l %%acc2, %[wim]\n\t" /* t5 */ | ||
223 | |||
224 | "movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */ | ||
225 | "move.l %%d4, %%d6\n\t" | ||
226 | "move.l %[wim], %%d7\n\t" | ||
227 | "sub.l %[wre], %[wim]\n\t" /* t5 = t5-t1 */ | ||
228 | "add.l %[wre], %%d7\n\t" | ||
229 | "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */ | ||
230 | "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */ | ||
231 | |||
232 | "movclr.l %%acc3, %%d7\n\t" /* t6 */ | ||
233 | "movclr.l %%acc1, %%d3\n\t" /* t2 */ | ||
234 | |||
235 | "move.l %%d3, %[wre]\n\t" | ||
236 | "add.l %%d7, %[wre]\n\t" | ||
237 | "sub.l %%d7, %%d3\n\t" /* t2 = t6-t2 */ | ||
238 | "move.l %%d5, %%d7\n\t" | ||
239 | "sub.l %[wre], %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */ | ||
240 | |||
241 | "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */ | ||
242 | "add.l %[wre], %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */ | ||
243 | "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */ | ||
244 | |||
245 | "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */ | ||
246 | "move.l %%d4, %%d6\n\t" | ||
247 | |||
248 | "sub.l %%d3, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */ | ||
249 | "add.l %%d3, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */ | ||
250 | |||
251 | "move.l %%d5, %%d7\n\t" | ||
252 | "sub.l %[wim], %%d7\n\t" | ||
253 | "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */ | ||
254 | "add.l %[wim], %%d5\n\t" | ||
255 | "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */ | ||
256 | |||
257 | : [wre] "+r" (wre), [wim] "+r" (wim) /* we clobber these after using them */ | ||
258 | : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n]) | ||
259 | : "d3", "d4", "d5", "d6", "d7", "cc", "memory"); | ||
260 | return z+1; | ||
261 | } | ||
262 | |||
263 | static inline FFTComplex* TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w) | ||
264 | { | ||
265 | return TRANSFORM(z, n, w[0], w[1]); | ||
266 | } | ||
267 | |||
268 | static inline FFTComplex* TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w) | ||
269 | { | ||
270 | return TRANSFORM(z, n, w[1], w[0]); | ||
271 | } | ||
272 | |||
273 | static inline FFTComplex* TRANSFORM_ZERO(FFTComplex * z, unsigned int n) | ||
274 | { | ||
275 | asm volatile("movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */ | ||
276 | "move.l %%d4, %%d6\n\t" | ||
277 | "movem.l (%[z2]), %%d2-%%d3\n\t" /* load z2 */ | ||
278 | "movem.l (%[z3]), %%d0-%%d1\n\t" /* load z0 */ | ||
279 | "move.l %%d0, %%d7\n\t" | ||
280 | "sub.l %%d2, %%d0\n\t" | ||
281 | "add.l %%d2, %%d7\n\t" | ||
282 | "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */ | ||
283 | "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */ | ||
284 | |||
285 | "move.l %%d5, %%d7\n\t" | ||
286 | "move.l %%d3, %%d2\n\t" | ||
287 | "add.l %%d1, %%d2\n\t" | ||
288 | "sub.l %%d2, %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */ | ||
289 | "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */ | ||
290 | "add.l %%d2, %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */ | ||
291 | "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */ | ||
292 | |||
293 | "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */ | ||
294 | "move.l %%d4, %%d6\n\t" | ||
295 | "sub.l %%d1, %%d3\n\t" | ||
296 | "sub.l %%d3, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */ | ||
297 | "add.l %%d3, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */ | ||
298 | |||
299 | "move.l %%d5, %%d7\n\t" | ||
300 | "sub.l %%d0, %%d7\n\t" | ||
301 | "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */ | ||
302 | "add.l %%d0, %%d5\n\t" | ||
303 | |||
304 | "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */ | ||
305 | |||
306 | : | ||
307 | : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n]) | ||
308 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory"); | ||
309 | return z+1; | ||
310 | } | ||
311 | |||
312 | static inline FFTComplex* TRANSFORM_EQUAL(FFTComplex * z, unsigned int n) | ||
313 | { | ||
314 | asm volatile ("move.l (%[z2]), %%d5\n\t" | ||
315 | "mac.l %%d5, %[PI2_8], (4, %[z2]), %%d5, %%acc0\n\t" | ||
316 | "mac.l %%d5, %[PI2_8], (%[z3]), %%d5, %%acc1\n\t" | ||
317 | "mac.l %%d5, %[PI2_8], (4,%[z3]), %%d5, %%acc2\n\t" | ||
318 | "mac.l %%d5, %[PI2_8], %%acc3\n\t" | ||
319 | |||
320 | "movclr.l %%acc0, %%d0\n\t" | ||
321 | "movclr.l %%acc1, %%d1\n\t" | ||
322 | "movclr.l %%acc2, %%d2\n\t" | ||
323 | "movclr.l %%acc3, %%d3\n\t" | ||
324 | |||
325 | "move.l %%d0, %%d7\n\t" | ||
326 | "add.l %%d1, %%d0\n\t" /* d0 == t1 */ | ||
327 | "sub.l %%d7, %%d1\n\t" /* d1 == t2 */ | ||
328 | |||
329 | "move.l %%d3, %%d7\n\t" | ||
330 | "add.l %%d2, %%d3\n\t" /* d3 == t6 */ | ||
331 | "sub.l %%d7, %%d2\n\t" /* d2 == t5 */ | ||
332 | |||
333 | "movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */ | ||
334 | "move.l %%d4, %%d6\n\t" | ||
335 | "move.l %%d2, %%d7\n\t" | ||
336 | "sub.l %%d0, %%d2\n\t" /* t5 = t5-t1 */ | ||
337 | "add.l %%d0, %%d7\n\t" | ||
338 | "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */ | ||
339 | "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */ | ||
340 | |||
341 | "move.l %%d1, %%d0\n\t" | ||
342 | "add.l %%d3, %%d0\n\t" | ||
343 | "sub.l %%d3, %%d1\n\t" /* t2 = t6-t2 */ | ||
344 | "move.l %%d5, %%d7\n\t" | ||
345 | "sub.l %%d0, %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */ | ||
346 | |||
347 | "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */ | ||
348 | "add.l %%d0, %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */ | ||
349 | "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */ | ||
350 | |||
351 | "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */ | ||
352 | "move.l %%d4, %%d6\n\t" | ||
353 | |||
354 | "sub.l %%d1, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */ | ||
355 | "add.l %%d1, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */ | ||
356 | |||
357 | "move.l %%d5, %%d7\n\t" | ||
358 | "sub.l %%d2, %%d7\n\t" | ||
359 | "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */ | ||
360 | "add.l %%d2, %%d5\n\t" | ||
361 | "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */ | ||
362 | |||
363 | : | ||
364 | : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n]), [PI2_8] "r" (cPI2_8) | ||
365 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory"); | ||
366 | |||
367 | return z+1; | ||
368 | } | ||
369 | |||
206 | #endif /* CPU_COLDIFRE */ | 370 | #endif /* CPU_COLDIFRE */ |
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c index 8382a7213d..621b9cbbb8 100644 --- a/apps/codecs/lib/mdct.c +++ b/apps/codecs/lib/mdct.c | |||
@@ -134,12 +134,50 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
134 | } | 134 | } |
135 | else | 135 | else |
136 | { | 136 | { |
137 | T = sincos_lookup1; | 137 | T = sincos_lookup1; |
138 | newstep = 2; | 138 | newstep = 2; |
139 | } | 139 | } |
140 | 140 | ||
141 | while(z1<z2) | 141 | while(z1<z2) |
142 | { | 142 | { |
143 | #ifdef CPU_COLDFIRE | ||
144 | asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t" | ||
145 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
146 | "mac.l %%d1, %%d2, %%acc0\n\t" | ||
147 | "msac.l %%d0, %%d3, %%acc0\n\t" | ||
148 | "mac.l %%d0, %%d2, %%acc1\n\t" | ||
149 | "mac.l %%d1, %%d3, %%acc1\n\t" | ||
150 | |||
151 | "lea (%[newstep]*4, %[T]), %[T]\n\t" | ||
152 | |||
153 | "movem.l (%[z2]), %%d0-%%d1\n\t" | ||
154 | "movem.l (%[T]), %%d2-%%d3\n\t" | ||
155 | "mac.l %%d1, %%d3, %%acc2\n\t" | ||
156 | "msac.l %%d0, %%d2, %%acc2\n\t" | ||
157 | "mac.l %%d0, %%d3, %%acc3\n\t" | ||
158 | "mac.l %%d1, %%d2, %%acc3\n\t" | ||
159 | |||
160 | "lea (%[newstep]*4, %[T]), %[T]\n\t" | ||
161 | |||
162 | "movclr.l %%acc0, %%d0\n\t" | ||
163 | "movclr.l %%acc1, %%d2\n\t" | ||
164 | "movclr.l %%acc2, %%d1\n\t" | ||
165 | "movclr.l %%acc3, %%d3\n\t" | ||
166 | |||
167 | "neg.l %%d0\n\t" | ||
168 | "neg.l %%d1\n\t" | ||
169 | "neg.l %%d2\n\t" | ||
170 | "neg.l %%d3\n\t" | ||
171 | |||
172 | "movem.l %%d0/%%d3, (%[z1])\n\t" | ||
173 | "movem.l %%d1/%%d2, (%[z2])\n\t" | ||
174 | |||
175 | "addq.l #8, %[z1]\n\t" | ||
176 | "subq.l #8, %[z2]\n\t" | ||
177 | : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T) | ||
178 | : [newstep] "d" (newstep) | ||
179 | : "d0", "d1", "d2", "d3", "cc", "memory"); | ||
180 | #else | ||
143 | fixed32 r0,i0,r1,i1; | 181 | fixed32 r0,i0,r1,i1; |
144 | XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep; | 182 | XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep; |
145 | XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep; | 183 | XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep; |
@@ -149,6 +187,7 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
149 | z2[1] = -i1; | 187 | z2[1] = -i1; |
150 | z1+=2; | 188 | z1+=2; |
151 | z2-=2; | 189 | z2-=2; |
190 | #endif | ||
152 | } | 191 | } |
153 | 192 | ||
154 | break; | 193 | break; |