summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2011-09-12 11:27:48 +0000
committerNils Wallménius <nils@rockbox.org>2011-09-12 11:27:48 +0000
commitf93530c4badf0811110baaee1e196a67f0e98eb8 (patch)
treefe3ead8bb0b78131c75936efa236f76295ebdc04
parentedf06b7324e1d5fd6d2e342fe4c069727ced22dd (diff)
downloadrockbox-f93530c4badf0811110baaee1e196a67f0e98eb8.tar.gz
rockbox-f93530c4badf0811110baaee1e196a67f0e98eb8.zip
codeclib: coldfire asm for the TRANSFORM* functions in the fft and a little for the mdct, speeds up codecs using the codeclib mdct 0.5-1.5MHz on h300.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@30513 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/lib/fft-ffmpeg_cf.h164
-rw-r--r--apps/codecs/lib/mdct.c43
2 files changed, 205 insertions, 2 deletions
diff --git a/apps/codecs/lib/fft-ffmpeg_cf.h b/apps/codecs/lib/fft-ffmpeg_cf.h
index 3a419eb51f..710e1dd1af 100644
--- a/apps/codecs/lib/fft-ffmpeg_cf.h
+++ b/apps/codecs/lib/fft-ffmpeg_cf.h
@@ -203,4 +203,168 @@ static inline void fft8(FFTComplex *z)
203 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 203 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
204 "a0", "a1", "a2", "a3", "a4", "cc", "memory"); 204 "a0", "a1", "a2", "a3", "a4", "cc", "memory");
205} 205}
206
207#define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
208
209static inline FFTComplex* TRANSFORM(FFTComplex * z, unsigned int n, FFTSample wre, FFTSample wim)
210{
211 asm volatile ("move.l (%[z2]), %%d5\n\t"
212 "mac.l %%d5, %[wre], (4, %[z2]), %%d4, %%acc0\n\t"
213 "mac.l %%d4, %[wim], %%acc0\n\t"
214 "mac.l %%d4, %[wre], (%[z3]), %%d6, %%acc1\n\t"
215 "msac.l %%d5, %[wim], (4,%[z3]), %%d7, %%acc1\n\t"
216 "mac.l %%d6, %[wre], %%acc2\n\t"
217 "msac.l %%d7, %[wim], %%acc2\n\t"
218 "mac.l %%d7, %[wre], %%acc3\n\t"
219 "mac.l %%d6, %[wim], %%acc3\n\t"
220
221 "movclr.l %%acc0, %[wre]\n\t" /* t1 */
222 "movclr.l %%acc2, %[wim]\n\t" /* t5 */
223
224 "movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */
225 "move.l %%d4, %%d6\n\t"
226 "move.l %[wim], %%d7\n\t"
227 "sub.l %[wre], %[wim]\n\t" /* t5 = t5-t1 */
228 "add.l %[wre], %%d7\n\t"
229 "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */
230 "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */
231
232 "movclr.l %%acc3, %%d7\n\t" /* t6 */
233 "movclr.l %%acc1, %%d3\n\t" /* t2 */
234
235 "move.l %%d3, %[wre]\n\t"
236 "add.l %%d7, %[wre]\n\t"
237 "sub.l %%d7, %%d3\n\t" /* t2 = t6-t2 */
238 "move.l %%d5, %%d7\n\t"
239 "sub.l %[wre], %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */
240
241 "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */
242 "add.l %[wre], %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */
243 "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */
244
245 "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */
246 "move.l %%d4, %%d6\n\t"
247
248 "sub.l %%d3, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */
249 "add.l %%d3, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */
250
251 "move.l %%d5, %%d7\n\t"
252 "sub.l %[wim], %%d7\n\t"
253 "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */
254 "add.l %[wim], %%d5\n\t"
255 "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */
256
257 : [wre] "+r" (wre), [wim] "+r" (wim) /* we clobber these after using them */
258 : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n])
259 : "d3", "d4", "d5", "d6", "d7", "cc", "memory");
260 return z+1;
261}
262
263static inline FFTComplex* TRANSFORM_W01(FFTComplex * z, unsigned int n, const FFTSample * w)
264{
265 return TRANSFORM(z, n, w[0], w[1]);
266}
267
268static inline FFTComplex* TRANSFORM_W10(FFTComplex * z, unsigned int n, const FFTSample * w)
269{
270 return TRANSFORM(z, n, w[1], w[0]);
271}
272
273static inline FFTComplex* TRANSFORM_ZERO(FFTComplex * z, unsigned int n)
274{
275 asm volatile("movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */
276 "move.l %%d4, %%d6\n\t"
277 "movem.l (%[z2]), %%d2-%%d3\n\t" /* load z2 */
278 "movem.l (%[z3]), %%d0-%%d1\n\t" /* load z0 */
279 "move.l %%d0, %%d7\n\t"
280 "sub.l %%d2, %%d0\n\t"
281 "add.l %%d2, %%d7\n\t"
282 "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */
283 "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */
284
285 "move.l %%d5, %%d7\n\t"
286 "move.l %%d3, %%d2\n\t"
287 "add.l %%d1, %%d2\n\t"
288 "sub.l %%d2, %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */
289 "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */
290 "add.l %%d2, %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */
291 "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */
292
293 "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */
294 "move.l %%d4, %%d6\n\t"
295 "sub.l %%d1, %%d3\n\t"
296 "sub.l %%d3, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */
297 "add.l %%d3, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */
298
299 "move.l %%d5, %%d7\n\t"
300 "sub.l %%d0, %%d7\n\t"
301 "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */
302 "add.l %%d0, %%d5\n\t"
303
304 "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */
305
306 :
307 : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n])
308 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
309 return z+1;
310}
311
312static inline FFTComplex* TRANSFORM_EQUAL(FFTComplex * z, unsigned int n)
313{
314 asm volatile ("move.l (%[z2]), %%d5\n\t"
315 "mac.l %%d5, %[PI2_8], (4, %[z2]), %%d5, %%acc0\n\t"
316 "mac.l %%d5, %[PI2_8], (%[z3]), %%d5, %%acc1\n\t"
317 "mac.l %%d5, %[PI2_8], (4,%[z3]), %%d5, %%acc2\n\t"
318 "mac.l %%d5, %[PI2_8], %%acc3\n\t"
319
320 "movclr.l %%acc0, %%d0\n\t"
321 "movclr.l %%acc1, %%d1\n\t"
322 "movclr.l %%acc2, %%d2\n\t"
323 "movclr.l %%acc3, %%d3\n\t"
324
325 "move.l %%d0, %%d7\n\t"
326 "add.l %%d1, %%d0\n\t" /* d0 == t1 */
327 "sub.l %%d7, %%d1\n\t" /* d1 == t2 */
328
329 "move.l %%d3, %%d7\n\t"
330 "add.l %%d2, %%d3\n\t" /* d3 == t6 */
331 "sub.l %%d7, %%d2\n\t" /* d2 == t5 */
332
333 "movem.l (%[z]), %%d4-%%d5\n\t" /* load z0 */
334 "move.l %%d4, %%d6\n\t"
335 "move.l %%d2, %%d7\n\t"
336 "sub.l %%d0, %%d2\n\t" /* t5 = t5-t1 */
337 "add.l %%d0, %%d7\n\t"
338 "sub.l %%d7, %%d6\n\t" /* d6 = a0re - (t5+t1) => a2re */
339 "add.l %%d7, %%d4\n\t" /* d4 = a0re + (t5+t1) => a0re */
340
341 "move.l %%d1, %%d0\n\t"
342 "add.l %%d3, %%d0\n\t"
343 "sub.l %%d3, %%d1\n\t" /* t2 = t6-t2 */
344 "move.l %%d5, %%d7\n\t"
345 "sub.l %%d0, %%d7\n\t" /* d7 = a0im - (t2+t6) => a2im */
346
347 "movem.l %%d6-%%d7, (%[z2])\n\t" /* store z2 */
348 "add.l %%d0, %%d5\n\t" /* d5 = a0im + (t2+t6) => a0im */
349 "movem.l %%d4-%%d5, (%[z])\n\t" /* store z0 */
350
351 "movem.l (%[z1]), %%d4-%%d5\n\t" /* load z1 */
352 "move.l %%d4, %%d6\n\t"
353
354 "sub.l %%d1, %%d6\n\t" /* d6 = a1re - (t2-t6) => a3re */
355 "add.l %%d1, %%d4\n\t" /* d4 = a1re + (t2-t6) => a1re */
356
357 "move.l %%d5, %%d7\n\t"
358 "sub.l %%d2, %%d7\n\t"
359 "movem.l %%d6-%%d7, (%[z3])\n\t" /* store z3 */
360 "add.l %%d2, %%d5\n\t"
361 "movem.l %%d4-%%d5, (%[z1])\n\t" /* store z1 */
362
363 :
364 : [z] "a" (z), [z1] "a" (&z[n]), [z2] "a" (&z[2*n]), [z3] "a" (&z[3*n]), [PI2_8] "r" (cPI2_8)
365 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
366
367 return z+1;
368}
369
206#endif /* CPU_COLDIFRE */ 370#endif /* CPU_COLDIFRE */
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
index 8382a7213d..621b9cbbb8 100644
--- a/apps/codecs/lib/mdct.c
+++ b/apps/codecs/lib/mdct.c
@@ -134,12 +134,50 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
134 } 134 }
135 else 135 else
136 { 136 {
137 T = sincos_lookup1; 137 T = sincos_lookup1;
138 newstep = 2; 138 newstep = 2;
139 } 139 }
140 140
141 while(z1<z2) 141 while(z1<z2)
142 { 142 {
143#ifdef CPU_COLDFIRE
144 asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
145 "movem.l (%[T]), %%d2-%%d3\n\t"
146 "mac.l %%d1, %%d2, %%acc0\n\t"
147 "msac.l %%d0, %%d3, %%acc0\n\t"
148 "mac.l %%d0, %%d2, %%acc1\n\t"
149 "mac.l %%d1, %%d3, %%acc1\n\t"
150
151 "lea (%[newstep]*4, %[T]), %[T]\n\t"
152
153 "movem.l (%[z2]), %%d0-%%d1\n\t"
154 "movem.l (%[T]), %%d2-%%d3\n\t"
155 "mac.l %%d1, %%d3, %%acc2\n\t"
156 "msac.l %%d0, %%d2, %%acc2\n\t"
157 "mac.l %%d0, %%d3, %%acc3\n\t"
158 "mac.l %%d1, %%d2, %%acc3\n\t"
159
160 "lea (%[newstep]*4, %[T]), %[T]\n\t"
161
162 "movclr.l %%acc0, %%d0\n\t"
163 "movclr.l %%acc1, %%d2\n\t"
164 "movclr.l %%acc2, %%d1\n\t"
165 "movclr.l %%acc3, %%d3\n\t"
166
167 "neg.l %%d0\n\t"
168 "neg.l %%d1\n\t"
169 "neg.l %%d2\n\t"
170 "neg.l %%d3\n\t"
171
172 "movem.l %%d0/%%d3, (%[z1])\n\t"
173 "movem.l %%d1/%%d2, (%[z2])\n\t"
174
175 "addq.l #8, %[z1]\n\t"
176 "subq.l #8, %[z2]\n\t"
177 : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T)
178 : [newstep] "d" (newstep)
179 : "d0", "d1", "d2", "d3", "cc", "memory");
180#else
143 fixed32 r0,i0,r1,i1; 181 fixed32 r0,i0,r1,i1;
144 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep; 182 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
145 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep; 183 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
@@ -149,6 +187,7 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
149 z2[1] = -i1; 187 z2[1] = -i1;
150 z1+=2; 188 z1+=2;
151 z2-=2; 189 z2-=2;
190#endif
152 } 191 }
153 192
154 break; 193 break;