diff options
author | Dave Hooper <dave@beermex.com> | 2010-02-21 21:14:40 +0000 |
---|---|---|
committer | Dave Hooper <dave@beermex.com> | 2010-02-21 21:14:40 +0000 |
commit | 3c52395b570d5abc394b5a1320d642057e6f4174 (patch) | |
tree | e23bd14af0f37a591fa960a60b436fa3f0a8c1fa | |
parent | 8aae18b3cce0412c4beeae6fdf95fe190df6ea25 (diff) | |
download | rockbox-3c52395b570d5abc394b5a1320d642057e6f4174.tar.gz rockbox-3c52395b570d5abc394b5a1320d642057e6f4174.zip |
Get a few more % speedup on ARM (measured on ipod video) - improve imdct full final symmetries using ldm/stm and simple register swapping. Also, add more comments (and improve/update some of the existing ones) regarding the layout of the imdct_half and the imdct_full
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24819 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | apps/codecs/lib/SOURCES | 4 | ||||
-rw-r--r-- | apps/codecs/lib/asm_arm.h | 13 | ||||
-rw-r--r-- | apps/codecs/lib/codeclib.h | 2 | ||||
-rw-r--r-- | apps/codecs/lib/mdct.c | 131 |
4 files changed, 116 insertions, 34 deletions
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index da77f97d30..438cd1fc62 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES | |||
@@ -4,14 +4,14 @@ fixedpoint.c | |||
4 | 4 | ||
5 | /* OLD MDCT */ | 5 | /* OLD MDCT */ |
6 | /* (when all other codecs are remediated this can be remoed) */ | 6 | /* (when all other codecs are remediated this can be remoed) */ |
7 | mdct2.c | 7 | /* mdct2.c */ |
8 | mdct_lookup.c | 8 | mdct_lookup.c |
9 | 9 | ||
10 | fft-ffmpeg.c | 10 | fft-ffmpeg.c |
11 | mdct.c | 11 | mdct.c |
12 | 12 | ||
13 | #ifdef CPU_ARM | 13 | #ifdef CPU_ARM |
14 | mdct_arm.S | 14 | /*mdct_arm.S*/ |
15 | setjmp_arm.S | 15 | setjmp_arm.S |
16 | ../../../firmware/target/arm/support-arm.S | 16 | ../../../firmware/target/arm/support-arm.S |
17 | #endif | 17 | #endif |
diff --git a/apps/codecs/lib/asm_arm.h b/apps/codecs/lib/asm_arm.h index 4f31f80c3e..9dcbcef755 100644 --- a/apps/codecs/lib/asm_arm.h +++ b/apps/codecs/lib/asm_arm.h | |||
@@ -226,14 +226,11 @@ void vect_mult_bw(int32_t *data, int32_t *window, int n) | |||
226 | #define _V_CLIP_MATH | 226 | #define _V_CLIP_MATH |
227 | 227 | ||
228 | static inline int32_t CLIP_TO_15(int32_t x) { | 228 | static inline int32_t CLIP_TO_15(int32_t x) { |
229 | int tmp; | 229 | const int32_t mask = 0xffff7fff; |
230 | asm volatile("subs %1, %0, #32768\n\t" | 230 | asm volatile("teq %0,%0,asr #31\n\t" |
231 | "movpl %0, #0x7f00\n\t" | 231 | "eorne %0,%1,%0,asr #31\n\t" |
232 | "orrpl %0, %0, #0xff\n" | 232 | : "+r"(x) |
233 | "adds %1, %0, #32768\n\t" | 233 | : "r" (mask) |
234 | "movmi %0, #0x8000" | ||
235 | : "+r"(x),"=r"(tmp) | ||
236 | : | ||
237 | : "cc"); | 234 | : "cc"); |
238 | return(x); | 235 | return(x); |
239 | } | 236 | } |
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 817d86a6a3..32a4696b9d 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h | |||
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con | |||
65 | 65 | ||
66 | /*MDCT library functions*/ | 66 | /*MDCT library functions*/ |
67 | /* -1- Tremor mdct */ | 67 | /* -1- Tremor mdct */ |
68 | extern void mdct_backward(int n, int32_t *in, int32_t *out); | 68 | /* extern void mdct_backward(int n, int32_t *in, int32_t *out); */ |
69 | /* -2- ffmpeg fft-based mdct */ | 69 | /* -2- ffmpeg fft-based mdct */ |
70 | extern void ff_imdct_half(unsigned int nbits, int32_t *output, const int32_t *input); | 70 | extern void ff_imdct_half(unsigned int nbits, int32_t *output, const int32_t *input); |
71 | extern void ff_imdct_calc(unsigned int nbits, int32_t *output, const int32_t *input); | 71 | extern void ff_imdct_calc(unsigned int nbits, int32_t *output, const int32_t *input); |
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c index aefd553f25..9747bd14d9 100644 --- a/apps/codecs/lib/mdct.c +++ b/apps/codecs/lib/mdct.c | |||
@@ -72,8 +72,9 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
72 | For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) | 72 | For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) |
73 | 73 | ||
74 | Therefore, prerotation can immediately reuse the same twiddles as fft | 74 | Therefore, prerotation can immediately reuse the same twiddles as fft |
75 | (for postrotation it's still a bit complex, so this is still using | 75 | (for postrotation it's still a bit complex, we reuse the fft trig tables |
76 | an mdct-local set of twiddles to do that part) | 76 | where we can, or a special table for N=2048, or interpolate between |
77 | trig tables for N>2048) | ||
77 | */ | 78 | */ |
78 | const int32_t *T = sincos_lookup0; | 79 | const int32_t *T = sincos_lookup0; |
79 | const int step = 2<<(12-nbits); | 80 | const int step = 2<<(12-nbits); |
@@ -248,25 +249,49 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
248 | * <----input----> | 249 | * <----input----> |
249 | * <-----------output-----------> | 250 | * <-----------output-----------> |
250 | * | 251 | * |
252 | * The result of ff_imdct_half is to put the 'half' imdct here | ||
253 | * | ||
254 | * N/2 N-1 | ||
255 | * <--half imdct--> | ||
256 | * | ||
257 | * We want it here for the full imdct: | ||
258 | * N/4 3N/4-1 | ||
259 | * <--------------> | ||
260 | * | ||
261 | * In addition we need to apply two symmetries to get the full imdct: | ||
262 | * | ||
263 | * <AAAAAA> <DDDDDD> | ||
264 | * <BBBBBB><CCCCCC> | ||
265 | * | ||
266 | * D is a reflection of C | ||
267 | * A is a reflection of B (but with sign flipped) | ||
268 | * | ||
269 | * We process the symmetries at the same time as we 'move' the half imdct | ||
270 | * from [N/2,N-1] to [N/4,3N/4-1] | ||
271 | * | ||
272 | * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1] | ||
273 | * This would require being able to use revtab 'inplace' (since the input | ||
274 | * and output of imdct_half would then overlap somewhat) | ||
251 | */ | 275 | */ |
252 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; | 276 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; |
277 | #ifndef CPU_ARM | ||
253 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | 278 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) |
254 | { | 279 | { |
255 | const int n = (1<<nbits); | 280 | const int n = (1<<nbits); |
256 | const int n2 = (n>>1); | 281 | const int n2 = (n>>1); |
257 | const int n4 = (n>>2); | 282 | const int n4 = (n>>2); |
258 | 283 | ||
284 | /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */ | ||
259 | ff_imdct_half(nbits,output+n2,input); | 285 | ff_imdct_half(nbits,output+n2,input); |
260 | 286 | ||
261 | /* reflect the half imdct into the full N samples */ | ||
262 | /* TODO: this could easily be optimised more! */ | ||
263 | fixed32 * in_r, * in_r2, * out_r, * out_r2; | 287 | fixed32 * in_r, * in_r2, * out_r, * out_r2; |
264 | 288 | ||
289 | /* Copy BBBB to AAAA, reflected and sign-flipped. | ||
290 | Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */ | ||
265 | out_r = output; | 291 | out_r = output; |
266 | out_r2 = output+n2-8; | 292 | out_r2 = output+n2-8; |
267 | in_r = output+n2+n4-8; | 293 | in_r = output+n2+n4-8; |
268 | while(out_r<out_r2) | 294 | while(out_r<out_r2) |
269 | { | ||
270 | out_r[0] = -(out_r2[7] = in_r[7]); | 295 | out_r[0] = -(out_r2[7] = in_r[7]); |
271 | out_r[1] = -(out_r2[6] = in_r[6]); | 296 | out_r[1] = -(out_r2[6] = in_r[6]); |
272 | out_r[2] = -(out_r2[5] = in_r[5]); | 297 | out_r[2] = -(out_r2[5] = in_r[5]); |
@@ -279,7 +304,6 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
279 | out_r += 8; | 304 | out_r += 8; |
280 | out_r2 -= 8; | 305 | out_r2 -= 8; |
281 | } | 306 | } |
282 | |||
283 | in_r = output + n2+n4; | 307 | in_r = output + n2+n4; |
284 | in_r2 = output + n-4; | 308 | in_r2 = output + n-4; |
285 | out_r = output + n2; | 309 | out_r = output + n2; |
@@ -289,28 +313,30 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
289 | register fixed32 t0,t1,t2,t3; | 313 | register fixed32 t0,t1,t2,t3; |
290 | register fixed32 s0,s1,s2,s3; | 314 | register fixed32 s0,s1,s2,s3; |
291 | 315 | ||
292 | //simultaneously do the following things: | 316 | /* Copy and reflect CCCC to DDDD. Because CCCC is already where |
293 | // 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] | 317 | we actually want to put DDDD this is a bit complicated. |
294 | // 2. reflect range from [n2+n4 .. n-1] inplace | 318 | * So simultaneously do the following things: |
295 | // | 319 | * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] |
296 | // [ | ] | 320 | * 2. reflect range from [n2+n4 .. n-1] inplace |
297 | // ^a -> <- ^b ^c -> <- ^d | 321 | * |
298 | // | 322 | * [ | ] |
299 | // #1: copy from ^c to ^a | 323 | * ^a -> <- ^b ^c -> <- ^d |
300 | // #2: copy from ^d to ^b | 324 | * |
301 | // #3: swap ^c and ^d in place | 325 | * #1: copy from ^c to ^a |
302 | // | 326 | * #2: copy from ^d to ^b |
303 | // #1 pt1 : load 4 words from ^c. | 327 | * #3: swap ^c and ^d in place |
328 | */ | ||
329 | /* #1 pt1 : load 4 words from ^c. */ | ||
304 | t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; | 330 | t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; |
305 | // #1 pt2 : write to ^a | 331 | /* #1 pt2 : write to ^a */ |
306 | out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; | 332 | out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; |
307 | // #2 pt1 : load 4 words from ^d | 333 | /* #2 pt1 : load 4 words from ^d */ |
308 | s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; | 334 | s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; |
309 | // #2 pt2 : write to ^b | 335 | /* #2 pt2 : write to ^b */ |
310 | out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; | 336 | out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; |
311 | // #3 pt1 : write words from #2 to ^c | 337 | /* #3 pt1 : write words from #2 to ^c */ |
312 | in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; | 338 | in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; |
313 | // #3 pt2 : write words from #1 to ^d | 339 | /* #3 pt2 : write words from #1 to ^d */ |
314 | in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; | 340 | in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; |
315 | 341 | ||
316 | in_r += 4; | 342 | in_r += 4; |
@@ -319,6 +345,65 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | |||
319 | out_r2 -= 4; | 345 | out_r2 -= 4; |
320 | } | 346 | } |
321 | } | 347 | } |
348 | #else | ||
349 | /* Follows the same structure as the canonical version above */ | ||
350 | void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) | ||
351 | { | ||
352 | const int n = (1<<nbits); | ||
353 | const int n2 = (n>>1); | ||
354 | const int n4 = (n>>2); | ||
355 | |||
356 | ff_imdct_half(nbits,output+n2,input); | ||
357 | |||
358 | fixed32 * in_r, * in_r2, * out_r, * out_r2; | ||
359 | |||
360 | out_r = output; | ||
361 | out_r2 = output+n2; | ||
362 | in_r = output+n2+n4; | ||
363 | while(out_r<out_r2) | ||
364 | { | ||
365 | asm volatile( | ||
366 | "ldmdb %[in_r]!, {r0-r7}\n\t" | ||
367 | "stmdb %[out_r2]!, {r0-r7}\n\t" | ||
368 | "rsb r8,r0,#0\n\t" | ||
369 | "rsb r0,r7,#0\n\t" | ||
370 | "rsb r7,r1,#0\n\t" | ||
371 | "rsb r1,r6,#0\n\t" | ||
372 | "rsb r6,r2,#0\n\t" | ||
373 | "rsb r2,r5,#0\n\t" | ||
374 | "rsb r5,r3,#0\n\t" | ||
375 | "rsb r3,r4,#0\n\t" | ||
376 | "stmia %[out_r]!, {r0-r3,r5-r8}\n\t" | ||
377 | : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) | ||
378 | : | ||
379 | : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); | ||
380 | } | ||
381 | in_r = output + n2+n4; | ||
382 | in_r2 = output + n; | ||
383 | out_r = output + n2; | ||
384 | out_r2 = output + n2 + n4; | ||
385 | while(in_r<in_r2) | ||
386 | { | ||
387 | asm volatile( | ||
388 | "ldmia %[in_r], {r0-r3}\n\t" | ||
389 | "stmia %[out_r]!, {r0-r3}\n\t" | ||
390 | "ldmdb %[in_r2], {r5-r8}\n\t" | ||
391 | "stmdb %[out_r2]!, {r5-r8}\n\t" | ||
392 | "mov r4,r0\n\t" | ||
393 | "mov r0,r3\n\t" | ||
394 | "mov r3,r1\n\t" | ||
395 | "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t" | ||
396 | "mov r4,r8\n\t" | ||
397 | "mov r8,r5\n\t" | ||
398 | "mov r5,r7\n\t" | ||
399 | "stmia %[in_r]!, {r4,r5,r6,r8}\n\t" | ||
400 | : | ||
401 | [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) | ||
402 | : | ||
403 | : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); | ||
404 | } | ||
405 | } | ||
406 | #endif | ||
322 | 407 | ||
323 | static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */ | 408 | static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */ |
324 | 409 | ||