summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-11-02 21:00:34 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-11-02 21:00:34 +0000
commit48bb3d00db6f8e1c628ffc517fcee01cf1d909e5 (patch)
treef61e2b9695dcf5d4c8dec5abf4af19bb9d5d0b89
parent4999100e21fccbaab65ec1b4f0724080e41b9b36 (diff)
downloadrockbox-48bb3d00db6f8e1c628ffc517fcee01cf1d909e5.tar.gz
rockbox-48bb3d00db6f8e1c628ffc517fcee01cf1d909e5.zip
Optimize mpc's synthesis filter by reducing stalls for arm9 and above. Speed up ranges from 4% (nano 2g) to 11% (beast).
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28452 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libmusepack/synth_filter_arm.S210
1 files changed, 208 insertions, 2 deletions
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 731a21ce21..5bdae93561 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -92,7 +92,7 @@ mpc_decoder_windowing_D:
92 bgt .loop32 92 bgt .loop32
93 93
94 ldmpc regs=r4-r8 94 ldmpc regs=r4-r8
95#else 95#elif defined(CPU_ARM7TDMI) /* arm7 only */
96mpc_decoder_windowing_D: 96mpc_decoder_windowing_D:
97 /* r0 = Data[] */ 97 /* r0 = Data[] */
98 /* r1 = V[] */ 98 /* r1 = V[] */
@@ -106,6 +106,7 @@ mpc_decoder_windowing_D:
106 * saved at the cost of 15 x 4 + 1 add's. 106 * saved at the cost of 15 x 4 + 1 add's.
107 * The row V[16] can be extracted as it has symmetries within this single 107 * The row V[16] can be extracted as it has symmetries within this single
108 * row. 8 smull/mlal and 8 ldr's can be saved. 108 * row. 8 smull/mlal and 8 ldr's can be saved.
109 * Used for arm7 only. For arm9 and above see implementation below.
109 ***********************************************************************/ 110 ***********************************************************************/
110 stmfd sp!, {r4-r11, lr} 111 stmfd sp!, {r4-r11, lr}
111 112
@@ -152,7 +153,7 @@ mpc_decoder_windowing_D:
152 add r2, r2, #7*4 /* D+=7, r2 = D[16] */ 153 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
153 154
154 /****************************************** 155 /******************************************
155 * rows 01..15 are symmetrc to rows 31..17 156 * rows 01..15 are symmetric to rows 31..17
156 * r8 = lo, r9 = hi of 01..15 157 * r8 = lo, r9 = hi of 01..15
157 * r1 = V[01..15] 158 * r1 = V[01..15]
158 * r10 = lo, r11 = hi of 31..17 159 * r10 = lo, r11 = hi of 31..17
@@ -290,6 +291,211 @@ mpc_decoder_windowing_D:
290 add r1, r1, #4 /* V++ */ 291 add r1, r1, #4 /* V++ */
291 292
292 ldmpc regs=r4-r11 293 ldmpc regs=r4-r11
294#else /* arm9 and above */
295 mpc_decoder_windowing_D:
296 /* r0 = Data[] */
297 /* r1 = V[] */
298 /* r2 = D[] */
299 /* lr = counter */
300 /************************************************************************
301 * Further speed up through making use of symmetries within D[]-window.
302 * The row V[00] can be extracted as it has symmetries within this single
303 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
304 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
305 * saved at the cost of 15 x 4 + 1 add's.
306 * The row V[16] can be extracted as it has symmetries within this single
307 * row. 8 smull/mlal and 8 ldr's can be saved.
308 * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds
309 * up decoding even though several ldm-calls are replaced with ldr to free
310 * 2 registers.
311 ***********************************************************************/
312 stmfd sp!, {r4-r11, lr}
313
314 /******************************************
315 * row 0 with internal symmetry
316 *****************************************/
317 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
318 ldmia r2!, { r3-r6 } /* load D[01..04] */
319 ldr r7 , [r1, #96*4] /* 1 */
320 ldr r10, [r1, #992*4] /* 15 */
321 ldr r11, [r1, #128*4] /* 2 */
322 ldr r12, [r1, #896*4] /* 14 */
323 rsb r10, r10, r7 /* V[01] - V[15] */
324 smull r8, r9, r10, r3
325 ldr r7 , [r1, #224*4] /* 3 */
326 ldr r10, [r1, #864*4] /* 13 */
327 add r12, r12, r11 /* V[02] + V[14] */
328 smlal r8, r9, r12, r4
329 ldr r11, [r1, #256*4] /* 4 */
330 ldr r12, [r1, #768*4] /* 12 */
331 rsb r10, r10, r7 /* V[03] - V[13] */
332 smlal r8, r9, r10, r5
333 ldr r7 , [r1, #352*4] /* 5 */
334 ldr r10, [r1, #736*4] /* 11 */
335 add r12, r12, r11 /* V[04] + V[12] */
336 smlal r8, r9, r12, r6
337 ldmia r2!, { r3-r6 } /* load D[05..08] */
338 ldr r11, [r1, #384*4] /* 6 */
339 ldr r12, [r1, #640*4] /* 10 */
340 rsb r10, r10, r7 /* V[05] - V[11] */
341 smlal r8, r9, r10, r3
342 ldr r7 , [r1, #480*4] /* 7 */
343 ldr r10, [r1, #608*4] /* 9 */
344 add r12, r12, r11 /* V[06] + V[10] */
345 smlal r8, r9, r12, r4
346 ldr r11, [r1, #512*4] /* 8 */
347 rsb r10, r10, r7 /* V[07] - V[09] */
348 smlal r8, r9, r10, r5
349 smlal r8, r9, r11, r6
350 mov r8, r8, lsr #16
351 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
352 str r8, [r0], #4 /* store Data */
353 add r1, r1, #4 /* V+=1, r1 = V[01] */
354 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
355
356 /******************************************
357 * rows 01..15 are symmetric to rows 31..17
358 * r8 = lo, r9 = hi of 01..15
359 * r1 = V[01..15]
360 * r10 = lo, r11 = hi of 31..17
361 * r12 = V[31..16]
362 *****************************************/
363 mov lr, #15
364 add r12, r1, #30*4 /* r12 = V[31] */
365.loop15:
366 ldmia r2!, { r3-r4 } /* load D[00..01] */
367 ldr r7, [r12, #896*4] /* 14 */
368 ldr r5, [r12, #992*4] /* 15 */
369 smull r10, r11, r7, r4
370 ldr r7, [r1] /* 0 */
371 smlal r10, r11, r5, r3
372 ldr r5, [r1, #96*4] /* 1 */
373 smull r8, r9, r7, r3
374 ldr r7, [r12, #768*4] /* 12 */
375 smlal r8, r9, r5, r4
376 ldmia r2!, { r3-r4 } /* load D[02..03] */
377 ldr r5, [r12, #864*4] /* 13 */
378 smlal r10, r11, r7, r4
379 ldr r7, [r1, #128*4] /* 2 */
380 smlal r10, r11, r5, r3
381 ldr r5, [r1, #224*4] /* 3 */
382 smlal r8, r9, r7, r3
383 ldr r7, [r1, #256*4] /* 4 */
384 smlal r8, r9, r5, r4
385 ldmia r2!, { r3-r4 } /* load D[04..04] */
386 ldr r5, [r1, #352*4] /* 5 */
387 smlal r8, r9, r7, r3
388 ldr r7, [r12, #640*4] /* 10 */
389 smlal r8, r9, r5, r4
390 ldr r5, [r12, #736*4] /* 11 */
391 smlal r10, r11, r7, r4
392 ldr r7, [r1, #384*4] /* 6 */
393 smlal r10, r11, r5, r3
394 ldmia r2!, { r3-r4 } /* load D[06..07] */
395 ldr r5, [r1, #480*4] /* 7 */
396 smlal r8, r9, r7, r3
397 ldr r7, [r12, #512*4] /* 8 */
398 smlal r8, r9, r5, r4
399 ldr r5, [r12, #608*4] /* 9 */
400 smlal r10, r11, r7, r4
401 ldr r7, [r12, #384*4] /* 6 */
402 smlal r10, r11, r5, r3
403 ldmia r2!, { r3-r4 } /* load D[08..09] */
404 ldr r5, [r12, #480*4] /* 7 */
405 smlal r10, r11, r7, r4
406 ldr r7, [r1, #512*4] /* 8 */
407 smlal r10, r11, r5, r3
408 ldr r5, [r1, #608*4] /* 9 */
409 smlal r8, r9, r7, r3
410 ldr r7, [r1, #640*4] /* 10 */
411 smlal r8, r9, r5, r4
412 ldmia r2!, { r3-r4 } /* load D[10..11] */
413 ldr r5, [r1, #736*4] /* 11 */
414 smlal r8, r9, r7, r3
415 ldr r7, [r12, #256*4] /* 4 */
416 smlal r8, r9, r5, r4
417 ldr r5, [r12, #352*4] /* 5 */
418 smlal r10, r11, r7, r4
419 ldr r7, [r1, #768*4] /* 12 */
420 smlal r10, r11, r5, r3
421 ldmia r2!, { r3-r4 } /* load D[12..13] */
422 ldr r5, [r1, #864*4] /* 13 */
423 smlal r8, r9, r7, r3
424 ldr r7, [r12, #128*4] /* 2 */
425 smlal r8, r9, r5, r4
426 ldr r5, [r12, #224*4] /* 3 */
427 smlal r10, r11, r7, r4
428 ldr r7, [r12] /* 0 */
429 smlal r10, r11, r5, r3
430 ldmia r2!, { r3-r4 } /* load D[14..15] */
431 ldr r5, [r12, #96*4] /* 1 */
432 smlal r10, r11, r7, r4
433 ldr r7, [r1, #896*4] /* 14 */
434 smlal r10, r11, r5, r3
435 ldr r5, [r1, #992*4] /* 15 */
436 smlal r8, r9, r7, r3
437 smlal r8, r9, r5, r4
438 /* store Data[01..15] */
439 mov r8, r8, lsr #16
440 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
441 str r8, [r0] /* store Data */
442 /* store Data[31..17] */
443 add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
444 mov r10, r10, lsr #16
445 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
446 rsb r10, r10, #0 /* r10 = -r10 */
447 str r10, [r0], #4 /* store Data */
448 sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
449 /* correct adresses for next loop */
450 sub r12, r12, #4 /* r12 = V-- */
451 add r1, r1, #4 /* r1 = V++ */
452 /* next loop */
453 subs lr, lr, #1
454 bgt .loop15
455
456 /******************************************
457 * V[16] with internal symmetry
458 *****************************************/
459 ldmia r2!, { r3-r6 } /* load D[00..03] */
460 ldr r7 , [r1] /* 0 */
461 ldr r10, [r1, #992*4] /* 15 */
462 ldr r11, [r1, #96*4] /* 1 */
463 ldr r12, [r1, #896*4] /* 14 */
464 rsb r10, r10, r7 /* V[00] - V[15] */
465 smull r8, r9, r10, r3
466 ldr r7 , [r1, #128*4] /* 2 */
467 ldr r10, [r1, #864*4] /* 13 */
468 rsb r12, r12, r11 /* V[01] - V[14] */
469 smlal r8, r9, r12, r4
470 ldr r11, [r1, #224*4] /* 3 */
471 ldr r12, [r1, #768*4] /* 12 */
472 rsb r10, r10, r7 /* V[02] - V[13] */
473 smlal r8, r9, r10, r5
474 ldr r7 , [r1, #256*4] /* 4 */
475 ldr r10, [r1, #736*4] /* 11 */
476 rsb r12, r12, r11 /* V[03] - V[12] */
477 smlal r8, r9, r12, r6
478 ldmia r2!, { r3-r6 } /* load D[04..07] */
479 ldr r11, [r1, #352*4] /* 5 */
480 ldr r12, [r1, #640*4] /* 10 */
481 rsb r10, r10, r7 /* V[04] - V[11] */
482 smlal r8, r9, r10, r3
483 ldr r7 , [r1, #384*4] /* 6 */
484 ldr r10, [r1, #608*4] /* 9 */
485 rsb r12, r12, r11 /* V[05] - V[10] */
486 smlal r8, r9, r12, r4
487 ldr r11, [r1, #480*4] /* 7 */
488 ldr r12, [r1, #512*4] /* 8 */
489 rsb r10, r10, r7 /* V[06] - V[09] */
490 smlal r8, r9, r10, r5
491 rsb r12, r12, r11 /* V[07] - V[08] */
492 smlal r8, r9, r12, r6
493 mov r8, r8, lsr #16
494 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
495 str r8, [r0], #4 /* store Data */
496 add r1, r1, #4 /* V++ */
497
498 ldmpc regs=r4-r11
293#endif 499#endif
294.mpc_dewindowing_end: 500.mpc_dewindowing_end:
295 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D 501 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D