summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2010-11-05 11:20:50 +0000
committerNils Wallménius <nils@rockbox.org>2010-11-05 11:20:50 +0000
commitdbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27 (patch)
tree7f18e2158a33462af7eae0d068e437be01ce5805
parent8404c53ee6d2f828fb7ea2b0713d2cd0afcfeeca (diff)
downloadrockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.tar.gz
rockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.zip
libmusepack: ARMv6 assembler for mpc_decoder_windowing_D, speeds up decoding of 128kbps sample file 2MHz, or 8%, on gigabeat S. The output difference to the c implementation and the other ARM implementation is +/-1 in less than 0.1% of the output samples.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28487 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libmusepack/synth_filter.c14
-rw-r--r--apps/codecs/libmusepack/synth_filter_arm.S196
2 files changed, 205 insertions, 5 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index 9a79328106..94c57eb213 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -46,10 +46,16 @@
46 46
47#if defined(MPC_FIXED_POINT) 47#if defined(MPC_FIXED_POINT)
48 #if defined(CPU_ARM) 48 #if defined(CPU_ARM)
49 // do not up-scale D-values to achieve higher speed in smull/mlal 49 #if ARM_ARCH >= 6
50 // operations. saves ~14/8 = 1.75 cycles per multiplication 50 // on ARMv6 we use 32*32=64>>32 multiplies (smmul/smmla) so we need to scale up the D coefficients
51 #define D(value) (value) 51 // the ARM11 multiplier doesn't have early termination so the magnitude of the multiplicands does not
52 52 // matter for speed.
53 #define D(value) (value << (14))
54 #else
55 // do not up-scale D-values to achieve higher speed in smull/mlal
56 // operations. saves ~14/8 = 1.75 cycles per multiplication
57 #define D(value) (value)
58 #endif
53 // in this configuration a post-shift by >>16 is needed after synthesis 59 // in this configuration a post-shift by >>16 is needed after synthesis
54 #else 60 #else
55 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 61 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index b44e029a43..598f218e45 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -296,7 +296,7 @@ mpc_decoder_windowing_D:
296 add r1, r1, #4 /* V++ */ 296 add r1, r1, #4 /* V++ */
297 297
298 ldmpc regs=r4-r11 298 ldmpc regs=r4-r11
299#else /* arm9 and above */ 299#elif ARM_ARCH < 6 /* arm9 and above */
300 mpc_decoder_windowing_D: 300 mpc_decoder_windowing_D:
301 /* r0 = Data[] */ 301 /* r0 = Data[] */
302 /* r1 = V[] */ 302 /* r1 = V[] */
@@ -501,6 +501,200 @@ mpc_decoder_windowing_D:
501 add r1, r1, #4 /* V++ */ 501 add r1, r1, #4 /* V++ */
502 502
503 ldmpc regs=r4-r11 503 ldmpc regs=r4-r11
504#else
505 mpc_decoder_windowing_D:
506 /* r0 = Data[] */
507 /* r1 = V[] */
508 /* r2 = D[] */
509 /* lr = counter */
510 /************************************************************************
511 * Further speed up through making use of symmetries within D[]-window.
512 * The row V[00] can be extracted as it has symmetries within this single
513 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
514 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
515 * saved at the cost of 15 x 4 + 1 add's.
516 * The row V[16] can be extracted as it has symmetries within this single
517 * row. 8 smull/mlal and 8 ldr's can be saved.
518 * On armv6 use smmulr/smlalr which are faster than smull/smlal and only
519 * accumulate the top 32 bits of the result so that frees up 2
520 * registers so we can ldm larger blocks.
521 ***********************************************************************/
522 stmfd sp!, {r4-r11, lr}
523
524 /******************************************
525 * row 0 with internal symmetry
526 *****************************************/
527 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
528 ldmia r2!, { r3-r6 } /* load D[01..04] */
529 ldr r7 , [r1, #96*4] /* 1 */
530 ldr r10, [r1, #992*4] /* 15 */
531 ldr r11, [r1, #128*4] /* 2 */
532 rsb r10, r10, r7 /* V[01] - V[15] */
533 ldr r12, [r1, #896*4] /* 14 */
534 smmulr r9, r10, r3
535 ldr r7 , [r1, #224*4] /* 3 */
536 add r12, r12, r11 /* V[02] + V[14] */
537 ldr r10, [r1, #864*4] /* 13 */
538 smmlar r9, r12, r4, r9
539 ldr r11, [r1, #256*4] /* 4 */
540 rsb r10, r10, r7 /* V[03] - V[13] */
541 ldr r12, [r1, #768*4] /* 12 */
542 smmlar r9, r10, r5, r9
543 ldr r7 , [r1, #352*4] /* 5 */
544 add r12, r12, r11 /* V[04] + V[12] */
545 ldr r10, [r1, #736*4] /* 11 */
546 smmlar r9, r12, r6, r9
547 ldmia r2!, { r3-r6 } /* load D[05..08] */
548 ldr r11, [r1, #384*4] /* 6 */
549 rsb r10, r10, r7 /* V[05] - V[11] */
550 ldr r12, [r1, #640*4] /* 10 */
551 smmlar r9, r10, r3, r9
552 ldr r7 , [r1, #480*4] /* 7 */
553 add r12, r12, r11 /* V[06] + V[10] */
554 ldr r10, [r1, #608*4] /* 9 */
555 smmlar r9, r12, r4, r9
556 rsb r10, r10, r7 /* V[07] - V[09] */
557 ldr r11, [r1, #512*4] /* 8 */
558 smmlar r9, r10, r5, r9
559 add r1, r1, #4 /* V+=1, r1 = V[01] */
560 smmlar r9, r11, r6, r9
561 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
562 mov r9, r9, lsl #2
563 str r9, [r0], #4 /* store Data */
564
565 /******************************************
566 * rows 01..15 are symmetric to rows 31..17
567 * r9 = acc of 01..15
568 * r1 = V[01..15]
569 * r11 = acc of 31..17
570 * r12 = V[31..16]
571 *****************************************/
572 mov lr, #15*8
573 add r12, r1, #30*4 /* r12 = V[31] */
574.loop15:
575 ldmia r2!, { r3-r6 } /* load D[00..03] */
576 ldr r7, [r12, #896*4] /* 14 */
577 ldr r8, [r12, #992*4] /* 15 */
578 smmulr r11, r7, r4
579 ldr r7, [r1] /* 0 */
580 smmlar r11, r8, r3, r11
581 ldr r8, [r1, #96*4] /* 1 */
582 smmulr r9, r7, r3
583 ldr r7, [r12, #768*4] /* 12 */
584 smmlar r9, r8, r4, r9
585 ldr r8, [r12, #864*4] /* 13 */
586 smmlar r11, r7, r6, r11
587 ldr r7, [r1, #128*4] /* 2 */
588 smmlar r11, r8, r5, r11
589 ldr r8, [r1, #224*4] /* 3 */
590 smmlar r9, r7, r5, r9
591 ldr r7, [r1, #256*4] /* 4 */
592 smmlar r9, r8, r6, r9
593 ldmia r2!, { r3-r6 } /* load D[04..07] */
594 ldr r8, [r1, #352*4] /* 5 */
595 smmlar r9, r7, r3, r9
596 ldr r7, [r12, #640*4] /* 10 */
597 smmlar r9, r8, r4, r9
598 ldr r8, [r12, #736*4] /* 11 */
599 smmlar r11, r7, r4, r11
600 ldr r7, [r1, #384*4] /* 6 */
601 smmlar r11, r8, r3, r11
602 ldr r8, [r1, #480*4] /* 7 */
603 smmlar r9, r7, r5, r9
604 ldr r7, [r12, #512*4] /* 8 */
605 smmlar r9, r8, r6, r9
606 ldr r8, [r12, #608*4] /* 9 */
607 smmlar r11, r7, r6, r11
608 ldr r7, [r12, #384*4] /* 6 */
609 smmlar r11, r8, r5, r11
610 ldmia r2!, { r3-r6 } /* load D[08..11] */
611 ldr r8, [r12, #480*4] /* 7 */
612 smmlar r11, r7, r4, r11
613 ldr r7, [r1, #512*4] /* 8 */
614 smmlar r11, r8, r3, r11
615 ldr r8, [r1, #608*4] /* 9 */
616 smmlar r9, r7, r3, r9
617 ldr r7, [r1, #640*4] /* 10 */
618 smmlar r9, r8, r4, r9
619 ldr r8, [r1, #736*4] /* 11 */
620 smmlar r9, r7, r5, r9
621 ldr r7, [r12, #256*4] /* 4 */
622 smmlar r9, r8, r6, r9
623 ldr r8, [r12, #352*4] /* 5 */
624 smmlar r11, r7, r6, r11
625 ldr r7, [r1, #768*4] /* 12 */
626 smmlar r11, r8, r5, r11
627 ldmia r2!, { r3-r6 } /* load D[12..15] */
628 ldr r8, [r1, #864*4] /* 13 */
629 smmlar r9, r7, r3, r9
630 ldr r7, [r12, #128*4] /* 2 */
631 smmlar r9, r8, r4, r9
632 ldr r8, [r12, #224*4] /* 3 */
633 smmlar r11, r7, r4, r11
634 ldr r7, [r12] /* 0 */
635 smmlar r11, r8, r3, r11
636 ldr r8, [r12, #96*4] /* 1 */
637 smmlar r11, r7, r6, r11
638 ldr r7, [r1, #896*4] /* 14 */
639 smmlar r11, r8, r5, r11
640 ldr r8, [r1, #992*4] /* 15 */
641 smmlar r9, r7, r5, r9
642 sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */
643 smmlar r9, r8, r6, r9
644 add r1, r1, #4 /* r1 = V++ correct adresses for next loop */
645 rsb r11, r11, #0 /* r11 = -r11 */
646 /* store Data[01..15] */
647 mov r9, r9, lsl #2
648 str r9, [r0] /* store Data */
649 /* store Data[31..17] */
650 mov r11, r11, lsl #2
651 str r11, [r0, lr] /* store Data */
652 add r0, r0, #4 /* r0++ */
653 /* next loop */
654 subs lr, lr, #8
655 bgt .loop15
656
657 /******************************************
658 * V[16] with internal symmetry
659 *****************************************/
660 ldmia r2!, { r3-r6 } /* load D[00..03] */
661 ldr r7 , [r1] /* 0 */
662 ldr r10, [r1, #992*4] /* 15 */
663 ldr r11, [r1, #96*4] /* 1 */
664 rsb r10, r10, r7 /* V[00] - V[15] */
665 ldr r12, [r1, #896*4] /* 14 */
666 smmulr r9, r10, r3
667 ldr r7 , [r1, #128*4] /* 2 */
668 rsb r12, r12, r11 /* V[01] - V[14] */
669 ldr r10, [r1, #864*4] /* 13 */
670 smmlar r9, r12, r4, r9
671 ldr r11, [r1, #224*4] /* 3 */
672 rsb r10, r10, r7 /* V[02] - V[13] */
673 ldr r12, [r1, #768*4] /* 12 */
674 smmlar r9, r10, r5, r9
675 ldr r7 , [r1, #256*4] /* 4 */
676 rsb r12, r12, r11 /* V[03] - V[12] */
677 ldr r10, [r1, #736*4] /* 11 */
678 smmlar r9, r12, r6, r9
679 ldmia r2!, { r3-r6 } /* load D[04..07] */
680 ldr r11, [r1, #352*4] /* 5 */
681 rsb r10, r10, r7 /* V[04] - V[11] */
682 ldr r12, [r1, #640*4] /* 10 */
683 smmlar r9, r10, r3, r9
684 ldr r7 , [r1, #384*4] /* 6 */
685 rsb r12, r12, r11 /* V[05] - V[10] */
686 ldr r10, [r1, #608*4] /* 9 */
687 smmlar r9, r12, r4, r9
688 ldr r11, [r1, #480*4] /* 7 */
689 rsb r10, r10, r7 /* V[06] - V[09] */
690 ldr r12, [r1, #512*4] /* 8 */
691 smmlar r9, r10, r5, r9
692 rsb r12, r12, r11 /* V[07] - V[08] */
693 smmlar r9, r12, r6, r9
694 mov r9, r9, lsl #2
695 str r9, [r0], #4 /* store Data */
696
697 ldmpc regs=r4-r11
504#endif 698#endif
505.mpc_dewindowing_end: 699.mpc_dewindowing_end:
506 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D 700 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D