summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Sevakis <jethead71@rockbox.org>2010-05-02 02:44:45 +0000
committerMichael Sevakis <jethead71@rockbox.org>2010-05-02 02:44:45 +0000
commit9f157ad584f50402e44a8055c2dd42b9839d9f2f (patch)
tree836772d5973a0782d1a1dd4345e5508234b92fe6
parent3adac47c6160f62e990e5704151ea73d44791f71 (diff)
downloadrockbox-9f157ad584f50402e44a8055c2dd42b9839d9f2f.tar.gz
rockbox-9f157ad584f50402e44a8055c2dd42b9839d9f2f.zip
Do some SPC codec optimizing for ARMv6 (as a training exercise), tweak realtime BRR for all CPU that use it, add Gaussian ASM optimization for all ARM that can use it. Add some LIKELY/UNLIKELY branch hints. On Gigabeat-S gives +22% speedup. For Gigabeat F, about +5% speedup. For less-powerful players, no real change aside possibly from branch hints.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25771 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libspc/spc_codec.h32
-rw-r--r--apps/codecs/libspc/spc_dsp.c510
2 files changed, 452 insertions, 90 deletions
diff --git a/apps/codecs/libspc/spc_codec.h b/apps/codecs/libspc/spc_codec.h
index cf72f90af4..95d09fa091 100644
--- a/apps/codecs/libspc/spc_codec.h
+++ b/apps/codecs/libspc/spc_codec.h
@@ -37,6 +37,10 @@
37 37
38/** Basic configuration options **/ 38/** Basic configuration options **/
39 39
40#ifndef ARM_ARCH
41#define ARM_ARCH 0
42#endif
43
40#define SPC_DUAL_CORE 1 44#define SPC_DUAL_CORE 1
41 45
42#if !defined(SPC_DUAL_CORE) || NUM_CORES == 1 46#if !defined(SPC_DUAL_CORE) || NUM_CORES == 1
@@ -293,6 +297,15 @@ enum
293 FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) 297 FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1))
294}; 298};
295#elif defined (CPU_ARM) 299#elif defined (CPU_ARM)
300#if ARM_ARCH >= 6
301enum
302{
303 FIR_BUF_CNT = FIR_BUF_HALF * 2,
304 FIR_BUF_SIZE = FIR_BUF_CNT * sizeof ( int32_t ),
305 FIR_BUF_ALIGN = FIR_BUF_SIZE,
306 FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1))
307};
308#else
296enum 309enum
297{ 310{
298 FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2, 311 FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2,
@@ -300,6 +313,7 @@ enum
300 FIR_BUF_ALIGN = FIR_BUF_SIZE, 313 FIR_BUF_ALIGN = FIR_BUF_SIZE,
301 FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1)) 314 FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1))
302}; 315};
316#endif /* ARM_ARCH */
303#endif /* CPU_* */ 317#endif /* CPU_* */
304 318
305struct Spc_Dsp 319struct Spc_Dsp
@@ -318,7 +332,8 @@ struct Spc_Dsp
318 uint16_t noise; /* also read as int16_t */ 332 uint16_t noise; /* also read as int16_t */
319 333
320#if defined(CPU_COLDFIRE) 334#if defined(CPU_COLDFIRE)
321 /* circularly hardware masked address */ 335 /* FIR history is interleaved. Hardware handles wrapping by mask.
336 * |LR|LR|LR|LR|LR|LR|LR|LR| */
322 int32_t *fir_ptr; 337 int32_t *fir_ptr;
323 /* wrapped address just behind current position - 338 /* wrapped address just behind current position -
324 allows mac.w to increment and mask fir_ptr */ 339 allows mac.w to increment and mask fir_ptr */
@@ -328,9 +343,22 @@ struct Spc_Dsp
328#elif defined (CPU_ARM) 343#elif defined (CPU_ARM)
329 /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ 344 /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
330 int32_t *fir_ptr; 345 int32_t *fir_ptr;
346#if ARM_ARCH >= 6
347 /* FIR history is interleaved with guard to eliminate wrap checking
348 * when convolving.
349 * |LR|LR|LR|LR|LR|LR|LR|LR|--|--|--|--|--|--|--|--| */
350 /* copy of echo FIR constants as int16_t, loaded as int32 for
351 * halfword, packed multiples */
352 int16_t fir_coeff [VOICE_COUNT];
353#else
354 /* FIR history is interleaved with guard to eliminate wrap checking
355 * when convolving.
356 * |LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|...
357 * |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--| */
331 /* copy of echo FIR constants as int32_t, for faster access */ 358 /* copy of echo FIR constants as int32_t, for faster access */
332 int32_t fir_coeff [VOICE_COUNT]; 359 int32_t fir_coeff [VOICE_COUNT];
333#else 360#endif /* ARM_ARCH */
361#else /* Unoptimized CPU */
334 /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ 362 /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
335 int fir_pos; /* (0 to 7) */ 363 int fir_pos; /* (0 to 7) */
336 int fir_buf [FIR_BUF_HALF * 2] [2]; 364 int fir_buf [FIR_BUF_HALF * 2] [2];
diff --git a/apps/codecs/libspc/spc_dsp.c b/apps/codecs/libspc/spc_dsp.c
index 5ea6514783..0d07e5f04e 100644
--- a/apps/codecs/libspc/spc_dsp.c
+++ b/apps/codecs/libspc/spc_dsp.c
@@ -57,6 +57,16 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
57 } 57 }
58} 58}
59 59
60#if ARM_ARCH >= 6
61/* if ( n < -32768 ) out = -32768; */
62/* if ( n > 32767 ) out = 32767; */
63#define CLAMP16( n ) \
64 ({ \
65 asm ("ssat %0, #16, %1" \
66 : "=r" ( n ) : "r"( n ) ); \
67 n; \
68 })
69#else
60/* if ( n < -32768 ) out = -32768; */ 70/* if ( n < -32768 ) out = -32768; */
61/* if ( n > 32767 ) out = 32767; */ 71/* if ( n > 32767 ) out = 32767; */
62#define CLAMP16( n ) \ 72#define CLAMP16( n ) \
@@ -65,6 +75,7 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
65 n = 0x7FFF ^ (n >> 31); \ 75 n = 0x7FFF ^ (n >> 31); \
66 n; \ 76 n; \
67}) 77})
78#endif
68 79
69#if SPC_BRRCACHE 80#if SPC_BRRCACHE
70static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, 81static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
@@ -418,7 +429,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
418 /* Key on events are delayed */ 429 /* Key on events are delayed */
419 int key_on_delay = voice->key_on_delay; 430 int key_on_delay = voice->key_on_delay;
420 431
421 if ( --key_on_delay >= 0 ) /* <1% of the time */ 432 if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */
422 { 433 {
423 key_on(this,voice,sd,raw_voice,key_on_delay,vbit); 434 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
424 } 435 }
@@ -438,13 +449,13 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
438 int env_mode = voice->env_mode; 449 int env_mode = voice->env_mode;
439 int adsr0 = raw_voice->adsr [0]; 450 int adsr0 = raw_voice->adsr [0];
440 int env_timer; 451 int env_timer;
441 if ( env_mode != state_release ) /* 99% of the time */ 452 if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */
442 { 453 {
443 env_timer = voice->env_timer; 454 env_timer = voice->env_timer;
444 if ( adsr0 & 0x80 ) /* 79% of the time */ 455 if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */
445 { 456 {
446 int adsr1 = raw_voice->adsr [1]; 457 int adsr1 = raw_voice->adsr [1];
447 if ( env_mode == state_sustain ) /* 74% of the time */ 458 if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */
448 { 459 {
449 if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 ) 460 if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
450 goto write_env_timer; 461 goto write_env_timer;
@@ -607,25 +618,12 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
607 goto skip_decode; 618 goto skip_decode;
608 } 619 }
609 } 620 }
610 621
611 /* header */ 622 /* header */
612 int const block_header = *addr; 623 int const block_header = *addr;
613 addr += 9; 624 addr += 9;
614 voice->addr = addr; 625 voice->addr = addr;
615 voice->block_header = block_header; 626 voice->block_header = block_header;
616 int const filter = (block_header & 0x0C) - 0x08;
617
618 /* scaling (invalid scaling gives -4096 for neg nybble,
619 0 for pos) */
620 static unsigned char const right_shifts [16] = {
621 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29, 29, 29,
622 };
623 static unsigned char const left_shifts [16] = {
624 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
625 };
626 int const scale = block_header >> 4;
627 int const right_shift = right_shifts [scale];
628 int const left_shift = left_shifts [scale];
629 627
630 /* previous samples */ 628 /* previous samples */
631 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1]; 629 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
@@ -650,54 +648,117 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
650 /* force sample to end on next decode */ 648 /* force sample to end on next decode */
651 voice->block_header = 1; 649 voice->block_header = 1;
652 } 650 }
653 651
654 do /* decode and filter 16 samples */ 652 int const filter = block_header & 0x0c;
653 int const scale = block_header >> 4;
654
655 if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
655 { 656 {
656 /* Get nybble, sign-extend, then scale 657 /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
657 get byte, select which nybble, sign-extend, then shift 658 do /* decode and filter 16 samples */
658 based on scaling. also handles invalid scaling values.*/
659 int delta = (int) (int8_t) (addr [offset >> 3] <<
660 (offset & 4)) >> right_shift << left_shift;
661
662 out [offset >> 2] = smp2;
663
664 if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
665 { 659 {
660 /* Get nybble, sign-extend, then scale
661 get byte, select which nybble, sign-extend, then shift
662 based on scaling. */
663 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
664 delta = (delta << scale) >> 1;
665
666 if (scale > 0xc)
667 delta = (delta >> 17) << 11;
668
669 out [offset >> 2] = smp2;
670
666 delta -= smp2 >> 1; 671 delta -= smp2 >> 1;
667 delta += smp2 >> 5; 672 delta += smp2 >> 5;
668 smp2 = smp1;
669 delta += smp1; 673 delta += smp1;
670 delta += (-smp1 - (smp1 >> 1)) >> 5; 674 delta += (-smp1 - (smp1 >> 1)) >> 5;
675
676 delta = CLAMP16( delta );
677 smp2 = smp1;
678 smp1 = (int16_t) (delta * 2); /* sign-extend */
671 } 679 }
672 else 680 while ( (offset += 4) != 0 );
681 }
682 else if ( filter == 0x04 ) /* filter 1 */
683 {
684 /* y[n] = x[n] + 15/16 * y[n-1] */
685 do /* decode and filter 16 samples */
673 { 686 {
674 if ( filter == -4 ) /* mode 0x04 */ 687 /* Get nybble, sign-extend, then scale
675 { 688 get byte, select which nybble, sign-extend, then shift
676 delta += smp1 >> 1; 689 based on scaling. */
677 delta += (-smp1) >> 5; 690 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
678 } 691 delta = (delta << scale) >> 1;
679 else if ( filter > -4 ) /* mode 0x0C */ 692
680 { 693 if (scale > 0xc)
681 delta -= smp2 >> 1; 694 delta = (delta >> 17) << 11;
682 delta += (smp2 + (smp2 >> 1)) >> 4; 695
683 delta += smp1; 696 out [offset >> 2] = smp2;
684 delta += (-smp1 * 13) >> 7; 697
685 } 698 delta += smp1 >> 1;
699 delta += (-smp1) >> 5;
700
701 delta = CLAMP16( delta );
686 smp2 = smp1; 702 smp2 = smp1;
703 smp1 = (int16_t) (delta * 2); /* sign-extend */
687 } 704 }
688 705 while ( (offset += 4) != 0 );
689 delta = CLAMP16( delta );
690 smp1 = (int16_t) (delta * 2); /* sign-extend */
691 } 706 }
692 while ( (offset += 4) != 0 ); 707 else if ( filter == 0x0c ) /* filter 3 */
693 708 {
709 /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
710 do /* decode and filter 16 samples */
711 {
712 /* Get nybble, sign-extend, then scale
713 get byte, select which nybble, sign-extend, then shift
714 based on scaling. */
715 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
716 delta = (delta << scale) >> 1;
717
718 if (scale > 0xc)
719 delta = (delta >> 17) << 11;
720
721 out [offset >> 2] = smp2;
722
723 delta -= smp2 >> 1;
724 delta += (smp2 + (smp2 >> 1)) >> 4;
725 delta += smp1;
726 delta += (-smp1 * 13) >> 7;
727
728 delta = CLAMP16( delta );
729 smp2 = smp1;
730 smp1 = (int16_t) (delta * 2); /* sign-extend */
731 }
732 while ( (offset += 4) != 0 );
733 }
734 else /* filter 0 */
735 {
736 /* y[n] = x[n] */
737 do /* decode and filter 16 samples */
738 {
739 /* Get nybble, sign-extend, then scale
740 get byte, select which nybble, sign-extend, then shift
741 based on scaling. */
742 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
743 delta = (delta << scale) >> 1;
744
745 if (scale > 0xc)
746 delta = (delta >> 17) << 11;
747
748 out [offset >> 2] = smp2;
749
750 smp2 = smp1;
751 smp1 = delta * 2;
752 }
753 while ( (offset += 4) != 0 );
754 }
755
694 out [0] = smp2; 756 out [0] = smp2;
695 out [1] = smp1; 757 out [1] = smp1;
696 758
697 skip_decode:; 759 skip_decode:;
698 } 760 }
699 #endif 761 #endif /* !SPC_BRRCACHE */
700
701 /* Get rate (with possible modulation) */ 762 /* Get rate (with possible modulation) */
702 int rate = VOICE_RATE(vr); 763 int rate = VOICE_RATE(vr);
703 if ( this->r.g.pitch_mods & vbit ) 764 if ( this->r.g.pitch_mods & vbit )
@@ -754,13 +815,87 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
754 815
755 /* Use faster gaussian interpolation when exact result isn't needed 816 /* Use faster gaussian interpolation when exact result isn't needed
756 by pitch modulator of next channel */ 817 by pitch modulator of next channel */
757 int amp_0, amp_1; 818 int amp_0, amp_1; /* Also serve as temps _0, and _1 */
758 if ( !(slow_gaussian & vbit) ) /* 99% of the time */ 819 if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */
759 { 820 {
760 /* Main optimization is lack of clamping. Not a problem since 821 /* Main optimization is lack of clamping. Not a problem since
761 output never goes more than +/- 16 outside 16-bit range and 822 output never goes more than +/- 16 outside 16-bit range and
762 things are clamped later anyway. Other optimization is to 823 things are clamped later anyway. Other optimization is to
763 preserve fractional accuracy, eliminating several masks. */ 824 preserve fractional accuracy, eliminating several masks. */
825 #if defined (CPU_ARM)
826 int output;
827 int _2, _3; /* All-purpose temps */
828 /* Multiple ASM blocks keep regs free and reduce result
829 * latency issues. */
830 #if ARM_ARCH >= 6
831 /* Interpolate */
832 asm volatile (
833 "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
834 "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
835 "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
836 "ldr %[_3], [%[rev]] \r\n" /* _3=r0r1 */
837 "smuad %[out], %[_0], %[_2] \r\n" /* out=f0*i0 + f1*i1 */
838 "smladx %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */
839 : [out]"=&r"(output),
840 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
841 [_2]"=&r"(_2), [_3]"=&r"(_3)
842 : [fwd]"r"(fwd), [rev]"r"(rev),
843 [interp]"r"(interp));
844 /* Apply voice envelope */
845 asm volatile (
846 "mov %[_2], %[out], asr #(11-5) \r\n" /* To do >> 16 later */
847 "mul %[out], %[_2], %[envx] \r\n" /* and avoid exp. shift */
848 : [out]"+r"(output), [_2]"=&r"(_2)
849 : [envx]"r"((int)voice->envx));
850 /* Apply left and right volume */
851 asm volatile (
852 "smulwb %[amp_0], %[out], %[vvol_0] \r\n" /* (32x16->48)[47:16]->[31:0] */
853 "smulwb %[amp_1], %[out], %[vvol_1] \r\n"
854 : [out]"+r"(output),
855 [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
856 : [vvol_0]"r"(voice->volume[0]),
857 [vvol_1]"r"(voice->volume[1]));
858
859 raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */
860 #else /* ARM_ARCH < 6 */
861 /* Perform gaussian interpolation on four samples */
862 asm volatile (
863 "ldrsh %[_0], [%[interp]] \r\n"
864 "ldrsh %[_2], [%[fwd]] \r\n"
865 "ldrsh %[_1], [%[interp], #2] \r\n"
866 "ldrsh %[_3], [%[fwd], #2] \r\n"
867 "mul %[out], %[_0], %[_2] \r\n" /* out= fwd[0]*interp[0] */
868 "ldrsh %[_0], [%[interp], #4] \r\n"
869 "ldrsh %[_2], [%[rev], #2] \r\n"
870 "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */
871 "ldrsh %[_1], [%[interp], #6] \r\n"
872 "ldrsh %[_3], [%[rev]] \r\n"
873 "mla %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */
874 "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */
875 : [out]"=&r"(output),
876 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
877 [_2]"=&r"(_2), [_3]"=&r"(_3)
878 : [fwd]"r"(fwd), [rev]"r"(rev),
879 [interp]"r"(interp));
880 /* Apply voice envelope */
881 asm volatile (
882 "mov %[_2], %[out], asr #11 \r\n"
883 "mul %[out], %[_2], %[envx] \r\n"
884 : [out]"+r"(output), [_2]"=&r"(_2)
885 : [envx]"r"((int)voice->envx));
886 /* Reduce and apply left and right volume */
887 asm volatile (
888 "mov %[out], %[out], asr #11 \r\n"
889 "mul %[amp_0], %[out], %[vvol_0] \r\n"
890 "mul %[amp_1], %[out], %[vvol_1] \r\n"
891 : [out]"+r"(output),
892 [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
893 : [vvol_0]"r"((int)voice->volume[0]),
894 [vvol_1]"r"((int)voice->volume[1]));
895
896 raw_voice->outx = output >> 8;
897 #endif /* ARM_ARCH */
898 #else /* Unoptimized CPU */
764 int output = (((fwd [0] * interp [0] + 899 int output = (((fwd [0] * interp [0] +
765 fwd [1] * interp [1] + 900 fwd [1] * interp [1] +
766 rev [1] * interp [2] + 901 rev [1] * interp [2] +
@@ -769,11 +904,121 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
769 /* duplicated here to give compiler more to run in parallel */ 904 /* duplicated here to give compiler more to run in parallel */
770 amp_0 = voice->volume [0] * output; 905 amp_0 = voice->volume [0] * output;
771 amp_1 = voice->volume [1] * output; 906 amp_1 = voice->volume [1] * output;
907
772 raw_voice->outx = output >> 8; 908 raw_voice->outx = output >> 8;
909 #endif /* CPU_* */
773 } 910 }
774 else 911 else /* slow gaussian */
775 { 912 {
913 #if defined(CPU_ARM)
914 #if ARM_ARCH >= 6
915 int output = *(int16_t*) &this->noise;
916
917 if ( !(this->r.g.noise_enables & vbit) )
918 {
919 /* Interpolate */
920 int _2, _3;
921 asm volatile (
922 /* NOTE: often-unaligned accesses */
923 "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
924 "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
925 "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
926 "ldr %[_3], [%[rev]] \r\n" /* _3=f2f3 */
927 "smulbb %[out], %[_0], %[_2] \r\n" /* out=f0*i0 */
928 "smultt %[_0], %[_0], %[_2] \r\n" /* _0=f1*i1 */
929 "smulbt %[_2], %[_1], %[_3] \r\n" /* _2=r1*i2 */
930 "smultb %[_3], %[_1], %[_3] \r\n" /* _3=r0*i3 */
931 : [out]"=r"(output),
932 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
933 [_2]"=&r"(_2), [_3]"=&r"(_3)
934 : [fwd]"r"(fwd), [rev]"r"(rev),
935 [interp]"r"(interp));
936 asm volatile (
937 "mov %[out], %[out], asr#12 \r\n"
938 "add %[_0], %[out], %[_0], asr #12 \r\n"
939 "add %[_2], %[_0], %[_2], asr #12 \r\n"
940 "pkhbt %[_0], %[_2], %[_3], asl #4 \r\n" /* _3[31:16], _2[15:0] */
941 "sadd16 %[_0], %[_0], %[_0] \r\n" /* _3[31:16]*2, _2[15:0]*2 */
942 "qsubaddx %[out], %[_0], %[_0] \r\n" /* out[15:0]=
943 * sat16(_3[31:16]+_2[15:0]) */
944 : [out]"+r"(output),
945 [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3));
946 }
947 /* Apply voice envelope */
948 asm volatile (
949 "smulbb %[out], %[out], %[envx] \r\n"
950 : [out]"+r"(output)
951 : [envx]"r"(voice->envx));
952 /* Reduce and apply left and right volume */
953 asm volatile (
954 "mov %[out], %[out], asr #11 \r\n"
955 "bic %[out], %[out], #0x1 \r\n"
956 "mul %[amp_0], %[out], %[vvol_0] \r\n"
957 "mul %[amp_1], %[out], %[vvol_1] \r\n"
958 : [out]"+r"(output),
959 [amp_1]"=r"(amp_1), [amp_0]"=r"(amp_0)
960 : [vvol_0]"r"((int)voice->volume[0]),
961 [vvol_1]"r"((int)voice->volume[1]));
962
963 prev_outx = output;
964 raw_voice->outx = output >> 8;
965 #else /* ARM_ARCH < 6 */
966 int output = *(int16_t*) &this->noise;
967
968 if ( !(this->r.g.noise_enables & vbit) )
969 {
970 /* Interpolate */
971 int _2, _3;
972 asm volatile (
973 "ldrsh %[_0], [%[interp]] \r\n"
974 "ldrsh %[_2], [%[fwd]] \r\n"
975 "ldrsh %[_1], [%[interp], #2] \r\n"
976 "ldrsh %[_3], [%[fwd], #2] \r\n"
977 "mul %[out], %[_2], %[_0] \r\n" /* fwd[0]*interp[0] */
978 "ldrsh %[_2], [%[rev], #2] \r\n"
979 "mul %[_0], %[_3], %[_1] \r\n" /* fwd[1]*interp[1] */
980 "ldrsh %[_1], [%[interp], #4] \r\n"
981 "mov %[out], %[out], asr #12 \r\n"
982 "ldrsh %[_3], [%[rev]] \r\n"
983 "mul %[_2], %[_1], %[_2] \r\n" /* rev[1]*interp[2] */
984 "ldrsh %[_1], [%[interp], #6] \r\n"
985 "add %[_0], %[out], %[_0], asr #12 \r\n"
986 "mul %[_3], %[_1], %[_3] \r\n" /* rev[0]*interp[3] */
987 "add %[_2], %[_0], %[_2], asr #12 \r\n"
988 "mov %[_2], %[_2], lsl #17 \r\n"
989 "mov %[_3], %[_3], asr #12 \r\n"
990 "mov %[_3], %[_3], asl #1 \r\n"
991 "add %[out], %[_3], %[_2], asr #16 \r\n"
992 : [out]"=r"(output),
993 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
994 [_2]"=&r"(_2), [_3]"=&r"(_3)
995 : [fwd]"r"(fwd), [rev]"r"(rev),
996 [interp]"r"(interp));
997
998 output = CLAMP16(output);
999 }
1000 /* Apply voice envelope */
1001 asm volatile (
1002 "mul %[_0], %[out], %[envx] \r\n"
1003 : [_0]"=r"(amp_0)
1004 : [out]"r"(output), [envx]"r"((int)voice->envx));
1005 /* Reduce and apply left and right volume */
1006 asm volatile (
1007 "mov %[out], %[amp_0], asr #11 \r\n" /* amp_0 = _0 */
1008 "bic %[out], %[out], #0x1 \r\n"
1009 "mul %[amp_0], %[out], %[vvol_0] \r\n"
1010 "mul %[amp_1], %[out], %[vvol_1] \r\n"
1011 : [out]"+r"(output), [amp_0]"+r"(amp_0),
1012 [amp_1]"=r"(amp_1)
1013 : [vvol_0]"r"((int)voice->volume[0]),
1014 [vvol_1]"r"((int)voice->volume[1]));
1015
1016 prev_outx = output;
1017 raw_voice->outx = output >> 8;
1018 #endif /* ARM_ARCH >= 6 */
1019 #else /* Unoptimized CPU */
776 int output = *(int16_t*) &this->noise; 1020 int output = *(int16_t*) &this->noise;
1021
777 if ( !(this->r.g.noise_enables & vbit) ) 1022 if ( !(this->r.g.noise_enables & vbit) )
778 { 1023 {
779 output = (fwd [0] * interp [0]) & ~0xFFF; 1024 output = (fwd [0] * interp [0]) & ~0xFFF;
@@ -788,8 +1033,10 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
788 /* duplicated here to give compiler more to run in parallel */ 1033 /* duplicated here to give compiler more to run in parallel */
789 amp_0 = voice->volume [0] * output; 1034 amp_0 = voice->volume [0] * output;
790 amp_1 = voice->volume [1] * output; 1035 amp_1 = voice->volume [1] * output;
1036
791 prev_outx = output; 1037 prev_outx = output;
792 raw_voice->outx = (int8_t) (output >> 8); 1038 raw_voice->outx = output >> 8;
1039 #endif /* CPU_* */
793 } 1040 }
794 #else /* SPCNOINTERP */ 1041 #else /* SPCNOINTERP */
795 /* two-point linear interpolation */ 1042 /* two-point linear interpolation */
@@ -826,16 +1073,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
826 "asr.l %[sh], %[y1] \r\n" 1073 "asr.l %[sh], %[y1] \r\n"
827 "add.l %[y0], %[y1] \r\n" 1074 "add.l %[y0], %[y1] \r\n"
828 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) 1075 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
829 : [s]"a"(voice->samples), [sh]"d"(12) 1076 : [s]"a"(voice->samples), [sh]"d"(12));
830 );
831 } 1077 }
832 1078
833 /* apply voice envelope to output */ 1079 /* apply voice envelope to output */
834 asm volatile ( 1080 asm volatile (
835 "mac.w %[output]l, %[envx]l, %%acc0 \r\n" 1081 "mac.w %[out]l, %[envx]l, %%acc0 \r\n"
836 : 1082 :
837 : [output]"r"(amp_0), [envx]"r"(voice->envx) 1083 : [out]"r"(amp_0), [envx]"r"(voice->envx));
838 );
839 1084
840 /* advance voice position */ 1085 /* advance voice position */
841 voice->position += rate; 1086 voice->position += rate;
@@ -843,15 +1088,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
843 /* fetch output, scale and apply left and right 1088 /* fetch output, scale and apply left and right
844 voice volume */ 1089 voice volume */
845 asm volatile ( 1090 asm volatile (
846 "movclr.l %%acc0, %[output] \r\n" 1091 "movclr.l %%acc0, %[out] \r\n"
847 "asr.l %[sh], %[output] \r\n" 1092 "asr.l %[sh], %[out] \r\n"
848 "mac.l %[vvol_0], %[output], %%acc0 \r\n" 1093 "mac.l %[vvol_0], %[out], %%acc0 \r\n"
849 "mac.l %[vvol_1], %[output], %%acc1 \r\n" 1094 "mac.l %[vvol_1], %[out], %%acc1 \r\n"
850 : [output]"=&d"(amp_0) 1095 : [out]"=&d"(amp_0)
851 : [vvol_0]"r"((int)voice->volume[0]), 1096 : [vvol_0]"r"((int)voice->volume[0]),
852 [vvol_1]"r"((int)voice->volume[1]), 1097 [vvol_1]"r"((int)voice->volume[1]),
853 [sh]"d"(11) 1098 [sh]"d"(11));
854 );
855 1099
856 /* save this output into previous, scale and save in 1100 /* save this output into previous, scale and save in
857 output register */ 1101 output register */
@@ -862,14 +1106,16 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
862 asm volatile ( 1106 asm volatile (
863 "movclr.l %%acc0, %[amp_0] \r\n" 1107 "movclr.l %%acc0, %[amp_0] \r\n"
864 "movclr.l %%acc1, %[amp_1] \r\n" 1108 "movclr.l %%acc1, %[amp_1] \r\n"
865 : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) 1109 : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1));
866 );
867 #elif defined (CPU_ARM) 1110 #elif defined (CPU_ARM)
868 int amp_0, amp_1; 1111 int amp_0, amp_1;
869 1112
870 if ( (this->r.g.noise_enables & vbit) != 0 ) { 1113 if ( (this->r.g.noise_enables & vbit) != 0 )
1114 {
871 amp_0 = *(int16_t *)&this->noise; 1115 amp_0 = *(int16_t *)&this->noise;
872 } else { 1116 }
1117 else
1118 {
873 uint32_t f = voice->position; 1119 uint32_t f = voice->position;
874 amp_0 = (uint32_t)voice->samples; 1120 amp_0 = (uint32_t)voice->samples;
875 1121
@@ -882,8 +1128,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
882 "sub %[y1], %[y1], %[y0] \r\n" 1128 "sub %[y1], %[y1], %[y0] \r\n"
883 "mul %[f], %[y1], %[f] \r\n" 1129 "mul %[f], %[y1], %[f] \r\n"
884 "add %[y0], %[y0], %[f], asr #12 \r\n" 1130 "add %[y0], %[y0], %[f], asr #12 \r\n"
885 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1) 1131 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1));
886 );
887 } 1132 }
888 1133
889 voice->position += rate; 1134 voice->position += rate;
@@ -893,8 +1138,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
893 "mov %[amp_0], %[amp_1], asr #11 \r\n" 1138 "mov %[amp_0], %[amp_1], asr #11 \r\n"
894 "mov %[amp_1], %[amp_0], asr #8 \r\n" 1139 "mov %[amp_1], %[amp_0], asr #8 \r\n"
895 : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1) 1140 : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
896 : [envx]"r"(voice->envx) 1141 : [envx]"r"(voice->envx));
897 );
898 1142
899 prev_outx = amp_0; 1143 prev_outx = amp_0;
900 raw_voice->outx = (int8_t)amp_1; 1144 raw_voice->outx = (int8_t)amp_1;
@@ -904,8 +1148,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
904 "mul %[amp_0], %[vol_0], %[amp_0] \r\n" 1148 "mul %[amp_0], %[vol_0], %[amp_0] \r\n"
905 : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) 1149 : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
906 : [vol_0]"r"((int)voice->volume[0]), 1150 : [vol_0]"r"((int)voice->volume[0]),
907 [vol_1]"r"((int)voice->volume[1]) 1151 [vol_1]"r"((int)voice->volume[1]));
908 );
909 #else /* Unoptimized CPU */ 1152 #else /* Unoptimized CPU */
910 int output; 1153 int output;
911 1154
@@ -1089,25 +1332,116 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1089 echo_pos = 0; 1332 echo_pos = 0;
1090 this->echo_pos = echo_pos; 1333 this->echo_pos = echo_pos;
1091 1334
1335 #if ARM_ARCH >= 6
1336 int32_t *fir_ptr, *fir_coeff;
1337 int fb_0, fb_1;
1338
1339 /* Apply FIR */
1340 fb_0 = *(uint32_t *)echo_ptr;
1341
1342 /* Keep last 8 samples */
1343 asm volatile (
1344 "add %[fir_p], %[t_fir_p], #4 \r\n"
1345 "bic %[t_fir_p], %[fir_p], %[mask] \r\n"
1346 "str %[fb_0], [%[fir_p], #-4] \r\n"
1347 /* duplicate at +8 eliminates wrap checking below */
1348 "str %[fb_0], [%[fir_p], #28] \r\n"
1349 : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
1350 : [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK));
1351
1352 fir_coeff = (int32_t *)this->fir_coeff;
1353
1354 /* Fugly, but the best version found. */
1355 int _0;
1356 asm volatile ( /* L0R0 = acc0 */
1357 "ldmia %[fir_p]!, { r2-r5 } \r\n" /* L1R1-L4R4 = r2-r5 */
1358 "ldmia %[fir_c]!, { r0-r1 } \r\n" /* C0C1-C2C3 = r0-r1 */
1359 "pkhbt %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */
1360 "pkhtb r2, r2, %[acc0], asr #16 \r\n"
1361 "smuad %[acc0], %[_0], r0 \r\n" /* acc0=L0*C0+L1*C1 */
1362 "smuad %[acc1], r2, r0 \r\n" /* acc1=R0*C0+R1*C1 */
1363 "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L2R2,L3R3->L2L3,R2R3 */
1364 "pkhtb r4, r4, r3, asr #16 \r\n"
1365 "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3 */
1366 "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R2*C2+R3*C3 */
1367 "ldmia %[fir_p], { r2-r4 } \r\n" /* L5R5-L7R7 = r2-r4 */
1368 "ldmia %[fir_c], { r0-r1 } \r\n" /* C4C5-C6C7 = r0-r1 */
1369 "pkhbt %[_0], r5, r2, asl #16 \r\n" /* L4R4,L5R5->L4L5,R4R5 */
1370 "pkhtb r2, r2, r5, asr #16 \r\n"
1371 "smlad %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5 */
1372 "smlad %[acc1], r2, r0, %[acc1] \r\n" /* acc1+=R4*C4+R5*C5 */
1373 "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L6R6,L7R7->L6L7,R6R7 */
1374 "pkhtb r4, r4, r3, asr #16 \r\n"
1375 "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7 */
1376 "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R6*C6+R7*C7 */
1377 : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0),
1378 [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1379 :
1380 : "r0", "r1", "r2", "r3", "r4", "r5");
1381
1382 /* Generate output */
1383 int amp_0, amp_1;
1384
1385 asm volatile (
1386 "mul %[amp_0], %[gvol_0], %[chans_0] \r\n"
1387 "mul %[amp_1], %[gvol_1], %[chans_1] \r\n"
1388 : [amp_0]"=&r"(amp_0), [amp_1]"=&r"(amp_1)
1389 : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1),
1390 [chans_0]"r"(chans_0), [chans_1]"r"(chans_1));
1391 asm volatile (
1392 "mla %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n"
1393 "mla %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n"
1394 : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
1395 : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1396 [ev_0]"r"((int)this->r.g.echo_volume_0),
1397 [ev_1]"r"((int)this->r.g.echo_volume_1));
1398
1399 out_buf [ 0] = amp_0 >> global_muting;
1400 out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting;
1401 out_buf ++;
1402
1403 if ( !(this->r.g.flags & 0x20) )
1404 {
1405 /* Feedback into echo buffer */
1406 int e0, e1;
1407
1408 asm volatile (
1409 "mov %[e0], %[echo_0], asl #7 \r\n"
1410 "mov %[e1], %[echo_1], asl #7 \r\n"
1411 "mla %[e0], %[fb_0], %[efb], %[e0] \r\n"
1412 "mla %[e1], %[fb_1], %[efb], %[e1] \r\n"
1413 : [e0]"=&r"(e0), [e1]"=&r"(e1)
1414 : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1),
1415 [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1416 [efb]"r"((int)this->r.g.echo_feedback));
1417 asm volatile (
1418 "ssat %[e0], #16, %[e0], asr #14 \r\n"
1419 "ssat %[e1], #16, %[e1], asr #14 \r\n"
1420 "pkhbt %[e0], %[e0], %[e1], lsl #16 \r\n"
1421 "str %[e0], [%[echo_p]] \r\n"
1422 : [e0]"+r"(e0), [e1]"+r"(e1)
1423 : [echo_p]"r"(echo_ptr));
1424 }
1425 #else /* ARM_ARCH < 6 */
1092 int fb_0 = GET_LE16SA( echo_ptr ); 1426 int fb_0 = GET_LE16SA( echo_ptr );
1093 int fb_1 = GET_LE16SA( echo_ptr + 2 ); 1427 int fb_1 = GET_LE16SA( echo_ptr + 2 );
1428 int32_t *fir_ptr, *fir_coeff;
1094 1429
1095 /* Keep last 8 samples */ 1430 /* Keep last 8 samples */
1096 int32_t *fir_ptr = this->fir_ptr;
1097 1431
1098 /* Apply FIR */ 1432 /* Apply FIR */
1099 asm volatile ( 1433 asm volatile (
1100 "str %[fb_0], [%[fir_p]], #4 \r\n" 1434 "add %[fir_p], %[t_fir_p], #8 \r\n"
1101 "str %[fb_1], [%[fir_p]], #4 \r\n" 1435 "bic %[t_fir_p], %[fir_p], %[mask] \r\n"
1436 "str %[fb_0], [%[fir_p], #-8] \r\n"
1437 "str %[fb_1], [%[fir_p], #-4] \r\n"
1102 /* duplicate at +8 eliminates wrap checking below */ 1438 /* duplicate at +8 eliminates wrap checking below */
1103 "str %[fb_0], [%[fir_p], #56] \r\n" 1439 "str %[fb_0], [%[fir_p], #56] \r\n"
1104 "str %[fb_1], [%[fir_p], #60] \r\n" 1440 "str %[fb_1], [%[fir_p], #60] \r\n"
1105 : [fir_p]"+r"(fir_ptr) 1441 : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
1106 : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1) 1442 : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK));
1107 );
1108 1443
1109 this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK); 1444 fir_coeff = this->fir_coeff;
1110 int32_t *fir_coeff = this->fir_coeff;
1111 1445
1112 asm volatile ( 1446 asm volatile (
1113 "ldmia %[fir_c]!, { r0-r1 } \r\n" 1447 "ldmia %[fir_c]!, { r0-r1 } \r\n"
@@ -1137,8 +1471,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1137 : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1), 1471 : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
1138 [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) 1472 [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1139 : 1473 :
1140 : "r0", "r1", "r2", "r3", "r4", "r5" 1474 : "r0", "r1", "r2", "r3", "r4", "r5");
1141 );
1142 1475
1143 /* Generate output */ 1476 /* Generate output */
1144 int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0) 1477 int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
@@ -1160,6 +1493,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1160 e1 = CLAMP16( e1 ); 1493 e1 = CLAMP16( e1 );
1161 SET_LE16A( echo_ptr + 2, e1 ); 1494 SET_LE16A( echo_ptr + 2, e1 );
1162 } 1495 }
1496 #endif /* ARM_ARCH */
1163 #else /* Unoptimized CPU */ 1497 #else /* Unoptimized CPU */
1164 /* Read feedback from echo buffer */ 1498 /* Read feedback from echo buffer */
1165 int echo_pos = this->echo_pos; 1499 int echo_pos = this->echo_pos;