diff options
author | Michael Sevakis <jethead71@rockbox.org> | 2010-05-02 02:44:45 +0000 |
---|---|---|
committer | Michael Sevakis <jethead71@rockbox.org> | 2010-05-02 02:44:45 +0000 |
commit | 9f157ad584f50402e44a8055c2dd42b9839d9f2f (patch) | |
tree | 836772d5973a0782d1a1dd4345e5508234b92fe6 /apps | |
parent | 3adac47c6160f62e990e5704151ea73d44791f71 (diff) | |
download | rockbox-9f157ad584f50402e44a8055c2dd42b9839d9f2f.tar.gz rockbox-9f157ad584f50402e44a8055c2dd42b9839d9f2f.zip |
Do some SPC codec optimizing for ARMv6 (as a training exercise), tweak realtime BRR for all CPU that use it, add Gaussian ASM optimization for all ARM that can use it. Add some LIKELY/UNLIKELY branch hints. On Gigabeat-S gives +22% speedup. For Gigabeat F, about +5% speedup. For less-powerful players, no real change aside possibly from branch hints.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25771 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/libspc/spc_codec.h | 32 | ||||
-rw-r--r-- | apps/codecs/libspc/spc_dsp.c | 510 |
2 files changed, 452 insertions, 90 deletions
diff --git a/apps/codecs/libspc/spc_codec.h b/apps/codecs/libspc/spc_codec.h index cf72f90af4..95d09fa091 100644 --- a/apps/codecs/libspc/spc_codec.h +++ b/apps/codecs/libspc/spc_codec.h | |||
@@ -37,6 +37,10 @@ | |||
37 | 37 | ||
38 | /** Basic configuration options **/ | 38 | /** Basic configuration options **/ |
39 | 39 | ||
40 | #ifndef ARM_ARCH | ||
41 | #define ARM_ARCH 0 | ||
42 | #endif | ||
43 | |||
40 | #define SPC_DUAL_CORE 1 | 44 | #define SPC_DUAL_CORE 1 |
41 | 45 | ||
42 | #if !defined(SPC_DUAL_CORE) || NUM_CORES == 1 | 46 | #if !defined(SPC_DUAL_CORE) || NUM_CORES == 1 |
@@ -293,6 +297,15 @@ enum | |||
293 | FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) | 297 | FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) |
294 | }; | 298 | }; |
295 | #elif defined (CPU_ARM) | 299 | #elif defined (CPU_ARM) |
300 | #if ARM_ARCH >= 6 | ||
301 | enum | ||
302 | { | ||
303 | FIR_BUF_CNT = FIR_BUF_HALF * 2, | ||
304 | FIR_BUF_SIZE = FIR_BUF_CNT * sizeof ( int32_t ), | ||
305 | FIR_BUF_ALIGN = FIR_BUF_SIZE, | ||
306 | FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) | ||
307 | }; | ||
308 | #else | ||
296 | enum | 309 | enum |
297 | { | 310 | { |
298 | FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2, | 311 | FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2, |
@@ -300,6 +313,7 @@ enum | |||
300 | FIR_BUF_ALIGN = FIR_BUF_SIZE, | 313 | FIR_BUF_ALIGN = FIR_BUF_SIZE, |
301 | FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1)) | 314 | FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1)) |
302 | }; | 315 | }; |
316 | #endif /* ARM_ARCH */ | ||
303 | #endif /* CPU_* */ | 317 | #endif /* CPU_* */ |
304 | 318 | ||
305 | struct Spc_Dsp | 319 | struct Spc_Dsp |
@@ -318,7 +332,8 @@ struct Spc_Dsp | |||
318 | uint16_t noise; /* also read as int16_t */ | 332 | uint16_t noise; /* also read as int16_t */ |
319 | 333 | ||
320 | #if defined(CPU_COLDFIRE) | 334 | #if defined(CPU_COLDFIRE) |
321 | /* circularly hardware masked address */ | 335 | /* FIR history is interleaved. Hardware handles wrapping by mask. |
336 | * |LR|LR|LR|LR|LR|LR|LR|LR| */ | ||
322 | int32_t *fir_ptr; | 337 | int32_t *fir_ptr; |
323 | /* wrapped address just behind current position - | 338 | /* wrapped address just behind current position - |
324 | allows mac.w to increment and mask fir_ptr */ | 339 | allows mac.w to increment and mask fir_ptr */ |
@@ -328,9 +343,22 @@ struct Spc_Dsp | |||
328 | #elif defined (CPU_ARM) | 343 | #elif defined (CPU_ARM) |
329 | /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ | 344 | /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ |
330 | int32_t *fir_ptr; | 345 | int32_t *fir_ptr; |
346 | #if ARM_ARCH >= 6 | ||
347 | /* FIR history is interleaved with guard to eliminate wrap checking | ||
348 | * when convolving. | ||
349 | * |LR|LR|LR|LR|LR|LR|LR|LR|--|--|--|--|--|--|--|--| */ | ||
350 | /* copy of echo FIR constants as int16_t, loaded as int32 for | ||
351 | * halfword, packed multiples */ | ||
352 | int16_t fir_coeff [VOICE_COUNT]; | ||
353 | #else | ||
354 | /* FIR history is interleaved with guard to eliminate wrap checking | ||
355 | * when convolving. | ||
356 | * |LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|... | ||
357 | * |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--| */ | ||
331 | /* copy of echo FIR constants as int32_t, for faster access */ | 358 | /* copy of echo FIR constants as int32_t, for faster access */ |
332 | int32_t fir_coeff [VOICE_COUNT]; | 359 | int32_t fir_coeff [VOICE_COUNT]; |
333 | #else | 360 | #endif /* ARM_ARCH */ |
361 | #else /* Unoptimized CPU */ | ||
334 | /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ | 362 | /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ |
335 | int fir_pos; /* (0 to 7) */ | 363 | int fir_pos; /* (0 to 7) */ |
336 | int fir_buf [FIR_BUF_HALF * 2] [2]; | 364 | int fir_buf [FIR_BUF_HALF * 2] [2]; |
diff --git a/apps/codecs/libspc/spc_dsp.c b/apps/codecs/libspc/spc_dsp.c index 5ea6514783..0d07e5f04e 100644 --- a/apps/codecs/libspc/spc_dsp.c +++ b/apps/codecs/libspc/spc_dsp.c | |||
@@ -57,6 +57,16 @@ void DSP_write( struct Spc_Dsp* this, int i, int data ) | |||
57 | } | 57 | } |
58 | } | 58 | } |
59 | 59 | ||
60 | #if ARM_ARCH >= 6 | ||
61 | /* if ( n < -32768 ) out = -32768; */ | ||
62 | /* if ( n > 32767 ) out = 32767; */ | ||
63 | #define CLAMP16( n ) \ | ||
64 | ({ \ | ||
65 | asm ("ssat %0, #16, %1" \ | ||
66 | : "=r" ( n ) : "r"( n ) ); \ | ||
67 | n; \ | ||
68 | }) | ||
69 | #else | ||
60 | /* if ( n < -32768 ) out = -32768; */ | 70 | /* if ( n < -32768 ) out = -32768; */ |
61 | /* if ( n > 32767 ) out = 32767; */ | 71 | /* if ( n > 32767 ) out = 32767; */ |
62 | #define CLAMP16( n ) \ | 72 | #define CLAMP16( n ) \ |
@@ -65,6 +75,7 @@ void DSP_write( struct Spc_Dsp* this, int i, int data ) | |||
65 | n = 0x7FFF ^ (n >> 31); \ | 75 | n = 0x7FFF ^ (n >> 31); \ |
66 | n; \ | 76 | n; \ |
67 | }) | 77 | }) |
78 | #endif | ||
68 | 79 | ||
69 | #if SPC_BRRCACHE | 80 | #if SPC_BRRCACHE |
70 | static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, | 81 | static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, |
@@ -418,7 +429,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
418 | /* Key on events are delayed */ | 429 | /* Key on events are delayed */ |
419 | int key_on_delay = voice->key_on_delay; | 430 | int key_on_delay = voice->key_on_delay; |
420 | 431 | ||
421 | if ( --key_on_delay >= 0 ) /* <1% of the time */ | 432 | if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */ |
422 | { | 433 | { |
423 | key_on(this,voice,sd,raw_voice,key_on_delay,vbit); | 434 | key_on(this,voice,sd,raw_voice,key_on_delay,vbit); |
424 | } | 435 | } |
@@ -438,13 +449,13 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
438 | int env_mode = voice->env_mode; | 449 | int env_mode = voice->env_mode; |
439 | int adsr0 = raw_voice->adsr [0]; | 450 | int adsr0 = raw_voice->adsr [0]; |
440 | int env_timer; | 451 | int env_timer; |
441 | if ( env_mode != state_release ) /* 99% of the time */ | 452 | if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */ |
442 | { | 453 | { |
443 | env_timer = voice->env_timer; | 454 | env_timer = voice->env_timer; |
444 | if ( adsr0 & 0x80 ) /* 79% of the time */ | 455 | if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */ |
445 | { | 456 | { |
446 | int adsr1 = raw_voice->adsr [1]; | 457 | int adsr1 = raw_voice->adsr [1]; |
447 | if ( env_mode == state_sustain ) /* 74% of the time */ | 458 | if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */ |
448 | { | 459 | { |
449 | if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 ) | 460 | if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 ) |
450 | goto write_env_timer; | 461 | goto write_env_timer; |
@@ -607,25 +618,12 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
607 | goto skip_decode; | 618 | goto skip_decode; |
608 | } | 619 | } |
609 | } | 620 | } |
610 | 621 | ||
611 | /* header */ | 622 | /* header */ |
612 | int const block_header = *addr; | 623 | int const block_header = *addr; |
613 | addr += 9; | 624 | addr += 9; |
614 | voice->addr = addr; | 625 | voice->addr = addr; |
615 | voice->block_header = block_header; | 626 | voice->block_header = block_header; |
616 | int const filter = (block_header & 0x0C) - 0x08; | ||
617 | |||
618 | /* scaling (invalid scaling gives -4096 for neg nybble, | ||
619 | 0 for pos) */ | ||
620 | static unsigned char const right_shifts [16] = { | ||
621 | 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29, 29, 29, | ||
622 | }; | ||
623 | static unsigned char const left_shifts [16] = { | ||
624 | 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11 | ||
625 | }; | ||
626 | int const scale = block_header >> 4; | ||
627 | int const right_shift = right_shifts [scale]; | ||
628 | int const left_shift = left_shifts [scale]; | ||
629 | 627 | ||
630 | /* previous samples */ | 628 | /* previous samples */ |
631 | int smp2 = voice->samples [BRR_BLOCK_SIZE + 1]; | 629 | int smp2 = voice->samples [BRR_BLOCK_SIZE + 1]; |
@@ -650,54 +648,117 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
650 | /* force sample to end on next decode */ | 648 | /* force sample to end on next decode */ |
651 | voice->block_header = 1; | 649 | voice->block_header = 1; |
652 | } | 650 | } |
653 | 651 | ||
654 | do /* decode and filter 16 samples */ | 652 | int const filter = block_header & 0x0c; |
653 | int const scale = block_header >> 4; | ||
654 | |||
655 | if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */ | ||
655 | { | 656 | { |
656 | /* Get nybble, sign-extend, then scale | 657 | /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */ |
657 | get byte, select which nybble, sign-extend, then shift | 658 | do /* decode and filter 16 samples */ |
658 | based on scaling. also handles invalid scaling values.*/ | ||
659 | int delta = (int) (int8_t) (addr [offset >> 3] << | ||
660 | (offset & 4)) >> right_shift << left_shift; | ||
661 | |||
662 | out [offset >> 2] = smp2; | ||
663 | |||
664 | if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */ | ||
665 | { | 659 | { |
660 | /* Get nybble, sign-extend, then scale | ||
661 | get byte, select which nybble, sign-extend, then shift | ||
662 | based on scaling. */ | ||
663 | int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; | ||
664 | delta = (delta << scale) >> 1; | ||
665 | |||
666 | if (scale > 0xc) | ||
667 | delta = (delta >> 17) << 11; | ||
668 | |||
669 | out [offset >> 2] = smp2; | ||
670 | |||
666 | delta -= smp2 >> 1; | 671 | delta -= smp2 >> 1; |
667 | delta += smp2 >> 5; | 672 | delta += smp2 >> 5; |
668 | smp2 = smp1; | ||
669 | delta += smp1; | 673 | delta += smp1; |
670 | delta += (-smp1 - (smp1 >> 1)) >> 5; | 674 | delta += (-smp1 - (smp1 >> 1)) >> 5; |
675 | |||
676 | delta = CLAMP16( delta ); | ||
677 | smp2 = smp1; | ||
678 | smp1 = (int16_t) (delta * 2); /* sign-extend */ | ||
671 | } | 679 | } |
672 | else | 680 | while ( (offset += 4) != 0 ); |
681 | } | ||
682 | else if ( filter == 0x04 ) /* filter 1 */ | ||
683 | { | ||
684 | /* y[n] = x[n] + 15/16 * y[n-1] */ | ||
685 | do /* decode and filter 16 samples */ | ||
673 | { | 686 | { |
674 | if ( filter == -4 ) /* mode 0x04 */ | 687 | /* Get nybble, sign-extend, then scale |
675 | { | 688 | get byte, select which nybble, sign-extend, then shift |
676 | delta += smp1 >> 1; | 689 | based on scaling. */ |
677 | delta += (-smp1) >> 5; | 690 | int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; |
678 | } | 691 | delta = (delta << scale) >> 1; |
679 | else if ( filter > -4 ) /* mode 0x0C */ | 692 | |
680 | { | 693 | if (scale > 0xc) |
681 | delta -= smp2 >> 1; | 694 | delta = (delta >> 17) << 11; |
682 | delta += (smp2 + (smp2 >> 1)) >> 4; | 695 | |
683 | delta += smp1; | 696 | out [offset >> 2] = smp2; |
684 | delta += (-smp1 * 13) >> 7; | 697 | |
685 | } | 698 | delta += smp1 >> 1; |
699 | delta += (-smp1) >> 5; | ||
700 | |||
701 | delta = CLAMP16( delta ); | ||
686 | smp2 = smp1; | 702 | smp2 = smp1; |
703 | smp1 = (int16_t) (delta * 2); /* sign-extend */ | ||
687 | } | 704 | } |
688 | 705 | while ( (offset += 4) != 0 ); | |
689 | delta = CLAMP16( delta ); | ||
690 | smp1 = (int16_t) (delta * 2); /* sign-extend */ | ||
691 | } | 706 | } |
692 | while ( (offset += 4) != 0 ); | 707 | else if ( filter == 0x0c ) /* filter 3 */ |
693 | 708 | { | |
709 | /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */ | ||
710 | do /* decode and filter 16 samples */ | ||
711 | { | ||
712 | /* Get nybble, sign-extend, then scale | ||
713 | get byte, select which nybble, sign-extend, then shift | ||
714 | based on scaling. */ | ||
715 | int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; | ||
716 | delta = (delta << scale) >> 1; | ||
717 | |||
718 | if (scale > 0xc) | ||
719 | delta = (delta >> 17) << 11; | ||
720 | |||
721 | out [offset >> 2] = smp2; | ||
722 | |||
723 | delta -= smp2 >> 1; | ||
724 | delta += (smp2 + (smp2 >> 1)) >> 4; | ||
725 | delta += smp1; | ||
726 | delta += (-smp1 * 13) >> 7; | ||
727 | |||
728 | delta = CLAMP16( delta ); | ||
729 | smp2 = smp1; | ||
730 | smp1 = (int16_t) (delta * 2); /* sign-extend */ | ||
731 | } | ||
732 | while ( (offset += 4) != 0 ); | ||
733 | } | ||
734 | else /* filter 0 */ | ||
735 | { | ||
736 | /* y[n] = x[n] */ | ||
737 | do /* decode and filter 16 samples */ | ||
738 | { | ||
739 | /* Get nybble, sign-extend, then scale | ||
740 | get byte, select which nybble, sign-extend, then shift | ||
741 | based on scaling. */ | ||
742 | int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; | ||
743 | delta = (delta << scale) >> 1; | ||
744 | |||
745 | if (scale > 0xc) | ||
746 | delta = (delta >> 17) << 11; | ||
747 | |||
748 | out [offset >> 2] = smp2; | ||
749 | |||
750 | smp2 = smp1; | ||
751 | smp1 = delta * 2; | ||
752 | } | ||
753 | while ( (offset += 4) != 0 ); | ||
754 | } | ||
755 | |||
694 | out [0] = smp2; | 756 | out [0] = smp2; |
695 | out [1] = smp1; | 757 | out [1] = smp1; |
696 | 758 | ||
697 | skip_decode:; | 759 | skip_decode:; |
698 | } | 760 | } |
699 | #endif | 761 | #endif /* !SPC_BRRCACHE */ |
700 | |||
701 | /* Get rate (with possible modulation) */ | 762 | /* Get rate (with possible modulation) */ |
702 | int rate = VOICE_RATE(vr); | 763 | int rate = VOICE_RATE(vr); |
703 | if ( this->r.g.pitch_mods & vbit ) | 764 | if ( this->r.g.pitch_mods & vbit ) |
@@ -754,13 +815,87 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
754 | 815 | ||
755 | /* Use faster gaussian interpolation when exact result isn't needed | 816 | /* Use faster gaussian interpolation when exact result isn't needed |
756 | by pitch modulator of next channel */ | 817 | by pitch modulator of next channel */ |
757 | int amp_0, amp_1; | 818 | int amp_0, amp_1; /* Also serve as temps _0, and _1 */ |
758 | if ( !(slow_gaussian & vbit) ) /* 99% of the time */ | 819 | if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */ |
759 | { | 820 | { |
760 | /* Main optimization is lack of clamping. Not a problem since | 821 | /* Main optimization is lack of clamping. Not a problem since |
761 | output never goes more than +/- 16 outside 16-bit range and | 822 | output never goes more than +/- 16 outside 16-bit range and |
762 | things are clamped later anyway. Other optimization is to | 823 | things are clamped later anyway. Other optimization is to |
763 | preserve fractional accuracy, eliminating several masks. */ | 824 | preserve fractional accuracy, eliminating several masks. */ |
825 | #if defined (CPU_ARM) | ||
826 | int output; | ||
827 | int _2, _3; /* All-purpose temps */ | ||
828 | /* Multiple ASM blocks keep regs free and reduce result | ||
829 | * latency issues. */ | ||
830 | #if ARM_ARCH >= 6 | ||
831 | /* Interpolate */ | ||
832 | asm volatile ( | ||
833 | "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */ | ||
834 | "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */ | ||
835 | "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */ | ||
836 | "ldr %[_3], [%[rev]] \r\n" /* _3=r0r1 */ | ||
837 | "smuad %[out], %[_0], %[_2] \r\n" /* out=f0*i0 + f1*i1 */ | ||
838 | "smladx %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */ | ||
839 | : [out]"=&r"(output), | ||
840 | [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), | ||
841 | [_2]"=&r"(_2), [_3]"=&r"(_3) | ||
842 | : [fwd]"r"(fwd), [rev]"r"(rev), | ||
843 | [interp]"r"(interp)); | ||
844 | /* Apply voice envelope */ | ||
845 | asm volatile ( | ||
846 | "mov %[_2], %[out], asr #(11-5) \r\n" /* To do >> 16 later */ | ||
847 | "mul %[out], %[_2], %[envx] \r\n" /* and avoid exp. shift */ | ||
848 | : [out]"+r"(output), [_2]"=&r"(_2) | ||
849 | : [envx]"r"((int)voice->envx)); | ||
850 | /* Apply left and right volume */ | ||
851 | asm volatile ( | ||
852 | "smulwb %[amp_0], %[out], %[vvol_0] \r\n" /* (32x16->48)[47:16]->[31:0] */ | ||
853 | "smulwb %[amp_1], %[out], %[vvol_1] \r\n" | ||
854 | : [out]"+r"(output), | ||
855 | [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) | ||
856 | : [vvol_0]"r"(voice->volume[0]), | ||
857 | [vvol_1]"r"(voice->volume[1])); | ||
858 | |||
859 | raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */ | ||
860 | #else /* ARM_ARCH < 6 */ | ||
861 | /* Perform gaussian interpolation on four samples */ | ||
862 | asm volatile ( | ||
863 | "ldrsh %[_0], [%[interp]] \r\n" | ||
864 | "ldrsh %[_2], [%[fwd]] \r\n" | ||
865 | "ldrsh %[_1], [%[interp], #2] \r\n" | ||
866 | "ldrsh %[_3], [%[fwd], #2] \r\n" | ||
867 | "mul %[out], %[_0], %[_2] \r\n" /* out= fwd[0]*interp[0] */ | ||
868 | "ldrsh %[_0], [%[interp], #4] \r\n" | ||
869 | "ldrsh %[_2], [%[rev], #2] \r\n" | ||
870 | "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */ | ||
871 | "ldrsh %[_1], [%[interp], #6] \r\n" | ||
872 | "ldrsh %[_3], [%[rev]] \r\n" | ||
873 | "mla %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */ | ||
874 | "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */ | ||
875 | : [out]"=&r"(output), | ||
876 | [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), | ||
877 | [_2]"=&r"(_2), [_3]"=&r"(_3) | ||
878 | : [fwd]"r"(fwd), [rev]"r"(rev), | ||
879 | [interp]"r"(interp)); | ||
880 | /* Apply voice envelope */ | ||
881 | asm volatile ( | ||
882 | "mov %[_2], %[out], asr #11 \r\n" | ||
883 | "mul %[out], %[_2], %[envx] \r\n" | ||
884 | : [out]"+r"(output), [_2]"=&r"(_2) | ||
885 | : [envx]"r"((int)voice->envx)); | ||
886 | /* Reduce and apply left and right volume */ | ||
887 | asm volatile ( | ||
888 | "mov %[out], %[out], asr #11 \r\n" | ||
889 | "mul %[amp_0], %[out], %[vvol_0] \r\n" | ||
890 | "mul %[amp_1], %[out], %[vvol_1] \r\n" | ||
891 | : [out]"+r"(output), | ||
892 | [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) | ||
893 | : [vvol_0]"r"((int)voice->volume[0]), | ||
894 | [vvol_1]"r"((int)voice->volume[1])); | ||
895 | |||
896 | raw_voice->outx = output >> 8; | ||
897 | #endif /* ARM_ARCH */ | ||
898 | #else /* Unoptimized CPU */ | ||
764 | int output = (((fwd [0] * interp [0] + | 899 | int output = (((fwd [0] * interp [0] + |
765 | fwd [1] * interp [1] + | 900 | fwd [1] * interp [1] + |
766 | rev [1] * interp [2] + | 901 | rev [1] * interp [2] + |
@@ -769,11 +904,121 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
769 | /* duplicated here to give compiler more to run in parallel */ | 904 | /* duplicated here to give compiler more to run in parallel */ |
770 | amp_0 = voice->volume [0] * output; | 905 | amp_0 = voice->volume [0] * output; |
771 | amp_1 = voice->volume [1] * output; | 906 | amp_1 = voice->volume [1] * output; |
907 | |||
772 | raw_voice->outx = output >> 8; | 908 | raw_voice->outx = output >> 8; |
909 | #endif /* CPU_* */ | ||
773 | } | 910 | } |
774 | else | 911 | else /* slow gaussian */ |
775 | { | 912 | { |
913 | #if defined(CPU_ARM) | ||
914 | #if ARM_ARCH >= 6 | ||
915 | int output = *(int16_t*) &this->noise; | ||
916 | |||
917 | if ( !(this->r.g.noise_enables & vbit) ) | ||
918 | { | ||
919 | /* Interpolate */ | ||
920 | int _2, _3; | ||
921 | asm volatile ( | ||
922 | /* NOTE: often-unaligned accesses */ | ||
923 | "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */ | ||
924 | "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */ | ||
925 | "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */ | ||
926 | "ldr %[_3], [%[rev]] \r\n" /* _3=f2f3 */ | ||
927 | "smulbb %[out], %[_0], %[_2] \r\n" /* out=f0*i0 */ | ||
928 | "smultt %[_0], %[_0], %[_2] \r\n" /* _0=f1*i1 */ | ||
929 | "smulbt %[_2], %[_1], %[_3] \r\n" /* _2=r1*i2 */ | ||
930 | "smultb %[_3], %[_1], %[_3] \r\n" /* _3=r0*i3 */ | ||
931 | : [out]"=r"(output), | ||
932 | [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), | ||
933 | [_2]"=&r"(_2), [_3]"=&r"(_3) | ||
934 | : [fwd]"r"(fwd), [rev]"r"(rev), | ||
935 | [interp]"r"(interp)); | ||
936 | asm volatile ( | ||
937 | "mov %[out], %[out], asr#12 \r\n" | ||
938 | "add %[_0], %[out], %[_0], asr #12 \r\n" | ||
939 | "add %[_2], %[_0], %[_2], asr #12 \r\n" | ||
940 | "pkhbt %[_0], %[_2], %[_3], asl #4 \r\n" /* _3[31:16], _2[15:0] */ | ||
941 | "sadd16 %[_0], %[_0], %[_0] \r\n" /* _3[31:16]*2, _2[15:0]*2 */ | ||
942 | "qsubaddx %[out], %[_0], %[_0] \r\n" /* out[15:0]= | ||
943 | * sat16(_3[31:16]+_2[15:0]) */ | ||
944 | : [out]"+r"(output), | ||
945 | [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3)); | ||
946 | } | ||
947 | /* Apply voice envelope */ | ||
948 | asm volatile ( | ||
949 | "smulbb %[out], %[out], %[envx] \r\n" | ||
950 | : [out]"+r"(output) | ||
951 | : [envx]"r"(voice->envx)); | ||
952 | /* Reduce and apply left and right volume */ | ||
953 | asm volatile ( | ||
954 | "mov %[out], %[out], asr #11 \r\n" | ||
955 | "bic %[out], %[out], #0x1 \r\n" | ||
956 | "mul %[amp_0], %[out], %[vvol_0] \r\n" | ||
957 | "mul %[amp_1], %[out], %[vvol_1] \r\n" | ||
958 | : [out]"+r"(output), | ||
959 | [amp_1]"=r"(amp_1), [amp_0]"=r"(amp_0) | ||
960 | : [vvol_0]"r"((int)voice->volume[0]), | ||
961 | [vvol_1]"r"((int)voice->volume[1])); | ||
962 | |||
963 | prev_outx = output; | ||
964 | raw_voice->outx = output >> 8; | ||
965 | #else /* ARM_ARCH < 6 */ | ||
966 | int output = *(int16_t*) &this->noise; | ||
967 | |||
968 | if ( !(this->r.g.noise_enables & vbit) ) | ||
969 | { | ||
970 | /* Interpolate */ | ||
971 | int _2, _3; | ||
972 | asm volatile ( | ||
973 | "ldrsh %[_0], [%[interp]] \r\n" | ||
974 | "ldrsh %[_2], [%[fwd]] \r\n" | ||
975 | "ldrsh %[_1], [%[interp], #2] \r\n" | ||
976 | "ldrsh %[_3], [%[fwd], #2] \r\n" | ||
977 | "mul %[out], %[_2], %[_0] \r\n" /* fwd[0]*interp[0] */ | ||
978 | "ldrsh %[_2], [%[rev], #2] \r\n" | ||
979 | "mul %[_0], %[_3], %[_1] \r\n" /* fwd[1]*interp[1] */ | ||
980 | "ldrsh %[_1], [%[interp], #4] \r\n" | ||
981 | "mov %[out], %[out], asr #12 \r\n" | ||
982 | "ldrsh %[_3], [%[rev]] \r\n" | ||
983 | "mul %[_2], %[_1], %[_2] \r\n" /* rev[1]*interp[2] */ | ||
984 | "ldrsh %[_1], [%[interp], #6] \r\n" | ||
985 | "add %[_0], %[out], %[_0], asr #12 \r\n" | ||
986 | "mul %[_3], %[_1], %[_3] \r\n" /* rev[0]*interp[3] */ | ||
987 | "add %[_2], %[_0], %[_2], asr #12 \r\n" | ||
988 | "mov %[_2], %[_2], lsl #17 \r\n" | ||
989 | "mov %[_3], %[_3], asr #12 \r\n" | ||
990 | "mov %[_3], %[_3], asl #1 \r\n" | ||
991 | "add %[out], %[_3], %[_2], asr #16 \r\n" | ||
992 | : [out]"=r"(output), | ||
993 | [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), | ||
994 | [_2]"=&r"(_2), [_3]"=&r"(_3) | ||
995 | : [fwd]"r"(fwd), [rev]"r"(rev), | ||
996 | [interp]"r"(interp)); | ||
997 | |||
998 | output = CLAMP16(output); | ||
999 | } | ||
1000 | /* Apply voice envelope */ | ||
1001 | asm volatile ( | ||
1002 | "mul %[_0], %[out], %[envx] \r\n" | ||
1003 | : [_0]"=r"(amp_0) | ||
1004 | : [out]"r"(output), [envx]"r"((int)voice->envx)); | ||
1005 | /* Reduce and apply left and right volume */ | ||
1006 | asm volatile ( | ||
1007 | "mov %[out], %[amp_0], asr #11 \r\n" /* amp_0 = _0 */ | ||
1008 | "bic %[out], %[out], #0x1 \r\n" | ||
1009 | "mul %[amp_0], %[out], %[vvol_0] \r\n" | ||
1010 | "mul %[amp_1], %[out], %[vvol_1] \r\n" | ||
1011 | : [out]"+r"(output), [amp_0]"+r"(amp_0), | ||
1012 | [amp_1]"=r"(amp_1) | ||
1013 | : [vvol_0]"r"((int)voice->volume[0]), | ||
1014 | [vvol_1]"r"((int)voice->volume[1])); | ||
1015 | |||
1016 | prev_outx = output; | ||
1017 | raw_voice->outx = output >> 8; | ||
1018 | #endif /* ARM_ARCH >= 6 */ | ||
1019 | #else /* Unoptimized CPU */ | ||
776 | int output = *(int16_t*) &this->noise; | 1020 | int output = *(int16_t*) &this->noise; |
1021 | |||
777 | if ( !(this->r.g.noise_enables & vbit) ) | 1022 | if ( !(this->r.g.noise_enables & vbit) ) |
778 | { | 1023 | { |
779 | output = (fwd [0] * interp [0]) & ~0xFFF; | 1024 | output = (fwd [0] * interp [0]) & ~0xFFF; |
@@ -788,8 +1033,10 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
788 | /* duplicated here to give compiler more to run in parallel */ | 1033 | /* duplicated here to give compiler more to run in parallel */ |
789 | amp_0 = voice->volume [0] * output; | 1034 | amp_0 = voice->volume [0] * output; |
790 | amp_1 = voice->volume [1] * output; | 1035 | amp_1 = voice->volume [1] * output; |
1036 | |||
791 | prev_outx = output; | 1037 | prev_outx = output; |
792 | raw_voice->outx = (int8_t) (output >> 8); | 1038 | raw_voice->outx = output >> 8; |
1039 | #endif /* CPU_* */ | ||
793 | } | 1040 | } |
794 | #else /* SPCNOINTERP */ | 1041 | #else /* SPCNOINTERP */ |
795 | /* two-point linear interpolation */ | 1042 | /* two-point linear interpolation */ |
@@ -826,16 +1073,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
826 | "asr.l %[sh], %[y1] \r\n" | 1073 | "asr.l %[sh], %[y1] \r\n" |
827 | "add.l %[y0], %[y1] \r\n" | 1074 | "add.l %[y0], %[y1] \r\n" |
828 | : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) | 1075 | : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) |
829 | : [s]"a"(voice->samples), [sh]"d"(12) | 1076 | : [s]"a"(voice->samples), [sh]"d"(12)); |
830 | ); | ||
831 | } | 1077 | } |
832 | 1078 | ||
833 | /* apply voice envelope to output */ | 1079 | /* apply voice envelope to output */ |
834 | asm volatile ( | 1080 | asm volatile ( |
835 | "mac.w %[output]l, %[envx]l, %%acc0 \r\n" | 1081 | "mac.w %[out]l, %[envx]l, %%acc0 \r\n" |
836 | : | 1082 | : |
837 | : [output]"r"(amp_0), [envx]"r"(voice->envx) | 1083 | : [out]"r"(amp_0), [envx]"r"(voice->envx)); |
838 | ); | ||
839 | 1084 | ||
840 | /* advance voice position */ | 1085 | /* advance voice position */ |
841 | voice->position += rate; | 1086 | voice->position += rate; |
@@ -843,15 +1088,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
843 | /* fetch output, scale and apply left and right | 1088 | /* fetch output, scale and apply left and right |
844 | voice volume */ | 1089 | voice volume */ |
845 | asm volatile ( | 1090 | asm volatile ( |
846 | "movclr.l %%acc0, %[output] \r\n" | 1091 | "movclr.l %%acc0, %[out] \r\n" |
847 | "asr.l %[sh], %[output] \r\n" | 1092 | "asr.l %[sh], %[out] \r\n" |
848 | "mac.l %[vvol_0], %[output], %%acc0 \r\n" | 1093 | "mac.l %[vvol_0], %[out], %%acc0 \r\n" |
849 | "mac.l %[vvol_1], %[output], %%acc1 \r\n" | 1094 | "mac.l %[vvol_1], %[out], %%acc1 \r\n" |
850 | : [output]"=&d"(amp_0) | 1095 | : [out]"=&d"(amp_0) |
851 | : [vvol_0]"r"((int)voice->volume[0]), | 1096 | : [vvol_0]"r"((int)voice->volume[0]), |
852 | [vvol_1]"r"((int)voice->volume[1]), | 1097 | [vvol_1]"r"((int)voice->volume[1]), |
853 | [sh]"d"(11) | 1098 | [sh]"d"(11)); |
854 | ); | ||
855 | 1099 | ||
856 | /* save this output into previous, scale and save in | 1100 | /* save this output into previous, scale and save in |
857 | output register */ | 1101 | output register */ |
@@ -862,14 +1106,16 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
862 | asm volatile ( | 1106 | asm volatile ( |
863 | "movclr.l %%acc0, %[amp_0] \r\n" | 1107 | "movclr.l %%acc0, %[amp_0] \r\n" |
864 | "movclr.l %%acc1, %[amp_1] \r\n" | 1108 | "movclr.l %%acc1, %[amp_1] \r\n" |
865 | : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) | 1109 | : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)); |
866 | ); | ||
867 | #elif defined (CPU_ARM) | 1110 | #elif defined (CPU_ARM) |
868 | int amp_0, amp_1; | 1111 | int amp_0, amp_1; |
869 | 1112 | ||
870 | if ( (this->r.g.noise_enables & vbit) != 0 ) { | 1113 | if ( (this->r.g.noise_enables & vbit) != 0 ) |
1114 | { | ||
871 | amp_0 = *(int16_t *)&this->noise; | 1115 | amp_0 = *(int16_t *)&this->noise; |
872 | } else { | 1116 | } |
1117 | else | ||
1118 | { | ||
873 | uint32_t f = voice->position; | 1119 | uint32_t f = voice->position; |
874 | amp_0 = (uint32_t)voice->samples; | 1120 | amp_0 = (uint32_t)voice->samples; |
875 | 1121 | ||
@@ -882,8 +1128,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
882 | "sub %[y1], %[y1], %[y0] \r\n" | 1128 | "sub %[y1], %[y1], %[y0] \r\n" |
883 | "mul %[f], %[y1], %[f] \r\n" | 1129 | "mul %[f], %[y1], %[f] \r\n" |
884 | "add %[y0], %[y0], %[f], asr #12 \r\n" | 1130 | "add %[y0], %[y0], %[f], asr #12 \r\n" |
885 | : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1) | 1131 | : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)); |
886 | ); | ||
887 | } | 1132 | } |
888 | 1133 | ||
889 | voice->position += rate; | 1134 | voice->position += rate; |
@@ -893,8 +1138,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
893 | "mov %[amp_0], %[amp_1], asr #11 \r\n" | 1138 | "mov %[amp_0], %[amp_1], asr #11 \r\n" |
894 | "mov %[amp_1], %[amp_0], asr #8 \r\n" | 1139 | "mov %[amp_1], %[amp_0], asr #8 \r\n" |
895 | : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1) | 1140 | : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1) |
896 | : [envx]"r"(voice->envx) | 1141 | : [envx]"r"(voice->envx)); |
897 | ); | ||
898 | 1142 | ||
899 | prev_outx = amp_0; | 1143 | prev_outx = amp_0; |
900 | raw_voice->outx = (int8_t)amp_1; | 1144 | raw_voice->outx = (int8_t)amp_1; |
@@ -904,8 +1148,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
904 | "mul %[amp_0], %[vol_0], %[amp_0] \r\n" | 1148 | "mul %[amp_0], %[vol_0], %[amp_0] \r\n" |
905 | : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) | 1149 | : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) |
906 | : [vol_0]"r"((int)voice->volume[0]), | 1150 | : [vol_0]"r"((int)voice->volume[0]), |
907 | [vol_1]"r"((int)voice->volume[1]) | 1151 | [vol_1]"r"((int)voice->volume[1])); |
908 | ); | ||
909 | #else /* Unoptimized CPU */ | 1152 | #else /* Unoptimized CPU */ |
910 | int output; | 1153 | int output; |
911 | 1154 | ||
@@ -1089,25 +1332,116 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
1089 | echo_pos = 0; | 1332 | echo_pos = 0; |
1090 | this->echo_pos = echo_pos; | 1333 | this->echo_pos = echo_pos; |
1091 | 1334 | ||
1335 | #if ARM_ARCH >= 6 | ||
1336 | int32_t *fir_ptr, *fir_coeff; | ||
1337 | int fb_0, fb_1; | ||
1338 | |||
1339 | /* Apply FIR */ | ||
1340 | fb_0 = *(uint32_t *)echo_ptr; | ||
1341 | |||
1342 | /* Keep last 8 samples */ | ||
1343 | asm volatile ( | ||
1344 | "add %[fir_p], %[t_fir_p], #4 \r\n" | ||
1345 | "bic %[t_fir_p], %[fir_p], %[mask] \r\n" | ||
1346 | "str %[fb_0], [%[fir_p], #-4] \r\n" | ||
1347 | /* duplicate at +8 eliminates wrap checking below */ | ||
1348 | "str %[fb_0], [%[fir_p], #28] \r\n" | ||
1349 | : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr) | ||
1350 | : [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK)); | ||
1351 | |||
1352 | fir_coeff = (int32_t *)this->fir_coeff; | ||
1353 | |||
1354 | /* Fugly, but the best version found. */ | ||
1355 | int _0; | ||
1356 | asm volatile ( /* L0R0 = acc0 */ | ||
1357 | "ldmia %[fir_p]!, { r2-r5 } \r\n" /* L1R1-L4R4 = r2-r5 */ | ||
1358 | "ldmia %[fir_c]!, { r0-r1 } \r\n" /* C0C1-C2C3 = r0-r1 */ | ||
1359 | "pkhbt %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */ | ||
1360 | "pkhtb r2, r2, %[acc0], asr #16 \r\n" | ||
1361 | "smuad %[acc0], %[_0], r0 \r\n" /* acc0=L0*C0+L1*C1 */ | ||
1362 | "smuad %[acc1], r2, r0 \r\n" /* acc1=R0*C0+R1*C1 */ | ||
1363 | "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L2R2,L3R3->L2L3,R2R3 */ | ||
1364 | "pkhtb r4, r4, r3, asr #16 \r\n" | ||
1365 | "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3 */ | ||
1366 | "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R2*C2+R3*C3 */ | ||
1367 | "ldmia %[fir_p], { r2-r4 } \r\n" /* L5R5-L7R7 = r2-r4 */ | ||
1368 | "ldmia %[fir_c], { r0-r1 } \r\n" /* C4C5-C6C7 = r0-r1 */ | ||
1369 | "pkhbt %[_0], r5, r2, asl #16 \r\n" /* L4R4,L5R5->L4L5,R4R5 */ | ||
1370 | "pkhtb r2, r2, r5, asr #16 \r\n" | ||
1371 | "smlad %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5 */ | ||
1372 | "smlad %[acc1], r2, r0, %[acc1] \r\n" /* acc1+=R4*C4+R5*C5 */ | ||
1373 | "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L6R6,L7R7->L6L7,R6R7 */ | ||
1374 | "pkhtb r4, r4, r3, asr #16 \r\n" | ||
1375 | "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7 */ | ||
1376 | "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R6*C6+R7*C7 */ | ||
1377 | : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0), | ||
1378 | [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) | ||
1379 | : | ||
1380 | : "r0", "r1", "r2", "r3", "r4", "r5"); | ||
1381 | |||
1382 | /* Generate output */ | ||
1383 | int amp_0, amp_1; | ||
1384 | |||
1385 | asm volatile ( | ||
1386 | "mul %[amp_0], %[gvol_0], %[chans_0] \r\n" | ||
1387 | "mul %[amp_1], %[gvol_1], %[chans_1] \r\n" | ||
1388 | : [amp_0]"=&r"(amp_0), [amp_1]"=&r"(amp_1) | ||
1389 | : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1), | ||
1390 | [chans_0]"r"(chans_0), [chans_1]"r"(chans_1)); | ||
1391 | asm volatile ( | ||
1392 | "mla %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n" | ||
1393 | "mla %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n" | ||
1394 | : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) | ||
1395 | : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), | ||
1396 | [ev_0]"r"((int)this->r.g.echo_volume_0), | ||
1397 | [ev_1]"r"((int)this->r.g.echo_volume_1)); | ||
1398 | |||
1399 | out_buf [ 0] = amp_0 >> global_muting; | ||
1400 | out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting; | ||
1401 | out_buf ++; | ||
1402 | |||
1403 | if ( !(this->r.g.flags & 0x20) ) | ||
1404 | { | ||
1405 | /* Feedback into echo buffer */ | ||
1406 | int e0, e1; | ||
1407 | |||
1408 | asm volatile ( | ||
1409 | "mov %[e0], %[echo_0], asl #7 \r\n" | ||
1410 | "mov %[e1], %[echo_1], asl #7 \r\n" | ||
1411 | "mla %[e0], %[fb_0], %[efb], %[e0] \r\n" | ||
1412 | "mla %[e1], %[fb_1], %[efb], %[e1] \r\n" | ||
1413 | : [e0]"=&r"(e0), [e1]"=&r"(e1) | ||
1414 | : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1), | ||
1415 | [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), | ||
1416 | [efb]"r"((int)this->r.g.echo_feedback)); | ||
1417 | asm volatile ( | ||
1418 | "ssat %[e0], #16, %[e0], asr #14 \r\n" | ||
1419 | "ssat %[e1], #16, %[e1], asr #14 \r\n" | ||
1420 | "pkhbt %[e0], %[e0], %[e1], lsl #16 \r\n" | ||
1421 | "str %[e0], [%[echo_p]] \r\n" | ||
1422 | : [e0]"+r"(e0), [e1]"+r"(e1) | ||
1423 | : [echo_p]"r"(echo_ptr)); | ||
1424 | } | ||
1425 | #else /* ARM_ARCH < 6 */ | ||
1092 | int fb_0 = GET_LE16SA( echo_ptr ); | 1426 | int fb_0 = GET_LE16SA( echo_ptr ); |
1093 | int fb_1 = GET_LE16SA( echo_ptr + 2 ); | 1427 | int fb_1 = GET_LE16SA( echo_ptr + 2 ); |
1428 | int32_t *fir_ptr, *fir_coeff; | ||
1094 | 1429 | ||
1095 | /* Keep last 8 samples */ | 1430 | /* Keep last 8 samples */ |
1096 | int32_t *fir_ptr = this->fir_ptr; | ||
1097 | 1431 | ||
1098 | /* Apply FIR */ | 1432 | /* Apply FIR */ |
1099 | asm volatile ( | 1433 | asm volatile ( |
1100 | "str %[fb_0], [%[fir_p]], #4 \r\n" | 1434 | "add %[fir_p], %[t_fir_p], #8 \r\n" |
1101 | "str %[fb_1], [%[fir_p]], #4 \r\n" | 1435 | "bic %[t_fir_p], %[fir_p], %[mask] \r\n" |
1436 | "str %[fb_0], [%[fir_p], #-8] \r\n" | ||
1437 | "str %[fb_1], [%[fir_p], #-4] \r\n" | ||
1102 | /* duplicate at +8 eliminates wrap checking below */ | 1438 | /* duplicate at +8 eliminates wrap checking below */ |
1103 | "str %[fb_0], [%[fir_p], #56] \r\n" | 1439 | "str %[fb_0], [%[fir_p], #56] \r\n" |
1104 | "str %[fb_1], [%[fir_p], #60] \r\n" | 1440 | "str %[fb_1], [%[fir_p], #60] \r\n" |
1105 | : [fir_p]"+r"(fir_ptr) | 1441 | : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr) |
1106 | : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1) | 1442 | : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK)); |
1107 | ); | ||
1108 | 1443 | ||
1109 | this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK); | 1444 | fir_coeff = this->fir_coeff; |
1110 | int32_t *fir_coeff = this->fir_coeff; | ||
1111 | 1445 | ||
1112 | asm volatile ( | 1446 | asm volatile ( |
1113 | "ldmia %[fir_c]!, { r0-r1 } \r\n" | 1447 | "ldmia %[fir_c]!, { r0-r1 } \r\n" |
@@ -1137,8 +1471,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
1137 | : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1), | 1471 | : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1), |
1138 | [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) | 1472 | [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) |
1139 | : | 1473 | : |
1140 | : "r0", "r1", "r2", "r3", "r4", "r5" | 1474 | : "r0", "r1", "r2", "r3", "r4", "r5"); |
1141 | ); | ||
1142 | 1475 | ||
1143 | /* Generate output */ | 1476 | /* Generate output */ |
1144 | int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0) | 1477 | int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0) |
@@ -1160,6 +1493,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
1160 | e1 = CLAMP16( e1 ); | 1493 | e1 = CLAMP16( e1 ); |
1161 | SET_LE16A( echo_ptr + 2, e1 ); | 1494 | SET_LE16A( echo_ptr + 2, e1 ); |
1162 | } | 1495 | } |
1496 | #endif /* ARM_ARCH */ | ||
1163 | #else /* Unoptimized CPU */ | 1497 | #else /* Unoptimized CPU */ |
1164 | /* Read feedback from echo buffer */ | 1498 | /* Read feedback from echo buffer */ |
1165 | int echo_pos = this->echo_pos; | 1499 | int echo_pos = this->echo_pos; |