1 files changed, 168 insertions, 64 deletions
diff --git a/apps/codecs/spc/spc_dsp.c b/apps/codecs/spc/spc_dsp.c
index 8881788cf1..19986fd8a8 100644
--- a/apps/codecs/spc/spc_dsp.c
+++ b/apps/codecs/spc/spc_dsp.c
@@ -25,14 +25,13 @@
 #include "spc_codec.h"
 #include "spc_profiler.h"
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE) || defined (CPU_ARM)
-static int32_t fir_buf[FIR_BUF_HALF]
+int32_t fir_buf[FIR_BUF_CNT]
-    __attribute__ ((aligned (FIR_BUF_SIZE*2))) IBSS_ATTR;
+    __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
 #endif
 #if SPC_BRRCACHE
 /* a little extra for samples that go past end */
-int16_t BRRcache [0x20000 + 32];
+int16_t BRRcache [BRR_CACHE_SIZE];
 #endif
 void DSP_write( struct Spc_Dsp* this, int i, int data )
@@ -58,11 +57,12 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
 /* if ( n < -32768 ) out = -32768; */
 /* if ( n >  32767 ) out =  32767; */
-#define CLAMP16( n, out )\
+#define CLAMP16( n ) \
-{\
+({                              \
-    if ( (int16_t) n != n )\
+    if ( (int16_t) n != n )     \
-        out = 0x7FFF ^ (n >> 31);\
+        n = 0x7FFF ^ (n >> 31); \
-}
+    n;                          \
+})
 #if SPC_BRRCACHE
 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
@@ -181,7 +181,7 @@ static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
                    smp2 = smp1;
                }
                
-                CLAMP16( delta, delta );
+                delta = CLAMP16( delta );
                smp1 = (int16_t) (delta * 2); /* sign-extend */
            }
            while ( (offset += 4) != 0 );
@@ -359,7 +359,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
        #define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)
        #define IF_RBE(...)
    #endif /* ROCKBOX_BIG_ENDIAN */
-    
+   
 #if !SPC_NOINTERP
    int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
        this->r.g.noise_enables;
@@ -431,7 +431,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
            
            /* Envelope */
            {
-                int const env_range = 0x800;
+                int const ENV_RANGE = 0x800;
                int env_mode = voice->env_mode;
                int adsr0 = raw_voice->adsr [0];
                int env_timer;
@@ -482,14 +482,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                            
                            int envx = voice->envx;
                            
-                            int const step = env_range / 64;
+                            int const step = ENV_RANGE / 64;
                            envx += step;
                            if ( t == 15 )
-                                envx += env_range / 2 - step;
+                                envx += ENV_RANGE / 2 - step;
                            
-                            if ( envx >= env_range )
+                            if ( envx >= ENV_RANGE )
                            {
-                                envx = env_range - 1;
+                                envx = ENV_RANGE - 1;
                                voice->env_mode = state_decay;
                            }
                            voice->envx = envx;
@@ -516,7 +516,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                            int mode = t >> 5;
                            if ( mode <= 5 ) /* decay */
                            {
-                                int step = env_range / 64;
+                                int step = ENV_RANGE / 64;
                                if ( mode == 5 ) /* exponential */
                                {
                                    envx--; /* envx *= 255 / 256 */
@@ -531,14 +531,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                            }
                            else /* attack */
                            {
-                                int const step = env_range / 64;
+                                int const step = ENV_RANGE / 64;
                                envx += step;
                                if ( mode == 7 &&
-                                     envx >= env_range * 3 / 4 + step )
+                                     envx >= ENV_RANGE * 3 / 4 + step )
-                                    envx += env_range / 256 - step;
+                                    envx += ENV_RANGE / 256 - step;
                                
-                                if ( envx >= env_range )
+                                if ( envx >= ENV_RANGE )
-                                    envx = env_range - 1;
+                                    envx = ENV_RANGE - 1;
                            }
                            voice->envx = envx;
                            /* TODO: should this be 8? */
@@ -550,7 +550,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                else /* state_release */
                {
                    int envx = voice->envx;
-                    if ( (envx -= env_range / 256) > 0 )
+                    if ( (envx -= ENV_RANGE / 256) > 0 )
                    {
                        voice->envx = envx;
                        raw_voice->envx = envx >> 8;
@@ -683,7 +683,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                        smp2 = smp1;
                    }
                    
-                    CLAMP16( delta, delta );
+                    delta = CLAMP16( delta );
                    smp1 = (int16_t) (delta * 2); /* sign-extend */
                }
                while ( (offset += 4) != 0 );
@@ -778,7 +778,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                    output = (output + rev [1] * interp [2]) >> 12;
                    output = (int16_t) (output * 2);
                    output += ((rev [0] * interp [3]) >> 12) * 2;
-                    CLAMP16( output, output );
+                    output = CLAMP16( output );
                }
                output = (output * voice->envx) >> 11 & ~1;
                
@@ -788,7 +788,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                prev_outx = output;
                raw_voice->outx = (int8_t) (output >> 8);
            }
-        #else
+        #else /* SPCNOINTERP */
        /* two-point linear interpolation */
        #ifdef CPU_COLDFIRE
            int amp_0 = (int16_t)this->noise;
@@ -822,7 +822,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                /* output = y0 + (result >> 12)          */
                "asr.l      %[sh], %[y1]              \r\n"
                "add.l      %[y0], %[y1]              \r\n"
-                : [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
+                : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
                : [s]"a"(voice->samples), [sh]"d"(12)
                    );
            }
@@ -861,17 +861,49 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
            "movclr.l %%acc1, %[amp_1] \r\n"
            : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
            );
-        #else
+        #elif defined (CPU_ARM)
+            int amp_0, amp_1;
+            
+            if ( (this->r.g.noise_enables & vbit) != 0 ) {
+                amp_0 = *(int16_t *)&this->noise;
+            } else {
+                uint32_t f = voice->position;
+                amp_0 = (uint32_t)voice->samples;
-            /* Try this one out on ARM and see - similar to above but the asm
+                asm volatile(
-               on coldfire removes a redundant register load worth 1 or 2%;
+                "mov    %[y1], %[f], lsr #12        \r\n"
-               switching to loading two samples at once may help too. That's
+                "eor    %[f], %[f], %[y1], lsl #12  \r\n" 
-               done above and while 6 to 7% faster on cf over two 16 bit loads
+                "add    %[y1], %[y0], %[y1], lsl #1 \r\n"
-               it makes it endian dependant.
+                "ldrsh  %[y0], [%[y1], #2]          \r\n"
-               
+                "ldrsh  %[y1], [%[y1], #4]          \r\n"
-               measured small improvement (~1.5%) - hcs
+                "sub    %[y1], %[y1], %[y0]         \r\n"
-            */
+                "mul    %[f], %[y1], %[f]           \r\n"
+                "add    %[y0], %[y0], %[f], asr #12 \r\n"
+                : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
+                );
+            }
+            voice->position += rate;
+            asm volatile(
+            "mul    %[amp_1], %[amp_0], %[envx] \r\n"
+            "mov    %[amp_0], %[amp_1], asr #11 \r\n"
+            "mov    %[amp_1], %[amp_0], asr #8  \r\n"
+            : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
+            : [envx]"r"(voice->envx)
+            );
+            prev_outx = amp_0;
+            raw_voice->outx = (int8_t)amp_1;
+            asm volatile(
+            "mul    %[amp_1], %[amp_0], %[vol_1] \r\n"
+            "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
+            : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
+            : [vol_0]"r"((int)voice->volume[0]),
+              [vol_1]"r"((int)voice->volume[1])
+            );
+        #else /* Unoptimized CPU */
            int output;
            
            if ( (this->r.g.noise_enables & vbit) == 0 )
@@ -884,19 +916,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
            }
            voice->position += rate;
-            
-            /* old version */
-#if 0
-            int fraction = voice->position & 0xFFF;
-            short const* const pos = voice->samples + (voice->position >> 12);
-            voice->position += rate;
-            int output =
-                (pos [2] * fraction + pos [1] * (0x1000 - fraction)) >> 12;
-            /* no interpolation (hardly faster, and crappy sounding) */
-            /*int output = pos [0];*/
-            if ( this->r.g.noise_enables & vbit )
-                output = *(int16_t*) &this->noise;
-#endif
            output = (output * voice->envx) >> 11;
            /* duplicated here to give compiler more to run in parallel */
@@ -905,8 +925,8 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
            prev_outx = output;
            raw_voice->outx = (int8_t) (output >> 8);
-        #endif /* CPU_COLDFIRE */
+        #endif /* CPU_* */
-        #endif
+        #endif /* SPCNOINTERP */
        
        #if SPC_BRRCACHE
            if ( voice->position >= voice->wave_end )
@@ -1033,7 +1053,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
            "or.l       %[sh], %[e0]                \r\n"
            /* save final feedback into echo buffer    */
            "move.l     %[e0], (%[echo_ptr])        \r\n"
-            : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1)
+            : [e0]"+d"(echo_0), [e1]"+d"(echo_1)
            : [out_0]"r"(out_0), [out_1]"r"(out_1),
              [ef]"r"((int)this->r.g.echo_feedback),
              [echo_ptr]"a"((int32_t *)echo_ptr),
@@ -1056,7 +1076,88 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
        out_buf [             0] = out_0;
        out_buf [WAV_CHUNK_SIZE] = out_1;
        out_buf ++;
-    #else /* !CPU_COLDFIRE */
+    #elif defined (CPU_ARM)
+        /* Read feedback from echo buffer */
+        int echo_pos = this->echo_pos;
+        uint8_t* const echo_ptr = RAM +
+                ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
+        echo_pos += 4;
+        if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
+            echo_pos = 0;
+        this->echo_pos = echo_pos;
+        int fb_0 = GET_LE16SA( echo_ptr     );
+        int fb_1 = GET_LE16SA( echo_ptr + 2 );
+        /* Keep last 8 samples */
+        int32_t *fir_ptr = this->fir_ptr;
+        /* Apply FIR */
+        asm volatile (
+        "str    %[fb_0], [%[fir_p]], #4  \r\n"
+        "str    %[fb_1], [%[fir_p]], #4  \r\n"
+        /* duplicate at +8 eliminates wrap checking below */
+        "str    %[fb_0], [%[fir_p], #56] \r\n"
+        "str    %[fb_1], [%[fir_p], #60] \r\n"
+        : [fir_p]"+r"(fir_ptr)
+        : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
+        );
+        this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
+        int32_t *fir_coeff = this->fir_coeff;
+        asm volatile (
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r4-r5 }     \r\n"
+        "mul    %[fb_0],     r0, %[fb_0] \r\n"
+        "mul    %[fb_1],     r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
+          [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
+        :
+        : "r0", "r1", "r2", "r3", "r4", "r5"
+        );
+        /* Generate output */
+        int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
+                    >> global_muting;
+        int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
+                    >> global_muting;
+        out_buf [             0] = amp_0;
+        out_buf [WAV_CHUNK_SIZE] = amp_1;
+        out_buf ++;
+        if ( !(this->r.g.flags & 0x20) )
+        {
+            /* Feedback into echo buffer */
+            int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
+            int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
+            e0 = CLAMP16( e0 );
+            SET_LE16A( echo_ptr    , e0 );
+            e1 = CLAMP16( e1 );
+            SET_LE16A( echo_ptr + 2, e1 );
+        }
+    #else /* Unoptimized CPU */
        /* Read feedback from echo buffer */
        int echo_pos = this->echo_pos;
        uint8_t* const echo_ptr = RAM +
@@ -1102,25 +1203,25 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
        out_buf [WAV_CHUNK_SIZE] = amp_1;
        out_buf ++;
        
-        /* Feedback into echo buffer */
-        int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
-        int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
        if ( !(this->r.g.flags & 0x20) )
        {
-            CLAMP16( e0, e0 );
+            /* Feedback into echo buffer */
+            int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
+            int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
+            e0 = CLAMP16( e0 );
            SET_LE16A( echo_ptr    , e0 );
-            CLAMP16( e1, e1 );
+            e1 = CLAMP16( e1 );
            SET_LE16A( echo_ptr + 2, e1 );
        }
-    #endif /* CPU_COLDFIRE */
+    #endif /* CPU_* */
-    #else
+    #else /* SPCNOECHO == 1*/
        /* Generate output  */
        int amp_0 = (chans_0 * global_vol_0) >> global_muting;
        int amp_1 = (chans_1 * global_vol_1) >> global_muting;
        out_buf [             0] = amp_0;
        out_buf [WAV_CHUNK_SIZE] = amp_1;
        out_buf ++;
-    #endif
+    #endif /* SPCNOECHO */
    }
    while ( --count );
 #if 0
@@ -1155,10 +1256,13 @@ void DSP_reset( struct Spc_Dsp* this )
            this->wave_entry [i].start_addr = -1;
    #endif
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE)
-    this->fir_ptr      = fir_buf;
+    this->fir_ptr = fir_buf;
    this->last_fir_ptr = &fir_buf [7];
    ci->memset( fir_buf, 0, sizeof fir_buf );
+#elif defined (CPU_ARM)
+    this->fir_ptr = fir_buf;
+    ci->memset( fir_buf, 0, sizeof fir_buf );
 #else
    this->fir_pos = 0;
    ci->memset( this->fir_buf, 0, sizeof this->fir_buf );

diff --git a/apps/codecs/spc/spc_dsp.c b/apps/codecs/spc/spc_dsp.c index 8881788cf1..19986fd8a8 100644 --- a/apps/codecs/spc/spc_dsp.c +++ b/apps/codecs/spc/spc_dsp.c
@@ -25,14 +25,13 @@
25	#include "spc_codec.h"	25	#include "spc_codec.h"
26	#include "spc_profiler.h"	26	#include "spc_profiler.h"
27		27
28	#ifdef CPU_COLDFIRE	28	#if defined(CPU_COLDFIRE) \|\| defined (CPU_ARM)
29	static int32_t fir_buf[FIR_BUF_HALF]	29	int32_t fir_buf[FIR_BUF_CNT]
30	__attribute__ ((aligned (FIR_BUF_SIZE*2))) IBSS_ATTR;	30	__attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
31	#endif	31	#endif
32
33	#if SPC_BRRCACHE	32	#if SPC_BRRCACHE
34	/* a little extra for samples that go past end */	33	/* a little extra for samples that go past end */
35	int16_t BRRcache [0x20000 + 32];	34	int16_t BRRcache [BRR_CACHE_SIZE];
36	#endif	35	#endif
37		36
38	void DSP_write( struct Spc_Dsp* this, int i, int data )	37	void DSP_write( struct Spc_Dsp* this, int i, int data )
@@ -58,11 +57,12 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
58		57
59	/* if ( n < -32768 ) out = -32768; */	58	/* if ( n < -32768 ) out = -32768; */
60	/* if ( n > 32767 ) out = 32767; */	59	/* if ( n > 32767 ) out = 32767; */
61	#define CLAMP16( n, out )\	60	#define CLAMP16( n ) \
62	{\	61	({ \
63	if ( (int16_t) n != n )\	62	if ( (int16_t) n != n ) \
64	out = 0x7FFF ^ (n >> 31);\	63	n = 0x7FFF ^ (n >> 31); \
65	}	64	n; \
		65	})
66		66
67	#if SPC_BRRCACHE	67	#if SPC_BRRCACHE
68	static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,	68	static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
@@ -181,7 +181,7 @@ static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
181	smp2 = smp1;	181	smp2 = smp1;
182	}	182	}
183		183
184	CLAMP16( delta, delta );	184	delta = CLAMP16( delta );
185	smp1 = (int16_t) (delta * 2); /* sign-extend */	185	smp1 = (int16_t) (delta * 2); /* sign-extend */
186	}	186	}
187	while ( (offset += 4) != 0 );	187	while ( (offset += 4) != 0 );
@@ -359,7 +359,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
359	#define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)	359	#define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)
360	#define IF_RBE(...)	360	#define IF_RBE(...)
361	#endif /* ROCKBOX_BIG_ENDIAN */	361	#endif /* ROCKBOX_BIG_ENDIAN */
362		362
363	#if !SPC_NOINTERP	363	#if !SPC_NOINTERP
364	int const slow_gaussian = (this->r.g.pitch_mods >> 1) \|	364	int const slow_gaussian = (this->r.g.pitch_mods >> 1) \|
365	this->r.g.noise_enables;	365	this->r.g.noise_enables;
@@ -431,7 +431,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
431		431
432	/* Envelope */	432	/* Envelope */
433	{	433	{
434	int const env_range = 0x800;	434	int const ENV_RANGE = 0x800;
435	int env_mode = voice->env_mode;	435	int env_mode = voice->env_mode;
436	int adsr0 = raw_voice->adsr [0];	436	int adsr0 = raw_voice->adsr [0];
437	int env_timer;	437	int env_timer;
@@ -482,14 +482,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
482		482
483	int envx = voice->envx;	483	int envx = voice->envx;
484		484
485	int const step = env_range / 64;	485	int const step = ENV_RANGE / 64;
486	envx += step;	486	envx += step;
487	if ( t == 15 )	487	if ( t == 15 )
488	envx += env_range / 2 - step;	488	envx += ENV_RANGE / 2 - step;
489		489
490	if ( envx >= env_range )	490	if ( envx >= ENV_RANGE )
491	{	491	{
492	envx = env_range - 1;	492	envx = ENV_RANGE - 1;
493	voice->env_mode = state_decay;	493	voice->env_mode = state_decay;
494	}	494	}
495	voice->envx = envx;	495	voice->envx = envx;
@@ -516,7 +516,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
516	int mode = t >> 5;	516	int mode = t >> 5;
517	if ( mode <= 5 ) /* decay */	517	if ( mode <= 5 ) /* decay */
518	{	518	{
519	int step = env_range / 64;	519	int step = ENV_RANGE / 64;
520	if ( mode == 5 ) /* exponential */	520	if ( mode == 5 ) /* exponential */
521	{	521	{
522	envx--; /* envx = 255 / 256 /	522	envx--; /* envx = 255 / 256 /
@@ -531,14 +531,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
531	}	531	}
532	else /* attack */	532	else /* attack */
533	{	533	{
534	int const step = env_range / 64;	534	int const step = ENV_RANGE / 64;
535	envx += step;	535	envx += step;
536	if ( mode == 7 &&	536	if ( mode == 7 &&
537	envx >= env_range * 3 / 4 + step )	537	envx >= ENV_RANGE * 3 / 4 + step )
538	envx += env_range / 256 - step;	538	envx += ENV_RANGE / 256 - step;
539		539
540	if ( envx >= env_range )	540	if ( envx >= ENV_RANGE )
541	envx = env_range - 1;	541	envx = ENV_RANGE - 1;
542	}	542	}
543	voice->envx = envx;	543	voice->envx = envx;
544	/* TODO: should this be 8? */	544	/* TODO: should this be 8? */
@@ -550,7 +550,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
550	else /* state_release */	550	else /* state_release */
551	{	551	{
552	int envx = voice->envx;	552	int envx = voice->envx;
553	if ( (envx -= env_range / 256) > 0 )	553	if ( (envx -= ENV_RANGE / 256) > 0 )
554	{	554	{
555	voice->envx = envx;	555	voice->envx = envx;
556	raw_voice->envx = envx >> 8;	556	raw_voice->envx = envx >> 8;
@@ -683,7 +683,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
683	smp2 = smp1;	683	smp2 = smp1;
684	}	684	}
685		685
686	CLAMP16( delta, delta );	686	delta = CLAMP16( delta );
687	smp1 = (int16_t) (delta * 2); /* sign-extend */	687	smp1 = (int16_t) (delta * 2); /* sign-extend */
688	}	688	}
689	while ( (offset += 4) != 0 );	689	while ( (offset += 4) != 0 );
@@ -778,7 +778,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
778	output = (output + rev [1] * interp [2]) >> 12;	778	output = (output + rev [1] * interp [2]) >> 12;
779	output = (int16_t) (output * 2);	779	output = (int16_t) (output * 2);
780	output += ((rev [0] * interp [3]) >> 12) * 2;	780	output += ((rev [0] * interp [3]) >> 12) * 2;
781	CLAMP16( output, output );	781	output = CLAMP16( output );
782	}	782	}
783	output = (output * voice->envx) >> 11 & ~1;	783	output = (output * voice->envx) >> 11 & ~1;
784		784
@@ -788,7 +788,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
788	prev_outx = output;	788	prev_outx = output;
789	raw_voice->outx = (int8_t) (output >> 8);	789	raw_voice->outx = (int8_t) (output >> 8);
790	}	790	}
791	#else	791	#else /* SPCNOINTERP */
792	/* two-point linear interpolation */	792	/* two-point linear interpolation */
793	#ifdef CPU_COLDFIRE	793	#ifdef CPU_COLDFIRE
794	int amp_0 = (int16_t)this->noise;	794	int amp_0 = (int16_t)this->noise;
@@ -822,7 +822,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
822	/* output = y0 + (result >> 12) */	822	/* output = y0 + (result >> 12) */
823	"asr.l %[sh], %[y1] \r\n"	823	"asr.l %[sh], %[y1] \r\n"
824	"add.l %[y0], %[y1] \r\n"	824	"add.l %[y0], %[y1] \r\n"
825	: [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)	825	: [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
826	: [s]"a"(voice->samples), [sh]"d"(12)	826	: [s]"a"(voice->samples), [sh]"d"(12)
827	);	827	);
828	}	828	}
@@ -861,17 +861,49 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
861	"movclr.l %%acc1, %[amp_1] \r\n"	861	"movclr.l %%acc1, %[amp_1] \r\n"
862	: [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)	862	: [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
863	);	863	);
864	#else	864	#elif defined (CPU_ARM)
		865	int amp_0, amp_1;
		866
		867	if ( (this->r.g.noise_enables & vbit) != 0 ) {
		868	amp_0 = (int16_t )&this->noise;
		869	} else {
		870	uint32_t f = voice->position;
		871	amp_0 = (uint32_t)voice->samples;
865		872
866	/* Try this one out on ARM and see - similar to above but the asm	873	asm volatile(
867	on coldfire removes a redundant register load worth 1 or 2%;	874	"mov %[y1], %[f], lsr #12 \r\n"
868	switching to loading two samples at once may help too. That's	875	"eor %[f], %[f], %[y1], lsl #12 \r\n"
869	done above and while 6 to 7% faster on cf over two 16 bit loads	876	"add %[y1], %[y0], %[y1], lsl #1 \r\n"
870	it makes it endian dependant.	877	"ldrsh %[y0], [%[y1], #2] \r\n"
871		878	"ldrsh %[y1], [%[y1], #4] \r\n"
872	measured small improvement (~1.5%) - hcs	879	"sub %[y1], %[y1], %[y0] \r\n"
873	*/	880	"mul %[f], %[y1], %[f] \r\n"
		881	"add %[y0], %[y0], %[f], asr #12 \r\n"
		882	: [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
		883	);
		884	}
		885
		886	voice->position += rate;
		887
		888	asm volatile(
		889	"mul %[amp_1], %[amp_0], %[envx] \r\n"
		890	"mov %[amp_0], %[amp_1], asr #11 \r\n"
		891	"mov %[amp_1], %[amp_0], asr #8 \r\n"
		892	: [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
		893	: [envx]"r"(voice->envx)
		894	);
		895
		896	prev_outx = amp_0;
		897	raw_voice->outx = (int8_t)amp_1;
874		898
		899	asm volatile(
		900	"mul %[amp_1], %[amp_0], %[vol_1] \r\n"
		901	"mul %[amp_0], %[vol_0], %[amp_0] \r\n"
		902	: [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
		903	: [vol_0]"r"((int)voice->volume[0]),
		904	[vol_1]"r"((int)voice->volume[1])
		905	);
		906	#else /* Unoptimized CPU */
875	int output;	907	int output;
876		908
877	if ( (this->r.g.noise_enables & vbit) == 0 )	909	if ( (this->r.g.noise_enables & vbit) == 0 )
@@ -884,19 +916,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
884	}	916	}
885		917
886	voice->position += rate;	918	voice->position += rate;
887		919
888	/* old version */
889	#if 0
890	int fraction = voice->position & 0xFFF;
891	short const* const pos = voice->samples + (voice->position >> 12);
892	voice->position += rate;
893	int output =
894	(pos [2] * fraction + pos [1] * (0x1000 - fraction)) >> 12;
895	/* no interpolation (hardly faster, and crappy sounding) */
896	/int output = pos [0];/
897	if ( this->r.g.noise_enables & vbit )
898	output = (int16_t) &this->noise;
899	#endif
900	output = (output * voice->envx) >> 11;	920	output = (output * voice->envx) >> 11;
901		921
902	/* duplicated here to give compiler more to run in parallel */	922	/* duplicated here to give compiler more to run in parallel */
@@ -905,8 +925,8 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
905		925
906	prev_outx = output;	926	prev_outx = output;
907	raw_voice->outx = (int8_t) (output >> 8);	927	raw_voice->outx = (int8_t) (output >> 8);
908	#endif /* CPU_COLDFIRE */	928	#endif /* CPU_* */
909	#endif	929	#endif /* SPCNOINTERP */
910		930
911	#if SPC_BRRCACHE	931	#if SPC_BRRCACHE
912	if ( voice->position >= voice->wave_end )	932	if ( voice->position >= voice->wave_end )
@@ -1033,7 +1053,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1033	"or.l %[sh], %[e0] \r\n"	1053	"or.l %[sh], %[e0] \r\n"
1034	/* save final feedback into echo buffer */	1054	/* save final feedback into echo buffer */
1035	"move.l %[e0], (%[echo_ptr]) \r\n"	1055	"move.l %[e0], (%[echo_ptr]) \r\n"
1036	: [e0]"+&d"(echo_0), [e1]"+&d"(echo_1)	1056	: [e0]"+d"(echo_0), [e1]"+d"(echo_1)
1037	: [out_0]"r"(out_0), [out_1]"r"(out_1),	1057	: [out_0]"r"(out_0), [out_1]"r"(out_1),
1038	[ef]"r"((int)this->r.g.echo_feedback),	1058	[ef]"r"((int)this->r.g.echo_feedback),
1039	[echo_ptr]"a"((int32_t *)echo_ptr),	1059	[echo_ptr]"a"((int32_t *)echo_ptr),
@@ -1056,7 +1076,88 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1056	out_buf [ 0] = out_0;	1076	out_buf [ 0] = out_0;
1057	out_buf [WAV_CHUNK_SIZE] = out_1;	1077	out_buf [WAV_CHUNK_SIZE] = out_1;
1058	out_buf ++;	1078	out_buf ++;
1059	#else /* !CPU_COLDFIRE */	1079	#elif defined (CPU_ARM)
		1080	/* Read feedback from echo buffer */
		1081	int echo_pos = this->echo_pos;
		1082	uint8_t* const echo_ptr = RAM +
		1083	((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
		1084	echo_pos += 4;
		1085	if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
		1086	echo_pos = 0;
		1087	this->echo_pos = echo_pos;
		1088
		1089	int fb_0 = GET_LE16SA( echo_ptr );
		1090	int fb_1 = GET_LE16SA( echo_ptr + 2 );
		1091
		1092	/* Keep last 8 samples */
		1093	int32_t *fir_ptr = this->fir_ptr;
		1094
		1095	/* Apply FIR */
		1096	asm volatile (
		1097	"str %[fb_0], [%[fir_p]], #4 \r\n"
		1098	"str %[fb_1], [%[fir_p]], #4 \r\n"
		1099	/* duplicate at +8 eliminates wrap checking below */
		1100	"str %[fb_0], [%[fir_p], #56] \r\n"
		1101	"str %[fb_1], [%[fir_p], #60] \r\n"
		1102	: [fir_p]"+r"(fir_ptr)
		1103	: [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
		1104	);
		1105
		1106	this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
		1107	int32_t *fir_coeff = this->fir_coeff;
		1108
		1109	asm volatile (
		1110	"ldmia %[fir_c]!, { r0-r1 } \r\n"
		1111	"ldmia %[fir_p]!, { r4-r5 } \r\n"
		1112	"mul %[fb_0], r0, %[fb_0] \r\n"
		1113	"mul %[fb_1], r0, %[fb_1] \r\n"
		1114	"mla %[fb_0], r4, r1, %[fb_0] \r\n"
		1115	"mla %[fb_1], r5, r1, %[fb_1] \r\n"
		1116	"ldmia %[fir_c]!, { r0-r1 } \r\n"
		1117	"ldmia %[fir_p]!, { r2-r5 } \r\n"
		1118	"mla %[fb_0], r2, r0, %[fb_0] \r\n"
		1119	"mla %[fb_1], r3, r0, %[fb_1] \r\n"
		1120	"mla %[fb_0], r4, r1, %[fb_0] \r\n"
		1121	"mla %[fb_1], r5, r1, %[fb_1] \r\n"
		1122	"ldmia %[fir_c]!, { r0-r1 } \r\n"
		1123	"ldmia %[fir_p]!, { r2-r5 } \r\n"
		1124	"mla %[fb_0], r2, r0, %[fb_0] \r\n"
		1125	"mla %[fb_1], r3, r0, %[fb_1] \r\n"
		1126	"mla %[fb_0], r4, r1, %[fb_0] \r\n"
		1127	"mla %[fb_1], r5, r1, %[fb_1] \r\n"
		1128	"ldmia %[fir_c]!, { r0-r1 } \r\n"
		1129	"ldmia %[fir_p]!, { r2-r5 } \r\n"
		1130	"mla %[fb_0], r2, r0, %[fb_0] \r\n"
		1131	"mla %[fb_1], r3, r0, %[fb_1] \r\n"
		1132	"mla %[fb_0], r4, r1, %[fb_0] \r\n"
		1133	"mla %[fb_1], r5, r1, %[fb_1] \r\n"
		1134	: [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
		1135	[fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
		1136	:
		1137	: "r0", "r1", "r2", "r3", "r4", "r5"
		1138	);
		1139
		1140	/* Generate output */
		1141	int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
		1142	>> global_muting;
		1143	int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
		1144	>> global_muting;
		1145
		1146	out_buf [ 0] = amp_0;
		1147	out_buf [WAV_CHUNK_SIZE] = amp_1;
		1148	out_buf ++;
		1149
		1150	if ( !(this->r.g.flags & 0x20) )
		1151	{
		1152	/* Feedback into echo buffer */
		1153	int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
		1154	int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
		1155	e0 = CLAMP16( e0 );
		1156	SET_LE16A( echo_ptr , e0 );
		1157	e1 = CLAMP16( e1 );
		1158	SET_LE16A( echo_ptr + 2, e1 );
		1159	}
		1160	#else /* Unoptimized CPU */
1060	/* Read feedback from echo buffer */	1161	/* Read feedback from echo buffer */
1061	int echo_pos = this->echo_pos;	1162	int echo_pos = this->echo_pos;
1062	uint8_t* const echo_ptr = RAM +	1163	uint8_t* const echo_ptr = RAM +
@@ -1102,25 +1203,25 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
1102	out_buf [WAV_CHUNK_SIZE] = amp_1;	1203	out_buf [WAV_CHUNK_SIZE] = amp_1;
1103	out_buf ++;	1204	out_buf ++;
1104		1205
1105	/* Feedback into echo buffer */
1106	int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1107	int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1108	if ( !(this->r.g.flags & 0x20) )	1206	if ( !(this->r.g.flags & 0x20) )
1109	{	1207	{
1110	CLAMP16( e0, e0 );	1208	/* Feedback into echo buffer */
		1209	int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
		1210	int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
		1211	e0 = CLAMP16( e0 );
1111	SET_LE16A( echo_ptr , e0 );	1212	SET_LE16A( echo_ptr , e0 );
1112	CLAMP16( e1, e1 );	1213	e1 = CLAMP16( e1 );
1113	SET_LE16A( echo_ptr + 2, e1 );	1214	SET_LE16A( echo_ptr + 2, e1 );
1114	}	1215	}
1115	#endif /* CPU_COLDFIRE */	1216	#endif /* CPU_* */
1116	#else	1217	#else /* SPCNOECHO == 1*/
1117	/* Generate output */	1218	/* Generate output */
1118	int amp_0 = (chans_0 * global_vol_0) >> global_muting;	1219	int amp_0 = (chans_0 * global_vol_0) >> global_muting;
1119	int amp_1 = (chans_1 * global_vol_1) >> global_muting;	1220	int amp_1 = (chans_1 * global_vol_1) >> global_muting;
1120	out_buf [ 0] = amp_0;	1221	out_buf [ 0] = amp_0;
1121	out_buf [WAV_CHUNK_SIZE] = amp_1;	1222	out_buf [WAV_CHUNK_SIZE] = amp_1;
1122	out_buf ++;	1223	out_buf ++;
1123	#endif	1224	#endif /* SPCNOECHO */
1124	}	1225	}
1125	while ( --count );	1226	while ( --count );
1126	#if 0	1227	#if 0
@@ -1155,10 +1256,13 @@ void DSP_reset( struct Spc_Dsp* this )
1155	this->wave_entry [i].start_addr = -1;	1256	this->wave_entry [i].start_addr = -1;
1156	#endif	1257	#endif
1157		1258
1158	#ifdef CPU_COLDFIRE	1259	#if defined(CPU_COLDFIRE)
1159	this->fir_ptr = fir_buf;	1260	this->fir_ptr = fir_buf;
1160	this->last_fir_ptr = &fir_buf [7];	1261	this->last_fir_ptr = &fir_buf [7];
1161	ci->memset( fir_buf, 0, sizeof fir_buf );	1262	ci->memset( fir_buf, 0, sizeof fir_buf );
		1263	#elif defined (CPU_ARM)
		1264	this->fir_ptr = fir_buf;
		1265	ci->memset( fir_buf, 0, sizeof fir_buf );
1162	#else	1266	#else
1163	this->fir_pos = 0;	1267	this->fir_pos = 0;
1164	ci->memset( this->fir_buf, 0, sizeof this->fir_buf );	1268	ci->memset( this->fir_buf, 0, sizeof this->fir_buf );