From 1a41c8afeefd4884a5369c79808c68b3514eec8b Mon Sep 17 00:00:00 2001 From: Michael Sevakis Date: Sun, 18 Nov 2007 19:03:45 +0000 Subject: SPC Codec: Run SPC emulation on COP and audio sample processing on CPU on dual-core PortalPlayer targets. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15673 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs.c | 14 +- apps/codecs.h | 19 ++- apps/codecs/spc.c | 309 +++++++++++++++++++++++++++++++++++++++----- apps/codecs/spc/spc_codec.h | 100 ++++++++++---- apps/codecs/spc/spc_dsp.c | 232 ++++++++++++++++++++++++--------- 5 files changed, 549 insertions(+), 125 deletions(-) diff --git a/apps/codecs.c b/apps/codecs.c index f2539dc73e..9969b6f597 100644 --- a/apps/codecs.c +++ b/apps/codecs.c @@ -162,7 +162,19 @@ struct codec_api ci = { invalidate_icache, #endif - NULL, /* struct sp_data *dsp */ + NULL, /* struct dsp_config *dsp */ + +#if NUM_CORES > 1 + create_thread, + thread_thaw, + thread_wait, + semaphore_init, + semaphore_wait, + semaphore_release, + event_init, + event_wait, + event_set_state, +#endif }; void codec_get_full_path(char *path, const char *codec_root_fn) diff --git a/apps/codecs.h b/apps/codecs.h index d2ba00ca2a..29ed2d351b 100644 --- a/apps/codecs.h +++ b/apps/codecs.h @@ -80,7 +80,7 @@ #define CODEC_ENC_MAGIC 0x52454E43 /* RENC */ /* increase this every time the api struct changes */ -#define CODEC_API_VERSION 20 +#define CODEC_API_VERSION 21 /* update this to latest version if a change to the api struct breaks backwards compatibility (and please take the opportunity to sort in any @@ -236,6 +236,23 @@ struct codec_api { #endif struct dsp_config *dsp; + +#if NUM_CORES > 1 + struct thread_entry * + (*create_thread)(void (*function)(void), void* stack, + int stack_size, unsigned flags, const char *name + IF_PRIO(, int priority) + IF_COP(, unsigned int core)); + + void (*thread_thaw)(struct thread_entry *thread); + void (*thread_wait)(struct thread_entry *thread); + void (*semaphore_init)(struct semaphore *s, int max, int start); + void (*semaphore_wait)(struct semaphore *s); + void (*semaphore_release)(struct semaphore *s); + void (*event_init)(struct event *e, unsigned int flags); + void (*event_wait)(struct event *e, unsigned int for_state); + void (*event_set_state)(struct event *e, unsigned int state); +#endif /* NUM_CORES */ }; /* codec header */ diff --git a/apps/codecs/spc.c b/apps/codecs/spc.c index f2890cd4a4..ae3026354d 100644 --- a/apps/codecs/spc.c +++ b/apps/codecs/spc.c @@ -185,12 +185,253 @@ static int LoadID666(unsigned char *buf) { } /**************** Codec ****************/ +enum {SAMPLE_RATE = 32000}; +static struct Spc_Emu spc_emu IDATA_ATTR CACHEALIGN_ATTR; -static int32_t samples[WAV_CHUNK_SIZE*2] IBSS_ATTR; +#if SPC_DUAL_CORE +/** Implementations for pipelined dual-core operation **/ +static int spc_emu_thread_stack[DEFAULT_STACK_SIZE/sizeof(int)] + CACHEALIGN_ATTR; -static struct Spc_Emu spc_emu IDATA_ATTR; +static const unsigned char * const spc_emu_thread_name = "spc emu"; +static struct thread_entry *emu_thread_p; -enum {SAMPLE_RATE = 32000}; +enum +{ + SPC_EMU_AUDIO = 0, + SPC_EMU_LOAD, + SPC_EMU_QUIT, +}; + +struct spc_load +{ + uint8_t *buf; + size_t size; +}; + +/* sample queue */ +#define WAV_NUM_CHUNKS 2 +#define WAV_CHUNK_MASK (WAV_NUM_CHUNKS-1) +struct sample_queue_chunk +{ + long id; + union + { + intptr_t data; + int32_t audio[WAV_CHUNK_SIZE*2]; + }; +}; + +static struct +{ + int head, tail; + struct semaphore emu_sem_head; + struct semaphore emu_sem_tail; + struct event emu_evt_reply; + intptr_t retval; + struct sample_queue_chunk wav_chunk[WAV_NUM_CHUNKS]; +} sample_queue NOCACHEBSS_ATTR; + +static inline void samples_release_wrbuf(void) +{ + sample_queue.tail++; + ci->semaphore_release(&sample_queue.emu_sem_head); +} + +static inline struct sample_queue_chunk * samples_get_wrbuf(void) +{ + ci->semaphore_wait(&sample_queue.emu_sem_tail); + return &sample_queue.wav_chunk[sample_queue.tail & WAV_CHUNK_MASK]; +} + +static inline void samples_release_rdbuf(void) +{ + if (sample_queue.head != sample_queue.tail) { + sample_queue.head++; + } + + ci->semaphore_release(&sample_queue.emu_sem_tail); +} + +static inline int32_t * samples_get_rdbuf(void) +{ + ci->semaphore_wait(&sample_queue.emu_sem_head); + + if (ci->stop_codec || ci->new_track) + { + /* Told to stop. Buffer must be released. */ + samples_release_rdbuf(); + return NULL; + } + + return sample_queue.wav_chunk[sample_queue.head & WAV_CHUNK_MASK].audio; +} + +static intptr_t emu_thread_send_msg(long id, intptr_t data) +{ + struct sample_queue_chunk *chunk; + /* Grab an audio output buffer */ + ci->semaphore_wait(&sample_queue.emu_sem_head); + chunk = &sample_queue.wav_chunk[sample_queue.head & WAV_CHUNK_MASK]; + /* Place a message in it instead of audio */ + chunk->id = id; + chunk->data = data; + /* Release it to the emu thread */ + samples_release_rdbuf(); + /* Wait for a response */ + ci->event_wait(&sample_queue.emu_evt_reply, STATE_SIGNALED); + return sample_queue.retval; +} + +/* thread function */ +static bool emu_thread_process_msg(struct sample_queue_chunk *chunk) +{ + long id = chunk->id; + bool ret = id != SPC_EMU_QUIT; + + chunk->id = SPC_EMU_AUDIO; /* Reset chunk type to audio */ + sample_queue.retval = 0; + + if (id == SPC_EMU_LOAD) + { + struct spc_load *ld = (struct spc_load *)chunk->data; + invalidate_icache(); + SPC_Init(&spc_emu); + sample_queue.retval = SPC_load_spc(&spc_emu, ld->buf, ld->size); + } + + /* Empty the audio queue */ + /* This is a dirty hack a timeout based wait would make unnescessary but + still safe because the other thread is known to be waiting for a reply + and is not using the objects. */ + ci->semaphore_init(&sample_queue.emu_sem_tail, 2, 2); + ci->semaphore_init(&sample_queue.emu_sem_head, 2, 0); + sample_queue.head = sample_queue.tail = 0; + ci->event_set_state(&sample_queue.emu_evt_reply, STATE_SIGNALED); + + return ret; +} + +static void spc_emu_thread(void) +{ + CPU_Init(&spc_emu); + + while (1) { + /* get a buffer for output */ + struct sample_queue_chunk *chunk = samples_get_wrbuf(); + + if (chunk->id != SPC_EMU_AUDIO) { + /* This chunk doesn't contain audio but a command */ + if (!emu_thread_process_msg(chunk)) + break; + /* Have to re-get this pointer to keep semaphore counts correct */ + continue; + } + + ENTER_TIMER(render); + /* fill samples buffer */ + if ( SPC_play(&spc_emu, WAV_CHUNK_SIZE*2, chunk->audio) ) + assert( false ); + EXIT_TIMER(render); + + /* done so release it to output */ + samples_release_wrbuf(); + ci->yield(); + } +} + +static bool spc_emu_start(void) +{ + emu_thread_p = ci->create_thread(spc_emu_thread, spc_emu_thread_stack, + sizeof(spc_emu_thread_stack), CREATE_THREAD_FROZEN, + spc_emu_thread_name IF_PRIO(, PRIORITY_PLAYBACK), COP); + + if (emu_thread_p == NULL) + return false; + + /* Initialize audio queue as full to prevent emu thread from trying to run the + emulator before loading something */ + ci->event_init(&sample_queue.emu_evt_reply, + EVENT_AUTOMATIC | STATE_NONSIGNALED); + ci->semaphore_init(&sample_queue.emu_sem_tail, 2, 0); + ci->semaphore_init(&sample_queue.emu_sem_head, 2, 2); + sample_queue.head = 0; + sample_queue.tail = 2; + + /* Start it running */ + ci->thread_thaw(emu_thread_p); + return true; +} + +/* load a new program on the emu thread */ +static inline int load_spc_buffer(uint8_t *buf, size_t size) +{ + struct spc_load ld = { buf, size }; + flush_icache(); + return emu_thread_send_msg(SPC_EMU_LOAD, (intptr_t)&ld); +} + +static inline void spc_emu_quit(void) +{ + emu_thread_send_msg(SPC_EMU_QUIT, 0); + /* Wait for emu thread to be killed */ + ci->thread_wait(emu_thread_p); +} + +static inline bool spc_play_get_samples(int32_t **samples) +{ + /* obtain filled samples buffer */ + *samples = samples_get_rdbuf(); + return *samples != NULL; +} + +static inline void spc_play_send_samples(int32_t *samples) +{ + ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE); + /* done with chunk so release it to emu thread */ + samples_release_rdbuf(); +} + +#else /* !SPC_DUAL_CORE */ +/** Implementations for single-core operation **/ +int32_t wav_chunk[WAV_CHUNK_SIZE*2] IBSS_ATTR; + +/* load a new program into emu */ +static inline int load_spc_buffer(uint8_t *buf, size_t size) +{ + SPC_Init(&spc_emu); + return SPC_load_spc(&spc_emu, buf, size); +} + +static inline bool spc_emu_start(void) +{ +#ifdef CPU_COLDFIRE + /* signed integer mode with saturation */ + coldfire_set_macsr(EMAC_SATURATE); +#endif + CPU_Init(&spc_emu); + return true; +} + +static inline void spc_play_send_samples(int32_t *samples) +{ + ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE); +} + +#define spc_emu_quit() +#define samples_release_rdbuf() + +static inline bool spc_play_get_samples(int32_t **samples) +{ + ENTER_TIMER(render); + /* fill samples buffer */ + if ( SPC_play(&spc_emu,WAV_CHUNK_SIZE*2,wav_chunk) ) + assert( false ); + EXIT_TIMER(render); + *samples = wav_chunk; + return true; +} +#endif /* SPC_DUAL_CORE */ /* The main decoder loop */ static int play_track( void ) @@ -206,7 +447,7 @@ static int play_track( void ) fadedec=0x7fffffffl/(fadeendsample-fadestartsample)+1; ENTER_TIMER(total); - + while ( 1 ) { ci->yield(); @@ -224,14 +465,12 @@ static int play_track( void ) } ci->seek_complete(); } - - ENTER_TIMER(render); - /* fill samples buffer */ - if ( SPC_play(&spc_emu,WAV_CHUNK_SIZE*2,samples) ) - assert( false ); - EXIT_TIMER(render); - - sampleswritten+=WAV_CHUNK_SIZE; + + int32_t *samples; + if (!spc_play_get_samples(&samples)) + break; + + sampleswritten += WAV_CHUNK_SIZE; /* is track timed? */ if (ci->global_settings->repeat_mode!=REPEAT_ONE && ci->id3->length) { @@ -241,11 +480,11 @@ static int play_track( void ) /* fade? */ if (curtime>ID666.length) { -#ifdef CPU_COLDFIRE + #ifdef CPU_COLDFIRE /* Have to switch modes to do this */ long macsr = coldfire_get_macsr(); coldfire_set_macsr(EMAC_SATURATE | EMAC_FRACTIONAL | EMAC_ROUND); -#endif + #endif int i; for (i=0;ifadestartsample) { @@ -256,42 +495,43 @@ static int play_track( void ) fadevol-=fadedec; } } -#ifdef CPU_COLDFIRE + #ifdef CPU_COLDFIRE coldfire_set_macsr(macsr); -#endif + #endif } /* end? */ if (lasttimesample>=fadeendsample) + { + samples_release_rdbuf(); break; + } } - ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE); + spc_play_send_samples(samples); if (ci->global_settings->repeat_mode!=REPEAT_ONE) - ci->set_elapsed(sampleswritten*1000LL/SAMPLE_RATE); + ci->set_elapsed(sampleswritten*1000LL/SAMPLE_RATE); else ci->set_elapsed(0); } EXIT_TIMER(total); - return 0; } /* this is the codec entry point */ enum codec_status codec_main(void) { -#ifdef CPU_COLDFIRE - /* signed integer mode with saturation */ - coldfire_set_macsr(EMAC_SATURATE); -#endif - CPU_Init(&spc_emu); + enum codec_status stat = CODEC_ERROR; + + if (!spc_emu_start()) + goto codec_quit; do { DEBUGF("SPC: next_track\n"); if (codec_init()) { - return CODEC_ERROR; + goto codec_quit; } DEBUGF("SPC: after init\n"); @@ -301,7 +541,7 @@ enum codec_status codec_main(void) /* wait for track info to load */ while (!*ci->taginfo_ready && !ci->stop_codec) - ci->sleep(1); + ci->yield(); codec_set_replaygain(ci->id3); @@ -313,20 +553,19 @@ enum codec_status codec_main(void) size_t buffersize; uint8_t* buffer = ci->request_buffer(&buffersize, ci->filesize); if (!buffer) { - return CODEC_ERROR; + goto codec_quit; } DEBUGF("SPC: read size = 0x%lx\n",(unsigned long)buffersize); do { - SPC_Init(&spc_emu); - if (SPC_load_spc(&spc_emu,buffer,buffersize)) { + if (load_spc_buffer(buffer, buffersize)) { DEBUGF("SPC load failure\n"); - return CODEC_ERROR; + goto codec_quit; } LoadID666(buffer+0x2e); - + if (ci->global_settings->repeat_mode!=REPEAT_ONE && ID666.length==0) { ID666.length=3*60*1000; /* 3 minutes */ ID666.fade=5*1000; /* 5 seconds */ @@ -340,12 +579,16 @@ enum codec_status codec_main(void) reset_profile_timers(); } - while ( play_track() ); print_timers(ci->id3->path); } while ( ci->request_next_track() ); + + stat = CODEC_OK; + +codec_quit: + spc_emu_quit(); - return CODEC_OK; + return stat; } diff --git a/apps/codecs/spc/spc_codec.h b/apps/codecs/spc/spc_codec.h index f2677df04a..c785acc468 100644 --- a/apps/codecs/spc/spc_codec.h +++ b/apps/codecs/spc/spc_codec.h @@ -32,38 +32,51 @@ /** Basic configuration options **/ -/* TGB is the only target fast enough for gaussian and realtime BRR decode */ -/* echo is almost fast enough but not quite */ -#ifndef TOSHIBA_GIGABEAT_F - /* Cache BRR waves */ - #define SPC_BRRCACHE 1 +#define SPC_DUAL_CORE 1 - /* Disable gaussian interpolation */ - #define SPC_NOINTERP 1 - -#ifndef CPU_COLDFIRE - /* Disable echo processing */ - #define SPC_NOECHO 1 -#else - /* Enable echo processing */ - #define SPC_NOECHO 0 +#if !defined(SPC_DUAL_CORE) || NUM_CORES == 1 +#undef SPC_DUAL_CORE +#define SPC_DUAL_CORE 0 #endif -#else + +/* TGB is the only target fast enough for gaussian and realtime BRR decode */ +/* echo is almost fast enough but not quite */ +#if defined(TOSHIBA_GIGABEAT_F) || defined(SIMULATOR) /* Don't cache BRR waves */ #define SPC_BRRCACHE 0 /* Allow gaussian interpolation */ #define SPC_NOINTERP 0 + + /* Allow echo processing */ + #define SPC_NOECHO 0 +#elif defined(CPU_COLDFIRE) + /* Cache BRR waves */ + #define SPC_BRRCACHE 1 + /* Disable gaussian interpolation */ + #define SPC_NOINTERP 1 + /* Allow echo processing */ #define SPC_NOECHO 0 -#endif +#elif defined (CPU_PP) && SPC_DUAL_CORE + /* Cache BRR waves */ + #define SPC_BRRCACHE 1 + + /* Disable gaussian interpolation */ + #define SPC_NOINTERP 1 -/* Samples per channel per iteration */ -#ifdef CPU_COLDFIRE -#define WAV_CHUNK_SIZE 1024 + /* Allow echo processing */ + #define SPC_NOECHO 0 #else -#define WAV_CHUNK_SIZE 2048 + /* Cache BRR waves */ + #define SPC_BRRCACHE 1 + + /* Disable gaussian interpolation */ + #define SPC_NOINTERP 1 + + /* Disable echo processing */ + #define SPC_NOECHO 1 #endif #ifdef CPU_ARM @@ -72,6 +85,26 @@ #undef IDATA_ATTR #define IDATA_ATTR + + #undef ICONST_ATTR + #define ICONST_ATTR + + #undef IBSS_ATTR + #define IBSS_ATTR + +#if SPC_DUAL_CORE + #undef NOCACHEBSS_ATTR + #define NOCACHEBSS_ATTR __attribute__ ((section(".ibss"))) + #undef NOCACHEDATA_ATTR + #define NOCACHEDATA_ATTR __attribute__((section(".idata"))) +#endif +#endif + +/* Samples per channel per iteration */ +#if defined(CPU_PP) && NUM_CORES == 1 +#define WAV_CHUNK_SIZE 2048 +#else +#define WAV_CHUNK_SIZE 1024 #endif /**************** Little-endian handling ****************/ @@ -231,16 +264,26 @@ extern int16_t BRRcache [BRR_CACHE_SIZE]; enum { FIR_BUF_HALF = 8 }; -#ifdef CPU_COLDFIRE +#if defined(CPU_COLDFIRE) /* global because of the large aligment requirement for hardware masking - * L-R interleaved 16-bit samples for easy loading and mac.w use. */ enum { - FIR_BUF_SIZE = FIR_BUF_HALF * sizeof ( int32_t ), - FIR_BUF_MASK = ~FIR_BUF_SIZE + FIR_BUF_CNT = FIR_BUF_HALF, + FIR_BUF_SIZE = FIR_BUF_CNT * sizeof ( int32_t ), + FIR_BUF_ALIGN = FIR_BUF_SIZE * 2, + FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) +}; +#elif defined (CPU_ARM) +enum +{ + FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2, + FIR_BUF_SIZE = FIR_BUF_CNT * sizeof ( int32_t ), + FIR_BUF_ALIGN = FIR_BUF_SIZE, + FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1)) }; -#endif /* CPU_COLDFIRE */ +#endif /* CPU_* */ struct Spc_Dsp { @@ -257,14 +300,19 @@ struct Spc_Dsp int noise_count; uint16_t noise; /* also read as int16_t */ -#ifdef CPU_COLDFIRE +#if defined(CPU_COLDFIRE) /* circularly hardware masked address */ int32_t *fir_ptr; /* wrapped address just behind current position - allows mac.w to increment and mask fir_ptr */ int32_t *last_fir_ptr; /* copy of echo FIR constants as int16_t for use with mac.w */ - int16_t fir_coeff[VOICE_COUNT]; + int16_t fir_coeff [VOICE_COUNT]; +#elif defined (CPU_ARM) + /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ + int32_t *fir_ptr; + /* copy of echo FIR constants as int32_t, for faster access */ + int32_t fir_coeff [VOICE_COUNT]; #else /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ int fir_pos; /* (0 to 7) */ diff --git a/apps/codecs/spc/spc_dsp.c b/apps/codecs/spc/spc_dsp.c index 8881788cf1..19986fd8a8 100644 --- a/apps/codecs/spc/spc_dsp.c +++ b/apps/codecs/spc/spc_dsp.c @@ -25,14 +25,13 @@ #include "spc_codec.h" #include "spc_profiler.h" -#ifdef CPU_COLDFIRE -static int32_t fir_buf[FIR_BUF_HALF] - __attribute__ ((aligned (FIR_BUF_SIZE*2))) IBSS_ATTR; +#if defined(CPU_COLDFIRE) || defined (CPU_ARM) +int32_t fir_buf[FIR_BUF_CNT] + __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR; #endif - #if SPC_BRRCACHE /* a little extra for samples that go past end */ -int16_t BRRcache [0x20000 + 32]; +int16_t BRRcache [BRR_CACHE_SIZE]; #endif void DSP_write( struct Spc_Dsp* this, int i, int data ) @@ -58,11 +57,12 @@ void DSP_write( struct Spc_Dsp* this, int i, int data ) /* if ( n < -32768 ) out = -32768; */ /* if ( n > 32767 ) out = 32767; */ -#define CLAMP16( n, out )\ -{\ - if ( (int16_t) n != n )\ - out = 0x7FFF ^ (n >> 31);\ -} +#define CLAMP16( n ) \ +({ \ + if ( (int16_t) n != n ) \ + n = 0x7FFF ^ (n >> 31); \ + n; \ +}) #if SPC_BRRCACHE static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, @@ -181,7 +181,7 @@ static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, smp2 = smp1; } - CLAMP16( delta, delta ); + delta = CLAMP16( delta ); smp1 = (int16_t) (delta * 2); /* sign-extend */ } while ( (offset += 4) != 0 ); @@ -359,7 +359,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) #define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF) #define IF_RBE(...) #endif /* ROCKBOX_BIG_ENDIAN */ - + #if !SPC_NOINTERP int const slow_gaussian = (this->r.g.pitch_mods >> 1) | this->r.g.noise_enables; @@ -431,7 +431,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* Envelope */ { - int const env_range = 0x800; + int const ENV_RANGE = 0x800; int env_mode = voice->env_mode; int adsr0 = raw_voice->adsr [0]; int env_timer; @@ -482,14 +482,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) int envx = voice->envx; - int const step = env_range / 64; + int const step = ENV_RANGE / 64; envx += step; if ( t == 15 ) - envx += env_range / 2 - step; + envx += ENV_RANGE / 2 - step; - if ( envx >= env_range ) + if ( envx >= ENV_RANGE ) { - envx = env_range - 1; + envx = ENV_RANGE - 1; voice->env_mode = state_decay; } voice->envx = envx; @@ -516,7 +516,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) int mode = t >> 5; if ( mode <= 5 ) /* decay */ { - int step = env_range / 64; + int step = ENV_RANGE / 64; if ( mode == 5 ) /* exponential */ { envx--; /* envx *= 255 / 256 */ @@ -531,14 +531,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) } else /* attack */ { - int const step = env_range / 64; + int const step = ENV_RANGE / 64; envx += step; if ( mode == 7 && - envx >= env_range * 3 / 4 + step ) - envx += env_range / 256 - step; + envx >= ENV_RANGE * 3 / 4 + step ) + envx += ENV_RANGE / 256 - step; - if ( envx >= env_range ) - envx = env_range - 1; + if ( envx >= ENV_RANGE ) + envx = ENV_RANGE - 1; } voice->envx = envx; /* TODO: should this be 8? */ @@ -550,7 +550,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) else /* state_release */ { int envx = voice->envx; - if ( (envx -= env_range / 256) > 0 ) + if ( (envx -= ENV_RANGE / 256) > 0 ) { voice->envx = envx; raw_voice->envx = envx >> 8; @@ -683,7 +683,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) smp2 = smp1; } - CLAMP16( delta, delta ); + delta = CLAMP16( delta ); smp1 = (int16_t) (delta * 2); /* sign-extend */ } while ( (offset += 4) != 0 ); @@ -778,7 +778,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) output = (output + rev [1] * interp [2]) >> 12; output = (int16_t) (output * 2); output += ((rev [0] * interp [3]) >> 12) * 2; - CLAMP16( output, output ); + output = CLAMP16( output ); } output = (output * voice->envx) >> 11 & ~1; @@ -788,7 +788,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) prev_outx = output; raw_voice->outx = (int8_t) (output >> 8); } - #else + #else /* SPCNOINTERP */ /* two-point linear interpolation */ #ifdef CPU_COLDFIRE int amp_0 = (int16_t)this->noise; @@ -822,7 +822,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* output = y0 + (result >> 12) */ "asr.l %[sh], %[y1] \r\n" "add.l %[y0], %[y1] \r\n" - : [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) + : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) : [s]"a"(voice->samples), [sh]"d"(12) ); } @@ -861,17 +861,49 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "movclr.l %%acc1, %[amp_1] \r\n" : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) ); - #else + #elif defined (CPU_ARM) + int amp_0, amp_1; + + if ( (this->r.g.noise_enables & vbit) != 0 ) { + amp_0 = *(int16_t *)&this->noise; + } else { + uint32_t f = voice->position; + amp_0 = (uint32_t)voice->samples; - /* Try this one out on ARM and see - similar to above but the asm - on coldfire removes a redundant register load worth 1 or 2%; - switching to loading two samples at once may help too. That's - done above and while 6 to 7% faster on cf over two 16 bit loads - it makes it endian dependant. - - measured small improvement (~1.5%) - hcs - */ + asm volatile( + "mov %[y1], %[f], lsr #12 \r\n" + "eor %[f], %[f], %[y1], lsl #12 \r\n" + "add %[y1], %[y0], %[y1], lsl #1 \r\n" + "ldrsh %[y0], [%[y1], #2] \r\n" + "ldrsh %[y1], [%[y1], #4] \r\n" + "sub %[y1], %[y1], %[y0] \r\n" + "mul %[f], %[y1], %[f] \r\n" + "add %[y0], %[y0], %[f], asr #12 \r\n" + : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1) + ); + } + + voice->position += rate; + + asm volatile( + "mul %[amp_1], %[amp_0], %[envx] \r\n" + "mov %[amp_0], %[amp_1], asr #11 \r\n" + "mov %[amp_1], %[amp_0], asr #8 \r\n" + : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1) + : [envx]"r"(voice->envx) + ); + + prev_outx = amp_0; + raw_voice->outx = (int8_t)amp_1; + asm volatile( + "mul %[amp_1], %[amp_0], %[vol_1] \r\n" + "mul %[amp_0], %[vol_0], %[amp_0] \r\n" + : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) + : [vol_0]"r"((int)voice->volume[0]), + [vol_1]"r"((int)voice->volume[1]) + ); + #else /* Unoptimized CPU */ int output; if ( (this->r.g.noise_enables & vbit) == 0 ) @@ -884,19 +916,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) } voice->position += rate; - - /* old version */ -#if 0 - int fraction = voice->position & 0xFFF; - short const* const pos = voice->samples + (voice->position >> 12); - voice->position += rate; - int output = - (pos [2] * fraction + pos [1] * (0x1000 - fraction)) >> 12; - /* no interpolation (hardly faster, and crappy sounding) */ - /*int output = pos [0];*/ - if ( this->r.g.noise_enables & vbit ) - output = *(int16_t*) &this->noise; -#endif + output = (output * voice->envx) >> 11; /* duplicated here to give compiler more to run in parallel */ @@ -905,8 +925,8 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) prev_outx = output; raw_voice->outx = (int8_t) (output >> 8); - #endif /* CPU_COLDFIRE */ - #endif + #endif /* CPU_* */ + #endif /* SPCNOINTERP */ #if SPC_BRRCACHE if ( voice->position >= voice->wave_end ) @@ -1033,7 +1053,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "or.l %[sh], %[e0] \r\n" /* save final feedback into echo buffer */ "move.l %[e0], (%[echo_ptr]) \r\n" - : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1) + : [e0]"+d"(echo_0), [e1]"+d"(echo_1) : [out_0]"r"(out_0), [out_1]"r"(out_1), [ef]"r"((int)this->r.g.echo_feedback), [echo_ptr]"a"((int32_t *)echo_ptr), @@ -1056,7 +1076,88 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) out_buf [ 0] = out_0; out_buf [WAV_CHUNK_SIZE] = out_1; out_buf ++; - #else /* !CPU_COLDFIRE */ + #elif defined (CPU_ARM) + /* Read feedback from echo buffer */ + int echo_pos = this->echo_pos; + uint8_t* const echo_ptr = RAM + + ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF); + echo_pos += 4; + if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 ) + echo_pos = 0; + this->echo_pos = echo_pos; + + int fb_0 = GET_LE16SA( echo_ptr ); + int fb_1 = GET_LE16SA( echo_ptr + 2 ); + + /* Keep last 8 samples */ + int32_t *fir_ptr = this->fir_ptr; + + /* Apply FIR */ + asm volatile ( + "str %[fb_0], [%[fir_p]], #4 \r\n" + "str %[fb_1], [%[fir_p]], #4 \r\n" + /* duplicate at +8 eliminates wrap checking below */ + "str %[fb_0], [%[fir_p], #56] \r\n" + "str %[fb_1], [%[fir_p], #60] \r\n" + : [fir_p]"+r"(fir_ptr) + : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1) + ); + + this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK); + int32_t *fir_coeff = this->fir_coeff; + + asm volatile ( + "ldmia %[fir_c]!, { r0-r1 } \r\n" + "ldmia %[fir_p]!, { r4-r5 } \r\n" + "mul %[fb_0], r0, %[fb_0] \r\n" + "mul %[fb_1], r0, %[fb_1] \r\n" + "mla %[fb_0], r4, r1, %[fb_0] \r\n" + "mla %[fb_1], r5, r1, %[fb_1] \r\n" + "ldmia %[fir_c]!, { r0-r1 } \r\n" + "ldmia %[fir_p]!, { r2-r5 } \r\n" + "mla %[fb_0], r2, r0, %[fb_0] \r\n" + "mla %[fb_1], r3, r0, %[fb_1] \r\n" + "mla %[fb_0], r4, r1, %[fb_0] \r\n" + "mla %[fb_1], r5, r1, %[fb_1] \r\n" + "ldmia %[fir_c]!, { r0-r1 } \r\n" + "ldmia %[fir_p]!, { r2-r5 } \r\n" + "mla %[fb_0], r2, r0, %[fb_0] \r\n" + "mla %[fb_1], r3, r0, %[fb_1] \r\n" + "mla %[fb_0], r4, r1, %[fb_0] \r\n" + "mla %[fb_1], r5, r1, %[fb_1] \r\n" + "ldmia %[fir_c]!, { r0-r1 } \r\n" + "ldmia %[fir_p]!, { r2-r5 } \r\n" + "mla %[fb_0], r2, r0, %[fb_0] \r\n" + "mla %[fb_1], r3, r0, %[fb_1] \r\n" + "mla %[fb_0], r4, r1, %[fb_0] \r\n" + "mla %[fb_1], r5, r1, %[fb_1] \r\n" + : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1), + [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) + : + : "r0", "r1", "r2", "r3", "r4", "r5" + ); + + /* Generate output */ + int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0) + >> global_muting; + int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1) + >> global_muting; + + out_buf [ 0] = amp_0; + out_buf [WAV_CHUNK_SIZE] = amp_1; + out_buf ++; + + if ( !(this->r.g.flags & 0x20) ) + { + /* Feedback into echo buffer */ + int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14); + int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14); + e0 = CLAMP16( e0 ); + SET_LE16A( echo_ptr , e0 ); + e1 = CLAMP16( e1 ); + SET_LE16A( echo_ptr + 2, e1 ); + } + #else /* Unoptimized CPU */ /* Read feedback from echo buffer */ int echo_pos = this->echo_pos; uint8_t* const echo_ptr = RAM + @@ -1102,25 +1203,25 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) out_buf [WAV_CHUNK_SIZE] = amp_1; out_buf ++; - /* Feedback into echo buffer */ - int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14); - int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14); if ( !(this->r.g.flags & 0x20) ) { - CLAMP16( e0, e0 ); + /* Feedback into echo buffer */ + int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14); + int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14); + e0 = CLAMP16( e0 ); SET_LE16A( echo_ptr , e0 ); - CLAMP16( e1, e1 ); + e1 = CLAMP16( e1 ); SET_LE16A( echo_ptr + 2, e1 ); } - #endif /* CPU_COLDFIRE */ - #else + #endif /* CPU_* */ + #else /* SPCNOECHO == 1*/ /* Generate output */ int amp_0 = (chans_0 * global_vol_0) >> global_muting; int amp_1 = (chans_1 * global_vol_1) >> global_muting; out_buf [ 0] = amp_0; out_buf [WAV_CHUNK_SIZE] = amp_1; out_buf ++; - #endif + #endif /* SPCNOECHO */ } while ( --count ); #if 0 @@ -1155,10 +1256,13 @@ void DSP_reset( struct Spc_Dsp* this ) this->wave_entry [i].start_addr = -1; #endif -#ifdef CPU_COLDFIRE - this->fir_ptr = fir_buf; +#if defined(CPU_COLDFIRE) + this->fir_ptr = fir_buf; this->last_fir_ptr = &fir_buf [7]; ci->memset( fir_buf, 0, sizeof fir_buf ); +#elif defined (CPU_ARM) + this->fir_ptr = fir_buf; + ci->memset( fir_buf, 0, sizeof fir_buf ); #else this->fir_pos = 0; ci->memset( this->fir_buf, 0, sizeof this->fir_buf ); -- cgit v1.2.3