3 files changed, 468 insertions, 369 deletions
diff --git a/apps/dsp.c b/apps/dsp.c
index be851e2305..3b95145b39 100644
--- a/apps/dsp.c
+++ b/apps/dsp.c
@@ -38,9 +38,14 @@
 #define WORD_FRACBITS       27
 #define NATIVE_DEPTH        16
+/* If the buffer sizes change, check the assembly code! */
 #define SAMPLE_BUF_COUNT    256
 #define RESAMPLE_BUF_COUNT  (256 * 4)   /* Enough for 11,025 Hz -> 44,100 Hz*/
 #define DEFAULT_GAIN        0x01000000
+#define SAMPLE_BUF_LEFT_CHANNEL 0
+#define SAMPLE_BUF_RIGHT_CHANNEL (SAMPLE_BUF_COUNT/2)
+#define RESAMPLE_BUF_LEFT_CHANNEL 0
+#define RESAMPLE_BUF_RIGHT_CHANNEL (RESAMPLE_BUF_COUNT/2)
 /* enums to index conversion properly with stereo mode and other settings */
 enum
@@ -66,11 +71,10 @@ enum
 * NOTE: Any assembly routines that use these structures must be updated
 * if current data members are moved or changed.
 */
-                                        /* 32-bit achitecture offset */
 struct resample_data
 {
-    long delta;                         /* 00h */
+    uint32_t delta;                     /* 00h */
-    long phase;                         /* 04h */
+    uint32_t phase;                     /* 04h */
    int32_t last_sample[2];             /* 08h */
                                        /* 10h */
 };
@@ -93,9 +97,10 @@ struct dsp_data
    int output_scale;                   /* 00h */
    int num_channels;                   /* 04h */
    struct resample_data resample_data; /* 08h */
-    int clip_min;                       /* 18h */
+    int32_t clip_min;                   /* 18h */
-    int clip_max;                       /* 2ch */
+    int32_t clip_max;                   /* 1ch */
-                                        /* 30h */
+    int32_t gain;                       /* 20h - Note that this is in S8.23 format. */ 
+                                        /* 24h */
 };
 /* No asm...yet */
@@ -132,13 +137,18 @@ struct eq_state
 #include <dsp_asm.h>
 /* Typedefs keep things much neater in this case */
-typedef int (*sample_input_fn_type)(int count, const char *src[],
+typedef void (*sample_input_fn_type)(int count, const char *src[],
-                                    int32_t *dst[]);    
+                                     int32_t *dst[]);    
 typedef int (*resample_fn_type)(int count, struct dsp_data *data,
                                int32_t *src[], int32_t *dst[]);
 typedef void (*sample_output_fn_type)(int count, struct dsp_data *data,
                                      int32_t *src[], int16_t *dst);
+/* Single-DSP channel processing in place */
 typedef void (*channels_process_fn_type)(int count, int32_t *buf[]);
+/* DSP local channel processing in place */
+typedef void (*channels_process_dsp_fn_type)(int count, struct dsp_data *data,
+                                             int32_t *buf[]);
 /*
 ***************************************************************************/
@@ -152,16 +162,16 @@ struct dsp_config
    int  sample_bytes;
    int  stereo_mode;
    int  frac_bits;
-    long gain;          /* Note that this is in S8.23 format. */
    /* Functions that change depending upon settings - NULL if stage is
       disabled */
-    sample_input_fn_type        input_samples;
+    sample_input_fn_type         input_samples;
-    resample_fn_type            resample;
+    resample_fn_type             resample;
-    sample_output_fn_type       output_samples;
+    sample_output_fn_type        output_samples;
    /* These will be NULL for the voice codec and is more economical that
       way */
-    channels_process_fn_type    apply_crossfeed;
+    channels_process_dsp_fn_type apply_gain;
-    channels_process_fn_type    channels_process;
+    channels_process_fn_type     apply_crossfeed;
+    channels_process_fn_type     channels_process;
 };
 /* General DSP config */
@@ -211,7 +221,7 @@ static struct dsp_config *dsp IDATA_ATTR = audio_dsp;
 * of copying needed is minimized for that case.
 */
-static int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
+int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
 static int32_t resample_buf[RESAMPLE_BUF_COUNT] IBSS_ATTR;
 /* set a new dsp and return old one */
@@ -258,23 +268,20 @@ void sound_set_pitch(int permille)
    dsp_configure(DSP_SWITCH_FREQUENCY, dsp->codec_frequency);
 }
-/* Convert at most count samples to the internal format, if needed. Returns
+/* Convert count samples to the internal format, if needed.  Updates src
- * number of samples ready for further processing. Updates src to point
+ * to point past the samples "consumed" and dst is set to point to the
- * past the samples "consumed" and dst is set to point to the samples to
+ * samples to consume. Note that for mono, dst[0] equals dst[1], as there
- * consume. Note that for mono, dst[0] equals dst[1], as there is no point
+ * is no point in processing the same data twice.
- * in processing the same data twice.
 */
 /* convert count 16-bit mono to 32-bit mono */
-static int sample_input_lte_native_mono(
+static void sample_input_lte_native_mono(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    const int16_t *s = (int16_t *) src[0];
    const int16_t * const send = s + count;
-    int32_t *d = dst[0] = dst[1] = sample_buf;
+    int32_t *d = dst[0] = dst[1] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
-    const int scale = WORD_SHIFT;
+    int scale = WORD_SHIFT;
    do
    {
@@ -283,21 +290,17 @@ static int sample_input_lte_native_mono(
    while (s < send);
    src[0] = (char *)s;
-    return count;
 }
 /* convert count 16-bit interleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_i_stereo(
+static void sample_input_lte_native_i_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    const int32_t *s = (int32_t *) src[0];
    const int32_t * const send = s + count;
-    int32_t *dl = dst[0] = sample_buf;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
-    const int scale = WORD_SHIFT;
+    int scale = WORD_SHIFT;
    do
    {
@@ -313,22 +316,18 @@ static int sample_input_lte_native_i_stereo(
    while (s < send);
    src[0] = (char *)s;
-    return count;
 }
 /* convert count 16-bit noninterleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_ni_stereo(
+static void sample_input_lte_native_ni_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    const int16_t *sl = (int16_t *) src[0];
    const int16_t *sr = (int16_t *) src[1];
    const int16_t * const slend = sl + count;
-    int32_t *dl = dst[0] = sample_buf;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
-    const int scale = WORD_SHIFT;
+    int scale = WORD_SHIFT;
    do
    {
@@ -339,35 +338,24 @@ static int sample_input_lte_native_ni_stereo(
    src[0] = (char *)sl;
    src[1] = (char *)sr;
-    return count;
 }
 /* convert count 32-bit mono to 32-bit mono */
-static int sample_input_gt_native_mono(
+static void sample_input_gt_native_mono(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    dst[0] = dst[1] = (int32_t *)src[0];
    src[0] = (char *)(dst[0] + count);
-    return count;
 }
 /* convert count 32-bit interleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_i_stereo(
+static void sample_input_gt_native_i_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    const int32_t *s = (int32_t *)src[0];
    const int32_t * const send = s + 2*count;
-    int32_t *dl = sample_buf;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
-    int32_t *dr = sample_buf + SAMPLE_BUF_COUNT/2;
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
-    dst[0] = dl;
-    dst[1] = dr;
    do
    {
@@ -377,22 +365,16 @@ static int sample_input_gt_native_i_stereo(
    while (s < send);
    src[0] = (char *)send;
-    return count;
 }
 /* convert 32 bit-noninterleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_ni_stereo(
+static void sample_input_gt_native_ni_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
    dst[0] = (int32_t *)src[0];
    dst[1] = (int32_t *)src[1];
    src[0] = (char *)(dst[0] + count);
    src[1] = (char *)(dst[1] + count);
-    return count;
 }
 /**
@@ -573,12 +555,6 @@ static void sample_output_new_format(void)
    dsp->output_samples = sample_output_functions[out];
 }
-static void resampler_set_delta(int frequency)
-{
-    dsp->data.resample_data.delta = (unsigned long) 
-        frequency * 65536LL / NATIVE_FREQUENCY;
-}
 /**
 * Linear interpolation resampling that introduces a one sample delay because
 * of our inability to look into the future at the end of a frame.
@@ -587,9 +563,9 @@ static void resampler_set_delta(int frequency)
 static int dsp_downsample(int count, struct dsp_data *data,
                          int32_t *src[], int32_t *dst[])
 {
-    int  ch = data->num_channels - 1;
+    int ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
+    uint32_t delta = data->resample_data.delta;
-    long phase, pos;
+    uint32_t phase, pos;
    int32_t *d;
    /* Rolled channel loop actually showed slightly faster. */
@@ -610,7 +586,7 @@ static int dsp_downsample(int count, struct dsp_data *data,
        if (pos > 0)
            last = s[pos - 1];
-        while (pos < count)
+        while (pos < (uint32_t)count)
        {
            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
            phase += delta;
@@ -625,12 +601,12 @@ static int dsp_downsample(int count, struct dsp_data *data,
    return d - dst[0];
 }
-static int dsp_upsample(int count,  struct dsp_data *data,
+static int dsp_upsample(int count, struct dsp_data *data,
                        int32_t *src[], int32_t *dst[])
 {
    int  ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
+    uint32_t delta = data->resample_data.delta;
-    long phase, pos;
+    uint32_t phase, pos;
    int32_t *d;
    /* Rolled channel loop actually showed slightly faster. */
@@ -653,7 +629,7 @@ static int dsp_upsample(int count,  struct dsp_data *data,
            pos = phase >> 16;
        }
-        while (pos < count)
+        while (pos < (uint32_t)count)
        {
            last = s[pos - 1];
            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
@@ -669,24 +645,43 @@ static int dsp_upsample(int count,  struct dsp_data *data,
 }
 #endif /* DSP_HAVE_ASM_RESAMPLING */
+static void resampler_new_delta(void)
+{
+    dsp->data.resample_data.delta = (unsigned long) 
+        dsp->frequency * 65536LL / NATIVE_FREQUENCY;
+    if (dsp->frequency == NATIVE_FREQUENCY)
+    {
+        /* NOTE: If fully glitch-free transistions from no resampling to
+           resampling are desired, last_sample history should be maintained
+           even when not resampling. */
+        dsp->resample = NULL;
+        dsp->data.resample_data.phase = 0;
+        dsp->data.resample_data.last_sample[0] = 0;
+        dsp->data.resample_data.last_sample[1] = 0;
+    }
+    else if (dsp->frequency < NATIVE_FREQUENCY)
+        dsp->resample = dsp_upsample;
+    else
+        dsp->resample = dsp_downsample;
+}
 /* Resample count stereo samples. Updates the src array, if resampling is
 * done, to refer to the resampled data. Returns number of stereo samples
 * for further processing.
 */
 static inline int resample(int count, int32_t *src[])
 {
-    if (dsp->resample)
+    int32_t *dst[2] =
    {
-        int32_t *dst[2] =
+        &resample_buf[RESAMPLE_BUF_LEFT_CHANNEL],
-        {
+        &resample_buf[RESAMPLE_BUF_RIGHT_CHANNEL],
-            resample_buf,
+    };
-            resample_buf + RESAMPLE_BUF_COUNT/2,
-        };
-        count = dsp->resample(count, &dsp->data, src, dst);
+    count = dsp->resample(count, &dsp->data, src, dst);
-        src[0] = dst[0];
-        src[1] = dst[dsp->data.num_channels - 1];
+    src[0] = dst[0];
-    }
+    src[1] = dst[dsp->data.num_channels - 1];
    return count;
 }
@@ -810,30 +805,59 @@ void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
    c[2] <<= 4;
 }
+/* Apply a constant gain to the samples (e.g., for ReplayGain).
+ * Note that this must be called before the resampler.
+ */
+#ifndef DSP_HAVE_ASM_APPLY_GAIN
+static void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int32_t gain = data->gain;
+    int ch = data->num_channels - 1;
+    do
+    {
+        int32_t *s = buf[ch];
+        int32_t *d = buf[ch];
+        int32_t  samp = *s++;
+        int i = 0;
+        do
+        {
+            FRACMUL_8_LOOP(samp, gain, s, d);
+        }
+        while (++i < count);
+    }
+    while (--ch >= 0);
+}
+#endif /* DSP_HAVE_ASM_APPLY_GAIN */
 /* Combine all gains to a global gain. */
 static void set_gain(struct dsp_config *dsp)
 {
-    dsp->gain = DEFAULT_GAIN;
+    dsp->data.gain = DEFAULT_GAIN;
    /* Replay gain not relevant to voice */
    if (dsp == audio_dsp && replaygain)
    {
-        dsp->gain = replaygain;
+        dsp->data.gain = replaygain;
    }
    
    if (eq_enabled && eq_precut)
    {
-        dsp->gain = (long) (((int64_t) dsp->gain * eq_precut) >> 24);
+        dsp->data.gain =
+            (long) (((int64_t) dsp->data.gain * eq_precut) >> 24);
    }
    
-    if (dsp->gain == DEFAULT_GAIN)
+    if (dsp->data.gain == DEFAULT_GAIN)
    {
-        dsp->gain = 0;
+        dsp->data.gain = 0;
    }
    else
    {
-        dsp->gain >>= 1;
+        dsp->data.gain >>= 1;
    }
+    dsp->apply_gain = dsp->data.gain != 0 ? dsp_apply_gain : NULL;
 }
 /**
@@ -927,50 +951,6 @@ static void eq_process(int count, int32_t *buf[])
    }
 }
-/* Apply a constant gain to the samples (e.g., for ReplayGain). May update
- * the src array if gain was applied.
- * Note that this must be called before the resampler.
- */
-static void apply_gain(int count, int32_t *buf[])
-{
-    int32_t *sl, *sr;
-    int32_t s, *d;
-    long gain;
-    int i;
-    if (new_gain)
-    {
-        /* Gain has changed */
-        dsp_set_replaygain();
-        if (dsp->gain == 0)
-            return; /* No gain to apply now */
-    }
-    sl = buf[0], sr = buf[1];
-    gain = dsp->gain;
-    if (sl != sr)
-    {
-        d = &sample_buf[SAMPLE_BUF_COUNT / 2];
-        buf[1] = d;
-        s = *sr++;
-        for (i = 0; i < count; i++)
-            FRACMUL_8_LOOP(s, gain, sr, d);
-    }
-    else
-    {
-        buf[1] = &sample_buf[0];
-    }
-    d = &sample_buf[0];
-    buf[0] = d;
-    s = *sl++;
-    for (i = 0; i < count; i++)
-        FRACMUL_8_LOOP(s, gain, sl, d);
-}
 void dsp_set_stereo_width(int value)
 {
    long width, straight, cross;
@@ -993,35 +973,6 @@ void dsp_set_stereo_width(int value)
    dsp_sw_cross = cross << 8;
 }
-/**
- * Implements the different channel configurations and stereo width.
- */
-/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
- * completeness. */
-#if 0
-static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
-{
-    /* The channels are each just themselves */
-    (void)count; (void)buf;
-}
-#endif
-#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
-static void channels_process_sound_chan_mono(int count, int32_t *buf[])
-{
-    int32_t *sl = buf[0], *sr = buf[1];
-    do
-    {
-        int32_t lr = *sl/2 + *sr/2;
-        *sl++ = lr;
-        *sr++ = lr;
-    }
-    while (--count > 0);
-}
-#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
 #if CONFIG_CODEC == SWCODEC
 #ifdef HAVE_SW_TONE_CONTROLS
@@ -1063,6 +1014,35 @@ int dsp_callback(int msg, intptr_t param)
 }
 #endif
+/**
+ * Implements the different channel configurations and stereo width.
+ */
+/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
+ * completeness. */
+#if 0
+static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
+{
+    /* The channels are each just themselves */
+    (void)count; (void)buf;
+}
+#endif
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
+static void channels_process_sound_chan_mono(int count, int32_t *buf[])
+{
+    int32_t *sl = buf[0], *sr = buf[1];
+    do
+    {
+        int32_t lr = *sl/2 + *sr/2;
+        *sl++ = lr;
+        *sr++ = lr;
+    }
+    while (--count > 0);
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
 #ifndef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
 static void channels_process_sound_chan_custom(int count, int32_t *buf[])
 {
@@ -1151,30 +1131,47 @@ int dsp_process(char *dst, const char *src[], int count)
    coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
 #endif
+    if (new_gain)
+        dsp_set_replaygain(); /* Gain has changed */
+    /* Testing function pointers for NULL is preferred since the pointer
+       will be preloaded to be used for the call if not. */
    while (count > 0)
    {
-        samples = dsp->input_samples(count, src, tmp);
+        samples = MIN(SAMPLE_BUF_COUNT/2, count);
        count -= samples;
-        if (dsp->gain != 0)
-            apply_gain(samples, tmp);
+        dsp->input_samples(samples, src, tmp);
-        if ((samples = resample(samples, tmp)) <= 0)
+        if (dsp->apply_gain)
+            dsp->apply_gain(samples, &dsp->data, tmp);
+        if (dsp->resample && (samples = resample(samples, tmp)) <= 0)
            break; /* I'm pretty sure we're downsampling here */
        if (dsp->apply_crossfeed)
            dsp->apply_crossfeed(samples, tmp);
        /* TODO: EQ and tone controls need separate structs for audio and voice
         * DSP processing thanks to filter history. isn't really audible now, but
-         * might be the day we start handling voice more delicately.
+         * might be the day we start handling voice more delicately. Planned
+         * changes may well run all relevent channels through the same EQ so
+         * perhaps not.
         */
        if (eq_enabled)
            eq_process(samples, tmp);
 #ifdef HAVE_SW_TONE_CONTROLS
        if ((bass | treble) != 0)
            eq_filter(tmp, &tone_filter, samples, dsp->data.num_channels,
                      FILTER_BISHELF_SHIFT);
 #endif
        if (dsp->channels_process)
            dsp->channels_process(samples, tmp);
        dsp->output_samples(samples, &dsp->data, tmp, (int16_t *)dst);
        written += samples;
        dst += samples * sizeof (int16_t) * 2;
        yield();
@@ -1245,9 +1242,6 @@ bool dsp_configure(int setting, intptr_t value)
        if (dsp == audio_dsp)
        {
            *var = value;
-            /* In case current gain is zero, force at least one call
-               to apply_gain or apply_gain won't pick up on new_gain */
-            audio_dsp->gain = -1;
            new_gain = true;
        }
    }
@@ -1282,15 +1276,7 @@ bool dsp_configure(int setting, intptr_t value)
        else
            dsp->frequency = dsp->codec_frequency;
-        resampler_set_delta(dsp->frequency);
+        resampler_new_delta();
-        if (dsp->frequency == NATIVE_FREQUENCY)
-            dsp->resample = NULL;
-        else if (dsp->frequency < NATIVE_FREQUENCY)
-            dsp->resample = dsp_upsample;
-        else
-            dsp->resample = dsp_downsample;
        break;
    case DSP_SET_SAMPLE_DEPTH:
@@ -1348,7 +1334,7 @@ bool dsp_configure(int setting, intptr_t value)
    case DSP_FLUSH:
        memset(&dsp->data.resample_data, 0,
               sizeof (dsp->data.resample_data));
-        resampler_set_delta(dsp->frequency);
+        resampler_new_delta();
        dither_init();
        break;
diff --git a/apps/dsp_asm.h b/apps/dsp_asm.h
index f8df337b37..14875d21d8 100644
--- a/apps/dsp_asm.h
+++ b/apps/dsp_asm.h
@@ -22,32 +22,61 @@
 #ifndef _DSP_ASM_H
 #define _DSP_ASM_H
+/* Set the appropriate #defines based on CPU or whatever matters */
 #ifndef SIMULATOR
-#if defined(CPU_COLDFIRE) || defined(CPU_ARM)
+#if defined(CPU_ARM)
+#define DSP_HAVE_ASM_RESAMPLING
 #define DSP_HAVE_ASM_CROSSFEED
-void apply_crossfeed(int count, int32_t *buf[]);
+#elif defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_APPLY_GAIN
 #define DSP_HAVE_ASM_RESAMPLING
-int dsp_downsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
+#define DSP_HAVE_ASM_CROSSFEED
-int dsp_upsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
-#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */
-#if defined (CPU_COLDFIRE)
 #define DSP_HAVE_ASM_SOUND_CHAN_MONO
-void channels_process_sound_chan_mono(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
-void channels_process_sound_chan_custom(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
-void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
-void sample_output_mono(int count, struct dsp_data *data,
-                        int32_t *src[], int16_t *dst);
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
-void sample_output_stereo(int count, struct dsp_data *data,
-                          int32_t *src[], int16_t *dst);
 #endif /* CPU_COLDFIRE */
 #endif /* SIMULATOR */
+/* Declare prototypes based upon what's #defined above */
+#ifdef DSP_HAVE_ASM_CROSSFEED
+void apply_crossfeed(int count, int32_t *buf[]);
+#endif
+#ifdef DSP_HAVE_ASM_APPLY_GAIN
+void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]);
+#endif /* DSP_HAVE_ASM_APPLY_GAIN* */
+#ifdef DSP_HAVE_ASM_RESAMPLING
+int dsp_upsample(int count, struct dsp_data *data,
+                 int32_t *src[], int32_t *dst[]);
+int dsp_downsample(int count, struct dsp_data *data,
+                   int32_t *src[], int32_t *dst[]);
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_MONO
+void channels_process_sound_chan_mono(int count, int32_t *buf[]);
+#endif
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+void channels_process_sound_chan_custom(int count, int32_t *buf[]);
+#endif
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
+#endif
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+void sample_output_stereo(int count, struct dsp_data *data,
+                          int32_t *src[], int16_t *dst);
+#endif
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+void sample_output_mono(int count, struct dsp_data *data,
+                        int32_t *src[], int16_t *dst);
+#endif
 #endif /* _DSP_ASM_H */
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S
index af9ac1fa4b..e5d3ee8c55 100644
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@@ -19,68 +19,117 @@
 ****************************************************************************/
 /****************************************************************************
- * void apply_crossfeed(int count, int32_t *src[])
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
 */
    .section    .text
+        .align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+        move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+        move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+        subq.l      #1, %d1                 | next channel
+        bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+        .align      2
    .global     apply_crossfeed 
 apply_crossfeed:
-    lea.l       -44(%sp), %sp
+    lea.l       -44(%sp), %sp           |
    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
-    lea.l       crossfeed_data, %a1
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
-    move.l      (%a1)+, %a6             | a6 = direct gain
+    move.l      (%a1)+, %d6             | %d6 = direct gain
    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
    move.l      132(%a1), %a0           | fetch delay line address
    movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
    /* Register usage in loop:
     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
-     * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
     * %d0..%d3 = history
-     * %d4..%d6 = temp.
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
     * %d7 = count
     */
-.cfloop:
+10: | loop                              |
-    mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
+    movclr.l    %acc0, %d4              | write outputs
-    mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
+    move.l      %d4, (%a4)+             | .
-    mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
+    movclr.l    %acc1, %d5              | .
-    move.l      %acc0, %d1              | get filtered delayed sample
+    move.l      %d5, (%a5)+             | .
-    mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
+20: | loop start                        |
-    movclr.l    %acc0, %d6              |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
-    move.l      %d6, (%a4)+             | write result
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
-    mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
-    mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
-    mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
-    movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
-    move.l      %acc0, %d3              | get filtered delayed sample
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
-    mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
-    lea.l       8(%a0), %a0             | increment delay pointer
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
-    movclr.l    %acc0, %d6              |
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
-    move.l      %d6, (%a5)+             | write result
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
-    cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
+    .word       0x51fb | tpf.l          | trap the buffer wrap
-    bge.b       .cfwrap                 |
+30: | wrap buffer                       | ...fwd taken branches more costly
-    .word       0x51fb                  | tpf.l - trap the buffer wrap
+    lea.l       -104(%a0), %a0          | wrap it up
-.cfwrap:
+    subq.l      #1, %d7                 | --count > 0 ?
-    lea.l       -104(%a0), %a0          | wrap
+    bgt.b       10b | loop              | yes? do more
-    subq.l      #1, %d7                 | --count < 0 ?
+    movclr.l    %acc0, %d4              | write last outputs
-    bgt.b       .cfloop                 |
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
    lea.l       crossfeed_data+16, %a1  | save data back to struct
    movem.l     %d0-%d3, (%a1)          | ...history
    move.l      %a0, 120(%a1)           | ...delay_p
    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
-    lea.l       44(%sp), %sp
+    lea.l       44(%sp), %sp            |
-    rts
+    rts                                 |
-.cfend:
+    .size       apply_crossfeed,.-apply_crossfeed 
-    .size       apply_crossfeed,.cfend-apply_crossfeed
 /****************************************************************************
 * int dsp_downsample(int count, struct dsp_data *data,
 *                    in32_t *src[], int32_t *dst[])
 */
    .section    .text
+        .align      2
    .global     dsp_downsample
 dsp_downsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
@@ -92,7 +141,7 @@ dsp_downsample:
    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
                                        | %d4 = delta = data->resample_data.delta
    moveq.l     #16, %d7                | %d7 = shift
-.dschannel_loop:
+10: | channel loop                      |
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
@@ -102,15 +151,15 @@ dsp_downsample:
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    lsr.l       %d7, %d6                |
    cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .dsloop_skip            | yes? skip loop
+    bge.b       40f | skip resample loop| yes? skip loop
    tst.l       %d6                     | need last sample of prev. frame?
-    bne.b       .dsloop                 | no? start main loop
+    bne.b       20f | resample loop     | no? start main loop
    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
-    bra.b       .dsuse_last_start       | start with last (last in %d0)
+    bra.b       30f | resample start last | start with last (last in %d0)
-.dsloop:
+20: | resample loop                     |
    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
    movem.l     (%a5), %d0-%d1          |
-.dsuse_last_start:
+30: | resample start last               |
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
    move.l      %d0, %acc0              | %acc0 = previous sample
    move.l      %d5, %d0                | frac = (phase << 16) >> 1
@@ -123,11 +172,11 @@ dsp_downsample:
    movclr.l    %acc0, %d0              |
    move.l      %d0, (%a4)+             | *d++ = %d0
    cmp.l       %d2, %d6                | pos < count?
-    blt.b       .dsloop                 | yes? continue resampling
+    blt.b       20b | resample loop     | yes? continue resampling
-.dsloop_skip:
+40: | skip resample loop                |
    subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .dschannel_loop         | yes? process next channel
+    bgt.b       10b | channel loop      | yes? process next channel
-    asl.l       %d7, %d2                | wrap phase to start of next frame
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
    sub.l       %d2, %d5                | data->resample_data.phase =
    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
    move.l      %a4, %d0                | return d - d[0]
@@ -136,14 +185,14 @@ dsp_downsample:
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
-.dsend:
+    .size       dsp_downsample,.-dsp_downsample
-    .size       dsp_downsample,.dsend-dsp_downsample
 /****************************************************************************
 * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  in32_t *src[], int32_t *dst[])
+ *                  int32_t *src[], int32_t *dst[])
 */
    .section    .text
+        .align      2
    .global     dsp_upsample
 dsp_upsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
@@ -154,47 +203,55 @@ dsp_upsample:
                                        | %a2 = dst
    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
                                        | %d4 = delta = data->resample_data.delta
-    swap        %d4                     | swap delta to high word to use
+    swap        %d4                     | swap delta to high word to use...
-                                        | carries to increment position
+                                        | ...carries to increment position
-.uschannel_loop:
+10: | channel loop                      |
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
    swap        %d5                     | swap phase to high word to use
                                        | carries to increment position
-    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
    clr.w       %d5                     |
-    eor.l       %d5, %d6                | pos == 0?
+    eor.l       %d5, %d7                | pos == 0?
-    beq.b       .usstart_0              | no? transistion from down
+    beq.b       40f | loop start        | yes? start loop
-    cmp.l       %d2, %d6                | past end of samples?
+    cmp.l       %d2, %d7                | past end of samples?
-    bge.b       .usloop_skip            | yes? skip loop
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
-    lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
-    move.l      (%a3)+, %d0             | %d0 = *s++
+        movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
-    .word       0x51fa                  | tpf.w - trap next instruction
+    move.l      %d1, %d6                | save sample value
-.usloop_1:
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+        bra.b       40f | loop start        |
+20: | next sample loop                  |
    move.l      %d6, %d0                | move previous sample to %d0
-.usstart_0:
    move.l      (%a3)+, %d1             | fetch next sample
    move.l      %d1, %d6                | save sample value
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-.usloop_0:
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
    lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
    lsl.l       #1, %d5                 | restore frac to phase
-    movclr.l    %acc0, %d7              | %d7 = product
-    add.l       %d0, %d7                | %d7 = last + product
-    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
-    bcc.b       .usloop_0               | load next values?
+    bcc.b       30b | same sample loop  | load next values?
    cmp.l       %a5, %a3                | src <= src_end?
-    ble.b       .usloop_1               | yes? continue resampling
+    bls.b       20b | next sample loop  | yes? continue resampling
-.usloop_skip:
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
    subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .uschannel_loop         | yes? process next channel
+    bgt.b       10b | channel loop      | yes? process next channel
    swap        %d5                     | wrap phase to start of next frame
    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
    move.l      %a4, %d0                | return d - d[0]
@@ -203,12 +260,7 @@ dsp_upsample:
    asr.l       #2, %d0                 | convert bytes->samples
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
-.usend:
+    .size       dsp_upsample,.-dsp_upsample
-    .size       dsp_upsample,.usend-dsp_upsample
-/* These routines might benefit from burst transfers but we'll keep them
- * small for now since they're rather light weight
- */
 /****************************************************************************
 * void channels_process_sound_chan_mono(int count, int32_t *buf[])
@@ -216,31 +268,39 @@ dsp_upsample:
 * Mix left and right channels 50/50 into a center channel.
 */
    .section    .text
+        .align      2
    .global     channels_process_sound_chan_mono
 channels_process_sound_chan_mono:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -12(%sp), %sp           | save registers
+    lea.l       -20(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
-    movem.l     %d1-%d3, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      #0x40000000, %d3        | %d3 = 0.5
-1:
+    move.l      (%a0)+, %d1             | prime the input registers
-    move.l     (%a0), %d1               | L = R = l/2 + r/2
+    move.l      (%a1)+, %d2             |
-    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
-    mac.l      %d2, %d3, %acc0          |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
-    movclr.l   %acc0, %d1               |
+    subq.l      #1, %d0                 |
-    move.l     %d1, (%a0)+              | output to original buffer
+    ble.s       20f | loop done         |
-    move.l     %d1, (%a1)+              |
+10: | loop                              |
-    subq.l     #1, %d0                  |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
-    bgt.s      1b                       |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
-    movem.l    (%sp), %d1-%d3           | restore registers
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
-    move.l     %d1, %macsr              |
+    move.l      %d4, (%a2)+             | output to original buffer
-    lea.l      12(%sp), %sp             | cleanup
+    move.l      %d4, (%a3)+             |
-    rts
+    subq.l      #1, %d0                 |
-.cpmono_end:
+    bgt.s       10b | loop              |
-    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
 /****************************************************************************
 * void channels_process_sound_chan_custom(int count, int32_t *buf[])
@@ -248,34 +308,47 @@ channels_process_sound_chan_mono:
 * Apply stereo width (narrowing/expanding) effect.
 */
    .section    .text
+        .align      2
    .global     channels_process_sound_chan_custom
 channels_process_sound_chan_custom:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
+    lea.l       -28(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
    move.l      dsp_sw_cross, %d4       | load cross (side) gain
-1:
+    move.l      (%a0)+, %d1             | prime the input registers
-    move.l      (%a0), %d1              |
+    move.l      (%a1)+, %d2             |
-    mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
-    mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
-    mac.l       %d2, %d4            , %acc0 |
+    mac.l       %d2, %d4             , %acc0 |
-    mac.l       %d2, %d3            , %acc1 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
-    movclr.l    %acc0, %d1              |
-    movclr.l    %acc1, %d2              |
-    move.l      %d1, (%a0)+             |
-    move.l      %d2, (%a1)+             |
    subq.l      #1, %d0                 |
-    bgt.s       1b                      |
+    ble.b       20f | loop done         |
-    movem.l     (%sp), %d1-%d4          | restore registers
+10: | loop                              |
-    move.l      %d1, %macsr             |
+    movclr.l    %acc0, %d5              |
-    lea.l       16(%sp), %sp            | cleanup
+    movclr.l    %acc1, %d6              |
-    rts
+15: | loop start                        |
-.cpcustom_end:
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
-    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
 /****************************************************************************
 *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
@@ -283,31 +356,42 @@ channels_process_sound_chan_custom:
 *  Separate channels into side channels.
 */
    .section    .text
+        .align      2
    .global     channels_process_sound_chan_karaoke
 channels_process_sound_chan_karaoke:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
+    lea.l       -20(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
-    movem.l     %d1-%d4, (%sp)          |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    move.l      %a0, %a2                | use separate dst pointers since read
-    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a1, %a3                | pointers run one ahead of write
-    move.l      #0x40000000, %d4        | %d3 = 0.5
+    move.l      #0x40000000, %d3        | %d3 = 0.5
-1:
+    move.l      (%a0)+, %d1             | prime the input registers
-    move.l     (%a0), %d1               |
+    move.l      (%a1)+, %d2             |
-    msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
-    mac.l      %d2, %d4            , %acc0 |
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
-    movclr.l   %acc0, %d1               |
+    subq.l      #1, %d0                 |
-    move.l     %d1, (%a1)+              |
+    ble.b       20f | loop done         |
-    neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
+10: | loop                              |
-    move.l     %d1, (%a0)+              |
+    movclr.l    %acc0, %d4              |
-    subq.l     #1, %d0                  |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
-    bgt.s      1b                       |
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
-    movem.l    (%sp), %d1-%d4           | restore registers
+    move.l      %d4, (%a2)+             |
-    move.l     %d1, %macsr              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
-    lea.l      16(%sp), %sp             | cleanup
+    move.l      %d4, (%a3)+             |
-    rts
+    subq.l      #1, %d0                 |
-.cpkaraoke_end:
+    bgt.s       10b | loop              |
-    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
 /****************************************************************************
 * void sample_output_stereo(int count, struct dsp_data *data,
 *                               int32_t *src[], int16_t *dst)
@@ -329,6 +413,7 @@ channels_process_sound_chan_karaoke:
 *
 */
    .section   .text
+        .align      2
    .global    sample_output_stereo
 sample_output_stereo:
    lea.l       -44(%sp), %sp             | save registers
@@ -348,11 +433,11 @@ sample_output_stereo:
    add.l       %a4, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .sos_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a4, %d0                  | any leading longwords?
-    bls.b       .sos_lineloop_start       | no? jump to line loop
+    bls.b       20f | line loop start     | no? start line loop
-.sos_longloop_0:
+10: | long loop 0                         |
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
    mac.l       %d2, %a1, %acc1           | shift R to high word
@@ -362,10 +447,10 @@ sample_output_stereo:
    move.w      %d2, %d1                  | interleave MS 16 bits of each 
    move.l      %d1, (%a4)+               | ...and write both
    cmp.l       %a4, %d0                  |
-    bhi.b       .sos_longloop_0           |
+    bhi.b       10b | long loop 0         |
-.sos_lineloop_start:
+20: | line loop start                     |
    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
-.sos_lineloop:
+30: | line loop                           |
    move.l      (%a3)+, %d4               | get next 4 R samples and scale
    mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
    mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
@@ -394,11 +479,11 @@ sample_output_stereo:
    move.w      %d7, %d3                  |
    movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
    cmp.l       %a4, %a5                  |
-    bhi.b       .sos_lineloop             |
+    bhi.b       30b | line loop           |
-.sos_longloop_1_start:
+40: | long loop 1 start                   |
    cmp.l       %a4, %a0                  | any longwords left?
-    bls.b       .sos_done                 | no? finished.
+    bls.b       60f | output end          | no? stop
-.sos_longloop_1:
+50: | long loop 1                         |
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
    mac.l       %d2, %a1, %acc1           |
@@ -408,14 +493,13 @@ sample_output_stereo:
    move.w      %d2, %d1                  |
    move.l      %d1, (%a4)+               |
    cmp.l       %a4, %a0                  |
-    bhi.b       .sos_longloop_1           |
+    bhi.b       50b                       | long loop 1
-.sos_done:
+60: | output end                          |
    movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
    move.l      %d1, %macsr               |
    lea.l       44(%sp), %sp              | cleanup
    rts                                   |
-.sos_end:
+    .size      sample_output_stereo, .-sample_output_stereo
-    .size      sample_output_stereo, .sos_end-sample_output_stereo
 /****************************************************************************
 * void sample_output_mono(int count, struct dsp_data *data,
@@ -424,6 +508,7 @@ sample_output_stereo:
 * Same treatment as sample_output_stereo but for one channel.
 */
    .section   .text
+        .align      2
    .global    sample_output_mono
 sample_output_mono:
    lea.l       -28(%sp), %sp             | save registers
@@ -442,11 +527,11 @@ sample_output_mono:
    add.l       %a3, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .som_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a3, %d0                  | any leading longwords?
-    bls.b       .som_lineloop_start       | no? jump to line loop
+    bls.b       20f | line loop start     | no? start line loop
-.som_longloop_0:
+10: | long loop 0                         |
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %d5, %acc0           | shift L to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
@@ -455,10 +540,10 @@ sample_output_mono:
    move.w      %d2, %d1                  | duplicate single channel into
    move.l      %d1, (%a3)+               | L and R
    cmp.l       %a3, %d0                  |
-    bhi.b       .som_longloop_0           |
+    bhi.b       10b | long loop 0         |
-.som_lineloop_start:
+20: | line loop start                     |
    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
-.som_lineloop:
+30: | line loop                           |
    move.l      (%a2)+, %d0               | get next 4 L samples and scale
    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
@@ -483,11 +568,11 @@ sample_output_mono:
    move.w      %d4, %d3                  |
    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
    cmp.l       %a3, %a1                  |
-    bhi.b       .som_lineloop             |
+    bhi.b       30b | line loop           |
-.som_longloop_1_start:
+40: | long loop 1 start                   |
    cmp.l       %a3, %a0                  | any longwords left?
-    bls.b       .som_done                 | no? finished.
+    bls.b       60f | output end          | no? stop
-.som_longloop_1:
+50: | loop loop 1                         |
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %d5, %acc0           | the same way as leading ones
    movclr.l    %acc0, %d1                |
@@ -496,11 +581,10 @@ sample_output_mono:
    move.w      %d2, %d1                  |
    move.l      %d1, (%a3)+               |
    cmp.l       %a3, %a0                  |
-    bhi.b       .som_longloop_1           |
+    bhi.b       50b | long loop 1         |
-.som_done:
+60: | output end                          |
    movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
    move.l      %d1, %macsr               |
    lea.l       28(%sp), %sp              | cleanup
    rts                                   |
-.som_end:
+    .size      sample_output_mono, .-sample_output_mono
-    .size      sample_output_mono, .som_end-sample_output_mono