1 files changed, 305 insertions, 197 deletions
diff --git a/lib/rbcodec/dsp/dsp_cf.S b/lib/rbcodec/dsp/dsp_cf.S
index 15ec7eb383..c710df5177 100644
--- a/lib/rbcodec/dsp/dsp_cf.S
+++ b/lib/rbcodec/dsp/dsp_cf.S
@@ -19,23 +19,27 @@
 * KIND, either express or implied.
 *
 ****************************************************************************/
+#include "config.h"
 /****************************************************************************
- * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ * void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
 */
    .section    .text
    .align      2
-    .global     dsp_apply_gain
+    .global     pga_process
-dsp_apply_gain:
+pga_process:
+    | input: 4(sp) = this, 8(sp) = buf_p
+    movem.l     4(%sp), %a0-%a1         | %a0 = this, %a1 = buf_p
+    move.l      (%a0), %a0              | %a0 = this->data = &pga_data
+    move.l      (%a0), %a0              | %a0 = data->gain
+    move.l      (%a1), %a1              | %a1 = buf = *buf_p
    lea.l       -20(%sp), %sp           | save registers
    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
-    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+    clr.l       %d1                     | %d1 = buf->format.num_channels
-                                        | %a1 = buf
+    move.b      17(%a1), %d1            |
-    move.l      4(%a0), %d1             | %d1 = data->num_channels
-    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
 10: | channel loop                      |
-    move.l      24(%sp), %d0            | %d0 = count
+    move.l      (%a1), %d0              | %d0 = buf->remcount
-    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      (%a1, %d1.l*4), %a2     | %a2 = s = buf->p32[ch-1]
    move.l      %a2, %a3                | %a3 = d = s
    move.l      (%a2)+, %d2             | %d2 = *s++,
    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
@@ -61,25 +65,29 @@ dsp_apply_gain:
    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
    lea.l       20(%sp), %sp            | cleanup stack
    rts                                 |
-    .size       dsp_apply_gain,.-dsp_apply_gain
+    .size       pga_process, .-pga_process
 /****************************************************************************
- * void apply_crossfeed(int count, int32_t *buf[])
+ * void crossfeed_process(struct dsp_proc_entry *this,
+ *                        struct dsp_buffer **buf_p)
 */
    .section    .text
    .align      2
-    .global     apply_crossfeed
+    .global     crossfeed_process
-apply_crossfeed:
+crossfeed_process:
+    | input: 4(sp) = this, 8(sp) = buf_p
    lea.l       -44(%sp), %sp           |
    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
-    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
+    movem.l     48(%sp), %a1/%a4        | %a1 = this, %a4 = buf_p
-    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
+    move.l      (%a4), %a4              | %a4 = buf = *buf_p
-    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    movem.l     (%a4), %d7/%a4-%a5      | %d7 = buf->remcount, %a4 = buf->p32[0],
+                                        | %a5 = buf->p32[1]
+    move.l      (%a1), %a1              | %a1 = &crossfeed_state
    move.l      (%a1)+, %d6             | %d6 = direct gain
    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
-    move.l      132(%a1), %a0           | fetch delay line address
+    lea.l       132(%a1), %a6           | %a6 = delay line wrap limit
+    move.l      (%a6), %a0              | fetch delay line address
    movem.l     (%a1), %a1-%a3          | load filter coefs
-    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
    bra.b       20f | loop start        | go to loop start point
    /* Register usage in loop:
     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
@@ -109,174 +117,181 @@ apply_crossfeed:
    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
    cmp.l       %a6, %a0                | wrap %a0 if passed end
    bhs.b       30f | wrap buffer       |
-    .word       0x51fb | tpf.l          | trap the buffer wrap
+    tpf.l                               | trap the buffer wrap
 30: | wrap buffer                       | ...fwd taken branches more costly
-    lea.l       -104(%a0), %a0          | wrap it up
+    lea.l       -104(%a6), %a0          | wrap it up
    subq.l      #1, %d7                 | --count > 0 ?
    bgt.b       10b | loop              | yes? do more
    movclr.l    %acc0, %d4              | write last outputs
    move.l      %d4, (%a4)              | .
    movclr.l    %acc1, %d5              | .
    move.l      %d5, (%a5)              | .
-    lea.l       crossfeed_data+16, %a1  | save data back to struct
+    movem.l     %d0-%d3, -120(%a6)      | ...history
-    movem.l     %d0-%d3, (%a1)          | ...history
+    move.l      %a0, (%a6)              | ...delay_p
-    move.l      %a0, 120(%a1)           | ...delay_p
    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
    lea.l       44(%sp), %sp            |
    rts                                 |
-    .size       apply_crossfeed,.-apply_crossfeed
+    .size       crossfeed_process,.-crossfeed_process
 /****************************************************************************
- * int dsp_downsample(int count, struct dsp_data *data,
+ * int lin_resample_resample(struct resample_data *data,
- *                    in32_t *src[], int32_t *dst[])
+ *                           struct dsp_buffer *src,
+ *                           struct dsp_buffer *dst)
 */
    .section    .text
    .align      2
-    .global     dsp_downsample
+    .global     lin_resample_resample
-dsp_downsample:
+lin_resample_resample:
-    lea.l       -40(%sp), %sp           | save non-clobberables
+    | input: 4(sp) = data, 8(sp) = src, 12(sp) = dst
-    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
+    lea.l       -44(%sp), %sp           | save non-volatiles
-    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
+    movem.l     %d2-%d7/%a2-%a6, (%sp)  |
-                                        | %a0 = data
+    movem.l     48(%sp), %a0-%a2        | %a0 = data
                                        | %a1 = src
                                        | %a2 = dst
-    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
+    clr.l       %d1                     | %d1 = ch = src->format.num_channels
-                                        | %d4 = delta = data->resample_data.delta
+    move.b      17(%a1), %d1            |
    moveq.l     #16, %d7                | %d7 = shift
-10: | channel loop                      |
+.lrs_channel_loop:                      |
-    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
+    movem.l     (%a0), %d2-%d3          | %d2 = delta = data->delta,
-    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
+                                        | %d3 = phase = data->phase
-    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a1), %d4              | %d4 = srcrem = src->remcount
-    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
+    move.l      12(%a2), %d5            | %d5 = dstrem = dst->bufcount
-    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
+    cmp.l       #0x8000, %d4            | %d4 = MIN(srcrem, 0x8000)
-    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
+    ble.b       10f                     |
-    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    move.l      #0x8000, %d4            |
-    lsr.l       %d7, %d6                |
+10:                                     |
-    cmp.l       %d2, %d6                | past end of samples?
+    move.l      (%a1, %d1.l*4), %a3     | %a3 = s = src->p32[ch]
-    bge.b       40f | skip resample loop| yes? skip loop
+    move.l      (%a2, %d1.l*4), %a4     | %a4 = d = dst->p32[ch]
-    tst.l       %d6                     | need last sample of prev. frame?
+    move.l      %d3, %d0                | %d0 = pos
-    bne.b       20f | resample loop     | no? start main loop
+    lsr.l       %d7, %d0                | ...
-    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
+    beq.b       11f                     | pos == 0?
-    bra.b       30f | resample start last | start with last (last in %d0)
+    cmp.l       %d4, %d0                | pos = MIN(pos, srcrem)
-20: | resample loop                     |
+    blt.b       12f                     |
-    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
+    move.l      %d4, %d0                | pos = srcrem
-    movem.l     (%a5), %d0-%d1          |
+    move.l      -4(%a3, %d0.l*4), %d6   | %d6 = last = s[pos - 1]
-30: | resample start last               |
+    bra.w       .lrs_channel_complete   | at limit; nothing to do but next
-    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+11:                                     |
-    move.l      %d0, %acc0              | %acc0 = previous sample
+    move.l      4(%a0, %d1.l*4), %d6    | %d6 = last = last_sample[ch]
-    move.l      %d5, %d0                | frac = (phase << 16) >> 1
+    tpf.l                               | trap next move.l (last = s[pos - 1])
+12:                                     |
+    move.l      -4(%a3, %d0.l*4), %d6   | %d6 = last = s[pos - 1]
+    cmp.l       #0x10000, %d2           | delta >= 1.0?
+    bhs.b       .lrs_downsample         | yes? downsampling
+                                        |
+    /** Upsampling **/                  |
+    lea.l       (%a3, %d0.l*4), %a3     | %a3 = &s[pos]
+    sub.l       %d4, %d0                | %d0 = pos - srcrem = -dte
+    lsl.l       %d7, %d2                | move delta to bits 30..15
+    lsr.l       #1, %d2                 |
+    lsl.l       %d7, %d3                | move phase to bits 30..15
+    lsr.l       #1, %d3                 |
+    move.l      (%a3)+, %a5             | %a5 = s[pos]
+    move.l      %a5, %a6                | %a6 = diff = s[pos] - last
+    sub.l       %d6, %a6                |
+    bra.b       22f                     |
+    /* Funky loop structure is to avoid emac latency stalls */
+20:                                     |
+    move.l      (%a3)+, %a5             | %a5 = s[pos]
+    move.l      %a5, %a6                | %a6 = diff = s[pos] - last
+    sub.l       %d6, %a6                |
+21:                                     |
+    movclr.l    %acc0, %d7              | *d++ = %d7 = result
+    move.l      %d7, (%a4)+             |
+22:                                     |
+    move.l      %d6, %acc0              | %acc0 = last
+    mac.l       %d3, %a6, %acc0         | %acc0 += frac * diff
+    subq.l      #1, %d5                 | dstrem <= 0?
+    ble.b       23f                     | yes? stop
+    add.l       %d2, %d3                | phase += delta
+    bpl.b       21b                     | load next values?
+    move.l      %a5, %d6                |
+    bclr.l      #31, %d3                | clear sign bit
+    addq.l      #1, %d0                 | dte > 0?
+    bmi.b       20b                     | yes? continue resampling
+    tpf.w                               | trap next add.l (phase += delta)
+23:                                     |
+    add.l       %d2, %d3                | phase += delta
+    lsl.l       #1, %d3                 | frac -> phase
+    bcs.b       24f                     | was sign bit set?
+    tpf.l                               |
+24:                                     |
+    move.l      %a5, %d6                | yes? was going to move to new s[pos]
+    addq.l      #1, %d0                 |
+    movclr.l    %acc0, %d7              | *d = %d7 = result
+    move.l      %d7, (%a4)              |
+    add.l       %d4, %d0                | %d0 = -dte + srcrem = pos
+    or.l        %d0, %d3                | restore phase
+    swap.w      %d3                     |
+    moveq.l     #16, %d7                | %d7 = shift
+    bra.b       .lrs_channel_complete   |
+                                        |
+    /** Downsampling **/                |
+.lrs_downsample:                        |
+    move.l      (%a3, %d0.l*4), %a5     | %a5 = s[pos]
+    bra.b       31f                     |
+30:                                     |
+    lea.l       -4(%a3, %d0.l*4), %a5   | %d6 = s[pos - 1], %a5 = s[pos]
+    movem.l     (%a5), %d6/%a5          |
+31:                                     |
+    move.l      %d6, %acc0              | %acc0 = last
+    sub.l       %d6, %a5                | %a5 = diff = s[pos] - s[pos - 1]
+    move.l      %d3, %d0                | frac = (phase << 16) >> 1
    lsl.l       %d7, %d0                |
    lsr.l       #1, %d0                 |
-    mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
+    mac.l       %d0, %a5, %acc0         | %acc0 += frac * diff
-    add.l       %d4, %d5                | phase += delta
+    add.l       %d2, %d3                | phase += delta
-    move.l      %d5, %d6                | pos = phase >> 16
+    move.l      %d3, %d0                | pos = phase >> 16
-    lsr.l       %d7, %d6                |
+    lsr.l       %d7, %d0                |
-    movclr.l    %acc0, %d0              |
+    movclr.l    %acc0, %a5              |
-    move.l      %d0, (%a4)+             | *d++ = %d0
+    move.l      %a5, (%a4)+             | *d++ = %d0
-    cmp.l       %d2, %d6                | pos < count?
+    subq.l      #1, %d5                 | dst full?
-    blt.b       20b | resample loop     | yes? continue resampling
+    ble.b       32f                     | yes? stop
-40: | skip resample loop                |
+    cmp.l       %d4, %d0                | pos < srcrem?
-    subq.l      #1, %d3                 | ch > 0?
+    blt.b       30b                     | yes? continue resampling
-    bgt.b       10b | channel loop      | yes? process next channel
+    tpf.l                               | trap cmp.l and ble.b
-    lsl.l       %d7, %d2                | wrap phase to start of next frame
+32:                                     |
-    sub.l       %d2, %d5                | data->resample_data.phase =
+    cmp.l       %d4, %d0                | pos = MIN(pos, srcrem)
-    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
+    ble.b       33f                     |
-    move.l      %a4, %d0                | return d - d[0]
+    move.l      %d4, %d0                |
-    sub.l       (%a2), %d0              |
+33:                                     |
-    asr.l       #2, %d0                 | convert bytes->samples
+    move.l      -4(%a3, %d0.l*4), %d6   | %d6 = s[pos - 1]
-    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
+                                        |
-    lea.l       40(%sp), %sp            | cleanup stack
+.lrs_channel_complete:                  |
+    move.l      %d6, 4(%a0, %d1.l*4)    | last_sample[ch] = last
+    subq.l      #1, %d1                 | ch > 0?
+    bgt.w       .lrs_channel_loop       | yes? process next channel
+                                        |
+    move.l      12(%a2), %d1            | %d1 = dst->bufcount
+    sub.l       %d5, %d1                | written = dst->bufcount - dstrem
+    move.l      %d1, (%a2)              | dst->remcount = written
+    move.l      %d0, %d1                | wrap phase to position in next frame
+    lsl.l       %d7, %d1                | data->phase = phase - (pos << 16)
+    sub.l       %d1, %d3                | ...
+    move.l      %d3, 4(%a0)             | ...
+    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore non-volatiles
+    lea.l       44(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
-    .size       dsp_downsample,.-dsp_downsample
-/****************************************************************************
+    .size       lin_resample_resample, .-lin_resample_resample
- * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  const int32_t *src[], int32_t *dst[])
- */
-    .section    .text
-    .align      2
-    .global     dsp_upsample
-dsp_upsample:
-    lea.l       -40(%sp), %sp           | save non-clobberables
-    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
-    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
-                                        | %a0 = data
-                                        | %a1 = src
-                                        | %a2 = dst
-    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
-                                        | %d4 = delta = data->resample_data.delta
-    swap        %d4                     | swap delta to high word to use...
-                                        | ...carries to increment position
-10: | channel loop                      |
-    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
-    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
-    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
-    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
-    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
-    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
-                                        | ...away later but we'll be preincremented
-    move.l      %d1, %d6                | save sample value
-    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
-    swap        %d5                     | swap phase to high word to use
-                                        | carries to increment position
-    move.l      %d5, %d7                | %d7 = pos = phase >> 16
-    clr.w       %d5                     |
-    eor.l       %d5, %d7                | pos == 0?
-    beq.b       40f | loop start        | yes? start loop
-    cmp.l       %d2, %d7                | past end of samples?
-    bge.b       50f | skip resample loop| yes? go to next channel and collect info
-    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
-    movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
-    move.l      %d1, %d6                | save sample value
-    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-    bra.b       40f | loop start        |
-20: | next sample loop                  |
-    move.l      %d6, %d0                | move previous sample to %d0
-    move.l      (%a3)+, %d1             | fetch next sample
-    move.l      %d1, %d6                | save sample value
-    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-30: | same sample loop                  |
-    movclr.l    %acc0, %d7              | %d7 = result
-    move.l      %d7, (%a4)+             | *d++ = %d7
-40: | loop start                        |
-    lsr.l       #1, %d5                 | make phase into frac
-    move.l      %d0, %acc0              | %acc0 = s[pos-1]
-    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
-    lsl.l       #1, %d5                 | restore frac to phase
-    add.l       %d4, %d5                | phase += delta
-    bcc.b       30b | same sample loop  | load next values?
-    cmp.l       %a5, %a3                | src <= src_end?
-    bls.b       20b | next sample loop  | yes? continue resampling
-    movclr.l    %acc0, %d7              | %d7 = result
-    move.l      %d7, (%a4)+             | *d++ = %d7
-50: | skip resample loop                |
-    subq.l      #1, %d3                 | ch > 0?
-    bgt.b       10b | channel loop      | yes? process next channel
-    swap        %d5                     | wrap phase to start of next frame
-    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
-    move.l      %a4, %d0                | return d - d[0]
-    sub.l       (%a2), %d0              |
-    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
-    asr.l       #2, %d0                 | convert bytes->samples
-    lea.l       40(%sp), %sp            | cleanup stack
-    rts                                 | buh-bye
-    .size       dsp_upsample,.-dsp_upsample
 /****************************************************************************
- * void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ * void channel_mode_proc_mono(struct dsp_proc_entry *this,
+ *                             struct dsp_buffer **buf_p)
 *
 * Mix left and right channels 50/50 into a center channel.
 */
    .section    .text
    .align      2
-    .global     channels_process_sound_chan_mono
+    .global     channel_mode_proc_mono
-channels_process_sound_chan_mono:
+channel_mode_proc_mono:
-    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    | input: 4(sp) = this, 8(sp) = buf_p
+    move.l      8(%sp), %a0             | %a0 = buf_p
+    move.l      (%a0), %a0              | %a0 = buf = *buf_p
    lea.l       -20(%sp), %sp           | save registers
    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
-    movem.l     (%a0), %a0-%a1          | get channel pointers
+    movem.l     (%a0), %d0/%a0-%a1      | %d0 = buf->remcount, %a0 = buf->p32[0],
+                                        | %a1 = buf->p32[1]
    move.l      %a0, %a2                | use separate dst pointers since read
    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      #0x40000000, %d3        | %d3 = 0.5
@@ -301,26 +316,29 @@ channels_process_sound_chan_mono:
    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
    lea.l       20(%sp), %sp            | cleanup
    rts                                 |
-    .size       channels_process_sound_chan_mono, \
+    .size       channel_mode_proc_mono, .-channel_mode_proc_mono
-                .-channels_process_sound_chan_mono
 /****************************************************************************
- * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ * void channel_mode_proc_custom(struct dsp_proc_entry *this,
+ *                               struct dsp_buffer **buf_p)
 *
 * Apply stereo width (narrowing/expanding) effect.
 */
    .section    .text
    .align      2
-    .global     channels_process_sound_chan_custom
+    .global     channel_mode_proc_custom
-channels_process_sound_chan_custom:
+channel_mode_proc_custom:
-    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    | input: 4(sp) = this, 8(sp) = buf_p
    lea.l       -28(%sp), %sp           | save registers
    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
-    movem.l     (%a0), %a0-%a1          | get channel pointers
+    movem.l     32(%sp), %a0-%a1        | %a0 = this, %a1 = buf_p
+    move.l      (%a1), %a1              | %a1 = buf = *buf_p
+    move.l      (%a0), %a2              | %a2 = this->data = &channel_mode_data
+    movem.l     (%a1), %d0/%a0-%a1      | %d0 = buf->remcount, %a0 = buf->p32[0],
+                                        | %a1 = buf->p32[1]
+    movem.l     (%a2), %d3-%d4          | %d3 = sw_gain, %d4 = sw_cross
    move.l      %a0, %a2                | use separate dst pointers since read
    move.l      %a1, %a3                | pointers run one ahead of write
-    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
-    move.l      dsp_sw_cross, %d4       | load cross (side) gain
    move.l      (%a0)+, %d1             | prime the input registers
    move.l      (%a1)+, %d2             |
    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
@@ -348,22 +366,25 @@ channels_process_sound_chan_custom:
    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
    lea.l       28(%sp), %sp            | cleanup
    rts                                 |
-    .size       channels_process_sound_chan_custom, \
+    .size       channel_mode_proc_custom, .-channel_mode_proc_custom
-                .-channels_process_sound_chan_custom
 /****************************************************************************
- *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *  void channel_mode_proc_karaoke(struct dsp_proc_entry *this,
+ *                                 struct dsp_buffer **buf_p)
 *
 *  Separate channels into side channels.
 */
    .section    .text
    .align      2
-    .global     channels_process_sound_chan_karaoke
+    .global     channel_mode_proc_karaoke
-channels_process_sound_chan_karaoke:
+channel_mode_proc_karaoke:
-    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    | input: 4(sp) = this, 8(sp) = buf_p
+    move.l      8(%sp), %a0             | %a0 = buf_p
+    move.l      (%a0), %a0              | %a0 = buf = *buf_p
    lea.l       -20(%sp), %sp           | save registers
    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
-    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    movem.l     (%a0), %d0/%a0-%a1      | %d0 = buf->remcount, %a0 = buf->p32[0],
+                                        | %a1 = buf->p32[1]
    move.l      %a0, %a2                | use separate dst pointers since read
    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      #0x40000000, %d3        | %d3 = 0.5
@@ -390,12 +411,90 @@ channels_process_sound_chan_karaoke:
    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
    lea.l       20(%sp), %sp            | cleanup
    rts                                 |
-    .size       channels_process_sound_chan_karaoke, \
+    .size       channel_mode_proc_karaoke, .-channel_mode_proc_karaoke
-                .-channels_process_sound_chan_karaoke
+/****************************************************************************
+ * void filter_process(struct dsp_filter *f, int32_t *buf[], int count,
+ *                     unsigned int channels)
+ *
+ * define HIGH_PRECISION as '1' to make filtering calculate lower bits after
+ * shifting. without this, "shift" - 1 of the lower bits will be lost here.
+ */
+#define HIGH_PRECISION 0
+    .text
+    .global filter_process
+filter_process:
+    | input: 4(sp) = f, 8(sp) = buf, 12(sp) = count, 16(sp) = channels
+    lea.l       -44(%sp), %sp           | save clobbered regs
+#if HIGH_PRECISION
+    movem.l     %d2-%d7/%a2-%a6, (%sp)  | .
+#else
+    movem.l     %d2-%d6/%a2-%a6, (%sp)  |
+#endif
+    move.l      48(%sp), %a5            | fetch filter structure address
+    clr.l       %d6                     | load shift count
+    move.b      52(%a5), %d6            | .
+    subq.l      #1, %d6                 | EMAC gives us one free shift
+#if HIGH_PRECISION
+    moveq.l     #8, %d7
+    sub.l       %d6, %d7                | shift for lower part of accumulator
+#endif
+    movem.l     (%a5), %a0-%a4          | load coefs
+    lea.l       20(%a5), %a5            | point to filter history
+10: | channel loop
+    move.l      52(%sp), %a6            | load input channel pointer
+    addq.l      #4, 52(%sp)             | point x to next channel
+    move.l      (%a6), %a6              |
+    move.l      56(%sp), %d5            | number of samples
+    movem.l     (%a5), %d0-%d3          | load filter history
+    | d0-d3 = history, d4 = temp, d5 = sample count, d6 = upper shift amount,
+    | d7 = lower shift amount,a0-a4 = coefs, a5 = history pointer, a6 = buf[ch]
+20: | loop
+    | Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.
+    | y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+    | where y[] is output and x[] is input. This is performed out of order
+    | to do parallel load of input value.
+    mac.l       %a2, %d1, %acc0         | acc = b2*x[i - 2]
+    move.l      %d0, %d1                | fix input history
+    mac.l       %a1, %d0, (%a6), %d0, %acc0 | acc += b1*x[i - 1], x[i] -> d0
+    mac.l       %a0, %d0, %acc0         | acc += b0*x[i]
+    mac.l       %a3, %d2, %acc0         | acc += a1*y[i - 1]
+    mac.l       %a4, %d3, %acc0         | acc += a2*y[i - 2]
+    move.l      %d2, %d3                | fix output history
+#if HIGH_PRECISION
+    move.l      %accext01, %d2          | fetch lower part of accumulator
+    move.b      %d2, %d4                | clear upper three bytes
+    lsr.l       %d7, %d4                | shift lower bits
+#endif
+    movclr.l    %acc0, %d2              | fetch upper part of result
+    asl.l       %d6, %d2                | restore fixed point format
+#if HIGH_PRECISION
+    or.l        %d2, %d4                | combine lower and upper parts
+#endif
+    move.l      %d2, (%a6)+             | save result
+    subq.l      #1, %d5                 | are we done with this channel?
+    bgt         20b | loop
+    movem.l     %d0-%d3, (%a5)          | save history back to struct
+    lea.l       16(%a5), %a5            | point to next channel's history
+    subq.l      #1, 60(%sp)             | have we processed both channels?
+    bhi         10b | channel loop
+#if HIGH_PRECISION
+    movem.l     (%sp), %d2-%d7/%a2-%a6
+#else
+    movem.l     (%sp), %d2-%d6/%a2-%a6
+#endif
+    lea.l       44(%sp), %sp
+    rts
+    .size       filter_process, .-filter_process
 /****************************************************************************
- * void sample_output_stereo(int count, struct dsp_data *data,
+ * void sample_output_stereo(struct sample_io_data *this,
- *                           const int32_t *src[], int16_t *dst)
+ *                           struct dsp_buffer *src,
+ *                           struct dsp_buffer *dst)
 *
 * Framework based on the ubiquitous Rockbox line transfer logic for
 * Coldfire CPUs.
@@ -417,20 +516,24 @@ channels_process_sound_chan_karaoke:
    .align      2
    .global    sample_output_stereo
 sample_output_stereo:
+    | input: 4(sp) = count, 8(sp) = src, 12(sp) = dst
    lea.l       -48(%sp), %sp             | save registers
    move.l      %macsr, %d1               | do it now as at many lines will
    movem.l     %d1-%d7/%a2-%a6, (%sp)    | be the far more common condition
    move.l      #0x80, %macsr             | put emac unit in signed int mode
-    movem.l     52(%sp), %a0-%a2/%a4      |
+    movem.l     52(%sp), %a0-%a2          | %a0 = this, %a1 = src, %a2 = dst
-    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address
+    move.l      (%a0), %a0                | %a0 = this->outcount
-    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
+    move.l      4(%a2), %a4               | %a4 = dst->p16out
+    lea.l       (%a4, %a0.l*4), %a0       | %a0 = count -> end address
+    movem.l     4(%a1), %a2-%a3           | %a2 = src->p32[0], %a3 = src->p32[1]
+    clr.l       %d1                       | %a1 = multiplier: (1 << (16 - scale))
+    move.b      19(%a1), %d1              | %d1 = src->format.output_scale
    sub.l       #16, %d1                  |
    neg.l       %d1                       |
    moveq.l     #1, %d0                   |
    asl.l       %d1, %d0                  |
    move.l      %d0, %a1                  |
    move.l      #0x8000, %a6              | %a6 = rounding term
-    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
    moveq.l     #28, %d0                  | %d0 = second line bound
    add.l       %a4, %d0                  |
    and.l       #0xfffffff0, %d0          |
@@ -447,7 +550,7 @@ sample_output_stereo:
    mac.l       %d2, %a1, %acc1           | shift R to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
    movclr.l    %acc1, %d2                |
-    swap        %d2                       | move R to low word
+    swap.w      %d2                       | move R to low word
    move.w      %d2, %d1                  | interleave MS 16 bits of each
    move.l      %d1, (%a4)+               | ...and write both
    cmp.l       %a4, %d0                  |
@@ -477,10 +580,10 @@ sample_output_stereo:
    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 | with saturation
    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
    mac.l       %d3, %a1             , %acc3 |
-    swap        %d4                       | a) interleave most significant...
+    swap.w      %d4                       | a) interleave most significant...
-    swap        %d5                       |
+    swap.w      %d5                       |
-    swap        %d6                       |
+    swap.w      %d6                       |
-    swap        %d7                       |
+    swap.w      %d7                       |
    movclr.l    %acc0, %d0                | obtain L results
    movclr.l    %acc1, %d1                |
    movclr.l    %acc2, %d2                |
@@ -503,7 +606,7 @@ sample_output_stereo:
    mac.l       %d2, %a1, %acc1           |
    movclr.l    %acc0, %d1                |
    movclr.l    %acc1, %d2                |
-    swap        %d2                       |
+    swap.w      %d2                       |
    move.w      %d2, %d1                  |
    move.l      %d1, (%a4)+               |
    cmp.l       %a4, %a0                  |
@@ -516,8 +619,9 @@ sample_output_stereo:
    .size      sample_output_stereo, .-sample_output_stereo
 /****************************************************************************
- * void sample_output_mono(int count, struct dsp_data *data,
+ * void sample_output_mono(struct sample_io_data *this,
- *                         const int32_t *src[], int16_t *dst)
+ *                         struct dsp_buffer *src,
+ *                         struct dsp_buffer *dst)
 *
 * Same treatment as sample_output_stereo but for one channel.
 */
@@ -525,19 +629,23 @@ sample_output_stereo:
    .align      2
    .global    sample_output_mono
 sample_output_mono:
+    | input: 4(sp) = count, 8(sp) = src, 12(sp) = dst
    lea.l       -32(%sp), %sp             | save registers
    move.l      %macsr, %d1               | do it now as at many lines will
    movem.l     %d1-%d5/%a2-%a4, (%sp)    | be the far more common condition
    move.l      #0x80, %macsr             | put emac unit in signed int mode
-    movem.l     36(%sp), %a0-%a3          |
+    movem.l     36(%sp), %a0-%a2          | %a0 = this, %a1 = src, %a2 = dst
-    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address
+    move.l      (%a0), %a0                | %a0 = this->outcount
-    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
+    move.l      4(%a2), %a3               | %a3 = dst->p16out
+    movem.l     4(%a1), %a2               | %a2 = src->p32[0]
+    lea.l       (%a3, %a0.l*4), %a0       | %a0 = count -> end address
+    clr.l       %d1                       | %d5 = multiplier: (1 << (16 - scale))
+    move.b      19(%a1), %d1              | %d1 = src->format.output_scale
    sub.l       #16, %d1                  |
    neg.l       %d1                       |
    moveq.l     #1, %d5                   |
    asl.l       %d1, %d5                  |
    move.l      #0x8000, %a4              | %a4 = rounding term
-    movem.l     (%a2), %a2                | get source channel pointer
    moveq.l     #28, %d0                  | %d0 = second line bound
    add.l       %a3, %d0                  |
    and.l       #0xfffffff0, %d0          |
@@ -552,7 +660,7 @@ sample_output_mono:
    mac.l       %d1, %d5, %acc0           | shift L to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
    move.l      %d1, %d2                  |
-    swap        %d2                       | move R to low word
+    swap.w      %d2                       | move R to low word
    move.w      %d2, %d1                  | duplicate single channel into
    move.l      %d1, (%a3)+               | L and R
    cmp.l       %a3, %d0                  |
@@ -575,16 +683,16 @@ sample_output_mono:
    movclr.l    %acc2, %d2                |
    movclr.l    %acc3, %d3                |
    move.l      %d0, %d4                  | duplicate single channel
-    swap        %d4                       | into L and R
+    swap.w      %d4                       | into L and R
    move.w      %d4, %d0                  |
    move.l      %d1, %d4                  |
-    swap        %d4                       |
+    swap.w      %d4                       |
    move.w      %d4, %d1                  |
    move.l      %d2, %d4                  |
-    swap        %d4                       |
+    swap.w      %d4                       |
    move.w      %d4, %d2                  |
    move.l      %d3, %d4                  |
-    swap        %d4                       |
+    swap.w      %d4                       |
    move.w      %d4, %d3                  |
    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
    cmp.l       %a3, %a1                  |
@@ -598,7 +706,7 @@ sample_output_mono:
    mac.l       %d1, %d5, %acc0           | the same way as leading ones
    movclr.l    %acc0, %d1                |
    move.l      %d1, %d2                  |
-    swap        %d2                       |
+    swap.w      %d2                       |
    move.w      %d2, %d1                  |
    move.l      %d1, (%a3)+               |
    cmp.l       %a3, %a0                  |