SWCODEC: Dsp speed optimizations. Changes for more modularity. Removal of some usless stuff. Some assembly routines for Coldfire with speed in mind over size for the outputs but the channel modes remain compact. Miscellaneous coldfire asm updates to accomodate the changes. Codec API structure version has to increase so do a full update.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12472 a1c6a512-1295-4272-9138-f99709370657
author: Michael Sevakis <jethead71@rockbox.org> 2007-02-24 17:06:36 +0000
committer: Michael Sevakis <jethead71@rockbox.org> 2007-02-24 17:06:36 +0000
commit: d4e904bf3557c63fb358d2d8e91bb103ca369e1a (patch)
tree: 2405fea04069c5d13286438d38ef7c246bb75075 /apps/dsp_cf.S
parent: dbf772bae969703972a672a866f07edc9a9031a5 (diff)
download: rockbox-d4e904bf3557c63fb358d2d8e91bb103ca369e1a.tar.gz
rockbox-d4e904bf3557c63fb358d2d8e91bb103ca369e1a.zip
1 files changed, 341 insertions, 39 deletions
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S
index 295ef05fe0..1f8dd48cee 100644
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@@ -18,7 +18,7 @@
 ****************************************************************************/
 /****************************************************************************
- * apply_crossfeed(int32_t* src[], int count)
+ * void apply_crossfeed(int32_t *src[], int count)
 */
    .section    .text
    .global     apply_crossfeed 
@@ -88,32 +88,31 @@ apply_crossfeed:
    .size       apply_crossfeed,.cfend-apply_crossfeed
 /****************************************************************************
- * dsp_downsample(int channels, int count, struct resample_data *r,
+ * int dsp_downsample(int count, struct dsp_data *data,
- *                in32_t **src, int32_t **dst)
+ *                    in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_downsample
 dsp_downsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
-    movem.l     44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
-                                        | %d3 = count
+                                        | %a0 = data
-                                        | %a0 = r
                                        | %a1 = src
                                        | %a2 = dst
-    move.l      4(%a0), %d4             | %d4 = delta = r->delta
+    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
-    move.l      #16, %d7                | %d7 = shift
+                                        | %d4 = delta = data->resample_data.delta
+    moveq.l     #16, %d7                | %d7 = shift
 .dschannel_loop:
-    move.l      (%a0), %d5              | %d5 = phase = r->phase
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
-    move.l      -4(%a1, %d2.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
-    move.l      -4(%a2, %d2.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
-    lea.l       4(%a0, %d2.l*4), %a5    | %a5 = &r->last_sample[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
-    move.l      (%a5), %d0              | %d0 = last = r->last_sample[ch-1]
+    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -4(%a3, %d3.l*4), %d1   | r->last_sample[ch-1] = s[count-1]
+    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
-    move.l      %d1, (%a5)              |
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    lsr.l       %d7, %d6                |
-    cmp.l       %d3, %d6                | past end of samples?
+    cmp.l       %d2, %d6                | past end of samples?
    bge.b       .dsloop_skip            | yes? skip loop
    tst.l       %d6                     | need last sample of prev. frame?
    bne.b       .dsloop                 | no? start main loop
@@ -134,14 +133,14 @@ dsp_downsample:
    move.l      %d5, %d6                | pos = phase >> 16
    lsr.l       %d7, %d6                |
    move.l      %d0, (%a4)+             | *d++ = %d0
-    cmp.l       %d3, %d6                | pos < count?
+    cmp.l       %d2, %d6                | pos < count?
    blt.b       .dsloop                 | yes? continue resampling
 .dsloop_skip:
-    subq.l      #1, %d2                 | ch > 0?
+    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .dschannel_loop         | yes? process next channel
-    asl.l       %d7, %d3                | wrap phase to start of next frame
+    asl.l       %d7, %d2                | wrap phase to start of next frame
-    sub.l       %d3, %d5                | r->phase = phase - (count << 16)
+    sub.l       %d2, %d5                | data->resample_data.phase =
-    move.l      %d5, (%a0)              |
+    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    asr.l       #2, %d0                 | convert bytes->samples
@@ -153,31 +152,30 @@ dsp_downsample:
    .size       dsp_downsample,.dsend-dsp_downsample
 /****************************************************************************
- * dsp_upsample(int channels, int count, struct resample_data *r,
+ * int dsp_upsample(int count, struct dsp_data *dsp,
- *              in32_t **src, int32_t **dst)
+ *                  in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_upsample
 dsp_upsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
-    movem.l     44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
-                                        | %d3 = count
+                                        | %a0 = data
-                                        | %a0 = r
                                        | %a1 = src
                                        | %a2 = dst
-    move.l      4(%a0), %d4             | %d4 = delta = r->delta
+    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
+                                        | %d4 = delta = data->resample_data.delta
    swap        %d4                     | swap delta to high word to use
                                        | carries to increment position
 .uschannel_loop:
-    move.l      (%a0), %d5              | %d5 = phase = r->phase
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
-    move.l      -4(%a1, %d2.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
-    lea.l       4(%a0, %d2.l*4), %a4    | %a4 = &r->last_sample[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d3.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
-    move.l      (%a4), %d0              | %d0 = last = r->last_sample[ch-1]
+    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -4(%a5), %d1            | r->last_sample[ch-1] = s[count-1]
+    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
-    move.l      %d1, (%a4)              |
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
-    move.l      -4(%a2, %d2.l*4), %a4   | %a4 = d = dst[ch-1]
    swap        %d5                     | swap phase to high word to use
                                        | carries to increment position
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
@@ -204,13 +202,13 @@ dsp_upsample:
    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
    bcc.b       .usloop_0               | load next values?
-    cmp.l       %a5, %a3                | src < src_end?
+    cmp.l       %a5, %a3                | src <= src_end?
-    blt.b       .usloop_1               | yes? continue resampling
+    ble.b       .usloop_1               | yes? continue resampling
 .usloop_skip:
-    subq.l      #1, %d2                 | ch > 0?
+    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .uschannel_loop         | yes? process next channel
    swap        %d5                     | wrap phase to start of next frame
-    move.l      %d5, (%a0)              | ...and save in r->phase
+    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
@@ -219,3 +217,307 @@ dsp_upsample:
    rts                                 | buh-bye
 .usend:
    .size       dsp_upsample,.usend-dsp_upsample
+/* These routines might benefit from burst transfers but we'll keep them
+ * small for now since they're rather light weight
+ */
+/****************************************************************************
+ * void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ *
+ * Mix left and right channels 50/50 into a center channel.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_mono
+channels_process_sound_chan_mono:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -12(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d3, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+1:
+    move.l     (%a0), %d1               | L = R = l/2 + r/2
+    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
+    mac.l      %d2, %d3, %acc0          |
+    movclr.l   %acc0, %d1               |
+    move.l     %d1, (%a0)+              | output to original buffer
+    move.l     %d1, (%a1)+              |
+    subq.l     #1, %d0                  |
+    bgt.s      1b                       |
+    movem.l    (%sp), %d1-%d3           | restore registers
+    move.l     %d1, %macsr              |
+    lea.l      12(%sp), %sp             | cleanup
+    rts
+.cpmono_end:
+    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ *
+ * Apply stereo width (narrowing/expanding) effect.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_custom
+channels_process_sound_chan_custom:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -16(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d4, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
+    move.l      dsp_sw_cross, %d4       | load cross (side) gain
+1:
+    move.l      (%a0), %d1              |
+    mac.l       %d1, %d3 , (%a1), %d2, %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4 , %acc1        |  R = r*gain + l*cross
+    mac.l       %d2, %d4 , %acc0        |
+    mac.l       %d2, %d3 , %acc1        |
+    movclr.l    %acc0, %d1              |
+    movclr.l    %acc1, %d2              |
+    move.l      %d1, (%a0)+             |
+    move.l      %d2, (%a1)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       1b                      |
+    movem.l     (%sp), %d1-%d4          | restore registers
+    move.l      %d1, %macsr             |
+    lea.l       16(%sp), %sp            | cleanup
+    rts
+.cpcustom_end:
+    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *
+ *  Separate channels into side channels.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_karaoke
+channels_process_sound_chan_karaoke:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -16(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d4, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      #0x40000000, %d4        | %d3 = 0.5
+1:
+    move.l     (%a0), %d1               |
+    mac.l      %d1, %d4, (%a1), %d2, %acc0 | L = l/2 - r/2
+    mac.l      %d2, %d4, %acc1          | R = r/2 - l/2
+    movclr.l   %acc0, %d1               |
+    movclr.l   %acc1, %d2               |
+    move.l     %d1, %d3                 |
+    sub.l      %d2, %d1                 |
+    sub.l      %d3, %d2                 |
+    move.l     %d1, (%a0)+              |
+    move.l     %d2, (%a1)+              |
+    subq.l     #1, %d0                  |
+    bgt.s      1b                       |
+    movem.l    (%sp), %d1-%d4           | restore registers
+    move.l     %d1, %macsr              |
+    lea.l      16(%sp), %sp             | cleanup
+    rts
+.cpkaraoke_end:
+    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                               int32_t *src[], int16_t *dst)
+ *
+ * Framework based on the ubiquitous Rockbox line transfer logic for
+ * Coldfire CPUs.
+ *
+ * Does emac clamping and scaling (which proved faster than the usual
+ * checks and branches - even single test clamping) and writes using
+ * line burst transfers. Also better than writing a single L-R pair per
+ * loop but a good deal more code.
+ *
+ * Attemping bursting during reads is rather futile since the source and
+ * destination alignments rarely agree and too much complication will
+ * slow us up. The parallel loads seem to do a bit better at least until
+ * a pcm buffer can always give line aligned chunk and then aligning the
+ * dest can then imply the source is aligned if the source buffers are.
+ * For now longword alignment is assumed of both the source and dest.
+ *
+ */
+    .section   .text
+    .global    sample_output_stereo
+sample_output_stereo:
+    lea.l       -44(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d7/%a2-%a5, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     48(%sp), %a0-%a2/%a4      |
+    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    move.q      #1, %d0                   |
+    asl.l       %d1, %d0                  |
+    move.l      %d0, %a1                  |
+    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a4, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a4, %d0                  | at least a full line?
+    blo.w       .sos_longloop_1_start     | no? jump to trailing longword
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a4, %d0                  | any leading longwords?
+    bls.b       .sos_lineloop_start       | no? jump to line loop
+.sos_longloop_0:
+    move.l      (%a2)+, %d1               | read longword from L and R
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
+    mac.l       %d2, %a1, %acc1           | shift R to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | interleave MS 16 bits of each 
+    move.l      %d1, (%a4)+               | ...and write both
+    cmp.l       %a4, %d0                  |
+    bhi.b       .sos_longloop_0           |
+.sos_lineloop_start:
+    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
+.sos_lineloop:
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %a1, %acc3           |
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      (%a3)+, %d4               | get next 4 R samples and scale
+    mac.l       %d4, %a1, (%a3)+, %d5,  %acc0 | with saturation
+    mac.l       %d5, %a1, (%a3)+, %d6,  %acc1 |
+    mac.l       %d6, %a1, (%a3)+, %d7,  %acc2 |
+    mac.l       %d7, %a1, %acc3           |
+    movclr.l    %acc0, %d4                | obtain results
+    movclr.l    %acc1, %d5                |
+    movclr.l    %acc2, %d6                |
+    movclr.l    %acc3, %d7                |
+    swap        %d4                       | interleave most significant
+    move.w      %d4, %d0                  | 16 bits of L and R
+    swap        %d5                       |
+    move.w      %d5, %d1                  |
+    swap        %d6                       |
+    move.w      %d6, %d2                  |
+    swap        %d7                       |
+    move.w      %d7, %d3                  |
+    movem.l     %d0-%d3, (%a4)            | write four stereo samples
+    lea.l       16(%a4), %a4              |
+    cmp.l       %a4, %a5                  |
+    bhi.b       .sos_lineloop             |
+.sos_longloop_1_start:
+    cmp.l       %a4, %a0                  | any longwords left?
+    bls.b       .sos_done                 | no? finished.
+.sos_longloop_1:
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
+    mac.l       %d2, %a1, %acc1           |
+    movclr.l    %acc0, %d1                |
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a4)+               |
+    cmp.l       %a4, %a0                  |
+    bhi.b       .sos_longloop_1           |
+.sos_done:
+    movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       44(%sp), %sp              | cleanup
+    rts                                   |
+.sos_end:
+    .size      sample_output_stereo, .sos_end-sample_output_stereo
+/****************************************************************************
+ * void sample_output_mono(int count, struct dsp_data *data,
+ *                         int32_t *src[], int16_t *dst)
+ *
+ * Same treatment as sample_output_stereo but for one channel.
+ */
+    .section   .text
+    .global    sample_output_mono
+sample_output_mono:
+    lea.l       -28(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d5/%a2-%a3, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     32(%sp), %a0-%a3          |
+    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    move.q      #1, %d5                   |
+    asl.l       %d1, %d5                  |
+    movem.l     (%a2), %a2                | get source channel pointer
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a3, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a3, %d0                  | at least a full line?
+    blo.w       .som_longloop_1_start     | no? jump to trailing longword
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a3, %d0                  | any leading longwords?
+    bls.b       .som_lineloop_start       | no? jump to line loop
+.som_longloop_0:
+    move.l      (%a2)+, %d1               | read longword from L and R
+    mac.l       %d1, %d5, %acc0           | shift L to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    move.l      %d1, %d2                  |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | duplicate single channel into
+    move.l      %d1, (%a3)+               | L and R
+    cmp.l       %a3, %d0                  |
+    bhi.b       .som_longloop_0           |
+.som_lineloop_start:
+    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
+.som_lineloop:
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %d5, %acc3           |
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      %d0, %d4                  | duplicate single channel
+    swap        %d4                       | into L and R
+    move.w      %d4, %d0                  |
+    move.l      %d1, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d1                  |
+    move.l      %d2, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d2                  |
+    move.l      %d3, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d3                  |
+    movem.l     %d0-%d3, (%a3)            | write four stereo samples
+    lea.l       16(%a3), %a3              |
+    cmp.l       %a3, %a1                  |
+    bhi.b       .som_lineloop             |
+.som_longloop_1_start:
+    cmp.l       %a3, %a0                  | any longwords left?
+    bls.b       .som_done                 | no? finished.
+.som_longloop_1:
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    mac.l       %d1, %d5, %acc0           | the same way as leading ones
+    movclr.l    %acc0, %d1                |
+    move.l      %d1, %d2                  |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a3)+               |
+    cmp.l       %a3, %a0                  |
+    bhi.b       .som_longloop_1           |
+.som_done:
+    movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       28(%sp), %sp              | cleanup
+    rts                                   |
+.som_end:
+    .size      sample_output_mono, .som_end-sample_output_mono
author	Michael Sevakis <jethead71@rockbox.org>	2007-02-24 17:06:36 +0000
committer	Michael Sevakis <jethead71@rockbox.org>	2007-02-24 17:06:36 +0000
commit	d4e904bf3557c63fb358d2d8e91bb103ca369e1a (patch)
tree	2405fea04069c5d13286438d38ef7c246bb75075 /apps/dsp_cf.S
parent	dbf772bae969703972a672a866f07edc9a9031a5 (diff)
download	rockbox-d4e904bf3557c63fb358d2d8e91bb103ca369e1a.tar.gz rockbox-d4e904bf3557c63fb358d2d8e91bb103ca369e1a.zip

diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S index 295ef05fe0..1f8dd48cee 100644 --- a/apps/dsp_cf.S +++ b/apps/dsp_cf.S
@@ -18,7 +18,7 @@
18	****************************************************************************/	18	****************************************************************************/
19		19
20	/****************************************************************************	20	/****************************************************************************
21	* apply_crossfeed(int32_t* src[], int count)	21	* void apply_crossfeed(int32_t *src[], int count)
22	*/	22	*/
23	.section .text	23	.section .text
24	.global apply_crossfeed	24	.global apply_crossfeed
@@ -88,32 +88,31 @@ apply_crossfeed:
88	.size apply_crossfeed,.cfend-apply_crossfeed	88	.size apply_crossfeed,.cfend-apply_crossfeed
89		89
90	/****************************************************************************	90	/****************************************************************************
91	* dsp_downsample(int channels, int count, struct resample_data *r,	91	* int dsp_downsample(int count, struct dsp_data *data,
92	* in32_t src, int32_t dst)	92	* in32_t src[], int32_t dst[])
93	*/	93	*/
94	.section .text	94	.section .text
95	.global dsp_downsample	95	.global dsp_downsample
96	dsp_downsample:	96	dsp_downsample:
97	lea.l -40(%sp), %sp \| save non-clobberables	97	lea.l -40(%sp), %sp \| save non-clobberables
98	movem.l %d2-%d7/%a2-%a5, (%sp) \|	98	movem.l %d2-%d7/%a2-%a5, (%sp) \|
99	movem.l 44(%sp), %d2-%d3/%a0-%a2\| %d2 = ch = channels	99	movem.l 44(%sp), %d2/%a0-%a2 \| %d2 = count
100	\| %d3 = count	100	\| %a0 = data
101	\| %a0 = r
102	\| %a1 = src	101	\| %a1 = src
103	\| %a2 = dst	102	\| %a2 = dst
104	move.l 4(%a0), %d4 \| %d4 = delta = r->delta	103	movem.l 4(%a0), %d3-%d4 \| %d3 = ch = data->num_channels
105	move.l #16, %d7 \| %d7 = shift	104	\| %d4 = delta = data->resample_data.delta
		105	moveq.l #16, %d7 \| %d7 = shift
106	.dschannel_loop:	106	.dschannel_loop:
107	move.l (%a0), %d5 \| %d5 = phase = r->phase	107	move.l 12(%a0), %d5 \| %d5 = phase = data->resample_data.phase
108	move.l -4(%a1, %d2.l*4), %a3 \| %a3 = s = src[ch-1]	108	move.l -4(%a1, %d3.l*4), %a3 \| %a3 = s = src[ch-1]
109	move.l -4(%a2, %d2.l*4), %a4 \| %a4 = d = dst[ch-1]	109	move.l -4(%a2, %d3.l*4), %a4 \| %a4 = d = dst[ch-1]
110	lea.l 4(%a0, %d2.l*4), %a5 \| %a5 = &r->last_sample[ch-1]	110	lea.l 12(%a0, %d3.l*4), %a5 \| %a5 = &data->resample_data.ast_sample[ch-1]
111	move.l (%a5), %d0 \| %d0 = last = r->last_sample[ch-1]	111	move.l (%a5), %d0 \| %d0 = last = data->resample_data.last_sample[ch-1]
112	move.l -4(%a3, %d3.l*4), %d1 \| r->last_sample[ch-1] = s[count-1]	112	move.l -4(%a3, %d2.l*4), (%a5) \| data->resample_data.last_sample[ch-1] = s[count-1]
113	move.l %d1, (%a5) \|
114	move.l %d5, %d6 \| %d6 = pos = phase >> 16	113	move.l %d5, %d6 \| %d6 = pos = phase >> 16
115	lsr.l %d7, %d6 \|	114	lsr.l %d7, %d6 \|
116	cmp.l %d3, %d6 \| past end of samples?	115	cmp.l %d2, %d6 \| past end of samples?
117	bge.b .dsloop_skip \| yes? skip loop	116	bge.b .dsloop_skip \| yes? skip loop
118	tst.l %d6 \| need last sample of prev. frame?	117	tst.l %d6 \| need last sample of prev. frame?
119	bne.b .dsloop \| no? start main loop	118	bne.b .dsloop \| no? start main loop
@@ -134,14 +133,14 @@ dsp_downsample:
134	move.l %d5, %d6 \| pos = phase >> 16	133	move.l %d5, %d6 \| pos = phase >> 16
135	lsr.l %d7, %d6 \|	134	lsr.l %d7, %d6 \|
136	move.l %d0, (%a4)+ \| *d++ = %d0	135	move.l %d0, (%a4)+ \| *d++ = %d0
137	cmp.l %d3, %d6 \| pos < count?	136	cmp.l %d2, %d6 \| pos < count?
138	blt.b .dsloop \| yes? continue resampling	137	blt.b .dsloop \| yes? continue resampling
139	.dsloop_skip:	138	.dsloop_skip:
140	subq.l #1, %d2 \| ch > 0?	139	subq.l #1, %d3 \| ch > 0?
141	bgt.b .dschannel_loop \| yes? process next channel	140	bgt.b .dschannel_loop \| yes? process next channel
142	asl.l %d7, %d3 \| wrap phase to start of next frame	141	asl.l %d7, %d2 \| wrap phase to start of next frame
143	sub.l %d3, %d5 \| r->phase = phase - (count << 16)	142	sub.l %d2, %d5 \| data->resample_data.phase =
144	move.l %d5, (%a0) \|	143	move.l %d5, 12(%a0) \| ... phase - (count << 16)
145	move.l %a4, %d0 \| return d - d[0]	144	move.l %a4, %d0 \| return d - d[0]
146	sub.l (%a2), %d0 \|	145	sub.l (%a2), %d0 \|
147	asr.l #2, %d0 \| convert bytes->samples	146	asr.l #2, %d0 \| convert bytes->samples
@@ -153,31 +152,30 @@ dsp_downsample:
153	.size dsp_downsample,.dsend-dsp_downsample	152	.size dsp_downsample,.dsend-dsp_downsample
154		153
155	/****************************************************************************	154	/****************************************************************************
156	* dsp_upsample(int channels, int count, struct resample_data *r,	155	* int dsp_upsample(int count, struct dsp_data *dsp,
157	* in32_t src, int32_t dst)	156	* in32_t src[], int32_t dst[])
158	*/	157	*/
159	.section .text	158	.section .text
160	.global dsp_upsample	159	.global dsp_upsample
161	dsp_upsample:	160	dsp_upsample:
162	lea.l -40(%sp), %sp \| save non-clobberables	161	lea.l -40(%sp), %sp \| save non-clobberables
163	movem.l %d2-%d7/%a2-%a5, (%sp) \|	162	movem.l %d2-%d7/%a2-%a5, (%sp) \|
164	movem.l 44(%sp), %d2-%d3/%a0-%a2\| %d2 = ch = channels	163	movem.l 44(%sp), %d2/%a0-%a2 \| %d2 = count
165	\| %d3 = count	164	\| %a0 = data
166	\| %a0 = r
167	\| %a1 = src	165	\| %a1 = src
168	\| %a2 = dst	166	\| %a2 = dst
169	move.l 4(%a0), %d4 \| %d4 = delta = r->delta	167	movem.l 4(%a0), %d3-%d4 \| %d3 = ch = channels
		168	\| %d4 = delta = data->resample_data.delta
170	swap %d4 \| swap delta to high word to use	169	swap %d4 \| swap delta to high word to use
171	\| carries to increment position	170	\| carries to increment position
172	.uschannel_loop:	171	.uschannel_loop:
173	move.l (%a0), %d5 \| %d5 = phase = r->phase	172	move.l 12(%a0), %d5 \| %d5 = phase = data->resample_data.phase
174	move.l -4(%a1, %d2.l*4), %a3 \| %a3 = s = src[ch-1]	173	move.l -4(%a1, %d3.l*4), %a3 \| %a3 = s = src[ch-1]
175	lea.l 4(%a0, %d2.l*4), %a4 \| %a4 = &r->last_sample[ch-1]	174	lea.l 12(%a0, %d3.l*4), %a4 \| %a4 = &data->resample_data.last_sample[ch-1]
176	lea.l (%a3, %d3.l*4), %a5 \| %a5 = src_end = &src[count]	175	lea.l (%a3, %d2.l*4), %a5 \| %a5 = src_end = &src[count]
177	move.l (%a4), %d0 \| %d0 = last = r->last_sample[ch-1]	176	move.l (%a4), %d0 \| %d0 = last = data->resample_data.last_sample[ch-1]
178	move.l -4(%a5), %d1 \| r->last_sample[ch-1] = s[count-1]	177	move.l -(%a5), (%a4) \| data->resample_data.last_sample[ch-1] = s[count-1]
179	move.l %d1, (%a4) \|	178	move.l -4(%a2, %d3.l*4), %a4 \| %a4 = d = dst[ch-1]
180	move.l -4(%a2, %d2.l*4), %a4 \| %a4 = d = dst[ch-1]
181	swap %d5 \| swap phase to high word to use	179	swap %d5 \| swap phase to high word to use
182	\| carries to increment position	180	\| carries to increment position
183	move.l %d5, %d6 \| %d6 = pos = phase >> 16	181	move.l %d5, %d6 \| %d6 = pos = phase >> 16
@@ -204,13 +202,13 @@ dsp_upsample:
204	move.l %d7, (%a4)+ \| *d++ = %d7	202	move.l %d7, (%a4)+ \| *d++ = %d7
205	add.l %d4, %d5 \| phase += delta	203	add.l %d4, %d5 \| phase += delta
206	bcc.b .usloop_0 \| load next values?	204	bcc.b .usloop_0 \| load next values?
207	cmp.l %a5, %a3 \| src < src_end?	205	cmp.l %a5, %a3 \| src <= src_end?
208	blt.b .usloop_1 \| yes? continue resampling	206	ble.b .usloop_1 \| yes? continue resampling
209	.usloop_skip:	207	.usloop_skip:
210	subq.l #1, %d2 \| ch > 0?	208	subq.l #1, %d3 \| ch > 0?
211	bgt.b .uschannel_loop \| yes? process next channel	209	bgt.b .uschannel_loop \| yes? process next channel
212	swap %d5 \| wrap phase to start of next frame	210	swap %d5 \| wrap phase to start of next frame
213	move.l %d5, (%a0) \| ...and save in r->phase	211	move.l %d5, 12(%a0) \| ...and save in data->resample_data.phase
214	move.l %a4, %d0 \| return d - d[0]	212	move.l %a4, %d0 \| return d - d[0]
215	sub.l (%a2), %d0 \|	213	sub.l (%a2), %d0 \|
216	movem.l (%sp), %d2-%d7/%a2-%a5 \| restore non-clobberables	214	movem.l (%sp), %d2-%d7/%a2-%a5 \| restore non-clobberables
@@ -219,3 +217,307 @@ dsp_upsample:
219	rts \| buh-bye	217	rts \| buh-bye
220	.usend:	218	.usend:
221	.size dsp_upsample,.usend-dsp_upsample	219	.size dsp_upsample,.usend-dsp_upsample
		220
		221	/* These routines might benefit from burst transfers but we'll keep them
		222	* small for now since they're rather light weight
		223	*/
		224
		225	/****************************************************************************
		226	* void channels_process_sound_chan_mono(int count, int32_t *buf[])
		227	*
		228	* Mix left and right channels 50/50 into a center channel.
		229	*/
		230	.section .text
		231	.global channels_process_sound_chan_mono
		232	channels_process_sound_chan_mono:
		233	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		234	lea.l -12(%sp), %sp \| save registers
		235	move.l %macsr, %d1 \|
		236	movem.l %d1-%d3, (%sp) \|
		237	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		238	movem.l (%a0), %a0-%a1 \| get channel pointers
		239	move.l #0x40000000, %d3 \| %d3 = 0.5
		240	1:
		241	move.l (%a0), %d1 \| L = R = l/2 + r/2
		242	mac.l %d1, %d3, (%a1), %d2, %acc0 \|
		243	mac.l %d2, %d3, %acc0 \|
		244	movclr.l %acc0, %d1 \|
		245	move.l %d1, (%a0)+ \| output to original buffer
		246	move.l %d1, (%a1)+ \|
		247	subq.l #1, %d0 \|
		248	bgt.s 1b \|
		249	movem.l (%sp), %d1-%d3 \| restore registers
		250	move.l %d1, %macsr \|
		251	lea.l 12(%sp), %sp \| cleanup
		252	rts
		253	.cpmono_end:
		254	.size channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
		255
		256
		257	/****************************************************************************
		258	* void channels_process_sound_chan_custom(int count, int32_t *buf[])
		259	*
		260	* Apply stereo width (narrowing/expanding) effect.
		261	*/
		262	.section .text
		263	.global channels_process_sound_chan_custom
		264	channels_process_sound_chan_custom:
		265	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		266	lea.l -16(%sp), %sp \| save registers
		267	move.l %macsr, %d1 \|
		268	movem.l %d1-%d4, (%sp) \|
		269	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		270	movem.l (%a0), %a0-%a1 \| get channel pointers
		271	move.l dsp_sw_gain, %d3 \| load straight (mid) gain
		272	move.l dsp_sw_cross, %d4 \| load cross (side) gain
		273	1:
		274	move.l (%a0), %d1 \|
		275	mac.l %d1, %d3 , (%a1), %d2, %acc0 \| L = lgain + rcross
		276	mac.l %d1, %d4 , %acc1 \| R = rgain + lcross
		277	mac.l %d2, %d4 , %acc0 \|
		278	mac.l %d2, %d3 , %acc1 \|
		279	movclr.l %acc0, %d1 \|
		280	movclr.l %acc1, %d2 \|
		281	move.l %d1, (%a0)+ \|
		282	move.l %d2, (%a1)+ \|
		283	subq.l #1, %d0 \|
		284	bgt.s 1b \|
		285	movem.l (%sp), %d1-%d4 \| restore registers
		286	move.l %d1, %macsr \|
		287	lea.l 16(%sp), %sp \| cleanup
		288	rts
		289	.cpcustom_end:
		290	.size channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
		291
		292	/****************************************************************************
		293	* void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
		294	*
		295	* Separate channels into side channels.
		296	*/
		297	.section .text
		298	.global channels_process_sound_chan_karaoke
		299	channels_process_sound_chan_karaoke:
		300	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		301	lea.l -16(%sp), %sp \| save registers
		302	move.l %macsr, %d1 \|
		303	movem.l %d1-%d4, (%sp) \|
		304	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		305	movem.l (%a0), %a0-%a1 \| get channel pointers
		306	move.l #0x40000000, %d4 \| %d3 = 0.5
		307	1:
		308	move.l (%a0), %d1 \|
		309	mac.l %d1, %d4, (%a1), %d2, %acc0 \| L = l/2 - r/2
		310	mac.l %d2, %d4, %acc1 \| R = r/2 - l/2
		311	movclr.l %acc0, %d1 \|
		312	movclr.l %acc1, %d2 \|
		313	move.l %d1, %d3 \|
		314	sub.l %d2, %d1 \|
		315	sub.l %d3, %d2 \|
		316	move.l %d1, (%a0)+ \|
		317	move.l %d2, (%a1)+ \|
		318	subq.l #1, %d0 \|
		319	bgt.s 1b \|
		320	movem.l (%sp), %d1-%d4 \| restore registers
		321	move.l %d1, %macsr \|
		322	lea.l 16(%sp), %sp \| cleanup
		323	rts
		324	.cpkaraoke_end:
		325	.size channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
		326
		327	/****************************************************************************
		328	* void sample_output_stereo(int count, struct dsp_data *data,
		329	* int32_t src[], int16_t dst)
		330	*
		331	* Framework based on the ubiquitous Rockbox line transfer logic for
		332	* Coldfire CPUs.
		333	*
		334	* Does emac clamping and scaling (which proved faster than the usual
		335	* checks and branches - even single test clamping) and writes using
		336	* line burst transfers. Also better than writing a single L-R pair per
		337	* loop but a good deal more code.
		338	*
		339	* Attemping bursting during reads is rather futile since the source and
		340	* destination alignments rarely agree and too much complication will
		341	* slow us up. The parallel loads seem to do a bit better at least until
		342	* a pcm buffer can always give line aligned chunk and then aligning the
		343	* dest can then imply the source is aligned if the source buffers are.
		344	* For now longword alignment is assumed of both the source and dest.
		345	*
		346	*/
		347	.section .text
		348	.global sample_output_stereo
		349	sample_output_stereo:
		350	lea.l -44(%sp), %sp \| save registers
		351	move.l %macsr, %d1 \| do it now as at many lines will
		352	movem.l %d1-%d7/%a2-%a5, (%sp) \| be the far more common condition
		353	move.l #0x80, %macsr \| put emac unit in signed int mode
		354	movem.l 48(%sp), %a0-%a2/%a4 \|
		355	lea.l (%a4, %a0.l*4), %a0 \| %a0 = end address
		356	move.l (%a1), %d1 \| %a1 = multiplier: (1 << (16 - scale))
		357	sub.l #16, %d1 \|
		358	neg.l %d1 \|
		359	move.q #1, %d0 \|
		360	asl.l %d1, %d0 \|
		361	move.l %d0, %a1 \|
		362	movem.l (%a2), %a2-%a3 \| get L/R channel pointers
		363	moveq.l #28, %d0 \| %d0 = second line bound
		364	add.l %a4, %d0 \|
		365	and.l #0xfffffff0, %d0 \|
		366	cmp.l %a4, %d0 \| at least a full line?
		367	blo.w .sos_longloop_1_start \| no? jump to trailing longword
		368	sub.l #16, %d0 \| %d1 = first line bound
		369	cmp.l %a4, %d0 \| any leading longwords?
		370	bls.b .sos_lineloop_start \| no? jump to line loop
		371	.sos_longloop_0:
		372	move.l (%a2)+, %d1 \| read longword from L and R
		373	mac.l %d1, %a1, (%a3)+, %d2, %acc0 \| shift L to high word
		374	mac.l %d2, %a1, %acc1 \| shift R to high word
		375	movclr.l %acc0, %d1 \| get possibly saturated results
		376	movclr.l %acc1, %d2 \|
		377	swap %d2 \| move R to low word
		378	move.w %d2, %d1 \| interleave MS 16 bits of each
		379	move.l %d1, (%a4)+ \| ...and write both
		380	cmp.l %a4, %d0 \|
		381	bhi.b .sos_longloop_0 \|
		382	.sos_lineloop_start:
		383	lea.l -12(%a0), %a5 \| %a5 = at or just before last line bound
		384	.sos_lineloop:
		385	move.l (%a2)+, %d0 \| get next 4 L samples and scale
		386	mac.l %d0, %a1, (%a2)+, %d1, %acc0 \| with saturation
		387	mac.l %d1, %a1, (%a2)+, %d2, %acc1 \|
		388	mac.l %d2, %a1, (%a2)+, %d3, %acc2 \|
		389	mac.l %d3, %a1, %acc3 \|
		390	movclr.l %acc0, %d0 \| obtain results
		391	movclr.l %acc1, %d1 \|
		392	movclr.l %acc2, %d2 \|
		393	movclr.l %acc3, %d3 \|
		394	move.l (%a3)+, %d4 \| get next 4 R samples and scale
		395	mac.l %d4, %a1, (%a3)+, %d5, %acc0 \| with saturation
		396	mac.l %d5, %a1, (%a3)+, %d6, %acc1 \|
		397	mac.l %d6, %a1, (%a3)+, %d7, %acc2 \|
		398	mac.l %d7, %a1, %acc3 \|
		399	movclr.l %acc0, %d4 \| obtain results
		400	movclr.l %acc1, %d5 \|
		401	movclr.l %acc2, %d6 \|
		402	movclr.l %acc3, %d7 \|
		403	swap %d4 \| interleave most significant
		404	move.w %d4, %d0 \| 16 bits of L and R
		405	swap %d5 \|
		406	move.w %d5, %d1 \|
		407	swap %d6 \|
		408	move.w %d6, %d2 \|
		409	swap %d7 \|
		410	move.w %d7, %d3 \|
		411	movem.l %d0-%d3, (%a4) \| write four stereo samples
		412	lea.l 16(%a4), %a4 \|
		413	cmp.l %a4, %a5 \|
		414	bhi.b .sos_lineloop \|
		415	.sos_longloop_1_start:
		416	cmp.l %a4, %a0 \| any longwords left?
		417	bls.b .sos_done \| no? finished.
		418	.sos_longloop_1:
		419	move.l (%a2)+, %d1 \| handle trailing longwords
		420	mac.l %d1, %a1, (%a3)+, %d2, %acc0 \| the same way as leading ones
		421	mac.l %d2, %a1, %acc1 \|
		422	movclr.l %acc0, %d1 \|
		423	movclr.l %acc1, %d2 \|
		424	swap %d2 \|
		425	move.w %d2, %d1 \|
		426	move.l %d1, (%a4)+ \|
		427	cmp.l %a4, %a0 \|
		428	bhi.b .sos_longloop_1 \|
		429	.sos_done:
		430	movem.l (%sp), %d1-%d7/%a2-%a5 \| restore registers
		431	move.l %d1, %macsr \|
		432	lea.l 44(%sp), %sp \| cleanup
		433	rts \|
		434	.sos_end:
		435	.size sample_output_stereo, .sos_end-sample_output_stereo
		436
		437	/****************************************************************************
		438	* void sample_output_mono(int count, struct dsp_data *data,
		439	* int32_t src[], int16_t dst)
		440	*
		441	* Same treatment as sample_output_stereo but for one channel.
		442	*/
		443	.section .text
		444	.global sample_output_mono
		445	sample_output_mono:
		446	lea.l -28(%sp), %sp \| save registers
		447	move.l %macsr, %d1 \| do it now as at many lines will
		448	movem.l %d1-%d5/%a2-%a3, (%sp) \| be the far more common condition
		449	move.l #0x80, %macsr \| put emac unit in signed int mode
		450	movem.l 32(%sp), %a0-%a3 \|
		451	lea.l (%a3, %a0.l*4), %a0 \| %a0 = end address
		452	move.l (%a1), %d1 \| %d5 = multiplier: (1 << (16 - scale))
		453	sub.l #16, %d1 \|
		454	neg.l %d1 \|
		455	move.q #1, %d5 \|
		456	asl.l %d1, %d5 \|
		457	movem.l (%a2), %a2 \| get source channel pointer
		458	moveq.l #28, %d0 \| %d0 = second line bound
		459	add.l %a3, %d0 \|
		460	and.l #0xfffffff0, %d0 \|
		461	cmp.l %a3, %d0 \| at least a full line?
		462	blo.w .som_longloop_1_start \| no? jump to trailing longword
		463	sub.l #16, %d0 \| %d1 = first line bound
		464	cmp.l %a3, %d0 \| any leading longwords?
		465	bls.b .som_lineloop_start \| no? jump to line loop
		466	.som_longloop_0:
		467	move.l (%a2)+, %d1 \| read longword from L and R
		468	mac.l %d1, %d5, %acc0 \| shift L to high word
		469	movclr.l %acc0, %d1 \| get possibly saturated results
		470	move.l %d1, %d2 \|
		471	swap %d2 \| move R to low word
		472	move.w %d2, %d1 \| duplicate single channel into
		473	move.l %d1, (%a3)+ \| L and R
		474	cmp.l %a3, %d0 \|
		475	bhi.b .som_longloop_0 \|
		476	.som_lineloop_start:
		477	lea.l -12(%a0), %a1 \| %a1 = at or just before last line bound
		478	.som_lineloop:
		479	move.l (%a2)+, %d0 \| get next 4 L samples and scale
		480	mac.l %d0, %d5, (%a2)+, %d1, %acc0 \| with saturation
		481	mac.l %d1, %d5, (%a2)+, %d2, %acc1 \|
		482	mac.l %d2, %d5, (%a2)+, %d3, %acc2 \|
		483	mac.l %d3, %d5, %acc3 \|
		484	movclr.l %acc0, %d0 \| obtain results
		485	movclr.l %acc1, %d1 \|
		486	movclr.l %acc2, %d2 \|
		487	movclr.l %acc3, %d3 \|
		488	move.l %d0, %d4 \| duplicate single channel
		489	swap %d4 \| into L and R
		490	move.w %d4, %d0 \|
		491	move.l %d1, %d4 \|
		492	swap %d4 \|
		493	move.w %d4, %d1 \|
		494	move.l %d2, %d4 \|
		495	swap %d4 \|
		496	move.w %d4, %d2 \|
		497	move.l %d3, %d4 \|
		498	swap %d4 \|
		499	move.w %d4, %d3 \|
		500	movem.l %d0-%d3, (%a3) \| write four stereo samples
		501	lea.l 16(%a3), %a3 \|
		502	cmp.l %a3, %a1 \|
		503	bhi.b .som_lineloop \|
		504	.som_longloop_1_start:
		505	cmp.l %a3, %a0 \| any longwords left?
		506	bls.b .som_done \| no? finished.
		507	.som_longloop_1:
		508	move.l (%a2)+, %d1 \| handle trailing longwords
		509	mac.l %d1, %d5, %acc0 \| the same way as leading ones
		510	movclr.l %acc0, %d1 \|
		511	move.l %d1, %d2 \|
		512	swap %d2 \|
		513	move.w %d2, %d1 \|
		514	move.l %d1, (%a3)+ \|
		515	cmp.l %a3, %a0 \|
		516	bhi.b .som_longloop_1 \|
		517	.som_done:
		518	movem.l (%sp), %d1-%d5/%a2-%a3 \| restore registers
		519	move.l %d1, %macsr \|
		520	lea.l 28(%sp), %sp \| cleanup
		521	rts \|
		522	.som_end:
		523	.size sample_output_mono, .som_end-sample_output_mono