1 files changed, 341 insertions, 39 deletions
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S
index 295ef05fe0..1f8dd48cee 100644
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@@ -18,7 +18,7 @@
 ****************************************************************************/
 /****************************************************************************
- * apply_crossfeed(int32_t* src[], int count)
+ * void apply_crossfeed(int32_t *src[], int count)
 */
    .section    .text
    .global     apply_crossfeed 
@@ -88,32 +88,31 @@ apply_crossfeed:
    .size       apply_crossfeed,.cfend-apply_crossfeed
 /****************************************************************************
- * dsp_downsample(int channels, int count, struct resample_data *r,
+ * int dsp_downsample(int count, struct dsp_data *data,
- *                in32_t **src, int32_t **dst)
+ *                    in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_downsample
 dsp_downsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
-    movem.l     44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
-                                        | %d3 = count
+                                        | %a0 = data
-                                        | %a0 = r
                                        | %a1 = src
                                        | %a2 = dst
-    move.l      4(%a0), %d4             | %d4 = delta = r->delta
+    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
-    move.l      #16, %d7                | %d7 = shift
+                                        | %d4 = delta = data->resample_data.delta
+    moveq.l     #16, %d7                | %d7 = shift
 .dschannel_loop:
-    move.l      (%a0), %d5              | %d5 = phase = r->phase
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
-    move.l      -4(%a1, %d2.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
-    move.l      -4(%a2, %d2.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
-    lea.l       4(%a0, %d2.l*4), %a5    | %a5 = &r->last_sample[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
-    move.l      (%a5), %d0              | %d0 = last = r->last_sample[ch-1]
+    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -4(%a3, %d3.l*4), %d1   | r->last_sample[ch-1] = s[count-1]
+    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
-    move.l      %d1, (%a5)              |
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    lsr.l       %d7, %d6                |
-    cmp.l       %d3, %d6                | past end of samples?
+    cmp.l       %d2, %d6                | past end of samples?
    bge.b       .dsloop_skip            | yes? skip loop
    tst.l       %d6                     | need last sample of prev. frame?
    bne.b       .dsloop                 | no? start main loop
@@ -134,14 +133,14 @@ dsp_downsample:
    move.l      %d5, %d6                | pos = phase >> 16
    lsr.l       %d7, %d6                |
    move.l      %d0, (%a4)+             | *d++ = %d0
-    cmp.l       %d3, %d6                | pos < count?
+    cmp.l       %d2, %d6                | pos < count?
    blt.b       .dsloop                 | yes? continue resampling
 .dsloop_skip:
-    subq.l      #1, %d2                 | ch > 0?
+    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .dschannel_loop         | yes? process next channel
-    asl.l       %d7, %d3                | wrap phase to start of next frame
+    asl.l       %d7, %d2                | wrap phase to start of next frame
-    sub.l       %d3, %d5                | r->phase = phase - (count << 16)
+    sub.l       %d2, %d5                | data->resample_data.phase =
-    move.l      %d5, (%a0)              |
+    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    asr.l       #2, %d0                 | convert bytes->samples
@@ -153,31 +152,30 @@ dsp_downsample:
    .size       dsp_downsample,.dsend-dsp_downsample
 /****************************************************************************
- * dsp_upsample(int channels, int count, struct resample_data *r,
+ * int dsp_upsample(int count, struct dsp_data *dsp,
- *              in32_t **src, int32_t **dst)
+ *                  in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_upsample
 dsp_upsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
-    movem.l     44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
-                                        | %d3 = count
+                                        | %a0 = data
-                                        | %a0 = r
                                        | %a1 = src
                                        | %a2 = dst
-    move.l      4(%a0), %d4             | %d4 = delta = r->delta
+    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
+                                        | %d4 = delta = data->resample_data.delta
    swap        %d4                     | swap delta to high word to use
                                        | carries to increment position
 .uschannel_loop:
-    move.l      (%a0), %d5              | %d5 = phase = r->phase
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
-    move.l      -4(%a1, %d2.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
-    lea.l       4(%a0, %d2.l*4), %a4    | %a4 = &r->last_sample[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d3.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
-    move.l      (%a4), %d0              | %d0 = last = r->last_sample[ch-1]
+    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -4(%a5), %d1            | r->last_sample[ch-1] = s[count-1]
+    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
-    move.l      %d1, (%a4)              |
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
-    move.l      -4(%a2, %d2.l*4), %a4   | %a4 = d = dst[ch-1]
    swap        %d5                     | swap phase to high word to use
                                        | carries to increment position
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
@@ -204,13 +202,13 @@ dsp_upsample:
    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
    bcc.b       .usloop_0               | load next values?
-    cmp.l       %a5, %a3                | src < src_end?
+    cmp.l       %a5, %a3                | src <= src_end?
-    blt.b       .usloop_1               | yes? continue resampling
+    ble.b       .usloop_1               | yes? continue resampling
 .usloop_skip:
-    subq.l      #1, %d2                 | ch > 0?
+    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .uschannel_loop         | yes? process next channel
    swap        %d5                     | wrap phase to start of next frame
-    move.l      %d5, (%a0)              | ...and save in r->phase
+    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
@@ -219,3 +217,307 @@ dsp_upsample:
    rts                                 | buh-bye
 .usend:
    .size       dsp_upsample,.usend-dsp_upsample
+/* These routines might benefit from burst transfers but we'll keep them
+ * small for now since they're rather light weight
+ */
+/****************************************************************************
+ * void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ *
+ * Mix left and right channels 50/50 into a center channel.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_mono
+channels_process_sound_chan_mono:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -12(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d3, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+1:
+    move.l     (%a0), %d1               | L = R = l/2 + r/2
+    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
+    mac.l      %d2, %d3, %acc0          |
+    movclr.l   %acc0, %d1               |
+    move.l     %d1, (%a0)+              | output to original buffer
+    move.l     %d1, (%a1)+              |
+    subq.l     #1, %d0                  |
+    bgt.s      1b                       |
+    movem.l    (%sp), %d1-%d3           | restore registers
+    move.l     %d1, %macsr              |
+    lea.l      12(%sp), %sp             | cleanup
+    rts
+.cpmono_end:
+    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ *
+ * Apply stereo width (narrowing/expanding) effect.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_custom
+channels_process_sound_chan_custom:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -16(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d4, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
+    move.l      dsp_sw_cross, %d4       | load cross (side) gain
+1:
+    move.l      (%a0), %d1              |
+    mac.l       %d1, %d3 , (%a1), %d2, %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4 , %acc1        |  R = r*gain + l*cross
+    mac.l       %d2, %d4 , %acc0        |
+    mac.l       %d2, %d3 , %acc1        |
+    movclr.l    %acc0, %d1              |
+    movclr.l    %acc1, %d2              |
+    move.l      %d1, (%a0)+             |
+    move.l      %d2, (%a1)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       1b                      |
+    movem.l     (%sp), %d1-%d4          | restore registers
+    move.l      %d1, %macsr             |
+    lea.l       16(%sp), %sp            | cleanup
+    rts
+.cpcustom_end:
+    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *
+ *  Separate channels into side channels.
+ */
+    .section    .text
+    .global     channels_process_sound_chan_karaoke
+channels_process_sound_chan_karaoke:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -16(%sp), %sp           | save registers
+    move.l      %macsr, %d1             |
+    movem.l     %d1-%d4, (%sp)          |
+    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      #0x40000000, %d4        | %d3 = 0.5
+1:
+    move.l     (%a0), %d1               |
+    mac.l      %d1, %d4, (%a1), %d2, %acc0 | L = l/2 - r/2
+    mac.l      %d2, %d4, %acc1          | R = r/2 - l/2
+    movclr.l   %acc0, %d1               |
+    movclr.l   %acc1, %d2               |
+    move.l     %d1, %d3                 |
+    sub.l      %d2, %d1                 |
+    sub.l      %d3, %d2                 |
+    move.l     %d1, (%a0)+              |
+    move.l     %d2, (%a1)+              |
+    subq.l     #1, %d0                  |
+    bgt.s      1b                       |
+    movem.l    (%sp), %d1-%d4           | restore registers
+    move.l     %d1, %macsr              |
+    lea.l      16(%sp), %sp             | cleanup
+    rts
+.cpkaraoke_end:
+    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                               int32_t *src[], int16_t *dst)
+ *
+ * Framework based on the ubiquitous Rockbox line transfer logic for
+ * Coldfire CPUs.
+ *
+ * Does emac clamping and scaling (which proved faster than the usual
+ * checks and branches - even single test clamping) and writes using
+ * line burst transfers. Also better than writing a single L-R pair per
+ * loop but a good deal more code.
+ *
+ * Attemping bursting during reads is rather futile since the source and
+ * destination alignments rarely agree and too much complication will
+ * slow us up. The parallel loads seem to do a bit better at least until
+ * a pcm buffer can always give line aligned chunk and then aligning the
+ * dest can then imply the source is aligned if the source buffers are.
+ * For now longword alignment is assumed of both the source and dest.
+ *
+ */
+    .section   .text
+    .global    sample_output_stereo
+sample_output_stereo:
+    lea.l       -44(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d7/%a2-%a5, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     48(%sp), %a0-%a2/%a4      |
+    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    move.q      #1, %d0                   |
+    asl.l       %d1, %d0                  |
+    move.l      %d0, %a1                  |
+    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a4, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a4, %d0                  | at least a full line?
+    blo.w       .sos_longloop_1_start     | no? jump to trailing longword
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a4, %d0                  | any leading longwords?
+    bls.b       .sos_lineloop_start       | no? jump to line loop
+.sos_longloop_0:
+    move.l      (%a2)+, %d1               | read longword from L and R
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
+    mac.l       %d2, %a1, %acc1           | shift R to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | interleave MS 16 bits of each 
+    move.l      %d1, (%a4)+               | ...and write both
+    cmp.l       %a4, %d0                  |
+    bhi.b       .sos_longloop_0           |
+.sos_lineloop_start:
+    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
+.sos_lineloop:
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %a1, %acc3           |
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      (%a3)+, %d4               | get next 4 R samples and scale
+    mac.l       %d4, %a1, (%a3)+, %d5,  %acc0 | with saturation
+    mac.l       %d5, %a1, (%a3)+, %d6,  %acc1 |
+    mac.l       %d6, %a1, (%a3)+, %d7,  %acc2 |
+    mac.l       %d7, %a1, %acc3           |
+    movclr.l    %acc0, %d4                | obtain results
+    movclr.l    %acc1, %d5                |
+    movclr.l    %acc2, %d6                |
+    movclr.l    %acc3, %d7                |
+    swap        %d4                       | interleave most significant
+    move.w      %d4, %d0                  | 16 bits of L and R
+    swap        %d5                       |
+    move.w      %d5, %d1                  |
+    swap        %d6                       |
+    move.w      %d6, %d2                  |
+    swap        %d7                       |
+    move.w      %d7, %d3                  |
+    movem.l     %d0-%d3, (%a4)            | write four stereo samples
+    lea.l       16(%a4), %a4              |
+    cmp.l       %a4, %a5                  |
+    bhi.b       .sos_lineloop             |
+.sos_longloop_1_start:
+    cmp.l       %a4, %a0                  | any longwords left?
+    bls.b       .sos_done                 | no? finished.
+.sos_longloop_1:
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
+    mac.l       %d2, %a1, %acc1           |
+    movclr.l    %acc0, %d1                |
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a4)+               |
+    cmp.l       %a4, %a0                  |
+    bhi.b       .sos_longloop_1           |
+.sos_done:
+    movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       44(%sp), %sp              | cleanup
+    rts                                   |
+.sos_end:
+    .size      sample_output_stereo, .sos_end-sample_output_stereo
+/****************************************************************************
+ * void sample_output_mono(int count, struct dsp_data *data,
+ *                         int32_t *src[], int16_t *dst)
+ *
+ * Same treatment as sample_output_stereo but for one channel.
+ */
+    .section   .text
+    .global    sample_output_mono
+sample_output_mono:
+    lea.l       -28(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d5/%a2-%a3, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     32(%sp), %a0-%a3          |
+    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    move.q      #1, %d5                   |
+    asl.l       %d1, %d5                  |
+    movem.l     (%a2), %a2                | get source channel pointer
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a3, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a3, %d0                  | at least a full line?
+    blo.w       .som_longloop_1_start     | no? jump to trailing longword
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a3, %d0                  | any leading longwords?
+    bls.b       .som_lineloop_start       | no? jump to line loop
+.som_longloop_0:
+    move.l      (%a2)+, %d1               | read longword from L and R
+    mac.l       %d1, %d5, %acc0           | shift L to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    move.l      %d1, %d2                  |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | duplicate single channel into
+    move.l      %d1, (%a3)+               | L and R
+    cmp.l       %a3, %d0                  |
+    bhi.b       .som_longloop_0           |
+.som_lineloop_start:
+    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
+.som_lineloop:
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %d5, %acc3           |
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      %d0, %d4                  | duplicate single channel
+    swap        %d4                       | into L and R
+    move.w      %d4, %d0                  |
+    move.l      %d1, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d1                  |
+    move.l      %d2, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d2                  |
+    move.l      %d3, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d3                  |
+    movem.l     %d0-%d3, (%a3)            | write four stereo samples
+    lea.l       16(%a3), %a3              |
+    cmp.l       %a3, %a1                  |
+    bhi.b       .som_lineloop             |
+.som_longloop_1_start:
+    cmp.l       %a3, %a0                  | any longwords left?
+    bls.b       .som_done                 | no? finished.
+.som_longloop_1:
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    mac.l       %d1, %d5, %acc0           | the same way as leading ones
+    movclr.l    %acc0, %d1                |
+    move.l      %d1, %d2                  |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a3)+               |
+    cmp.l       %a3, %a0                  |
+    bhi.b       .som_longloop_1           |
+.som_done:
+    movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       28(%sp), %sp              | cleanup
+    rts                                   |
+.som_end:
+    .size      sample_output_mono, .som_end-sample_output_mono

diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S index 295ef05fe0..1f8dd48cee 100644 --- a/apps/dsp_cf.S +++ b/apps/dsp_cf.S
@@ -18,7 +18,7 @@
18	****************************************************************************/	18	****************************************************************************/
19		19
20	/****************************************************************************	20	/****************************************************************************
21	* apply_crossfeed(int32_t* src[], int count)	21	* void apply_crossfeed(int32_t *src[], int count)
22	*/	22	*/
23	.section .text	23	.section .text
24	.global apply_crossfeed	24	.global apply_crossfeed
@@ -88,32 +88,31 @@ apply_crossfeed:
88	.size apply_crossfeed,.cfend-apply_crossfeed	88	.size apply_crossfeed,.cfend-apply_crossfeed
89		89
90	/****************************************************************************	90	/****************************************************************************
91	* dsp_downsample(int channels, int count, struct resample_data *r,	91	* int dsp_downsample(int count, struct dsp_data *data,
92	* in32_t src, int32_t dst)	92	* in32_t src[], int32_t dst[])
93	*/	93	*/
94	.section .text	94	.section .text
95	.global dsp_downsample	95	.global dsp_downsample
96	dsp_downsample:	96	dsp_downsample:
97	lea.l -40(%sp), %sp \| save non-clobberables	97	lea.l -40(%sp), %sp \| save non-clobberables
98	movem.l %d2-%d7/%a2-%a5, (%sp) \|	98	movem.l %d2-%d7/%a2-%a5, (%sp) \|
99	movem.l 44(%sp), %d2-%d3/%a0-%a2\| %d2 = ch = channels	99	movem.l 44(%sp), %d2/%a0-%a2 \| %d2 = count
100	\| %d3 = count	100	\| %a0 = data
101	\| %a0 = r
102	\| %a1 = src	101	\| %a1 = src
103	\| %a2 = dst	102	\| %a2 = dst
104	move.l 4(%a0), %d4 \| %d4 = delta = r->delta	103	movem.l 4(%a0), %d3-%d4 \| %d3 = ch = data->num_channels
105	move.l #16, %d7 \| %d7 = shift	104	\| %d4 = delta = data->resample_data.delta
		105	moveq.l #16, %d7 \| %d7 = shift
106	.dschannel_loop:	106	.dschannel_loop:
107	move.l (%a0), %d5 \| %d5 = phase = r->phase	107	move.l 12(%a0), %d5 \| %d5 = phase = data->resample_data.phase
108	move.l -4(%a1, %d2.l*4), %a3 \| %a3 = s = src[ch-1]	108	move.l -4(%a1, %d3.l*4), %a3 \| %a3 = s = src[ch-1]
109	move.l -4(%a2, %d2.l*4), %a4 \| %a4 = d = dst[ch-1]	109	move.l -4(%a2, %d3.l*4), %a4 \| %a4 = d = dst[ch-1]
110	lea.l 4(%a0, %d2.l*4), %a5 \| %a5 = &r->last_sample[ch-1]	110	lea.l 12(%a0, %d3.l*4), %a5 \| %a5 = &data->resample_data.ast_sample[ch-1]
111	move.l (%a5), %d0 \| %d0 = last = r->last_sample[ch-1]	111	move.l (%a5), %d0 \| %d0 = last = data->resample_data.last_sample[ch-1]
112	move.l -4(%a3, %d3.l*4), %d1 \| r->last_sample[ch-1] = s[count-1]	112	move.l -4(%a3, %d2.l*4), (%a5) \| data->resample_data.last_sample[ch-1] = s[count-1]
113	move.l %d1, (%a5) \|
114	move.l %d5, %d6 \| %d6 = pos = phase >> 16	113	move.l %d5, %d6 \| %d6 = pos = phase >> 16
115	lsr.l %d7, %d6 \|	114	lsr.l %d7, %d6 \|
116	cmp.l %d3, %d6 \| past end of samples?	115	cmp.l %d2, %d6 \| past end of samples?
117	bge.b .dsloop_skip \| yes? skip loop	116	bge.b .dsloop_skip \| yes? skip loop
118	tst.l %d6 \| need last sample of prev. frame?	117	tst.l %d6 \| need last sample of prev. frame?
119	bne.b .dsloop \| no? start main loop	118	bne.b .dsloop \| no? start main loop
@@ -134,14 +133,14 @@ dsp_downsample:
134	move.l %d5, %d6 \| pos = phase >> 16	133	move.l %d5, %d6 \| pos = phase >> 16
135	lsr.l %d7, %d6 \|	134	lsr.l %d7, %d6 \|
136	move.l %d0, (%a4)+ \| *d++ = %d0	135	move.l %d0, (%a4)+ \| *d++ = %d0
137	cmp.l %d3, %d6 \| pos < count?	136	cmp.l %d2, %d6 \| pos < count?
138	blt.b .dsloop \| yes? continue resampling	137	blt.b .dsloop \| yes? continue resampling
139	.dsloop_skip:	138	.dsloop_skip:
140	subq.l #1, %d2 \| ch > 0?	139	subq.l #1, %d3 \| ch > 0?
141	bgt.b .dschannel_loop \| yes? process next channel	140	bgt.b .dschannel_loop \| yes? process next channel
142	asl.l %d7, %d3 \| wrap phase to start of next frame	141	asl.l %d7, %d2 \| wrap phase to start of next frame
143	sub.l %d3, %d5 \| r->phase = phase - (count << 16)	142	sub.l %d2, %d5 \| data->resample_data.phase =
144	move.l %d5, (%a0) \|	143	move.l %d5, 12(%a0) \| ... phase - (count << 16)
145	move.l %a4, %d0 \| return d - d[0]	144	move.l %a4, %d0 \| return d - d[0]
146	sub.l (%a2), %d0 \|	145	sub.l (%a2), %d0 \|
147	asr.l #2, %d0 \| convert bytes->samples	146	asr.l #2, %d0 \| convert bytes->samples
@@ -153,31 +152,30 @@ dsp_downsample:
153	.size dsp_downsample,.dsend-dsp_downsample	152	.size dsp_downsample,.dsend-dsp_downsample
154		153
155	/****************************************************************************	154	/****************************************************************************
156	* dsp_upsample(int channels, int count, struct resample_data *r,	155	* int dsp_upsample(int count, struct dsp_data *dsp,
157	* in32_t src, int32_t dst)	156	* in32_t src[], int32_t dst[])
158	*/	157	*/
159	.section .text	158	.section .text
160	.global dsp_upsample	159	.global dsp_upsample
161	dsp_upsample:	160	dsp_upsample:
162	lea.l -40(%sp), %sp \| save non-clobberables	161	lea.l -40(%sp), %sp \| save non-clobberables
163	movem.l %d2-%d7/%a2-%a5, (%sp) \|	162	movem.l %d2-%d7/%a2-%a5, (%sp) \|
164	movem.l 44(%sp), %d2-%d3/%a0-%a2\| %d2 = ch = channels	163	movem.l 44(%sp), %d2/%a0-%a2 \| %d2 = count
165	\| %d3 = count	164	\| %a0 = data
166	\| %a0 = r
167	\| %a1 = src	165	\| %a1 = src
168	\| %a2 = dst	166	\| %a2 = dst
169	move.l 4(%a0), %d4 \| %d4 = delta = r->delta	167	movem.l 4(%a0), %d3-%d4 \| %d3 = ch = channels
		168	\| %d4 = delta = data->resample_data.delta
170	swap %d4 \| swap delta to high word to use	169	swap %d4 \| swap delta to high word to use
171	\| carries to increment position	170	\| carries to increment position
172	.uschannel_loop:	171	.uschannel_loop:
173	move.l (%a0), %d5 \| %d5 = phase = r->phase	172	move.l 12(%a0), %d5 \| %d5 = phase = data->resample_data.phase
174	move.l -4(%a1, %d2.l*4), %a3 \| %a3 = s = src[ch-1]	173	move.l -4(%a1, %d3.l*4), %a3 \| %a3 = s = src[ch-1]
175	lea.l 4(%a0, %d2.l*4), %a4 \| %a4 = &r->last_sample[ch-1]	174	lea.l 12(%a0, %d3.l*4), %a4 \| %a4 = &data->resample_data.last_sample[ch-1]
176	lea.l (%a3, %d3.l*4), %a5 \| %a5 = src_end = &src[count]	175	lea.l (%a3, %d2.l*4), %a5 \| %a5 = src_end = &src[count]
177	move.l (%a4), %d0 \| %d0 = last = r->last_sample[ch-1]	176	move.l (%a4), %d0 \| %d0 = last = data->resample_data.last_sample[ch-1]
178	move.l -4(%a5), %d1 \| r->last_sample[ch-1] = s[count-1]	177	move.l -(%a5), (%a4) \| data->resample_data.last_sample[ch-1] = s[count-1]
179	move.l %d1, (%a4) \|	178	move.l -4(%a2, %d3.l*4), %a4 \| %a4 = d = dst[ch-1]
180	move.l -4(%a2, %d2.l*4), %a4 \| %a4 = d = dst[ch-1]
181	swap %d5 \| swap phase to high word to use	179	swap %d5 \| swap phase to high word to use
182	\| carries to increment position	180	\| carries to increment position
183	move.l %d5, %d6 \| %d6 = pos = phase >> 16	181	move.l %d5, %d6 \| %d6 = pos = phase >> 16
@@ -204,13 +202,13 @@ dsp_upsample:
204	move.l %d7, (%a4)+ \| *d++ = %d7	202	move.l %d7, (%a4)+ \| *d++ = %d7
205	add.l %d4, %d5 \| phase += delta	203	add.l %d4, %d5 \| phase += delta
206	bcc.b .usloop_0 \| load next values?	204	bcc.b .usloop_0 \| load next values?
207	cmp.l %a5, %a3 \| src < src_end?	205	cmp.l %a5, %a3 \| src <= src_end?
208	blt.b .usloop_1 \| yes? continue resampling	206	ble.b .usloop_1 \| yes? continue resampling
209	.usloop_skip:	207	.usloop_skip:
210	subq.l #1, %d2 \| ch > 0?	208	subq.l #1, %d3 \| ch > 0?
211	bgt.b .uschannel_loop \| yes? process next channel	209	bgt.b .uschannel_loop \| yes? process next channel
212	swap %d5 \| wrap phase to start of next frame	210	swap %d5 \| wrap phase to start of next frame
213	move.l %d5, (%a0) \| ...and save in r->phase	211	move.l %d5, 12(%a0) \| ...and save in data->resample_data.phase
214	move.l %a4, %d0 \| return d - d[0]	212	move.l %a4, %d0 \| return d - d[0]
215	sub.l (%a2), %d0 \|	213	sub.l (%a2), %d0 \|
216	movem.l (%sp), %d2-%d7/%a2-%a5 \| restore non-clobberables	214	movem.l (%sp), %d2-%d7/%a2-%a5 \| restore non-clobberables
@@ -219,3 +217,307 @@ dsp_upsample:
219	rts \| buh-bye	217	rts \| buh-bye
220	.usend:	218	.usend:
221	.size dsp_upsample,.usend-dsp_upsample	219	.size dsp_upsample,.usend-dsp_upsample
		220
		221	/* These routines might benefit from burst transfers but we'll keep them
		222	* small for now since they're rather light weight
		223	*/
		224
		225	/****************************************************************************
		226	* void channels_process_sound_chan_mono(int count, int32_t *buf[])
		227	*
		228	* Mix left and right channels 50/50 into a center channel.
		229	*/
		230	.section .text
		231	.global channels_process_sound_chan_mono
		232	channels_process_sound_chan_mono:
		233	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		234	lea.l -12(%sp), %sp \| save registers
		235	move.l %macsr, %d1 \|
		236	movem.l %d1-%d3, (%sp) \|
		237	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		238	movem.l (%a0), %a0-%a1 \| get channel pointers
		239	move.l #0x40000000, %d3 \| %d3 = 0.5
		240	1:
		241	move.l (%a0), %d1 \| L = R = l/2 + r/2
		242	mac.l %d1, %d3, (%a1), %d2, %acc0 \|
		243	mac.l %d2, %d3, %acc0 \|
		244	movclr.l %acc0, %d1 \|
		245	move.l %d1, (%a0)+ \| output to original buffer
		246	move.l %d1, (%a1)+ \|
		247	subq.l #1, %d0 \|
		248	bgt.s 1b \|
		249	movem.l (%sp), %d1-%d3 \| restore registers
		250	move.l %d1, %macsr \|
		251	lea.l 12(%sp), %sp \| cleanup
		252	rts
		253	.cpmono_end:
		254	.size channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
		255
		256
		257	/****************************************************************************
		258	* void channels_process_sound_chan_custom(int count, int32_t *buf[])
		259	*
		260	* Apply stereo width (narrowing/expanding) effect.
		261	*/
		262	.section .text
		263	.global channels_process_sound_chan_custom
		264	channels_process_sound_chan_custom:
		265	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		266	lea.l -16(%sp), %sp \| save registers
		267	move.l %macsr, %d1 \|
		268	movem.l %d1-%d4, (%sp) \|
		269	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		270	movem.l (%a0), %a0-%a1 \| get channel pointers
		271	move.l dsp_sw_gain, %d3 \| load straight (mid) gain
		272	move.l dsp_sw_cross, %d4 \| load cross (side) gain
		273	1:
		274	move.l (%a0), %d1 \|
		275	mac.l %d1, %d3 , (%a1), %d2, %acc0 \| L = lgain + rcross
		276	mac.l %d1, %d4 , %acc1 \| R = rgain + lcross
		277	mac.l %d2, %d4 , %acc0 \|
		278	mac.l %d2, %d3 , %acc1 \|
		279	movclr.l %acc0, %d1 \|
		280	movclr.l %acc1, %d2 \|
		281	move.l %d1, (%a0)+ \|
		282	move.l %d2, (%a1)+ \|
		283	subq.l #1, %d0 \|
		284	bgt.s 1b \|
		285	movem.l (%sp), %d1-%d4 \| restore registers
		286	move.l %d1, %macsr \|
		287	lea.l 16(%sp), %sp \| cleanup
		288	rts
		289	.cpcustom_end:
		290	.size channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
		291
		292	/****************************************************************************
		293	* void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
		294	*
		295	* Separate channels into side channels.
		296	*/
		297	.section .text
		298	.global channels_process_sound_chan_karaoke
		299	channels_process_sound_chan_karaoke:
		300	movem.l 4(%sp), %d0/%a0 \| %d0 = count, %a0 = buf
		301	lea.l -16(%sp), %sp \| save registers
		302	move.l %macsr, %d1 \|
		303	movem.l %d1-%d4, (%sp) \|
		304	move.l #0xb0, %macsr \| put emac in rounding fractional mode
		305	movem.l (%a0), %a0-%a1 \| get channel pointers
		306	move.l #0x40000000, %d4 \| %d3 = 0.5
		307	1:
		308	move.l (%a0), %d1 \|
		309	mac.l %d1, %d4, (%a1), %d2, %acc0 \| L = l/2 - r/2
		310	mac.l %d2, %d4, %acc1 \| R = r/2 - l/2
		311	movclr.l %acc0, %d1 \|
		312	movclr.l %acc1, %d2 \|
		313	move.l %d1, %d3 \|
		314	sub.l %d2, %d1 \|
		315	sub.l %d3, %d2 \|
		316	move.l %d1, (%a0)+ \|
		317	move.l %d2, (%a1)+ \|
		318	subq.l #1, %d0 \|
		319	bgt.s 1b \|
		320	movem.l (%sp), %d1-%d4 \| restore registers
		321	move.l %d1, %macsr \|
		322	lea.l 16(%sp), %sp \| cleanup
		323	rts
		324	.cpkaraoke_end:
		325	.size channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
		326
		327	/****************************************************************************
		328	* void sample_output_stereo(int count, struct dsp_data *data,
		329	* int32_t src[], int16_t dst)
		330	*
		331	* Framework based on the ubiquitous Rockbox line transfer logic for
		332	* Coldfire CPUs.
		333	*
		334	* Does emac clamping and scaling (which proved faster than the usual
		335	* checks and branches - even single test clamping) and writes using
		336	* line burst transfers. Also better than writing a single L-R pair per
		337	* loop but a good deal more code.
		338	*
		339	* Attemping bursting during reads is rather futile since the source and
		340	* destination alignments rarely agree and too much complication will
		341	* slow us up. The parallel loads seem to do a bit better at least until
		342	* a pcm buffer can always give line aligned chunk and then aligning the
		343	* dest can then imply the source is aligned if the source buffers are.
		344	* For now longword alignment is assumed of both the source and dest.
		345	*
		346	*/
		347	.section .text
		348	.global sample_output_stereo
		349	sample_output_stereo:
		350	lea.l -44(%sp), %sp \| save registers
		351	move.l %macsr, %d1 \| do it now as at many lines will
		352	movem.l %d1-%d7/%a2-%a5, (%sp) \| be the far more common condition
		353	move.l #0x80, %macsr \| put emac unit in signed int mode
		354	movem.l 48(%sp), %a0-%a2/%a4 \|
		355	lea.l (%a4, %a0.l*4), %a0 \| %a0 = end address
		356	move.l (%a1), %d1 \| %a1 = multiplier: (1 << (16 - scale))
		357	sub.l #16, %d1 \|
		358	neg.l %d1 \|
		359	move.q #1, %d0 \|
		360	asl.l %d1, %d0 \|
		361	move.l %d0, %a1 \|
		362	movem.l (%a2), %a2-%a3 \| get L/R channel pointers
		363	moveq.l #28, %d0 \| %d0 = second line bound
		364	add.l %a4, %d0 \|
		365	and.l #0xfffffff0, %d0 \|
		366	cmp.l %a4, %d0 \| at least a full line?
		367	blo.w .sos_longloop_1_start \| no? jump to trailing longword
		368	sub.l #16, %d0 \| %d1 = first line bound
		369	cmp.l %a4, %d0 \| any leading longwords?
		370	bls.b .sos_lineloop_start \| no? jump to line loop
		371	.sos_longloop_0:
		372	move.l (%a2)+, %d1 \| read longword from L and R
		373	mac.l %d1, %a1, (%a3)+, %d2, %acc0 \| shift L to high word
		374	mac.l %d2, %a1, %acc1 \| shift R to high word
		375	movclr.l %acc0, %d1 \| get possibly saturated results
		376	movclr.l %acc1, %d2 \|
		377	swap %d2 \| move R to low word
		378	move.w %d2, %d1 \| interleave MS 16 bits of each
		379	move.l %d1, (%a4)+ \| ...and write both
		380	cmp.l %a4, %d0 \|
		381	bhi.b .sos_longloop_0 \|
		382	.sos_lineloop_start:
		383	lea.l -12(%a0), %a5 \| %a5 = at or just before last line bound
		384	.sos_lineloop:
		385	move.l (%a2)+, %d0 \| get next 4 L samples and scale
		386	mac.l %d0, %a1, (%a2)+, %d1, %acc0 \| with saturation
		387	mac.l %d1, %a1, (%a2)+, %d2, %acc1 \|
		388	mac.l %d2, %a1, (%a2)+, %d3, %acc2 \|
		389	mac.l %d3, %a1, %acc3 \|
		390	movclr.l %acc0, %d0 \| obtain results
		391	movclr.l %acc1, %d1 \|
		392	movclr.l %acc2, %d2 \|
		393	movclr.l %acc3, %d3 \|
		394	move.l (%a3)+, %d4 \| get next 4 R samples and scale
		395	mac.l %d4, %a1, (%a3)+, %d5, %acc0 \| with saturation
		396	mac.l %d5, %a1, (%a3)+, %d6, %acc1 \|
		397	mac.l %d6, %a1, (%a3)+, %d7, %acc2 \|
		398	mac.l %d7, %a1, %acc3 \|
		399	movclr.l %acc0, %d4 \| obtain results
		400	movclr.l %acc1, %d5 \|
		401	movclr.l %acc2, %d6 \|
		402	movclr.l %acc3, %d7 \|
		403	swap %d4 \| interleave most significant
		404	move.w %d4, %d0 \| 16 bits of L and R
		405	swap %d5 \|
		406	move.w %d5, %d1 \|
		407	swap %d6 \|
		408	move.w %d6, %d2 \|
		409	swap %d7 \|
		410	move.w %d7, %d3 \|
		411	movem.l %d0-%d3, (%a4) \| write four stereo samples
		412	lea.l 16(%a4), %a4 \|
		413	cmp.l %a4, %a5 \|
		414	bhi.b .sos_lineloop \|
		415	.sos_longloop_1_start:
		416	cmp.l %a4, %a0 \| any longwords left?
		417	bls.b .sos_done \| no? finished.
		418	.sos_longloop_1:
		419	move.l (%a2)+, %d1 \| handle trailing longwords
		420	mac.l %d1, %a1, (%a3)+, %d2, %acc0 \| the same way as leading ones
		421	mac.l %d2, %a1, %acc1 \|
		422	movclr.l %acc0, %d1 \|
		423	movclr.l %acc1, %d2 \|
		424	swap %d2 \|
		425	move.w %d2, %d1 \|
		426	move.l %d1, (%a4)+ \|
		427	cmp.l %a4, %a0 \|
		428	bhi.b .sos_longloop_1 \|
		429	.sos_done:
		430	movem.l (%sp), %d1-%d7/%a2-%a5 \| restore registers
		431	move.l %d1, %macsr \|
		432	lea.l 44(%sp), %sp \| cleanup
		433	rts \|
		434	.sos_end:
		435	.size sample_output_stereo, .sos_end-sample_output_stereo
		436
		437	/****************************************************************************
		438	* void sample_output_mono(int count, struct dsp_data *data,
		439	* int32_t src[], int16_t dst)
		440	*
		441	* Same treatment as sample_output_stereo but for one channel.
		442	*/
		443	.section .text
		444	.global sample_output_mono
		445	sample_output_mono:
		446	lea.l -28(%sp), %sp \| save registers
		447	move.l %macsr, %d1 \| do it now as at many lines will
		448	movem.l %d1-%d5/%a2-%a3, (%sp) \| be the far more common condition
		449	move.l #0x80, %macsr \| put emac unit in signed int mode
		450	movem.l 32(%sp), %a0-%a3 \|
		451	lea.l (%a3, %a0.l*4), %a0 \| %a0 = end address
		452	move.l (%a1), %d1 \| %d5 = multiplier: (1 << (16 - scale))
		453	sub.l #16, %d1 \|
		454	neg.l %d1 \|
		455	move.q #1, %d5 \|
		456	asl.l %d1, %d5 \|
		457	movem.l (%a2), %a2 \| get source channel pointer
		458	moveq.l #28, %d0 \| %d0 = second line bound
		459	add.l %a3, %d0 \|
		460	and.l #0xfffffff0, %d0 \|
		461	cmp.l %a3, %d0 \| at least a full line?
		462	blo.w .som_longloop_1_start \| no? jump to trailing longword
		463	sub.l #16, %d0 \| %d1 = first line bound
		464	cmp.l %a3, %d0 \| any leading longwords?
		465	bls.b .som_lineloop_start \| no? jump to line loop
		466	.som_longloop_0:
		467	move.l (%a2)+, %d1 \| read longword from L and R
		468	mac.l %d1, %d5, %acc0 \| shift L to high word
		469	movclr.l %acc0, %d1 \| get possibly saturated results
		470	move.l %d1, %d2 \|
		471	swap %d2 \| move R to low word
		472	move.w %d2, %d1 \| duplicate single channel into
		473	move.l %d1, (%a3)+ \| L and R
		474	cmp.l %a3, %d0 \|
		475	bhi.b .som_longloop_0 \|
		476	.som_lineloop_start:
		477	lea.l -12(%a0), %a1 \| %a1 = at or just before last line bound
		478	.som_lineloop:
		479	move.l (%a2)+, %d0 \| get next 4 L samples and scale
		480	mac.l %d0, %d5, (%a2)+, %d1, %acc0 \| with saturation
		481	mac.l %d1, %d5, (%a2)+, %d2, %acc1 \|
		482	mac.l %d2, %d5, (%a2)+, %d3, %acc2 \|
		483	mac.l %d3, %d5, %acc3 \|
		484	movclr.l %acc0, %d0 \| obtain results
		485	movclr.l %acc1, %d1 \|
		486	movclr.l %acc2, %d2 \|
		487	movclr.l %acc3, %d3 \|
		488	move.l %d0, %d4 \| duplicate single channel
		489	swap %d4 \| into L and R
		490	move.w %d4, %d0 \|
		491	move.l %d1, %d4 \|
		492	swap %d4 \|
		493	move.w %d4, %d1 \|
		494	move.l %d2, %d4 \|
		495	swap %d4 \|
		496	move.w %d4, %d2 \|
		497	move.l %d3, %d4 \|
		498	swap %d4 \|
		499	move.w %d4, %d3 \|
		500	movem.l %d0-%d3, (%a3) \| write four stereo samples
		501	lea.l 16(%a3), %a3 \|
		502	cmp.l %a3, %a1 \|
		503	bhi.b .som_lineloop \|
		504	.som_longloop_1_start:
		505	cmp.l %a3, %a0 \| any longwords left?
		506	bls.b .som_done \| no? finished.
		507	.som_longloop_1:
		508	move.l (%a2)+, %d1 \| handle trailing longwords
		509	mac.l %d1, %d5, %acc0 \| the same way as leading ones
		510	movclr.l %acc0, %d1 \|
		511	move.l %d1, %d2 \|
		512	swap %d2 \|
		513	move.w %d2, %d1 \|
		514	move.l %d1, (%a3)+ \|
		515	cmp.l %a3, %a0 \|
		516	bhi.b .som_longloop_1 \|
		517	.som_done:
		518	movem.l (%sp), %d1-%d5/%a2-%a3 \| restore registers
		519	move.l %d1, %macsr \|
		520	lea.l 28(%sp), %sp \| cleanup
		521	rts \|
		522	.som_end:
		523	.size sample_output_mono, .som_end-sample_output_mono