1 files changed, 574 insertions, 0 deletions
diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S
new file mode 100644
index 0000000000..007c1a3e98
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id $
+ *
+ * Copyright (C) 2007 Jens Arnold
+ * Based on the work of Karim Boucher and Rani Hod
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+    .global     mpeg2_idct_copy_coldfire
+    .type       mpeg2_idct_copy_coldfire, @function
+    .global     mpeg2_idct_add_coldfire
+    .type       mpeg2_idct_add_coldfire, @function
+    /* The IDCT itself.
+     * Input: %a0: block pointer
+     * All registers are preserved. */
+    .align  2
+.idct:
+    lea.l   (-15*4,%sp), %sp
+    movem.l %d0-%d7/%a0-%a6, (%sp)  | save all registers
+    move.l  %a0, %a6
+    move.l  #0, %macsr              | signed integer mode
+    move.l  #((2048<<16)+2841), %a0 | W0,  W1
+    move.l  #((2676<<16)+2408), %a1 | W2,  W3
+    move.l  #((2048<<16)+1609), %a2 | W4,  W5
+    move.l  #((1108<<16)+ 565), %a3 | W6,  W7
+    
+    lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
+    moveq.l #8, %d3             | loop counter
+    
+.row_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+    
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(1<<16), %d0       | f0 += 1; 
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #12, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (7*16,%a4)
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (4*16,%a4)
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (3*16,%a4)
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (2*16,%a4)
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (1*16,%a4)
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (%a4)+         | advance to next temp column
+    
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .row_loop
+    
+    | %a6 now points to the temp buffer, where we need it.
+    lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
+    moveq.l #8, %d3             | loop counter
+    
+.col_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+  
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(32<<16), %d0      | DC offset: 0.5
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #17, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (7*16,%a4)
+   
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (4*16,%a4)
+   
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (3*16,%a4)
+   
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (2*16,%a4)
+   
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (1*16,%a4)
+   
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (%a4)+         | advance to next column
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .col_loop
+    
+    movem.l (%sp), %d0-%d7/%a0-%a6  | restore all registers
+    lea.l   (15*4,%sp), %sp
+    rts
+    
+    .align  2
+mpeg2_idct_copy_coldfire:
+    lea.l   (-4*4,%sp), %sp
+    movem.l %d2-%d4/%a2, (%sp)  | save some registers
+    movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
+                                | %a1 - destination pointer
+                                | %a2 - stride
+    bsr.w   .idct               | apply idct to block
+    move.l  #255, %d1           | preload constant for clipping
+    moveq.l #8, %d4             | loop counter
+    
+.copy_clip_loop:
+    move.w  (%a0), %d0          | load block[0]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 | yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    clr.l   (%a0)+              | clear block[0] and block[1],
+                                | %a0 now pointing to block[2]
+    move.w  (%a0), %d0          | do b2 and b3
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b4 and b5
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b6 and b7
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3            
+    clr.l   (%a0)+
+    
+    movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .copy_clip_loop
+    movem.l (%sp), %d2-%d4/%a2  | restore registers
+    lea.l   (4*4,%sp), %sp
+    rts
+    .align  2
+mpeg2_idct_add_coldfire:
+    lea.l   (-7*4,%sp), %sp
+    movem.l %d2-%d7/%a2, (%sp)      | save some registers
+    movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
+                                    | %a0 - block pointer
+                                    | %a1 - destination pointer
+                                    | %a2 - stride
+    cmp.l   #129, %d0           | last == 129 ?
+    bne.b   .idct_add           |   no: perform idct + addition
+    move.w  (%a0), %d0          
+    ext.l   %d0                 | ((block[0]
+    asr.l   #4, %d0             |      >> 4)
+    and.l   #7, %d0             |      & 7)
+    subq.l  #4, %d0             |      - 4 == 0 ?
+    bne.w   .dc_add             |   no: just perform addition
+.idct_add:
+    bsr.w   .idct               | apply idct
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | used for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.add_clip_loop:
+    movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1) 
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.w  (%a0), %d1          | load block[0]
+    ext.l   %d1                 | sign extend
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    clr.l   (%a0)+              | clear block[0] and block[1]
+                                |   %a0 now pointing to block[2]
+    move.w  (2,%a0), %d0        | do b3 and b2
+    ext.l   %d0
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b5 and b4
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b7 and b6
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    clr.l   (%a0)+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .add_clip_loop
+    bra.w   .idct_add_end
+    
+.dc_add:
+    move.w  (%a0), %d0
+    ext.l   %d0                 | %d0 = (block[0]
+    add.l   #64, %d0            |       + 64)
+    asr.l   #7, %d0             |       >> 7
+    clr.w   (%a0)               | clear block[0]
+    clr.w   (63*2,%a0)          |   and block[63]
+    move.l  %d0, %a0            | DC value in %a0
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.dc_clip_loop:
+    movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1)
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.l  %a0, %d0            | copy DC
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.l  %a0, %d1            | copy DC
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    move.l  %a0, %d0            | do b3 and b2
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    move.l  %a0, %d0            | do b5 and b4
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6            | do b7 and b6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    move.l  %a0, %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .dc_clip_loop
+.idct_add_end:
+    movem.l (%sp), %d2-%d7/%a2  | restore registers
+    lea.l   (7*4,%sp), %sp
+    rts

diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S new file mode 100644 index 0000000000..007c1a3e98 --- /dev/null +++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id $
	9	*
	10	* Copyright (C) 2007 Jens Arnold
	11	* Based on the work of Karim Boucher and Rani Hod
	12	*
	13	* All files in this archive are subject to the GNU General Public License.
	14	* See the file COPYING in the source tree root for full license agreement.
	15	*
	16	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	17	* KIND, either express or implied.
	18	*
	19	****************************************************************************/
	20
	21	.global mpeg2_idct_copy_coldfire
	22	.type mpeg2_idct_copy_coldfire, @function
	23	.global mpeg2_idct_add_coldfire
	24	.type mpeg2_idct_add_coldfire, @function
	25
	26	/* The IDCT itself.
	27	* Input: %a0: block pointer
	28	* All registers are preserved. */
	29	.align 2
	30	.idct:
	31	lea.l (-15*4,%sp), %sp
	32	movem.l %d0-%d7/%a0-%a6, (%sp) \| save all registers
	33	move.l %a0, %a6
	34
	35	move.l #0, %macsr \| signed integer mode
	36
	37	move.l #((2048<<16)+2841), %a0 \| W0, W1
	38	move.l #((2676<<16)+2408), %a1 \| W2, W3
	39	move.l #((2048<<16)+1609), %a2 \| W4, W5
	40	move.l #((1108<<16)+ 565), %a3 \| W6, W7
	41
	42	lea.l (128,%a6), %a4 \| secondary, transposed temp buffer
	43	moveq.l #8, %d3 \| loop counter
	44
	45	.row_loop:
	46	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
	47
	48	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
	49	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
	50	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
	51	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
	52
	53	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
	54	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
	55	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
	56	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
	57
	58	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
	59	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
	60	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
	61	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
	62
	63	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
	64	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
	65	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
	66	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
	67
	68	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
	69	add.l #(1<<16), %d0 \| f0 += 1;
	70
	71	movclr.l %acc0, %d4 \| b0
	72	movclr.l %acc1, %d5 \| b1
	73	movclr.l %acc2, %d6 \| b2
	74	movclr.l %acc3, %d7 \| b3
	75
	76	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
	77	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
	78	move.l %acc0, %acc3
	79	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
	80	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
	81
	82	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
	83	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
	84	move.l %acc1, %acc2
	85	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
	86	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
	87
	88	\| ^ move.l %acc0, %acc3 %acc2 = W0 * f0 - W4 * f4
	89	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
	90	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
	91
	92	\| ^ move.l %acc1, %acc2 %acc3 = W0 * f0 + W4 * f4
	93	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
	94	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
	95
	96	moveq.l #12, %d1 \| shift amount
	97
	98	move.l %acc0, %d0 \| block[7] = (a0
	99	sub.l %d4,%d0 \| - b0)
	100	asr.l %d1, %d0 \| >> 12
	101	move.w %d0, (7*16,%a4)
	102
	103	move.l %acc1, %d0 \| block[6] = (a1
	104	sub.l %d5,%d0 \| - b1)
	105	asr.l %d1, %d0 \| >> 12
	106	move.w %d0, (6*16,%a4)
	107
	108	move.l %acc2, %d0 \| block[5] = (a2
	109	sub.l %d6,%d0 \| - b2)
	110	asr.l %d1, %d0 \| >> 12
	111	move.w %d0, (5*16,%a4)
	112
	113	move.l %acc3, %d0 \| block[4] = (a3
	114	sub.l %d7,%d0 \| - b3)
	115	asr.l %d1, %d0 \| >> 12
	116	move.w %d0, (4*16,%a4)
	117
	118	movclr.l %acc3, %d0 \| block[3] = (a3
	119	add.l %d7, %d0 \| + b3)
	120	asr.l %d1, %d0 \| >> 12
	121	move.w %d0, (3*16,%a4)
	122
	123	movclr.l %acc2, %d0 \| block[2] = (a2
	124	add.l %d6, %d0 \| + b2)
	125	asr.l %d1, %d0 \| >> 12
	126	move.w %d0, (2*16,%a4)
	127
	128	movclr.l %acc1, %d0 \| block[1] = (a1
	129	add.l %d5, %d0 \| + b1)
	130	asr.l %d1, %d0 \| >> 12
	131	move.w %d0, (1*16,%a4)
	132
	133	movclr.l %acc0, %d0 \| block[0] = (a0
	134	add.l %d4, %d0 \| + b0)
	135	asr.l %d1, %d0 \| >> 12
	136	move.w %d0, (%a4)+ \| advance to next temp column
	137
	138	subq.l #1, %d3 \| loop 8 times
	139	bne.w .row_loop
	140
	141	\| %a6 now points to the temp buffer, where we need it.
	142	lea.l (-16-128,%a4), %a4 \| point %a4 back to the input block
	143	moveq.l #8, %d3 \| loop counter
	144
	145	.col_loop:
	146	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
	147
	148	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
	149	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
	150	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
	151	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
	152
	153	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
	154	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
	155	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
	156	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
	157
	158	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
	159	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
	160	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
	161	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
	162
	163	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
	164	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
	165	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
	166	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
	167
	168	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
	169	add.l #(32<<16), %d0 \| DC offset: 0.5
	170
	171	movclr.l %acc0, %d4 \| b0
	172	movclr.l %acc1, %d5 \| b1
	173	movclr.l %acc2, %d6 \| b2
	174	movclr.l %acc3, %d7 \| b3
	175
	176	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
	177	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
	178	move.l %acc0, %acc3
	179	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
	180	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
	181
	182	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
	183	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
	184	move.l %acc1, %acc2
	185	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
	186	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
	187
	188	\| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
	189	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
	190	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
	191
	192	\| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
	193	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
	194	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
	195
	196	moveq.l #17, %d1 \| shift amount
	197
	198	move.l %acc0, %d0 \| block[7] = (a0
	199	sub.l %d4,%d0 \| - b0)
	200	asr.l %d1, %d0 \| >> 17
	201	move.w %d0, (7*16,%a4)
	202
	203	move.l %acc1, %d0 \| block[6] = (a1
	204	sub.l %d5,%d0 \| - b1)
	205	asr.l %d1, %d0 \| >> 17
	206	move.w %d0, (6*16,%a4)
	207
	208	move.l %acc2, %d0 \| block[5] = (a2
	209	sub.l %d6,%d0 \| - b2)
	210	asr.l %d1, %d0 \| >> 17
	211	move.w %d0, (5*16,%a4)
	212
	213	move.l %acc3, %d0 \| block[4] = (a3
	214	sub.l %d7,%d0 \| - b3)
	215	asr.l %d1, %d0 \| >> 17
	216	move.w %d0, (4*16,%a4)
	217
	218	movclr.l %acc3, %d0 \| block[3] = (a3
	219	add.l %d7, %d0 \| + b3)
	220	asr.l %d1, %d0 \| >> 17
	221	move.w %d0, (3*16,%a4)
	222
	223	movclr.l %acc2, %d0 \| block[2] = (a2
	224	add.l %d6, %d0 \| + b2)
	225	asr.l %d1, %d0 \| >> 17
	226	move.w %d0, (2*16,%a4)
	227
	228	movclr.l %acc1, %d0 \| block[1] = (a1
	229	add.l %d5, %d0 \| + b1)
	230	asr.l %d1, %d0 \| >> 17
	231	move.w %d0, (1*16,%a4)
	232
	233	movclr.l %acc0, %d0 \| block[0] = (a0
	234	add.l %d4, %d0 \| + b0)
	235	asr.l %d1, %d0 \| >> 17
	236	move.w %d0, (%a4)+ \| advance to next column
	237
	238	subq.l #1, %d3 \| loop 8 times
	239	bne.w .col_loop
	240
	241	movem.l (%sp), %d0-%d7/%a0-%a6 \| restore all registers
	242	lea.l (15*4,%sp), %sp
	243	rts
	244
	245	.align 2
	246
	247	mpeg2_idct_copy_coldfire:
	248	lea.l (-4*4,%sp), %sp
	249	movem.l %d2-%d4/%a2, (%sp) \| save some registers
	250	movem.l (4*4+4,%sp), %a0-%a2\| %a0 - block pointer
	251	\| %a1 - destination pointer
	252	\| %a2 - stride
	253
	254	bsr.w .idct \| apply idct to block
	255
	256	move.l #255, %d1 \| preload constant for clipping
	257	moveq.l #8, %d4 \| loop counter
	258
	259	.copy_clip_loop:
	260	move.w (%a0), %d0 \| load block[0]
	261	ext.l %d0 \| sign extend
	262	cmp.l %d1, %d0 \| overflow?
	263	bls.b 1f
	264	spl.b %d0 \| yes: set appropriate limit value in low byte
	265	1:
	266	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
	267	lsl.l #8, %d2
	268
	269	move.w (2,%a0), %d0 \| load block[1]
	270	ext.l %d0 \| sign extend
	271	cmp.l %d1, %d0 \| overflow?
	272	bls.b 1f
	273	spl.b %d0 \| yes: set appropriate limit value in low byte
	274	1:
	275	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
	276	lsl.l #8, %d2
	277	clr.l (%a0)+ \| clear block[0] and block[1],
	278	\| %a0 now pointing to block[2]
	279	move.w (%a0), %d0 \| do b2 and b3
	280	ext.l %d0
	281	cmp.l %d1, %d0
	282	bls.b 1f
	283	spl.b %d0
	284	1:
	285	move.b %d0, %d2
	286	lsl.l #8, %d2
	287
	288	move.w (2,%a0), %d0
	289	ext.l %d0
	290	cmp.l %d1, %d0
	291	bls.b 1f
	292	spl.b %d0
	293	1:
	294	move.b %d0, %d2
	295	clr.l (%a0)+
	296
	297	move.w (%a0), %d0 \| do b4 and b5
	298	ext.l %d0
	299	cmp.l %d1, %d0
	300	bls.b 1f
	301	spl.b %d0
	302	1:
	303	move.b %d0, %d3
	304	lsl.l #8, %d3
	305
	306	move.w (2,%a0), %d0
	307	ext.l %d0
	308	cmp.l %d1, %d0
	309	bls.b 1f
	310	spl.b %d0
	311	1:
	312	move.b %d0, %d3
	313	lsl.l #8, %d3
	314	clr.l (%a0)+
	315
	316	move.w (%a0), %d0 \| do b6 and b7
	317	ext.l %d0
	318	cmp.l %d1, %d0
	319	bls.b 1f
	320	spl.b %d0
	321	1:
	322	move.b %d0, %d3
	323	lsl.l #8, %d3
	324
	325	move.w (2,%a0), %d0
	326	ext.l %d0
	327	cmp.l %d1, %d0
	328	bls.b 1f
	329	spl.b %d0
	330	1:
	331	move.b %d0, %d3
	332	clr.l (%a0)+
	333
	334	movem.l %d2-%d3, (%a1) \| write all 8 output bytes at once
	335	lea.l (%a2,%a1), %a1 \| advance output pointer
	336	subq.l #1, %d4 \| loop 8 times
	337	bne.w .copy_clip_loop
	338
	339	movem.l (%sp), %d2-%d4/%a2 \| restore registers
	340	lea.l (4*4,%sp), %sp
	341	rts
	342
	343	.align 2
	344
	345	mpeg2_idct_add_coldfire:
	346	lea.l (-7*4,%sp), %sp
	347	movem.l %d2-%d7/%a2, (%sp) \| save some registers
	348	movem.l (7*4+4,%sp), %d0/%a0-%a2\| %d0 - last value
	349	\| %a0 - block pointer
	350	\| %a1 - destination pointer
	351	\| %a2 - stride
	352	cmp.l #129, %d0 \| last == 129 ?
	353	bne.b .idct_add \| no: perform idct + addition
	354	move.w (%a0), %d0
	355	ext.l %d0 \| ((block[0]
	356	asr.l #4, %d0 \| >> 4)
	357	and.l #7, %d0 \| & 7)
	358	subq.l #4, %d0 \| - 4 == 0 ?
	359	bne.w .dc_add \| no: just perform addition
	360
	361	.idct_add:
	362	bsr.w .idct \| apply idct
	363
	364	move.l #255, %d2 \| preload constant for clipping
	365	clr.l %d3 \| used for splitting input words into bytes
	366	moveq.l #8, %d4 \| loop counter
	367
	368	.add_clip_loop:
	369	movem.l (%a1), %d6-%d7 \| fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
	370	swap %d6 \| (b2 b3 b0 b1)
	371	swap %d7 \| (b6 b7 b4 b5)
	372
	373	move.w (2,%a0), %d0 \| load block[1]
	374	ext.l %d0 \| sign extend
	375	move.b %d6, %d3 \| copy b1
	376	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	377	add.l %d3, %d0 \| add b1
	378	cmp.l %d2, %d0 \| overflow ?
	379	bls.b 1f
	380	spl.b %d0 \| yes: set appropriate limit value in low byte
	381	1:
	382	move.w (%a0), %d1 \| load block[0]
	383	ext.l %d1 \| sign extend
	384	move.b %d6, %d3 \| copy b0
	385	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	386	add.l %d3, %d1 \| add b0
	387	cmp.l %d2, %d1 \| overflow ?
	388	bls.b 1f
	389	spl.b %d1 \| yes: set appropriate limit value in low byte
	390	1:
	391	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
	392	lsl.l #8, %d5
	393	move.b %d0, %d5
	394	lsl.l #8, %d5
	395	clr.l (%a0)+ \| clear block[0] and block[1]
	396	\| %a0 now pointing to block[2]
	397	move.w (2,%a0), %d0 \| do b3 and b2
	398	ext.l %d0
	399	move.b %d6, %d3
	400	lsr.l #8, %d6
	401	add.l %d3, %d0
	402	cmp.l %d2, %d0
	403	bls.b 1f
	404	spl.b %d0
	405	1:
	406	move.w (%a0), %d1
	407	ext.l %d1
	408	add.l %d6, %d1
	409	cmp.l %d2, %d1
	410	bls.b 1f
	411	spl.b %d1
	412	1:
	413	move.b %d1, %d5
	414	lsl.l #8, %d5
	415	move.b %d0, %d5
	416	clr.l (%a0)+
	417
	418	move.w (2,%a0), %d0 \| do b5 and b4
	419	ext.l %d0
	420	move.b %d7, %d3
	421	lsr.l #8, %d7
	422	add.l %d3, %d0
	423	cmp.l %d2, %d0
	424	bls.b 1f
	425	spl.b %d0
	426	1:
	427	move.w (%a0), %d1
	428	ext.l %d1
	429	move.b %d7, %d3
	430	lsr.l #8, %d7
	431	add.l %d3, %d1
	432	cmp.l %d2, %d1
	433	bls.b 1f
	434	spl.b %d1
	435	1:
	436	move.b %d1, %d6
	437	lsl.l #8, %d6
	438	move.b %d0, %d6
	439	lsl.l #8, %d6
	440	clr.l (%a0)+
	441
	442	move.w (2,%a0), %d0 \| do b7 and b6
	443	ext.l %d0
	444	move.b %d7, %d3
	445	lsr.l #8, %d7
	446	add.l %d3, %d0
	447	cmp.l %d2, %d0
	448	bls.b 1f
	449	spl.b %d0
	450	1:
	451	move.w (%a0), %d1
	452	ext.l %d1
	453	add.l %d7, %d1
	454	cmp.l %d2, %d1
	455	bls.b 1f
	456	spl.b %d1
	457	1:
	458	move.b %d1, %d6
	459	lsl.l #8, %d6
	460	move.b %d0, %d6
	461	clr.l (%a0)+
	462
	463	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
	464	lea.l (%a2,%a1), %a1 \| advance output pointer
	465	subq.l #1, %d4 \| loop 8 times
	466	bne.w .add_clip_loop
	467
	468	bra.w .idct_add_end
	469
	470	.dc_add:
	471	move.w (%a0), %d0
	472	ext.l %d0 \| %d0 = (block[0]
	473	add.l #64, %d0 \| + 64)
	474	asr.l #7, %d0 \| >> 7
	475	clr.w (%a0) \| clear block[0]
	476	clr.w (63*2,%a0) \| and block[63]
	477	move.l %d0, %a0 \| DC value in %a0
	478
	479	move.l #255, %d2 \| preload constant for clipping
	480	clr.l %d3 \| for splitting input words into bytes
	481	moveq.l #8, %d4 \| loop counter
	482
	483	.dc_clip_loop:
	484	movem.l (%a1), %d6-%d7 \| (b0 b1 b2 b3) (b4 b5 b6 b7)
	485	swap %d6 \| (b2 b3 b0 b1)
	486	swap %d7 \| (b6 b7 b4 b5)
	487
	488	move.l %a0, %d0 \| copy DC
	489	move.b %d6, %d3 \| copy b1
	490	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	491	add.l %d3, %d0 \| add b1
	492	cmp.l %d2, %d0 \| overflow ?
	493	bls.b 1f
	494	spl.b %d0 \| yes: set appropriate limit value in low byte
	495	1:
	496	move.l %a0, %d1 \| copy DC
	497	move.b %d6, %d3 \| copy b0
	498	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	499	add.l %d3, %d1 \| add b0
	500	cmp.l %d2, %d1 \| overflow ?
	501	bls.b 1f
	502	spl.b %d1 \| yes: set appropriate limit value in low byte
	503	1:
	504	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
	505	lsl.l #8, %d5
	506	move.b %d0, %d5
	507	lsl.l #8, %d5
	508
	509	move.l %a0, %d0 \| do b3 and b2
	510	move.b %d6, %d3
	511	lsr.l #8, %d6
	512	add.l %d3, %d0
	513	cmp.l %d2, %d0
	514	bls.b 1f
	515	spl.b %d0
	516	1:
	517	move.l %a0, %d1
	518	add.l %d6, %d1
	519	cmp.l %d2, %d1
	520	bls.b 1f
	521	spl.b %d1
	522	1:
	523	move.b %d1, %d5
	524	lsl.l #8, %d5
	525	move.b %d0, %d5
	526
	527	move.l %a0, %d0 \| do b5 and b4
	528	move.b %d7, %d3
	529	lsr.l #8, %d7
	530	add.l %d3, %d0
	531	cmp.l %d2, %d0
	532	bls.b 1f
	533	spl.b %d0
	534	1:
	535	move.l %a0, %d1
	536	move.b %d7, %d3
	537	lsr.l #8, %d7
	538	add.l %d3, %d1
	539	cmp.l %d2, %d1
	540	bls.b 1f
	541	spl.b %d1
	542	1:
	543	move.b %d1, %d6 \| do b7 and b6
	544	lsl.l #8, %d6
	545	move.b %d0, %d6
	546	lsl.l #8, %d6
	547
	548	move.l %a0, %d0
	549	move.b %d7, %d3
	550	lsr.l #8, %d7
	551	add.l %d3, %d0
	552	cmp.l %d2, %d0
	553	bls.b 1f
	554	spl.b %d0
	555	1:
	556	move.l %a0, %d1
	557	add.l %d7, %d1
	558	cmp.l %d2, %d1
	559	bls.b 1f
	560	spl.b %d1
	561	1:
	562	move.b %d1, %d6
	563	lsl.l #8, %d6
	564	move.b %d0, %d6
	565
	566	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
	567	lea.l (%a2,%a1), %a1 \| advance output pointer
	568	subq.l #1, %d4 \| loop 8 times
	569	bne.w .dc_clip_loop
	570
	571	.idct_add_end:
	572	movem.l (%sp), %d2-%d7/%a2 \| restore registers
	573	lea.l (7*4,%sp), %sp
	574	rts