5 files changed, 611 insertions, 2 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES
index 6629cf7a4c..004c6395a2 100644
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@@ -13,6 +13,10 @@ idct.c
 motion_comp_c.c
 #endif /* CPU_* */
+#ifdef CPU_COLDFIRE
+idct_coldfire.S
+#endif
 slice.c
 video_out_rockbox.c
 mpeg_settings.c
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c
index 299abc9663..ca3d29a952 100644
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
 }
+#ifdef CPU_COLDFIRE
+/* twice as large as on other targets because coldfire uses
+ * a secondary, transposed buffer for optimisation */
+static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
+#endif
 mpeg2dec_t * mpeg2_init (void)
 {
    mpeg2dec_t * mpeg2dec;
@@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
                                            MPEG2_ALLOC_MPEG2DEC);
    if (mpeg2dec == NULL)
-        return NULL;
+            return NULL;
+#ifdef CPU_COLDFIRE
+    mpeg2dec->decoder.DCTblock = static_dct_block;
+#endif
    rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
    rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
diff --git a/apps/plugins/mpegplayer/idct.c b/apps/plugins/mpegplayer/idct.c
index bf705c6a2f..bf7097401e 100644
--- a/apps/plugins/mpegplayer/idct.c
+++ b/apps/plugins/mpegplayer/idct.c
@@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
 #define CLIP(i) ((mpeg2_clip + 3840)[i])
 #endif
+#ifdef CPU_COLDFIRE
+/* assembler functions */
+extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
+                                     const int stride);
+extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
+                                    uint8_t * dest, const int stride);
+#else /* !CPU_COLDFIE */
 #if 0
 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
    do {                             \
@@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
    }
 }
+#endif /* !CPU_COLDFIRE */
 void mpeg2_idct_init (void)
 {
    extern uint8_t default_mpeg2_scan_norm[64];
@@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
    extern uint8_t mpeg2_scan_alt[64];
    int i, j;
+#ifdef CPU_COLDFIRE
+    mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
+    mpeg2_idct_add  = mpeg2_idct_add_coldfire;
+#else
    mpeg2_idct_copy = mpeg2_idct_copy_c;
-    mpeg2_idct_add = mpeg2_idct_add_c;
+    mpeg2_idct_add  = mpeg2_idct_add_c;
+#endif
 #if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
    for (i = -3840; i < 3840 + 256; i++)
diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S
new file mode 100644
index 0000000000..007c1a3e98
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id $
+ *
+ * Copyright (C) 2007 Jens Arnold
+ * Based on the work of Karim Boucher and Rani Hod
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+    .global     mpeg2_idct_copy_coldfire
+    .type       mpeg2_idct_copy_coldfire, @function
+    .global     mpeg2_idct_add_coldfire
+    .type       mpeg2_idct_add_coldfire, @function
+    /* The IDCT itself.
+     * Input: %a0: block pointer
+     * All registers are preserved. */
+    .align  2
+.idct:
+    lea.l   (-15*4,%sp), %sp
+    movem.l %d0-%d7/%a0-%a6, (%sp)  | save all registers
+    move.l  %a0, %a6
+    move.l  #0, %macsr              | signed integer mode
+    move.l  #((2048<<16)+2841), %a0 | W0,  W1
+    move.l  #((2676<<16)+2408), %a1 | W2,  W3
+    move.l  #((2048<<16)+1609), %a2 | W4,  W5
+    move.l  #((1108<<16)+ 565), %a3 | W6,  W7
+    
+    lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
+    moveq.l #8, %d3             | loop counter
+    
+.row_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+    
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(1<<16), %d0       | f0 += 1; 
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #12, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (7*16,%a4)
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (4*16,%a4)
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (3*16,%a4)
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (2*16,%a4)
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (1*16,%a4)
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (%a4)+         | advance to next temp column
+    
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .row_loop
+    
+    | %a6 now points to the temp buffer, where we need it.
+    lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
+    moveq.l #8, %d3             | loop counter
+    
+.col_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+  
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(32<<16), %d0      | DC offset: 0.5
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #17, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (7*16,%a4)
+   
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (4*16,%a4)
+   
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (3*16,%a4)
+   
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (2*16,%a4)
+   
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (1*16,%a4)
+   
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (%a4)+         | advance to next column
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .col_loop
+    
+    movem.l (%sp), %d0-%d7/%a0-%a6  | restore all registers
+    lea.l   (15*4,%sp), %sp
+    rts
+    
+    .align  2
+mpeg2_idct_copy_coldfire:
+    lea.l   (-4*4,%sp), %sp
+    movem.l %d2-%d4/%a2, (%sp)  | save some registers
+    movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
+                                | %a1 - destination pointer
+                                | %a2 - stride
+    bsr.w   .idct               | apply idct to block
+    move.l  #255, %d1           | preload constant for clipping
+    moveq.l #8, %d4             | loop counter
+    
+.copy_clip_loop:
+    move.w  (%a0), %d0          | load block[0]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 | yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    clr.l   (%a0)+              | clear block[0] and block[1],
+                                | %a0 now pointing to block[2]
+    move.w  (%a0), %d0          | do b2 and b3
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b4 and b5
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b6 and b7
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3            
+    clr.l   (%a0)+
+    
+    movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .copy_clip_loop
+    movem.l (%sp), %d2-%d4/%a2  | restore registers
+    lea.l   (4*4,%sp), %sp
+    rts
+    .align  2
+mpeg2_idct_add_coldfire:
+    lea.l   (-7*4,%sp), %sp
+    movem.l %d2-%d7/%a2, (%sp)      | save some registers
+    movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
+                                    | %a0 - block pointer
+                                    | %a1 - destination pointer
+                                    | %a2 - stride
+    cmp.l   #129, %d0           | last == 129 ?
+    bne.b   .idct_add           |   no: perform idct + addition
+    move.w  (%a0), %d0          
+    ext.l   %d0                 | ((block[0]
+    asr.l   #4, %d0             |      >> 4)
+    and.l   #7, %d0             |      & 7)
+    subq.l  #4, %d0             |      - 4 == 0 ?
+    bne.w   .dc_add             |   no: just perform addition
+.idct_add:
+    bsr.w   .idct               | apply idct
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | used for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.add_clip_loop:
+    movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1) 
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.w  (%a0), %d1          | load block[0]
+    ext.l   %d1                 | sign extend
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    clr.l   (%a0)+              | clear block[0] and block[1]
+                                |   %a0 now pointing to block[2]
+    move.w  (2,%a0), %d0        | do b3 and b2
+    ext.l   %d0
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b5 and b4
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b7 and b6
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    clr.l   (%a0)+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .add_clip_loop
+    bra.w   .idct_add_end
+    
+.dc_add:
+    move.w  (%a0), %d0
+    ext.l   %d0                 | %d0 = (block[0]
+    add.l   #64, %d0            |       + 64)
+    asr.l   #7, %d0             |       >> 7
+    clr.w   (%a0)               | clear block[0]
+    clr.w   (63*2,%a0)          |   and block[63]
+    move.l  %d0, %a0            | DC value in %a0
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.dc_clip_loop:
+    movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1)
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.l  %a0, %d0            | copy DC
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.l  %a0, %d1            | copy DC
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    move.l  %a0, %d0            | do b3 and b2
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    move.l  %a0, %d0            | do b5 and b4
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6            | do b7 and b6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    move.l  %a0, %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .dc_clip_loop
+.idct_add_end:
+    movem.l (%sp), %d2-%d7/%a2  | restore registers
+    lea.l   (7*4,%sp), %sp
+    rts
diff --git a/apps/plugins/mpegplayer/mpeg2_internal.h b/apps/plugins/mpegplayer/mpeg2_internal.h
index 0c552b766f..1ec85c60f1 100644
--- a/apps/plugins/mpegplayer/mpeg2_internal.h
+++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@@ -20,6 +20,8 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+ 
+#include "config.h" /* for Rockbox CPU_ #defines */
 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
@@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
    int16_t dc_dct_pred[3];
    /* DCT coefficients */
+#ifdef CPU_COLDFIRE
+    int16_t *DCTblock;  /* put buffer separately to have it in IRAM */
+#else
    int16_t DCTblock[64] ATTR_ALIGN(64);
+#endif
    uint8_t * picture_dest[3];
    void (* convert) (void * convert_id, uint8_t * const * src,

diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES index 6629cf7a4c..004c6395a2 100644 --- a/apps/plugins/mpegplayer/SOURCES +++ b/apps/plugins/mpegplayer/SOURCES
@@ -13,6 +13,10 @@ idct.c
13	motion_comp_c.c	13	motion_comp_c.c
14	#endif /* CPU_* */	14	#endif /* CPU_* */
15		15
		16	#ifdef CPU_COLDFIRE
		17	idct_coldfire.S
		18	#endif
		19
16	slice.c	20	slice.c
17	video_out_rockbox.c	21	video_out_rockbox.c
18	mpeg_settings.c	22	mpeg_settings.c


diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c index 299abc9663..ca3d29a952 100644 --- a/apps/plugins/mpegplayer/decode.c +++ b/apps/plugins/mpegplayer/decode.c
@@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
401		401
402	}	402	}
403		403
		404	#ifdef CPU_COLDFIRE
		405	/* twice as large as on other targets because coldfire uses
		406	* a secondary, transposed buffer for optimisation */
		407	static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
		408	#endif
		409
404	mpeg2dec_t * mpeg2_init (void)	410	mpeg2dec_t * mpeg2_init (void)
405	{	411	{
406	mpeg2dec_t * mpeg2dec;	412	mpeg2dec_t * mpeg2dec;
@@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
410	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),	416	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
411	MPEG2_ALLOC_MPEG2DEC);	417	MPEG2_ALLOC_MPEG2DEC);
412	if (mpeg2dec == NULL)	418	if (mpeg2dec == NULL)
413	return NULL;	419	return NULL;
		420
		421	#ifdef CPU_COLDFIRE
		422	mpeg2dec->decoder.DCTblock = static_dct_block;
		423	#endif
414		424
415	rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));	425	rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
416	rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));	426	rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));


diff --git a/apps/plugins/mpegplayer/idct.c b/apps/plugins/mpegplayer/idct.c index bf705c6a2f..bf7097401e 100644 --- a/apps/plugins/mpegplayer/idct.c +++ b/apps/plugins/mpegplayer/idct.c
@@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
76	#define CLIP(i) ((mpeg2_clip + 3840)[i])	76	#define CLIP(i) ((mpeg2_clip + 3840)[i])
77	#endif	77	#endif
78		78
		79	#ifdef CPU_COLDFIRE
		80	/* assembler functions */
		81	extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
		82	const int stride);
		83	extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
		84	uint8_t * dest, const int stride);
		85	#else /* !CPU_COLDFIE */
		86
79	#if 0	87	#if 0
80	#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \	88	#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
81	do { \	89	do { \
@@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
258	}	266	}
259	}	267	}
260		268
		269	#endif /* !CPU_COLDFIRE */
		270
261	void mpeg2_idct_init (void)	271	void mpeg2_idct_init (void)
262	{	272	{
263	extern uint8_t default_mpeg2_scan_norm[64];	273	extern uint8_t default_mpeg2_scan_norm[64];
@@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
266	extern uint8_t mpeg2_scan_alt[64];	276	extern uint8_t mpeg2_scan_alt[64];
267	int i, j;	277	int i, j;
268		278
		279	#ifdef CPU_COLDFIRE
		280	mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
		281	mpeg2_idct_add = mpeg2_idct_add_coldfire;
		282	#else
269	mpeg2_idct_copy = mpeg2_idct_copy_c;	283	mpeg2_idct_copy = mpeg2_idct_copy_c;
270	mpeg2_idct_add = mpeg2_idct_add_c;	284	mpeg2_idct_add = mpeg2_idct_add_c;
		285	#endif
271		286
272	#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)	287	#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
273	for (i = -3840; i < 3840 + 256; i++)	288	for (i = -3840; i < 3840 + 256; i++)


diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S new file mode 100644 index 0000000000..007c1a3e98 --- /dev/null +++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id $
		9	*
		10	* Copyright (C) 2007 Jens Arnold
		11	* Based on the work of Karim Boucher and Rani Hod
		12	*
		13	* All files in this archive are subject to the GNU General Public License.
		14	* See the file COPYING in the source tree root for full license agreement.
		15	*
		16	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		17	* KIND, either express or implied.
		18	*
		19	****************************************************************************/
		20
		21	.global mpeg2_idct_copy_coldfire
		22	.type mpeg2_idct_copy_coldfire, @function
		23	.global mpeg2_idct_add_coldfire
		24	.type mpeg2_idct_add_coldfire, @function
		25
		26	/* The IDCT itself.
		27	* Input: %a0: block pointer
		28	* All registers are preserved. */
		29	.align 2
		30	.idct:
		31	lea.l (-15*4,%sp), %sp
		32	movem.l %d0-%d7/%a0-%a6, (%sp) \| save all registers
		33	move.l %a0, %a6
		34
		35	move.l #0, %macsr \| signed integer mode
		36
		37	move.l #((2048<<16)+2841), %a0 \| W0, W1
		38	move.l #((2676<<16)+2408), %a1 \| W2, W3
		39	move.l #((2048<<16)+1609), %a2 \| W4, W5
		40	move.l #((1108<<16)+ 565), %a3 \| W6, W7
		41
		42	lea.l (128,%a6), %a4 \| secondary, transposed temp buffer
		43	moveq.l #8, %d3 \| loop counter
		44
		45	.row_loop:
		46	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
		47
		48	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
		49	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
		50	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
		51	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
		52
		53	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
		54	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
		55	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
		56	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
		57
		58	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
		59	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
		60	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
		61	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
		62
		63	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
		64	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
		65	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
		66	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
		67
		68	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
		69	add.l #(1<<16), %d0 \| f0 += 1;
		70
		71	movclr.l %acc0, %d4 \| b0
		72	movclr.l %acc1, %d5 \| b1
		73	movclr.l %acc2, %d6 \| b2
		74	movclr.l %acc3, %d7 \| b3
		75
		76	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
		77	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
		78	move.l %acc0, %acc3
		79	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
		80	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
		81
		82	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
		83	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
		84	move.l %acc1, %acc2
		85	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
		86	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
		87
		88	\| ^ move.l %acc0, %acc3 %acc2 = W0 * f0 - W4 * f4
		89	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
		90	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
		91
		92	\| ^ move.l %acc1, %acc2 %acc3 = W0 * f0 + W4 * f4
		93	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
		94	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
		95
		96	moveq.l #12, %d1 \| shift amount
		97
		98	move.l %acc0, %d0 \| block[7] = (a0
		99	sub.l %d4,%d0 \| - b0)
		100	asr.l %d1, %d0 \| >> 12
		101	move.w %d0, (7*16,%a4)
		102
		103	move.l %acc1, %d0 \| block[6] = (a1
		104	sub.l %d5,%d0 \| - b1)
		105	asr.l %d1, %d0 \| >> 12
		106	move.w %d0, (6*16,%a4)
		107
		108	move.l %acc2, %d0 \| block[5] = (a2
		109	sub.l %d6,%d0 \| - b2)
		110	asr.l %d1, %d0 \| >> 12
		111	move.w %d0, (5*16,%a4)
		112
		113	move.l %acc3, %d0 \| block[4] = (a3
		114	sub.l %d7,%d0 \| - b3)
		115	asr.l %d1, %d0 \| >> 12
		116	move.w %d0, (4*16,%a4)
		117
		118	movclr.l %acc3, %d0 \| block[3] = (a3
		119	add.l %d7, %d0 \| + b3)
		120	asr.l %d1, %d0 \| >> 12
		121	move.w %d0, (3*16,%a4)
		122
		123	movclr.l %acc2, %d0 \| block[2] = (a2
		124	add.l %d6, %d0 \| + b2)
		125	asr.l %d1, %d0 \| >> 12
		126	move.w %d0, (2*16,%a4)
		127
		128	movclr.l %acc1, %d0 \| block[1] = (a1
		129	add.l %d5, %d0 \| + b1)
		130	asr.l %d1, %d0 \| >> 12
		131	move.w %d0, (1*16,%a4)
		132
		133	movclr.l %acc0, %d0 \| block[0] = (a0
		134	add.l %d4, %d0 \| + b0)
		135	asr.l %d1, %d0 \| >> 12
		136	move.w %d0, (%a4)+ \| advance to next temp column
		137
		138	subq.l #1, %d3 \| loop 8 times
		139	bne.w .row_loop
		140
		141	\| %a6 now points to the temp buffer, where we need it.
		142	lea.l (-16-128,%a4), %a4 \| point %a4 back to the input block
		143	moveq.l #8, %d3 \| loop counter
		144
		145	.col_loop:
		146	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
		147
		148	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
		149	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
		150	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
		151	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
		152
		153	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
		154	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
		155	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
		156	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
		157
		158	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
		159	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
		160	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
		161	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
		162
		163	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
		164	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
		165	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
		166	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
		167
		168	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
		169	add.l #(32<<16), %d0 \| DC offset: 0.5
		170
		171	movclr.l %acc0, %d4 \| b0
		172	movclr.l %acc1, %d5 \| b1
		173	movclr.l %acc2, %d6 \| b2
		174	movclr.l %acc3, %d7 \| b3
		175
		176	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
		177	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
		178	move.l %acc0, %acc3
		179	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
		180	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
		181
		182	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
		183	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
		184	move.l %acc1, %acc2
		185	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
		186	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
		187
		188	\| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
		189	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
		190	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
		191
		192	\| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
		193	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
		194	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
		195
		196	moveq.l #17, %d1 \| shift amount
		197
		198	move.l %acc0, %d0 \| block[7] = (a0
		199	sub.l %d4,%d0 \| - b0)
		200	asr.l %d1, %d0 \| >> 17
		201	move.w %d0, (7*16,%a4)
		202
		203	move.l %acc1, %d0 \| block[6] = (a1
		204	sub.l %d5,%d0 \| - b1)
		205	asr.l %d1, %d0 \| >> 17
		206	move.w %d0, (6*16,%a4)
		207
		208	move.l %acc2, %d0 \| block[5] = (a2
		209	sub.l %d6,%d0 \| - b2)
		210	asr.l %d1, %d0 \| >> 17
		211	move.w %d0, (5*16,%a4)
		212
		213	move.l %acc3, %d0 \| block[4] = (a3
		214	sub.l %d7,%d0 \| - b3)
		215	asr.l %d1, %d0 \| >> 17
		216	move.w %d0, (4*16,%a4)
		217
		218	movclr.l %acc3, %d0 \| block[3] = (a3
		219	add.l %d7, %d0 \| + b3)
		220	asr.l %d1, %d0 \| >> 17
		221	move.w %d0, (3*16,%a4)
		222
		223	movclr.l %acc2, %d0 \| block[2] = (a2
		224	add.l %d6, %d0 \| + b2)
		225	asr.l %d1, %d0 \| >> 17
		226	move.w %d0, (2*16,%a4)
		227
		228	movclr.l %acc1, %d0 \| block[1] = (a1
		229	add.l %d5, %d0 \| + b1)
		230	asr.l %d1, %d0 \| >> 17
		231	move.w %d0, (1*16,%a4)
		232
		233	movclr.l %acc0, %d0 \| block[0] = (a0
		234	add.l %d4, %d0 \| + b0)
		235	asr.l %d1, %d0 \| >> 17
		236	move.w %d0, (%a4)+ \| advance to next column
		237
		238	subq.l #1, %d3 \| loop 8 times
		239	bne.w .col_loop
		240
		241	movem.l (%sp), %d0-%d7/%a0-%a6 \| restore all registers
		242	lea.l (15*4,%sp), %sp
		243	rts
		244
		245	.align 2
		246
		247	mpeg2_idct_copy_coldfire:
		248	lea.l (-4*4,%sp), %sp
		249	movem.l %d2-%d4/%a2, (%sp) \| save some registers
		250	movem.l (4*4+4,%sp), %a0-%a2\| %a0 - block pointer
		251	\| %a1 - destination pointer
		252	\| %a2 - stride
		253
		254	bsr.w .idct \| apply idct to block
		255
		256	move.l #255, %d1 \| preload constant for clipping
		257	moveq.l #8, %d4 \| loop counter
		258
		259	.copy_clip_loop:
		260	move.w (%a0), %d0 \| load block[0]
		261	ext.l %d0 \| sign extend
		262	cmp.l %d1, %d0 \| overflow?
		263	bls.b 1f
		264	spl.b %d0 \| yes: set appropriate limit value in low byte
		265	1:
		266	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
		267	lsl.l #8, %d2
		268
		269	move.w (2,%a0), %d0 \| load block[1]
		270	ext.l %d0 \| sign extend
		271	cmp.l %d1, %d0 \| overflow?
		272	bls.b 1f
		273	spl.b %d0 \| yes: set appropriate limit value in low byte
		274	1:
		275	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
		276	lsl.l #8, %d2
		277	clr.l (%a0)+ \| clear block[0] and block[1],
		278	\| %a0 now pointing to block[2]
		279	move.w (%a0), %d0 \| do b2 and b3
		280	ext.l %d0
		281	cmp.l %d1, %d0
		282	bls.b 1f
		283	spl.b %d0
		284	1:
		285	move.b %d0, %d2
		286	lsl.l #8, %d2
		287
		288	move.w (2,%a0), %d0
		289	ext.l %d0
		290	cmp.l %d1, %d0
		291	bls.b 1f
		292	spl.b %d0
		293	1:
		294	move.b %d0, %d2
		295	clr.l (%a0)+
		296
		297	move.w (%a0), %d0 \| do b4 and b5
		298	ext.l %d0
		299	cmp.l %d1, %d0
		300	bls.b 1f
		301	spl.b %d0
		302	1:
		303	move.b %d0, %d3
		304	lsl.l #8, %d3
		305
		306	move.w (2,%a0), %d0
		307	ext.l %d0
		308	cmp.l %d1, %d0
		309	bls.b 1f
		310	spl.b %d0
		311	1:
		312	move.b %d0, %d3
		313	lsl.l #8, %d3
		314	clr.l (%a0)+
		315
		316	move.w (%a0), %d0 \| do b6 and b7
		317	ext.l %d0
		318	cmp.l %d1, %d0
		319	bls.b 1f
		320	spl.b %d0
		321	1:
		322	move.b %d0, %d3
		323	lsl.l #8, %d3
		324
		325	move.w (2,%a0), %d0
		326	ext.l %d0
		327	cmp.l %d1, %d0
		328	bls.b 1f
		329	spl.b %d0
		330	1:
		331	move.b %d0, %d3
		332	clr.l (%a0)+
		333
		334	movem.l %d2-%d3, (%a1) \| write all 8 output bytes at once
		335	lea.l (%a2,%a1), %a1 \| advance output pointer
		336	subq.l #1, %d4 \| loop 8 times
		337	bne.w .copy_clip_loop
		338
		339	movem.l (%sp), %d2-%d4/%a2 \| restore registers
		340	lea.l (4*4,%sp), %sp
		341	rts
		342
		343	.align 2
		344
		345	mpeg2_idct_add_coldfire:
		346	lea.l (-7*4,%sp), %sp
		347	movem.l %d2-%d7/%a2, (%sp) \| save some registers
		348	movem.l (7*4+4,%sp), %d0/%a0-%a2\| %d0 - last value
		349	\| %a0 - block pointer
		350	\| %a1 - destination pointer
		351	\| %a2 - stride
		352	cmp.l #129, %d0 \| last == 129 ?
		353	bne.b .idct_add \| no: perform idct + addition
		354	move.w (%a0), %d0
		355	ext.l %d0 \| ((block[0]
		356	asr.l #4, %d0 \| >> 4)
		357	and.l #7, %d0 \| & 7)
		358	subq.l #4, %d0 \| - 4 == 0 ?
		359	bne.w .dc_add \| no: just perform addition
		360
		361	.idct_add:
		362	bsr.w .idct \| apply idct
		363
		364	move.l #255, %d2 \| preload constant for clipping
		365	clr.l %d3 \| used for splitting input words into bytes
		366	moveq.l #8, %d4 \| loop counter
		367
		368	.add_clip_loop:
		369	movem.l (%a1), %d6-%d7 \| fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
		370	swap %d6 \| (b2 b3 b0 b1)
		371	swap %d7 \| (b6 b7 b4 b5)
		372
		373	move.w (2,%a0), %d0 \| load block[1]
		374	ext.l %d0 \| sign extend
		375	move.b %d6, %d3 \| copy b1
		376	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		377	add.l %d3, %d0 \| add b1
		378	cmp.l %d2, %d0 \| overflow ?
		379	bls.b 1f
		380	spl.b %d0 \| yes: set appropriate limit value in low byte
		381	1:
		382	move.w (%a0), %d1 \| load block[0]
		383	ext.l %d1 \| sign extend
		384	move.b %d6, %d3 \| copy b0
		385	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		386	add.l %d3, %d1 \| add b0
		387	cmp.l %d2, %d1 \| overflow ?
		388	bls.b 1f
		389	spl.b %d1 \| yes: set appropriate limit value in low byte
		390	1:
		391	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
		392	lsl.l #8, %d5
		393	move.b %d0, %d5
		394	lsl.l #8, %d5
		395	clr.l (%a0)+ \| clear block[0] and block[1]
		396	\| %a0 now pointing to block[2]
		397	move.w (2,%a0), %d0 \| do b3 and b2
		398	ext.l %d0
		399	move.b %d6, %d3
		400	lsr.l #8, %d6
		401	add.l %d3, %d0
		402	cmp.l %d2, %d0
		403	bls.b 1f
		404	spl.b %d0
		405	1:
		406	move.w (%a0), %d1
		407	ext.l %d1
		408	add.l %d6, %d1
		409	cmp.l %d2, %d1
		410	bls.b 1f
		411	spl.b %d1
		412	1:
		413	move.b %d1, %d5
		414	lsl.l #8, %d5
		415	move.b %d0, %d5
		416	clr.l (%a0)+
		417
		418	move.w (2,%a0), %d0 \| do b5 and b4
		419	ext.l %d0
		420	move.b %d7, %d3
		421	lsr.l #8, %d7
		422	add.l %d3, %d0
		423	cmp.l %d2, %d0
		424	bls.b 1f
		425	spl.b %d0
		426	1:
		427	move.w (%a0), %d1
		428	ext.l %d1
		429	move.b %d7, %d3
		430	lsr.l #8, %d7
		431	add.l %d3, %d1
		432	cmp.l %d2, %d1
		433	bls.b 1f
		434	spl.b %d1
		435	1:
		436	move.b %d1, %d6
		437	lsl.l #8, %d6
		438	move.b %d0, %d6
		439	lsl.l #8, %d6
		440	clr.l (%a0)+
		441
		442	move.w (2,%a0), %d0 \| do b7 and b6
		443	ext.l %d0
		444	move.b %d7, %d3
		445	lsr.l #8, %d7
		446	add.l %d3, %d0
		447	cmp.l %d2, %d0
		448	bls.b 1f
		449	spl.b %d0
		450	1:
		451	move.w (%a0), %d1
		452	ext.l %d1
		453	add.l %d7, %d1
		454	cmp.l %d2, %d1
		455	bls.b 1f
		456	spl.b %d1
		457	1:
		458	move.b %d1, %d6
		459	lsl.l #8, %d6
		460	move.b %d0, %d6
		461	clr.l (%a0)+
		462
		463	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
		464	lea.l (%a2,%a1), %a1 \| advance output pointer
		465	subq.l #1, %d4 \| loop 8 times
		466	bne.w .add_clip_loop
		467
		468	bra.w .idct_add_end
		469
		470	.dc_add:
		471	move.w (%a0), %d0
		472	ext.l %d0 \| %d0 = (block[0]
		473	add.l #64, %d0 \| + 64)
		474	asr.l #7, %d0 \| >> 7
		475	clr.w (%a0) \| clear block[0]
		476	clr.w (63*2,%a0) \| and block[63]
		477	move.l %d0, %a0 \| DC value in %a0
		478
		479	move.l #255, %d2 \| preload constant for clipping
		480	clr.l %d3 \| for splitting input words into bytes
		481	moveq.l #8, %d4 \| loop counter
		482
		483	.dc_clip_loop:
		484	movem.l (%a1), %d6-%d7 \| (b0 b1 b2 b3) (b4 b5 b6 b7)
		485	swap %d6 \| (b2 b3 b0 b1)
		486	swap %d7 \| (b6 b7 b4 b5)
		487
		488	move.l %a0, %d0 \| copy DC
		489	move.b %d6, %d3 \| copy b1
		490	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		491	add.l %d3, %d0 \| add b1
		492	cmp.l %d2, %d0 \| overflow ?
		493	bls.b 1f
		494	spl.b %d0 \| yes: set appropriate limit value in low byte
		495	1:
		496	move.l %a0, %d1 \| copy DC
		497	move.b %d6, %d3 \| copy b0
		498	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		499	add.l %d3, %d1 \| add b0
		500	cmp.l %d2, %d1 \| overflow ?
		501	bls.b 1f
		502	spl.b %d1 \| yes: set appropriate limit value in low byte
		503	1:
		504	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
		505	lsl.l #8, %d5
		506	move.b %d0, %d5
		507	lsl.l #8, %d5
		508
		509	move.l %a0, %d0 \| do b3 and b2
		510	move.b %d6, %d3
		511	lsr.l #8, %d6
		512	add.l %d3, %d0
		513	cmp.l %d2, %d0
		514	bls.b 1f
		515	spl.b %d0
		516	1:
		517	move.l %a0, %d1
		518	add.l %d6, %d1
		519	cmp.l %d2, %d1
		520	bls.b 1f
		521	spl.b %d1
		522	1:
		523	move.b %d1, %d5
		524	lsl.l #8, %d5
		525	move.b %d0, %d5
		526
		527	move.l %a0, %d0 \| do b5 and b4
		528	move.b %d7, %d3
		529	lsr.l #8, %d7
		530	add.l %d3, %d0
		531	cmp.l %d2, %d0
		532	bls.b 1f
		533	spl.b %d0
		534	1:
		535	move.l %a0, %d1
		536	move.b %d7, %d3
		537	lsr.l #8, %d7
		538	add.l %d3, %d1
		539	cmp.l %d2, %d1
		540	bls.b 1f
		541	spl.b %d1
		542	1:
		543	move.b %d1, %d6 \| do b7 and b6
		544	lsl.l #8, %d6
		545	move.b %d0, %d6
		546	lsl.l #8, %d6
		547
		548	move.l %a0, %d0
		549	move.b %d7, %d3
		550	lsr.l #8, %d7
		551	add.l %d3, %d0
		552	cmp.l %d2, %d0
		553	bls.b 1f
		554	spl.b %d0
		555	1:
		556	move.l %a0, %d1
		557	add.l %d7, %d1
		558	cmp.l %d2, %d1
		559	bls.b 1f
		560	spl.b %d1
		561	1:
		562	move.b %d1, %d6
		563	lsl.l #8, %d6
		564	move.b %d0, %d6
		565
		566	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
		567	lea.l (%a2,%a1), %a1 \| advance output pointer
		568	subq.l #1, %d4 \| loop 8 times
		569	bne.w .dc_clip_loop
		570
		571	.idct_add_end:
		572	movem.l (%sp), %d2-%d7/%a2 \| restore registers
		573	lea.l (7*4,%sp), %sp
		574	rts


diff --git a/apps/plugins/mpegplayer/mpeg2_internal.h b/apps/plugins/mpegplayer/mpeg2_internal.h index 0c552b766f..1ec85c60f1 100644 --- a/apps/plugins/mpegplayer/mpeg2_internal.h +++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@@ -20,6 +20,8 @@
20	* along with this program; if not, write to the Free Software	20	* along with this program; if not, write to the Free Software
21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22	*/	22	*/
		23
		24	#include "config.h" /* for Rockbox CPU_ #defines */
23		25
24	/* macroblock modes */	26	/* macroblock modes */
25	#define MACROBLOCK_INTRA 1	27	#define MACROBLOCK_INTRA 1
@@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
92	int16_t dc_dct_pred[3];	94	int16_t dc_dct_pred[3];
93		95
94	/* DCT coefficients */	96	/* DCT coefficients */
		97	#ifdef CPU_COLDFIRE
		98	int16_t DCTblock; / put buffer separately to have it in IRAM */
		99	#else
95	int16_t DCTblock[64] ATTR_ALIGN(64);	100	int16_t DCTblock[64] ATTR_ALIGN(64);
		101	#endif
96		102
97	uint8_t * picture_dest[3];	103	uint8_t * picture_dest[3];
98	void (* convert) (void * convert_id, uint8_t * const * src,	104	void (* convert) (void * convert_id, uint8_t * const * src,