Mpegplayer: Assembler optimised IDCT for coldfire, based on FS #5995 by Karim Boucher. Put the IDCT block buffer in IRAM for better performance. The whole libmpeg2 decoder struct doesn't fit without throwing some libmad buffers out of IRAM, but then doesn't change performance significantly. Mpegplayer is quite usable now on X5; H300 is sort-of usable for widescreen.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15156 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2007-10-16 22:55:40 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2007-10-16 22:55:40 +0000
commit: fc43b9df823af80dd1c9cf7dc1b5de6703944043 (patch)
tree: bb17b985d00d13bc6fce61823acbe50ed1e003b0
parent: 84f5c5c3e3590cb993f4cf2a7eba5979e3bc825b (diff)
download: rockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.tar.gz
rockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.zip
5 files changed, 611 insertions, 2 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES
index 6629cf7a4c..004c6395a2 100644
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@@ -13,6 +13,10 @@ idct.c
 motion_comp_c.c
 #endif /* CPU_* */
+#ifdef CPU_COLDFIRE
+idct_coldfire.S
+#endif
 slice.c
 video_out_rockbox.c
 mpeg_settings.c
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c
index 299abc9663..ca3d29a952 100644
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
 }
+#ifdef CPU_COLDFIRE
+/* twice as large as on other targets because coldfire uses
+ * a secondary, transposed buffer for optimisation */
+static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
+#endif
 mpeg2dec_t * mpeg2_init (void)
 {
    mpeg2dec_t * mpeg2dec;
@@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
                                            MPEG2_ALLOC_MPEG2DEC);
    if (mpeg2dec == NULL)
-        return NULL;
+            return NULL;
+#ifdef CPU_COLDFIRE
+    mpeg2dec->decoder.DCTblock = static_dct_block;
+#endif
    rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
    rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
diff --git a/apps/plugins/mpegplayer/idct.c b/apps/plugins/mpegplayer/idct.c
index bf705c6a2f..bf7097401e 100644
--- a/apps/plugins/mpegplayer/idct.c
+++ b/apps/plugins/mpegplayer/idct.c
@@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
 #define CLIP(i) ((mpeg2_clip + 3840)[i])
 #endif
+#ifdef CPU_COLDFIRE
+/* assembler functions */
+extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
+                                     const int stride);
+extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
+                                    uint8_t * dest, const int stride);
+#else /* !CPU_COLDFIE */
 #if 0
 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
    do {                             \
@@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
    }
 }
+#endif /* !CPU_COLDFIRE */
 void mpeg2_idct_init (void)
 {
    extern uint8_t default_mpeg2_scan_norm[64];
@@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
    extern uint8_t mpeg2_scan_alt[64];
    int i, j;
+#ifdef CPU_COLDFIRE
+    mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
+    mpeg2_idct_add  = mpeg2_idct_add_coldfire;
+#else
    mpeg2_idct_copy = mpeg2_idct_copy_c;
-    mpeg2_idct_add = mpeg2_idct_add_c;
+    mpeg2_idct_add  = mpeg2_idct_add_c;
+#endif
 #if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
    for (i = -3840; i < 3840 + 256; i++)
diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S
new file mode 100644
index 0000000000..007c1a3e98
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id $
+ *
+ * Copyright (C) 2007 Jens Arnold
+ * Based on the work of Karim Boucher and Rani Hod
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+    .global     mpeg2_idct_copy_coldfire
+    .type       mpeg2_idct_copy_coldfire, @function
+    .global     mpeg2_idct_add_coldfire
+    .type       mpeg2_idct_add_coldfire, @function
+    /* The IDCT itself.
+     * Input: %a0: block pointer
+     * All registers are preserved. */
+    .align  2
+.idct:
+    lea.l   (-15*4,%sp), %sp
+    movem.l %d0-%d7/%a0-%a6, (%sp)  | save all registers
+    move.l  %a0, %a6
+    move.l  #0, %macsr              | signed integer mode
+    move.l  #((2048<<16)+2841), %a0 | W0,  W1
+    move.l  #((2676<<16)+2408), %a1 | W2,  W3
+    move.l  #((2048<<16)+1609), %a2 | W4,  W5
+    move.l  #((1108<<16)+ 565), %a3 | W6,  W7
+    
+    lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
+    moveq.l #8, %d3             | loop counter
+    
+.row_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+    
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(1<<16), %d0       | f0 += 1; 
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #12, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (7*16,%a4)
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (4*16,%a4)
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (3*16,%a4)
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (2*16,%a4)
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (1*16,%a4)
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (%a4)+         | advance to next temp column
+    
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .row_loop
+    
+    | %a6 now points to the temp buffer, where we need it.
+    lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
+    moveq.l #8, %d3             | loop counter
+    
+.col_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+  
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(32<<16), %d0      | DC offset: 0.5
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #17, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (7*16,%a4)
+   
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (4*16,%a4)
+   
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (3*16,%a4)
+   
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (2*16,%a4)
+   
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (1*16,%a4)
+   
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (%a4)+         | advance to next column
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .col_loop
+    
+    movem.l (%sp), %d0-%d7/%a0-%a6  | restore all registers
+    lea.l   (15*4,%sp), %sp
+    rts
+    
+    .align  2
+mpeg2_idct_copy_coldfire:
+    lea.l   (-4*4,%sp), %sp
+    movem.l %d2-%d4/%a2, (%sp)  | save some registers
+    movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
+                                | %a1 - destination pointer
+                                | %a2 - stride
+    bsr.w   .idct               | apply idct to block
+    move.l  #255, %d1           | preload constant for clipping
+    moveq.l #8, %d4             | loop counter
+    
+.copy_clip_loop:
+    move.w  (%a0), %d0          | load block[0]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 | yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    clr.l   (%a0)+              | clear block[0] and block[1],
+                                | %a0 now pointing to block[2]
+    move.w  (%a0), %d0          | do b2 and b3
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b4 and b5
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b6 and b7
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3            
+    clr.l   (%a0)+
+    
+    movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .copy_clip_loop
+    movem.l (%sp), %d2-%d4/%a2  | restore registers
+    lea.l   (4*4,%sp), %sp
+    rts
+    .align  2
+mpeg2_idct_add_coldfire:
+    lea.l   (-7*4,%sp), %sp
+    movem.l %d2-%d7/%a2, (%sp)      | save some registers
+    movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
+                                    | %a0 - block pointer
+                                    | %a1 - destination pointer
+                                    | %a2 - stride
+    cmp.l   #129, %d0           | last == 129 ?
+    bne.b   .idct_add           |   no: perform idct + addition
+    move.w  (%a0), %d0          
+    ext.l   %d0                 | ((block[0]
+    asr.l   #4, %d0             |      >> 4)
+    and.l   #7, %d0             |      & 7)
+    subq.l  #4, %d0             |      - 4 == 0 ?
+    bne.w   .dc_add             |   no: just perform addition
+.idct_add:
+    bsr.w   .idct               | apply idct
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | used for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.add_clip_loop:
+    movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1) 
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.w  (%a0), %d1          | load block[0]
+    ext.l   %d1                 | sign extend
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    clr.l   (%a0)+              | clear block[0] and block[1]
+                                |   %a0 now pointing to block[2]
+    move.w  (2,%a0), %d0        | do b3 and b2
+    ext.l   %d0
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b5 and b4
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b7 and b6
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    clr.l   (%a0)+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .add_clip_loop
+    bra.w   .idct_add_end
+    
+.dc_add:
+    move.w  (%a0), %d0
+    ext.l   %d0                 | %d0 = (block[0]
+    add.l   #64, %d0            |       + 64)
+    asr.l   #7, %d0             |       >> 7
+    clr.w   (%a0)               | clear block[0]
+    clr.w   (63*2,%a0)          |   and block[63]
+    move.l  %d0, %a0            | DC value in %a0
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.dc_clip_loop:
+    movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1)
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.l  %a0, %d0            | copy DC
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.l  %a0, %d1            | copy DC
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    move.l  %a0, %d0            | do b3 and b2
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    move.l  %a0, %d0            | do b5 and b4
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6            | do b7 and b6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    move.l  %a0, %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .dc_clip_loop
+.idct_add_end:
+    movem.l (%sp), %d2-%d7/%a2  | restore registers
+    lea.l   (7*4,%sp), %sp
+    rts
diff --git a/apps/plugins/mpegplayer/mpeg2_internal.h b/apps/plugins/mpegplayer/mpeg2_internal.h
index 0c552b766f..1ec85c60f1 100644
--- a/apps/plugins/mpegplayer/mpeg2_internal.h
+++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@@ -20,6 +20,8 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+ 
+#include "config.h" /* for Rockbox CPU_ #defines */
 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
@@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
    int16_t dc_dct_pred[3];
    /* DCT coefficients */
+#ifdef CPU_COLDFIRE
+    int16_t *DCTblock;  /* put buffer separately to have it in IRAM */
+#else
    int16_t DCTblock[64] ATTR_ALIGN(64);
+#endif
    uint8_t * picture_dest[3];
    void (* convert) (void * convert_id, uint8_t * const * src,
author	Jens Arnold <amiconn@rockbox.org>	2007-10-16 22:55:40 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2007-10-16 22:55:40 +0000
commit	fc43b9df823af80dd1c9cf7dc1b5de6703944043 (patch)
tree	bb17b985d00d13bc6fce61823acbe50ed1e003b0
parent	84f5c5c3e3590cb993f4cf2a7eba5979e3bc825b (diff)
download	rockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.tar.gz rockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.zip

diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES index 6629cf7a4c..004c6395a2 100644 --- a/apps/plugins/mpegplayer/SOURCES +++ b/apps/plugins/mpegplayer/SOURCES
@@ -13,6 +13,10 @@ idct.c
13	motion_comp_c.c	13	motion_comp_c.c
14	#endif /* CPU_* */	14	#endif /* CPU_* */
15		15
		16	#ifdef CPU_COLDFIRE
		17	idct_coldfire.S
		18	#endif
		19
16	slice.c	20	slice.c
17	video_out_rockbox.c	21	video_out_rockbox.c
18	mpeg_settings.c	22	mpeg_settings.c


diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c index 299abc9663..ca3d29a952 100644 --- a/apps/plugins/mpegplayer/decode.c +++ b/apps/plugins/mpegplayer/decode.c
@@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
401		401
402	}	402	}
403		403
		404	#ifdef CPU_COLDFIRE
		405	/* twice as large as on other targets because coldfire uses
		406	* a secondary, transposed buffer for optimisation */
		407	static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
		408	#endif
		409
404	mpeg2dec_t * mpeg2_init (void)	410	mpeg2dec_t * mpeg2_init (void)
405	{	411	{
406	mpeg2dec_t * mpeg2dec;	412	mpeg2dec_t * mpeg2dec;
@@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
410	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),	416	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
411	MPEG2_ALLOC_MPEG2DEC);	417	MPEG2_ALLOC_MPEG2DEC);
412	if (mpeg2dec == NULL)	418	if (mpeg2dec == NULL)
413	return NULL;	419	return NULL;
		420
		421	#ifdef CPU_COLDFIRE
		422	mpeg2dec->decoder.DCTblock = static_dct_block;
		423	#endif
414		424
415	rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));	425	rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
416	rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));	426	rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));


diff --git a/apps/plugins/mpegplayer/idct.c b/apps/plugins/mpegplayer/idct.c index bf705c6a2f..bf7097401e 100644 --- a/apps/plugins/mpegplayer/idct.c +++ b/apps/plugins/mpegplayer/idct.c
@@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
76	#define CLIP(i) ((mpeg2_clip + 3840)[i])	76	#define CLIP(i) ((mpeg2_clip + 3840)[i])
77	#endif	77	#endif
78		78
		79	#ifdef CPU_COLDFIRE
		80	/* assembler functions */
		81	extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
		82	const int stride);
		83	extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
		84	uint8_t * dest, const int stride);
		85	#else /* !CPU_COLDFIE */
		86
79	#if 0	87	#if 0
80	#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \	88	#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
81	do { \	89	do { \
@@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
258	}	266	}
259	}	267	}
260		268
		269	#endif /* !CPU_COLDFIRE */
		270
261	void mpeg2_idct_init (void)	271	void mpeg2_idct_init (void)
262	{	272	{
263	extern uint8_t default_mpeg2_scan_norm[64];	273	extern uint8_t default_mpeg2_scan_norm[64];
@@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
266	extern uint8_t mpeg2_scan_alt[64];	276	extern uint8_t mpeg2_scan_alt[64];
267	int i, j;	277	int i, j;
268		278
		279	#ifdef CPU_COLDFIRE
		280	mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
		281	mpeg2_idct_add = mpeg2_idct_add_coldfire;
		282	#else
269	mpeg2_idct_copy = mpeg2_idct_copy_c;	283	mpeg2_idct_copy = mpeg2_idct_copy_c;
270	mpeg2_idct_add = mpeg2_idct_add_c;	284	mpeg2_idct_add = mpeg2_idct_add_c;
		285	#endif
271		286
272	#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)	287	#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
273	for (i = -3840; i < 3840 + 256; i++)	288	for (i = -3840; i < 3840 + 256; i++)


diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S new file mode 100644 index 0000000000..007c1a3e98 --- /dev/null +++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id $
		9	*
		10	* Copyright (C) 2007 Jens Arnold
		11	* Based on the work of Karim Boucher and Rani Hod
		12	*
		13	* All files in this archive are subject to the GNU General Public License.
		14	* See the file COPYING in the source tree root for full license agreement.
		15	*
		16	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		17	* KIND, either express or implied.
		18	*
		19	****************************************************************************/
		20
		21	.global mpeg2_idct_copy_coldfire
		22	.type mpeg2_idct_copy_coldfire, @function
		23	.global mpeg2_idct_add_coldfire
		24	.type mpeg2_idct_add_coldfire, @function
		25
		26	/* The IDCT itself.
		27	* Input: %a0: block pointer
		28	* All registers are preserved. */
		29	.align 2
		30	.idct:
		31	lea.l (-15*4,%sp), %sp
		32	movem.l %d0-%d7/%a0-%a6, (%sp) \| save all registers
		33	move.l %a0, %a6
		34
		35	move.l #0, %macsr \| signed integer mode
		36
		37	move.l #((2048<<16)+2841), %a0 \| W0, W1
		38	move.l #((2676<<16)+2408), %a1 \| W2, W3
		39	move.l #((2048<<16)+1609), %a2 \| W4, W5
		40	move.l #((1108<<16)+ 565), %a3 \| W6, W7
		41
		42	lea.l (128,%a6), %a4 \| secondary, transposed temp buffer
		43	moveq.l #8, %d3 \| loop counter
		44
		45	.row_loop:
		46	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
		47
		48	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
		49	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
		50	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
		51	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
		52
		53	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
		54	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
		55	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
		56	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
		57
		58	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
		59	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
		60	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
		61	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
		62
		63	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
		64	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
		65	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
		66	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
		67
		68	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
		69	add.l #(1<<16), %d0 \| f0 += 1;
		70
		71	movclr.l %acc0, %d4 \| b0
		72	movclr.l %acc1, %d5 \| b1
		73	movclr.l %acc2, %d6 \| b2
		74	movclr.l %acc3, %d7 \| b3
		75
		76	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
		77	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
		78	move.l %acc0, %acc3
		79	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
		80	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
		81
		82	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
		83	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
		84	move.l %acc1, %acc2
		85	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
		86	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
		87
		88	\| ^ move.l %acc0, %acc3 %acc2 = W0 * f0 - W4 * f4
		89	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
		90	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
		91
		92	\| ^ move.l %acc1, %acc2 %acc3 = W0 * f0 + W4 * f4
		93	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
		94	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
		95
		96	moveq.l #12, %d1 \| shift amount
		97
		98	move.l %acc0, %d0 \| block[7] = (a0
		99	sub.l %d4,%d0 \| - b0)
		100	asr.l %d1, %d0 \| >> 12
		101	move.w %d0, (7*16,%a4)
		102
		103	move.l %acc1, %d0 \| block[6] = (a1
		104	sub.l %d5,%d0 \| - b1)
		105	asr.l %d1, %d0 \| >> 12
		106	move.w %d0, (6*16,%a4)
		107
		108	move.l %acc2, %d0 \| block[5] = (a2
		109	sub.l %d6,%d0 \| - b2)
		110	asr.l %d1, %d0 \| >> 12
		111	move.w %d0, (5*16,%a4)
		112
		113	move.l %acc3, %d0 \| block[4] = (a3
		114	sub.l %d7,%d0 \| - b3)
		115	asr.l %d1, %d0 \| >> 12
		116	move.w %d0, (4*16,%a4)
		117
		118	movclr.l %acc3, %d0 \| block[3] = (a3
		119	add.l %d7, %d0 \| + b3)
		120	asr.l %d1, %d0 \| >> 12
		121	move.w %d0, (3*16,%a4)
		122
		123	movclr.l %acc2, %d0 \| block[2] = (a2
		124	add.l %d6, %d0 \| + b2)
		125	asr.l %d1, %d0 \| >> 12
		126	move.w %d0, (2*16,%a4)
		127
		128	movclr.l %acc1, %d0 \| block[1] = (a1
		129	add.l %d5, %d0 \| + b1)
		130	asr.l %d1, %d0 \| >> 12
		131	move.w %d0, (1*16,%a4)
		132
		133	movclr.l %acc0, %d0 \| block[0] = (a0
		134	add.l %d4, %d0 \| + b0)
		135	asr.l %d1, %d0 \| >> 12
		136	move.w %d0, (%a4)+ \| advance to next temp column
		137
		138	subq.l #1, %d3 \| loop 8 times
		139	bne.w .row_loop
		140
		141	\| %a6 now points to the temp buffer, where we need it.
		142	lea.l (-16-128,%a4), %a4 \| point %a4 back to the input block
		143	moveq.l #8, %d3 \| loop counter
		144
		145	.col_loop:
		146	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
		147
		148	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
		149	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
		150	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
		151	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
		152
		153	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
		154	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
		155	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
		156	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
		157
		158	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
		159	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
		160	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
		161	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
		162
		163	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
		164	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
		165	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
		166	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
		167
		168	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
		169	add.l #(32<<16), %d0 \| DC offset: 0.5
		170
		171	movclr.l %acc0, %d4 \| b0
		172	movclr.l %acc1, %d5 \| b1
		173	movclr.l %acc2, %d6 \| b2
		174	movclr.l %acc3, %d7 \| b3
		175
		176	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
		177	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
		178	move.l %acc0, %acc3
		179	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
		180	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
		181
		182	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
		183	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
		184	move.l %acc1, %acc2
		185	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
		186	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
		187
		188	\| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
		189	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
		190	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
		191
		192	\| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
		193	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
		194	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
		195
		196	moveq.l #17, %d1 \| shift amount
		197
		198	move.l %acc0, %d0 \| block[7] = (a0
		199	sub.l %d4,%d0 \| - b0)
		200	asr.l %d1, %d0 \| >> 17
		201	move.w %d0, (7*16,%a4)
		202
		203	move.l %acc1, %d0 \| block[6] = (a1
		204	sub.l %d5,%d0 \| - b1)
		205	asr.l %d1, %d0 \| >> 17
		206	move.w %d0, (6*16,%a4)
		207
		208	move.l %acc2, %d0 \| block[5] = (a2
		209	sub.l %d6,%d0 \| - b2)
		210	asr.l %d1, %d0 \| >> 17
		211	move.w %d0, (5*16,%a4)
		212
		213	move.l %acc3, %d0 \| block[4] = (a3
		214	sub.l %d7,%d0 \| - b3)
		215	asr.l %d1, %d0 \| >> 17
		216	move.w %d0, (4*16,%a4)
		217
		218	movclr.l %acc3, %d0 \| block[3] = (a3
		219	add.l %d7, %d0 \| + b3)
		220	asr.l %d1, %d0 \| >> 17
		221	move.w %d0, (3*16,%a4)
		222
		223	movclr.l %acc2, %d0 \| block[2] = (a2
		224	add.l %d6, %d0 \| + b2)
		225	asr.l %d1, %d0 \| >> 17
		226	move.w %d0, (2*16,%a4)
		227
		228	movclr.l %acc1, %d0 \| block[1] = (a1
		229	add.l %d5, %d0 \| + b1)
		230	asr.l %d1, %d0 \| >> 17
		231	move.w %d0, (1*16,%a4)
		232
		233	movclr.l %acc0, %d0 \| block[0] = (a0
		234	add.l %d4, %d0 \| + b0)
		235	asr.l %d1, %d0 \| >> 17
		236	move.w %d0, (%a4)+ \| advance to next column
		237
		238	subq.l #1, %d3 \| loop 8 times
		239	bne.w .col_loop
		240
		241	movem.l (%sp), %d0-%d7/%a0-%a6 \| restore all registers
		242	lea.l (15*4,%sp), %sp
		243	rts
		244
		245	.align 2
		246
		247	mpeg2_idct_copy_coldfire:
		248	lea.l (-4*4,%sp), %sp
		249	movem.l %d2-%d4/%a2, (%sp) \| save some registers
		250	movem.l (4*4+4,%sp), %a0-%a2\| %a0 - block pointer
		251	\| %a1 - destination pointer
		252	\| %a2 - stride
		253
		254	bsr.w .idct \| apply idct to block
		255
		256	move.l #255, %d1 \| preload constant for clipping
		257	moveq.l #8, %d4 \| loop counter
		258
		259	.copy_clip_loop:
		260	move.w (%a0), %d0 \| load block[0]
		261	ext.l %d0 \| sign extend
		262	cmp.l %d1, %d0 \| overflow?
		263	bls.b 1f
		264	spl.b %d0 \| yes: set appropriate limit value in low byte
		265	1:
		266	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
		267	lsl.l #8, %d2
		268
		269	move.w (2,%a0), %d0 \| load block[1]
		270	ext.l %d0 \| sign extend
		271	cmp.l %d1, %d0 \| overflow?
		272	bls.b 1f
		273	spl.b %d0 \| yes: set appropriate limit value in low byte
		274	1:
		275	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
		276	lsl.l #8, %d2
		277	clr.l (%a0)+ \| clear block[0] and block[1],
		278	\| %a0 now pointing to block[2]
		279	move.w (%a0), %d0 \| do b2 and b3
		280	ext.l %d0
		281	cmp.l %d1, %d0
		282	bls.b 1f
		283	spl.b %d0
		284	1:
		285	move.b %d0, %d2
		286	lsl.l #8, %d2
		287
		288	move.w (2,%a0), %d0
		289	ext.l %d0
		290	cmp.l %d1, %d0
		291	bls.b 1f
		292	spl.b %d0
		293	1:
		294	move.b %d0, %d2
		295	clr.l (%a0)+
		296
		297	move.w (%a0), %d0 \| do b4 and b5
		298	ext.l %d0
		299	cmp.l %d1, %d0
		300	bls.b 1f
		301	spl.b %d0
		302	1:
		303	move.b %d0, %d3
		304	lsl.l #8, %d3
		305
		306	move.w (2,%a0), %d0
		307	ext.l %d0
		308	cmp.l %d1, %d0
		309	bls.b 1f
		310	spl.b %d0
		311	1:
		312	move.b %d0, %d3
		313	lsl.l #8, %d3
		314	clr.l (%a0)+
		315
		316	move.w (%a0), %d0 \| do b6 and b7
		317	ext.l %d0
		318	cmp.l %d1, %d0
		319	bls.b 1f
		320	spl.b %d0
		321	1:
		322	move.b %d0, %d3
		323	lsl.l #8, %d3
		324
		325	move.w (2,%a0), %d0
		326	ext.l %d0
		327	cmp.l %d1, %d0
		328	bls.b 1f
		329	spl.b %d0
		330	1:
		331	move.b %d0, %d3
		332	clr.l (%a0)+
		333
		334	movem.l %d2-%d3, (%a1) \| write all 8 output bytes at once
		335	lea.l (%a2,%a1), %a1 \| advance output pointer
		336	subq.l #1, %d4 \| loop 8 times
		337	bne.w .copy_clip_loop
		338
		339	movem.l (%sp), %d2-%d4/%a2 \| restore registers
		340	lea.l (4*4,%sp), %sp
		341	rts
		342
		343	.align 2
		344
		345	mpeg2_idct_add_coldfire:
		346	lea.l (-7*4,%sp), %sp
		347	movem.l %d2-%d7/%a2, (%sp) \| save some registers
		348	movem.l (7*4+4,%sp), %d0/%a0-%a2\| %d0 - last value
		349	\| %a0 - block pointer
		350	\| %a1 - destination pointer
		351	\| %a2 - stride
		352	cmp.l #129, %d0 \| last == 129 ?
		353	bne.b .idct_add \| no: perform idct + addition
		354	move.w (%a0), %d0
		355	ext.l %d0 \| ((block[0]
		356	asr.l #4, %d0 \| >> 4)
		357	and.l #7, %d0 \| & 7)
		358	subq.l #4, %d0 \| - 4 == 0 ?
		359	bne.w .dc_add \| no: just perform addition
		360
		361	.idct_add:
		362	bsr.w .idct \| apply idct
		363
		364	move.l #255, %d2 \| preload constant for clipping
		365	clr.l %d3 \| used for splitting input words into bytes
		366	moveq.l #8, %d4 \| loop counter
		367
		368	.add_clip_loop:
		369	movem.l (%a1), %d6-%d7 \| fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
		370	swap %d6 \| (b2 b3 b0 b1)
		371	swap %d7 \| (b6 b7 b4 b5)
		372
		373	move.w (2,%a0), %d0 \| load block[1]
		374	ext.l %d0 \| sign extend
		375	move.b %d6, %d3 \| copy b1
		376	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		377	add.l %d3, %d0 \| add b1
		378	cmp.l %d2, %d0 \| overflow ?
		379	bls.b 1f
		380	spl.b %d0 \| yes: set appropriate limit value in low byte
		381	1:
		382	move.w (%a0), %d1 \| load block[0]
		383	ext.l %d1 \| sign extend
		384	move.b %d6, %d3 \| copy b0
		385	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		386	add.l %d3, %d1 \| add b0
		387	cmp.l %d2, %d1 \| overflow ?
		388	bls.b 1f
		389	spl.b %d1 \| yes: set appropriate limit value in low byte
		390	1:
		391	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
		392	lsl.l #8, %d5
		393	move.b %d0, %d5
		394	lsl.l #8, %d5
		395	clr.l (%a0)+ \| clear block[0] and block[1]
		396	\| %a0 now pointing to block[2]
		397	move.w (2,%a0), %d0 \| do b3 and b2
		398	ext.l %d0
		399	move.b %d6, %d3
		400	lsr.l #8, %d6
		401	add.l %d3, %d0
		402	cmp.l %d2, %d0
		403	bls.b 1f
		404	spl.b %d0
		405	1:
		406	move.w (%a0), %d1
		407	ext.l %d1
		408	add.l %d6, %d1
		409	cmp.l %d2, %d1
		410	bls.b 1f
		411	spl.b %d1
		412	1:
		413	move.b %d1, %d5
		414	lsl.l #8, %d5
		415	move.b %d0, %d5
		416	clr.l (%a0)+
		417
		418	move.w (2,%a0), %d0 \| do b5 and b4
		419	ext.l %d0
		420	move.b %d7, %d3
		421	lsr.l #8, %d7
		422	add.l %d3, %d0
		423	cmp.l %d2, %d0
		424	bls.b 1f
		425	spl.b %d0
		426	1:
		427	move.w (%a0), %d1
		428	ext.l %d1
		429	move.b %d7, %d3
		430	lsr.l #8, %d7
		431	add.l %d3, %d1
		432	cmp.l %d2, %d1
		433	bls.b 1f
		434	spl.b %d1
		435	1:
		436	move.b %d1, %d6
		437	lsl.l #8, %d6
		438	move.b %d0, %d6
		439	lsl.l #8, %d6
		440	clr.l (%a0)+
		441
		442	move.w (2,%a0), %d0 \| do b7 and b6
		443	ext.l %d0
		444	move.b %d7, %d3
		445	lsr.l #8, %d7
		446	add.l %d3, %d0
		447	cmp.l %d2, %d0
		448	bls.b 1f
		449	spl.b %d0
		450	1:
		451	move.w (%a0), %d1
		452	ext.l %d1
		453	add.l %d7, %d1
		454	cmp.l %d2, %d1
		455	bls.b 1f
		456	spl.b %d1
		457	1:
		458	move.b %d1, %d6
		459	lsl.l #8, %d6
		460	move.b %d0, %d6
		461	clr.l (%a0)+
		462
		463	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
		464	lea.l (%a2,%a1), %a1 \| advance output pointer
		465	subq.l #1, %d4 \| loop 8 times
		466	bne.w .add_clip_loop
		467
		468	bra.w .idct_add_end
		469
		470	.dc_add:
		471	move.w (%a0), %d0
		472	ext.l %d0 \| %d0 = (block[0]
		473	add.l #64, %d0 \| + 64)
		474	asr.l #7, %d0 \| >> 7
		475	clr.w (%a0) \| clear block[0]
		476	clr.w (63*2,%a0) \| and block[63]
		477	move.l %d0, %a0 \| DC value in %a0
		478
		479	move.l #255, %d2 \| preload constant for clipping
		480	clr.l %d3 \| for splitting input words into bytes
		481	moveq.l #8, %d4 \| loop counter
		482
		483	.dc_clip_loop:
		484	movem.l (%a1), %d6-%d7 \| (b0 b1 b2 b3) (b4 b5 b6 b7)
		485	swap %d6 \| (b2 b3 b0 b1)
		486	swap %d7 \| (b6 b7 b4 b5)
		487
		488	move.l %a0, %d0 \| copy DC
		489	move.b %d6, %d3 \| copy b1
		490	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		491	add.l %d3, %d0 \| add b1
		492	cmp.l %d2, %d0 \| overflow ?
		493	bls.b 1f
		494	spl.b %d0 \| yes: set appropriate limit value in low byte
		495	1:
		496	move.l %a0, %d1 \| copy DC
		497	move.b %d6, %d3 \| copy b0
		498	lsr.l #8, %d6 \| prepare 1st buffer for next byte
		499	add.l %d3, %d1 \| add b0
		500	cmp.l %d2, %d1 \| overflow ?
		501	bls.b 1f
		502	spl.b %d1 \| yes: set appropriate limit value in low byte
		503	1:
		504	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
		505	lsl.l #8, %d5
		506	move.b %d0, %d5
		507	lsl.l #8, %d5
		508
		509	move.l %a0, %d0 \| do b3 and b2
		510	move.b %d6, %d3
		511	lsr.l #8, %d6
		512	add.l %d3, %d0
		513	cmp.l %d2, %d0
		514	bls.b 1f
		515	spl.b %d0
		516	1:
		517	move.l %a0, %d1
		518	add.l %d6, %d1
		519	cmp.l %d2, %d1
		520	bls.b 1f
		521	spl.b %d1
		522	1:
		523	move.b %d1, %d5
		524	lsl.l #8, %d5
		525	move.b %d0, %d5
		526
		527	move.l %a0, %d0 \| do b5 and b4
		528	move.b %d7, %d3
		529	lsr.l #8, %d7
		530	add.l %d3, %d0
		531	cmp.l %d2, %d0
		532	bls.b 1f
		533	spl.b %d0
		534	1:
		535	move.l %a0, %d1
		536	move.b %d7, %d3
		537	lsr.l #8, %d7
		538	add.l %d3, %d1
		539	cmp.l %d2, %d1
		540	bls.b 1f
		541	spl.b %d1
		542	1:
		543	move.b %d1, %d6 \| do b7 and b6
		544	lsl.l #8, %d6
		545	move.b %d0, %d6
		546	lsl.l #8, %d6
		547
		548	move.l %a0, %d0
		549	move.b %d7, %d3
		550	lsr.l #8, %d7
		551	add.l %d3, %d0
		552	cmp.l %d2, %d0
		553	bls.b 1f
		554	spl.b %d0
		555	1:
		556	move.l %a0, %d1
		557	add.l %d7, %d1
		558	cmp.l %d2, %d1
		559	bls.b 1f
		560	spl.b %d1
		561	1:
		562	move.b %d1, %d6
		563	lsl.l #8, %d6
		564	move.b %d0, %d6
		565
		566	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
		567	lea.l (%a2,%a1), %a1 \| advance output pointer
		568	subq.l #1, %d4 \| loop 8 times
		569	bne.w .dc_clip_loop
		570
		571	.idct_add_end:
		572	movem.l (%sp), %d2-%d7/%a2 \| restore registers
		573	lea.l (7*4,%sp), %sp
		574	rts


diff --git a/apps/plugins/mpegplayer/mpeg2_internal.h b/apps/plugins/mpegplayer/mpeg2_internal.h index 0c552b766f..1ec85c60f1 100644 --- a/apps/plugins/mpegplayer/mpeg2_internal.h +++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@@ -20,6 +20,8 @@
20	* along with this program; if not, write to the Free Software	20	* along with this program; if not, write to the Free Software
21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	21	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22	*/	22	*/
		23
		24	#include "config.h" /* for Rockbox CPU_ #defines */
23		25
24	/* macroblock modes */	26	/* macroblock modes */
25	#define MACROBLOCK_INTRA 1	27	#define MACROBLOCK_INTRA 1
@@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
92	int16_t dc_dct_pred[3];	94	int16_t dc_dct_pred[3];
93		95
94	/* DCT coefficients */	96	/* DCT coefficients */
		97	#ifdef CPU_COLDFIRE
		98	int16_t DCTblock; / put buffer separately to have it in IRAM */
		99	#else
95	int16_t DCTblock[64] ATTR_ALIGN(64);	100	int16_t DCTblock[64] ATTR_ALIGN(64);
		101	#endif
96		102
97	uint8_t * picture_dest[3];	103	uint8_t * picture_dest[3];
98	void (* convert) (void * convert_id, uint8_t * const * src,	104	void (* convert) (void * convert_id, uint8_t * const * src,