1 files changed, 575 insertions, 0 deletions
diff --git a/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
new file mode 100644
index 0000000000..abc54b16cb
--- /dev/null
+++ b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
@@ -0,0 +1,575 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2007 Jens Arnold
+ * Based on the work of Karim Boucher and Rani Hod
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+    .global     mpeg2_idct_copy
+    .type       mpeg2_idct_copy, @function
+    .global     mpeg2_idct_add
+    .type       mpeg2_idct_add, @function
+    /* The IDCT itself.
+     * Input: %a0: block pointer
+     * Caller must save all registers. */
+    .align  2
+.idct:
+    move.l  %a0, %a6
+    move.l  #0, %macsr              | signed integer mode
+    move.l  #((2048<<16)+2841), %a0 | W0,  W1
+    move.l  #((2676<<16)+2408), %a1 | W2,  W3
+    move.l  #((2048<<16)+1609), %a2 | W4,  W5
+    move.l  #((1108<<16)+ 565), %a3 | W6,  W7
+    
+    lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
+    moveq.l #8, %d3             | loop counter
+    
+.row_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+    
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(1<<16), %d0       | f0 += 1; 
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #12, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (7*16,%a4)
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (4*16,%a4)
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (3*16,%a4)
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (2*16,%a4)
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (1*16,%a4)
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (%a4)+         | advance to next temp column
+    
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .row_loop
+    
+    | %a6 now points to the temp buffer, where we need it.
+    lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
+    moveq.l #8, %d3             | loop counter
+    
+.col_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+  
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(32<<16), %d0      | DC offset: 0.5
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+    moveq.l #17, %d1            | shift amount
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (7*16,%a4)
+   
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (4*16,%a4)
+   
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (3*16,%a4)
+   
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (2*16,%a4)
+   
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (1*16,%a4)
+   
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (%a4)+         | advance to next column
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .col_loop
+    
+    rts
+    
+    .align  2
+mpeg2_idct_copy:
+    lea.l   (-11*4,%sp), %sp
+    movem.l %d2-%d7/%a2-%a6, (%sp)  | save some registers
+    move.l  (11*4+4,%sp), %a0       | %a0 - block pointer for idct
+    bsr.w   .idct                   | apply idct to block
+    movem.l (11*4+4,%sp), %a0-%a2   | %a0 - block pointer
+                                    | %a1 - destination pointer
+                                    | %a2 - stride
+    move.l  #255, %d1           | preload constant for clipping
+    moveq.l #8, %d4             | loop counter
+    
+.copy_clip_loop:
+    move.w  (%a0), %d0          | load block[0]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 | yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    clr.l   (%a0)+              | clear block[0] and block[1],
+                                | %a0 now pointing to block[2]
+    move.w  (%a0), %d0          | do b2 and b3
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    lsl.l   #8, %d2
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b4 and b5
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    clr.l   (%a0)+
+    move.w  (%a0), %d0          | do b6 and b7
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3            
+    clr.l   (%a0)+
+    
+    movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
+    add.l   %a2, %a1            | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .copy_clip_loop
+    movem.l (%sp), %d2-%d7/%a2-%a6
+    lea.l   (11*4,%sp), %sp
+    rts
+    .align  2
+mpeg2_idct_add:
+    lea.l   (-11*4,%sp), %sp
+    movem.l %d2-%d7/%a2-%a6, (%sp)
+    movem.l (11*4+4,%sp), %d0/%a0-%a2   | %d0 - last value
+                                        | %a0 - block pointer
+                                        | %a1 - destination pointer
+                                        | %a2 - stride
+    cmp.l   #129, %d0           | last == 129 ?
+    bne.b   .idct_add           |   no: perform idct + addition
+    move.w  (%a0), %d0
+    ext.l   %d0                 | ((block[0]
+    asr.l   #4, %d0             |      >> 4)
+    and.l   #7, %d0             |      & 7)
+    subq.l  #4, %d0             |      - 4 == 0 ?
+    bne.w   .dc_add             |   no: just perform addition
+.idct_add:
+    bsr.w   .idct                   | apply idct
+    movem.l (11*4+8,%sp), %a0-%a2   | reload arguments %a0..%a2
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | used for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.add_clip_loop:
+    movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1) 
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.w  (%a0), %d1          | load block[0]
+    ext.l   %d1                 | sign extend
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    clr.l   (%a0)+              | clear block[0] and block[1]
+                                |   %a0 now pointing to block[2]
+    move.w  (2,%a0), %d0        | do b3 and b2
+    ext.l   %d0
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b5 and b4
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    clr.l   (%a0)+
+    move.w  (2,%a0), %d0        | do b7 and b6
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    clr.l   (%a0)+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    add.l   %a2, %a1            | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .add_clip_loop
+    bra.w   .idct_add_end
+    
+.dc_add:
+    move.w  (%a0), %d0
+    ext.l   %d0                 | %d0 = (block[0]
+    add.l   #64, %d0            |       + 64)
+    asr.l   #7, %d0             |       >> 7
+    clr.w   (%a0)               | clear block[0]
+    clr.w   (63*2,%a0)          |   and block[63]
+    move.l  %d0, %a0            | DC value in %a0
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.dc_clip_loop:
+    movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1)
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.l  %a0, %d0            | copy DC
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.l  %a0, %d1            | copy DC
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    move.l  %a0, %d0            | do b3 and b2
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    move.l  %a0, %d0            | do b5 and b4
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6            | do b7 and b6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    move.l  %a0, %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    add.l   %a2, %a1            | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .dc_clip_loop
+.idct_add_end:
+    movem.l (%sp), %d2-%d7/%a2-%a6
+    lea.l   (11*4,%sp), %sp
+    rts

diff --git a/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S new file mode 100644 index 0000000000..abc54b16cb --- /dev/null +++ b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
@@ -0,0 +1,575 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2007 Jens Arnold
	11	* Based on the work of Karim Boucher and Rani Hod
	12	*
	13	* This program is free software; you can redistribute it and/or
	14	* modify it under the terms of the GNU General Public License
	15	* as published by the Free Software Foundation; either version 2
	16	* of the License, or (at your option) any later version.
	17	*
	18	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	19	* KIND, either express or implied.
	20	*
	21	****************************************************************************/
	22
	23	.global mpeg2_idct_copy
	24	.type mpeg2_idct_copy, @function
	25	.global mpeg2_idct_add
	26	.type mpeg2_idct_add, @function
	27
	28	/* The IDCT itself.
	29	* Input: %a0: block pointer
	30	* Caller must save all registers. */
	31	.align 2
	32	.idct:
	33	move.l %a0, %a6
	34
	35	move.l #0, %macsr \| signed integer mode
	36
	37	move.l #((2048<<16)+2841), %a0 \| W0, W1
	38	move.l #((2676<<16)+2408), %a1 \| W2, W3
	39	move.l #((2048<<16)+1609), %a2 \| W4, W5
	40	move.l #((1108<<16)+ 565), %a3 \| W6, W7
	41
	42	lea.l (128,%a6), %a4 \| secondary, transposed temp buffer
	43	moveq.l #8, %d3 \| loop counter
	44
	45	.row_loop:
	46	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
	47
	48	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
	49	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
	50	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
	51	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
	52
	53	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
	54	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
	55	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
	56	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
	57
	58	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
	59	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
	60	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
	61	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
	62
	63	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
	64	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
	65	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
	66	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
	67
	68	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
	69	add.l #(1<<16), %d0 \| f0 += 1;
	70
	71	movclr.l %acc0, %d4 \| b0
	72	movclr.l %acc1, %d5 \| b1
	73	movclr.l %acc2, %d6 \| b2
	74	movclr.l %acc3, %d7 \| b3
	75
	76	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
	77	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
	78	move.l %acc0, %acc3
	79	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
	80	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
	81
	82	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
	83	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
	84	move.l %acc1, %acc2
	85	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
	86	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
	87
	88	\| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
	89	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
	90	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
	91
	92	\| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
	93	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
	94	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
	95
	96	moveq.l #12, %d1 \| shift amount
	97
	98	move.l %acc0, %d0 \| block[7] = (a0
	99	sub.l %d4,%d0 \| - b0)
	100	asr.l %d1, %d0 \| >> 12
	101	move.w %d0, (7*16,%a4)
	102
	103	move.l %acc1, %d0 \| block[6] = (a1
	104	sub.l %d5,%d0 \| - b1)
	105	asr.l %d1, %d0 \| >> 12
	106	move.w %d0, (6*16,%a4)
	107
	108	move.l %acc2, %d0 \| block[5] = (a2
	109	sub.l %d6,%d0 \| - b2)
	110	asr.l %d1, %d0 \| >> 12
	111	move.w %d0, (5*16,%a4)
	112
	113	move.l %acc3, %d0 \| block[4] = (a3
	114	sub.l %d7,%d0 \| - b3)
	115	asr.l %d1, %d0 \| >> 12
	116	move.w %d0, (4*16,%a4)
	117
	118	movclr.l %acc3, %d0 \| block[3] = (a3
	119	add.l %d7, %d0 \| + b3)
	120	asr.l %d1, %d0 \| >> 12
	121	move.w %d0, (3*16,%a4)
	122
	123	movclr.l %acc2, %d0 \| block[2] = (a2
	124	add.l %d6, %d0 \| + b2)
	125	asr.l %d1, %d0 \| >> 12
	126	move.w %d0, (2*16,%a4)
	127
	128	movclr.l %acc1, %d0 \| block[1] = (a1
	129	add.l %d5, %d0 \| + b1)
	130	asr.l %d1, %d0 \| >> 12
	131	move.w %d0, (1*16,%a4)
	132
	133	movclr.l %acc0, %d0 \| block[0] = (a0
	134	add.l %d4, %d0 \| + b0)
	135	asr.l %d1, %d0 \| >> 12
	136	move.w %d0, (%a4)+ \| advance to next temp column
	137
	138	subq.l #1, %d3 \| loop 8 times
	139	bne.w .row_loop
	140
	141	\| %a6 now points to the temp buffer, where we need it.
	142	lea.l (-16-128,%a4), %a4 \| point %a4 back to the input block
	143	moveq.l #8, %d3 \| loop counter
	144
	145	.col_loop:
	146	movem.l (%a6), %d0-%d2/%a5 \| fetch (f0, f2, f4, f6, f1, f3, f5, f7)
	147
	148	mac.w %a0l, %d2u, %acc0 \| %acc0 = W1 * f1
	149	mac.w %a1l, %d2l, %acc0 \| + W3 * f3
	150	mac.w %a2l, %a5u, %acc0 \| + W5 * f5
	151	mac.w %a3l, %a5l, %acc0 \| + W7 * f7
	152
	153	mac.w %a1l, %d2u, %acc1 \| %acc1 = W3 * f1
	154	msac.w %a3l, %d2l, %acc1 \| - W7 * f3
	155	msac.w %a0l, %a5u, %acc1 \| - W1 * f5
	156	msac.w %a2l, %a5l, %acc1 \| - W5 * f7
	157
	158	mac.w %a2l, %d2u, %acc2 \| %acc2 = W5 * f1
	159	msac.w %a0l, %d2l, %acc2 \| - W1 * f3
	160	mac.w %a3l, %a5u, %acc2 \| + W7 * f5
	161	mac.w %a1l, %a5l, %acc2 \| + W3 * f7
	162
	163	mac.w %a3l, %d2u, %acc3 \| %acc3 = W7 * f1
	164	msac.w %a2l, %d2l, %acc3 \| - W5 * f3
	165	mac.w %a1l, %a5u, %acc3 \| + W3 * f5
	166	msac.w %a0l, %a5l, %acc3 \| - W1 * f7
	167
	168	lea.l (16,%a6), %a6 \| Advance to next row; put here to fill EMAC latency
	169	add.l #(32<<16), %d0 \| DC offset: 0.5
	170
	171	movclr.l %acc0, %d4 \| b0
	172	movclr.l %acc1, %d5 \| b1
	173	movclr.l %acc2, %d6 \| b2
	174	movclr.l %acc3, %d7 \| b3
	175
	176	mac.w %a0u, %d0u, %acc0 \| %acc0 = W0 * f0
	177	mac.w %a2u, %d1u, %acc0 \| + W4 * f4
	178	move.l %acc0, %acc3
	179	mac.w %a1u, %d0l, %acc0 \| + W2 * f2
	180	mac.w %a3u, %d1l, %acc0 \| + W6 * f6
	181
	182	mac.w %a0u, %d0u, %acc1 \| %acc1 = W0 * f0
	183	msac.w %a2u, %d1u, %acc1 \| - W4 * f4
	184	move.l %acc1, %acc2
	185	mac.w %a3u, %d0l, %acc1 \| + W6 * f2
	186	msac.w %a1u, %d1l, %acc1 \| - W2 * f6
	187
	188	\| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
	189	msac.w %a3u, %d0l, %acc2 \| - W6 * f2
	190	mac.w %a1u, %d1l, %acc2 \| + W2 * f6
	191
	192	\| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
	193	msac.w %a1u, %d0l, %acc3 \| - W2 * f2
	194	msac.w %a3u, %d1l, %acc3 \| - W6 * f6
	195
	196	moveq.l #17, %d1 \| shift amount
	197
	198	move.l %acc0, %d0 \| block[7] = (a0
	199	sub.l %d4,%d0 \| - b0)
	200	asr.l %d1, %d0 \| >> 17
	201	move.w %d0, (7*16,%a4)
	202
	203	move.l %acc1, %d0 \| block[6] = (a1
	204	sub.l %d5,%d0 \| - b1)
	205	asr.l %d1, %d0 \| >> 17
	206	move.w %d0, (6*16,%a4)
	207
	208	move.l %acc2, %d0 \| block[5] = (a2
	209	sub.l %d6,%d0 \| - b2)
	210	asr.l %d1, %d0 \| >> 17
	211	move.w %d0, (5*16,%a4)
	212
	213	move.l %acc3, %d0 \| block[4] = (a3
	214	sub.l %d7,%d0 \| - b3)
	215	asr.l %d1, %d0 \| >> 17
	216	move.w %d0, (4*16,%a4)
	217
	218	movclr.l %acc3, %d0 \| block[3] = (a3
	219	add.l %d7, %d0 \| + b3)
	220	asr.l %d1, %d0 \| >> 17
	221	move.w %d0, (3*16,%a4)
	222
	223	movclr.l %acc2, %d0 \| block[2] = (a2
	224	add.l %d6, %d0 \| + b2)
	225	asr.l %d1, %d0 \| >> 17
	226	move.w %d0, (2*16,%a4)
	227
	228	movclr.l %acc1, %d0 \| block[1] = (a1
	229	add.l %d5, %d0 \| + b1)
	230	asr.l %d1, %d0 \| >> 17
	231	move.w %d0, (1*16,%a4)
	232
	233	movclr.l %acc0, %d0 \| block[0] = (a0
	234	add.l %d4, %d0 \| + b0)
	235	asr.l %d1, %d0 \| >> 17
	236	move.w %d0, (%a4)+ \| advance to next column
	237
	238	subq.l #1, %d3 \| loop 8 times
	239	bne.w .col_loop
	240
	241	rts
	242
	243	.align 2
	244
	245	mpeg2_idct_copy:
	246	lea.l (-11*4,%sp), %sp
	247	movem.l %d2-%d7/%a2-%a6, (%sp) \| save some registers
	248	move.l (11*4+4,%sp), %a0 \| %a0 - block pointer for idct
	249
	250	bsr.w .idct \| apply idct to block
	251	movem.l (11*4+4,%sp), %a0-%a2 \| %a0 - block pointer
	252	\| %a1 - destination pointer
	253	\| %a2 - stride
	254
	255	move.l #255, %d1 \| preload constant for clipping
	256	moveq.l #8, %d4 \| loop counter
	257
	258	.copy_clip_loop:
	259	move.w (%a0), %d0 \| load block[0]
	260	ext.l %d0 \| sign extend
	261	cmp.l %d1, %d0 \| overflow?
	262	bls.b 1f
	263	spl.b %d0 \| yes: set appropriate limit value in low byte
	264	1:
	265	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
	266	lsl.l #8, %d2
	267
	268	move.w (2,%a0), %d0 \| load block[1]
	269	ext.l %d0 \| sign extend
	270	cmp.l %d1, %d0 \| overflow?
	271	bls.b 1f
	272	spl.b %d0 \| yes: set appropriate limit value in low byte
	273	1:
	274	move.b %d0, %d2 \| collect output bytes 0..3 in %d2
	275	lsl.l #8, %d2
	276	clr.l (%a0)+ \| clear block[0] and block[1],
	277	\| %a0 now pointing to block[2]
	278	move.w (%a0), %d0 \| do b2 and b3
	279	ext.l %d0
	280	cmp.l %d1, %d0
	281	bls.b 1f
	282	spl.b %d0
	283	1:
	284	move.b %d0, %d2
	285	lsl.l #8, %d2
	286
	287	move.w (2,%a0), %d0
	288	ext.l %d0
	289	cmp.l %d1, %d0
	290	bls.b 1f
	291	spl.b %d0
	292	1:
	293	move.b %d0, %d2
	294	clr.l (%a0)+
	295
	296	move.w (%a0), %d0 \| do b4 and b5
	297	ext.l %d0
	298	cmp.l %d1, %d0
	299	bls.b 1f
	300	spl.b %d0
	301	1:
	302	move.b %d0, %d3
	303	lsl.l #8, %d3
	304
	305	move.w (2,%a0), %d0
	306	ext.l %d0
	307	cmp.l %d1, %d0
	308	bls.b 1f
	309	spl.b %d0
	310	1:
	311	move.b %d0, %d3
	312	lsl.l #8, %d3
	313	clr.l (%a0)+
	314
	315	move.w (%a0), %d0 \| do b6 and b7
	316	ext.l %d0
	317	cmp.l %d1, %d0
	318	bls.b 1f
	319	spl.b %d0
	320	1:
	321	move.b %d0, %d3
	322	lsl.l #8, %d3
	323
	324	move.w (2,%a0), %d0
	325	ext.l %d0
	326	cmp.l %d1, %d0
	327	bls.b 1f
	328	spl.b %d0
	329	1:
	330	move.b %d0, %d3
	331	clr.l (%a0)+
	332
	333	movem.l %d2-%d3, (%a1) \| write all 8 output bytes at once
	334	add.l %a2, %a1 \| advance output pointer
	335	subq.l #1, %d4 \| loop 8 times
	336	bne.w .copy_clip_loop
	337
	338	movem.l (%sp), %d2-%d7/%a2-%a6
	339	lea.l (11*4,%sp), %sp
	340	rts
	341
	342	.align 2
	343
	344	mpeg2_idct_add:
	345	lea.l (-11*4,%sp), %sp
	346	movem.l %d2-%d7/%a2-%a6, (%sp)
	347	movem.l (11*4+4,%sp), %d0/%a0-%a2 \| %d0 - last value
	348	\| %a0 - block pointer
	349	\| %a1 - destination pointer
	350	\| %a2 - stride
	351
	352	cmp.l #129, %d0 \| last == 129 ?
	353	bne.b .idct_add \| no: perform idct + addition
	354	move.w (%a0), %d0
	355	ext.l %d0 \| ((block[0]
	356	asr.l #4, %d0 \| >> 4)
	357	and.l #7, %d0 \| & 7)
	358	subq.l #4, %d0 \| - 4 == 0 ?
	359	bne.w .dc_add \| no: just perform addition
	360
	361	.idct_add:
	362	bsr.w .idct \| apply idct
	363	movem.l (11*4+8,%sp), %a0-%a2 \| reload arguments %a0..%a2
	364
	365	move.l #255, %d2 \| preload constant for clipping
	366	clr.l %d3 \| used for splitting input words into bytes
	367	moveq.l #8, %d4 \| loop counter
	368
	369	.add_clip_loop:
	370	movem.l (%a1), %d6-%d7 \| fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
	371	swap %d6 \| (b2 b3 b0 b1)
	372	swap %d7 \| (b6 b7 b4 b5)
	373
	374	move.w (2,%a0), %d0 \| load block[1]
	375	ext.l %d0 \| sign extend
	376	move.b %d6, %d3 \| copy b1
	377	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	378	add.l %d3, %d0 \| add b1
	379	cmp.l %d2, %d0 \| overflow ?
	380	bls.b 1f
	381	spl.b %d0 \| yes: set appropriate limit value in low byte
	382	1:
	383	move.w (%a0), %d1 \| load block[0]
	384	ext.l %d1 \| sign extend
	385	move.b %d6, %d3 \| copy b0
	386	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	387	add.l %d3, %d1 \| add b0
	388	cmp.l %d2, %d1 \| overflow ?
	389	bls.b 1f
	390	spl.b %d1 \| yes: set appropriate limit value in low byte
	391	1:
	392	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
	393	lsl.l #8, %d5
	394	move.b %d0, %d5
	395	lsl.l #8, %d5
	396	clr.l (%a0)+ \| clear block[0] and block[1]
	397	\| %a0 now pointing to block[2]
	398	move.w (2,%a0), %d0 \| do b3 and b2
	399	ext.l %d0
	400	move.b %d6, %d3
	401	lsr.l #8, %d6
	402	add.l %d3, %d0
	403	cmp.l %d2, %d0
	404	bls.b 1f
	405	spl.b %d0
	406	1:
	407	move.w (%a0), %d1
	408	ext.l %d1
	409	add.l %d6, %d1
	410	cmp.l %d2, %d1
	411	bls.b 1f
	412	spl.b %d1
	413	1:
	414	move.b %d1, %d5
	415	lsl.l #8, %d5
	416	move.b %d0, %d5
	417	clr.l (%a0)+
	418
	419	move.w (2,%a0), %d0 \| do b5 and b4
	420	ext.l %d0
	421	move.b %d7, %d3
	422	lsr.l #8, %d7
	423	add.l %d3, %d0
	424	cmp.l %d2, %d0
	425	bls.b 1f
	426	spl.b %d0
	427	1:
	428	move.w (%a0), %d1
	429	ext.l %d1
	430	move.b %d7, %d3
	431	lsr.l #8, %d7
	432	add.l %d3, %d1
	433	cmp.l %d2, %d1
	434	bls.b 1f
	435	spl.b %d1
	436	1:
	437	move.b %d1, %d6
	438	lsl.l #8, %d6
	439	move.b %d0, %d6
	440	lsl.l #8, %d6
	441	clr.l (%a0)+
	442
	443	move.w (2,%a0), %d0 \| do b7 and b6
	444	ext.l %d0
	445	move.b %d7, %d3
	446	lsr.l #8, %d7
	447	add.l %d3, %d0
	448	cmp.l %d2, %d0
	449	bls.b 1f
	450	spl.b %d0
	451	1:
	452	move.w (%a0), %d1
	453	ext.l %d1
	454	add.l %d7, %d1
	455	cmp.l %d2, %d1
	456	bls.b 1f
	457	spl.b %d1
	458	1:
	459	move.b %d1, %d6
	460	lsl.l #8, %d6
	461	move.b %d0, %d6
	462	clr.l (%a0)+
	463
	464	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
	465	add.l %a2, %a1 \| advance output pointer
	466	subq.l #1, %d4 \| loop 8 times
	467	bne.w .add_clip_loop
	468
	469	bra.w .idct_add_end
	470
	471	.dc_add:
	472	move.w (%a0), %d0
	473	ext.l %d0 \| %d0 = (block[0]
	474	add.l #64, %d0 \| + 64)
	475	asr.l #7, %d0 \| >> 7
	476	clr.w (%a0) \| clear block[0]
	477	clr.w (63*2,%a0) \| and block[63]
	478	move.l %d0, %a0 \| DC value in %a0
	479
	480	move.l #255, %d2 \| preload constant for clipping
	481	clr.l %d3 \| for splitting input words into bytes
	482	moveq.l #8, %d4 \| loop counter
	483
	484	.dc_clip_loop:
	485	movem.l (%a1), %d6-%d7 \| (b0 b1 b2 b3) (b4 b5 b6 b7)
	486	swap %d6 \| (b2 b3 b0 b1)
	487	swap %d7 \| (b6 b7 b4 b5)
	488
	489	move.l %a0, %d0 \| copy DC
	490	move.b %d6, %d3 \| copy b1
	491	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	492	add.l %d3, %d0 \| add b1
	493	cmp.l %d2, %d0 \| overflow ?
	494	bls.b 1f
	495	spl.b %d0 \| yes: set appropriate limit value in low byte
	496	1:
	497	move.l %a0, %d1 \| copy DC
	498	move.b %d6, %d3 \| copy b0
	499	lsr.l #8, %d6 \| prepare 1st buffer for next byte
	500	add.l %d3, %d1 \| add b0
	501	cmp.l %d2, %d1 \| overflow ?
	502	bls.b 1f
	503	spl.b %d1 \| yes: set appropriate limit value in low byte
	504	1:
	505	move.b %d1, %d5 \| collect output bytes 0..3 in %d5
	506	lsl.l #8, %d5
	507	move.b %d0, %d5
	508	lsl.l #8, %d5
	509
	510	move.l %a0, %d0 \| do b3 and b2
	511	move.b %d6, %d3
	512	lsr.l #8, %d6
	513	add.l %d3, %d0
	514	cmp.l %d2, %d0
	515	bls.b 1f
	516	spl.b %d0
	517	1:
	518	move.l %a0, %d1
	519	add.l %d6, %d1
	520	cmp.l %d2, %d1
	521	bls.b 1f
	522	spl.b %d1
	523	1:
	524	move.b %d1, %d5
	525	lsl.l #8, %d5
	526	move.b %d0, %d5
	527
	528	move.l %a0, %d0 \| do b5 and b4
	529	move.b %d7, %d3
	530	lsr.l #8, %d7
	531	add.l %d3, %d0
	532	cmp.l %d2, %d0
	533	bls.b 1f
	534	spl.b %d0
	535	1:
	536	move.l %a0, %d1
	537	move.b %d7, %d3
	538	lsr.l #8, %d7
	539	add.l %d3, %d1
	540	cmp.l %d2, %d1
	541	bls.b 1f
	542	spl.b %d1
	543	1:
	544	move.b %d1, %d6 \| do b7 and b6
	545	lsl.l #8, %d6
	546	move.b %d0, %d6
	547	lsl.l #8, %d6
	548
	549	move.l %a0, %d0
	550	move.b %d7, %d3
	551	lsr.l #8, %d7
	552	add.l %d3, %d0
	553	cmp.l %d2, %d0
	554	bls.b 1f
	555	spl.b %d0
	556	1:
	557	move.l %a0, %d1
	558	add.l %d7, %d1
	559	cmp.l %d2, %d1
	560	bls.b 1f
	561	spl.b %d1
	562	1:
	563	move.b %d1, %d6
	564	lsl.l #8, %d6
	565	move.b %d0, %d6
	566
	567	movem.l %d5-%d6, (%a1) \| write all 8 output bytes at once
	568	add.l %a2, %a1 \| advance output pointer
	569	subq.l #1, %d4 \| loop 8 times
	570	bne.w .dc_clip_loop
	571
	572	.idct_add_end:
	573	movem.l (%sp), %d2-%d7/%a2-%a6
	574	lea.l (11*4,%sp), %sp
	575	rts