summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2007-10-16 22:55:40 +0000
committerJens Arnold <amiconn@rockbox.org>2007-10-16 22:55:40 +0000
commitfc43b9df823af80dd1c9cf7dc1b5de6703944043 (patch)
treebb17b985d00d13bc6fce61823acbe50ed1e003b0
parent84f5c5c3e3590cb993f4cf2a7eba5979e3bc825b (diff)
downloadrockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.tar.gz
rockbox-fc43b9df823af80dd1c9cf7dc1b5de6703944043.zip
Mpegplayer: Assembler optimised IDCT for coldfire, based on FS #5995 by Karim Boucher. Put the IDCT block buffer in IRAM for better performance. The whole libmpeg2 decoder struct doesn't fit without throwing some libmad buffers out of IRAM, but then doesn't change performance significantly. Mpegplayer is quite usable now on X5; H300 is sort-of usable for widescreen.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15156 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/plugins/mpegplayer/SOURCES4
-rw-r--r--apps/plugins/mpegplayer/decode.c12
-rw-r--r--apps/plugins/mpegplayer/idct.c17
-rw-r--r--apps/plugins/mpegplayer/idct_coldfire.S574
-rw-r--r--apps/plugins/mpegplayer/mpeg2_internal.h6
5 files changed, 611 insertions, 2 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES
index 6629cf7a4c..004c6395a2 100644
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@@ -13,6 +13,10 @@ idct.c
13motion_comp_c.c 13motion_comp_c.c
14#endif /* CPU_* */ 14#endif /* CPU_* */
15 15
16#ifdef CPU_COLDFIRE
17idct_coldfire.S
18#endif
19
16slice.c 20slice.c
17video_out_rockbox.c 21video_out_rockbox.c
18mpeg_settings.c 22mpeg_settings.c
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c
index 299abc9663..ca3d29a952 100644
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
401 401
402} 402}
403 403
404#ifdef CPU_COLDFIRE
405/* twice as large as on other targets because coldfire uses
406 * a secondary, transposed buffer for optimisation */
407static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
408#endif
409
404mpeg2dec_t * mpeg2_init (void) 410mpeg2dec_t * mpeg2_init (void)
405{ 411{
406 mpeg2dec_t * mpeg2dec; 412 mpeg2dec_t * mpeg2dec;
@@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
410 mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t), 416 mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
411 MPEG2_ALLOC_MPEG2DEC); 417 MPEG2_ALLOC_MPEG2DEC);
412 if (mpeg2dec == NULL) 418 if (mpeg2dec == NULL)
413 return NULL; 419 return NULL;
420
421#ifdef CPU_COLDFIRE
422 mpeg2dec->decoder.DCTblock = static_dct_block;
423#endif
414 424
415 rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t)); 425 rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
416 rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t)); 426 rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
diff --git a/apps/plugins/mpegplayer/idct.c b/apps/plugins/mpegplayer/idct.c
index bf705c6a2f..bf7097401e 100644
--- a/apps/plugins/mpegplayer/idct.c
+++ b/apps/plugins/mpegplayer/idct.c
@@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
76#define CLIP(i) ((mpeg2_clip + 3840)[i]) 76#define CLIP(i) ((mpeg2_clip + 3840)[i])
77#endif 77#endif
78 78
79#ifdef CPU_COLDFIRE
80/* assembler functions */
81extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
82 const int stride);
83extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
84 uint8_t * dest, const int stride);
85#else /* !CPU_COLDFIE */
86
79#if 0 87#if 0
80#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ 88#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
81 do { \ 89 do { \
@@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
258 } 266 }
259} 267}
260 268
269#endif /* !CPU_COLDFIRE */
270
261void mpeg2_idct_init (void) 271void mpeg2_idct_init (void)
262{ 272{
263 extern uint8_t default_mpeg2_scan_norm[64]; 273 extern uint8_t default_mpeg2_scan_norm[64];
@@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
266 extern uint8_t mpeg2_scan_alt[64]; 276 extern uint8_t mpeg2_scan_alt[64];
267 int i, j; 277 int i, j;
268 278
279#ifdef CPU_COLDFIRE
280 mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
281 mpeg2_idct_add = mpeg2_idct_add_coldfire;
282#else
269 mpeg2_idct_copy = mpeg2_idct_copy_c; 283 mpeg2_idct_copy = mpeg2_idct_copy_c;
270 mpeg2_idct_add = mpeg2_idct_add_c; 284 mpeg2_idct_add = mpeg2_idct_add_c;
285#endif
271 286
272#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM) 287#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
273 for (i = -3840; i < 3840 + 256; i++) 288 for (i = -3840; i < 3840 + 256; i++)
diff --git a/apps/plugins/mpegplayer/idct_coldfire.S b/apps/plugins/mpegplayer/idct_coldfire.S
new file mode 100644
index 0000000000..007c1a3e98
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_coldfire.S
@@ -0,0 +1,574 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id $
9 *
10 * Copyright (C) 2007 Jens Arnold
11 * Based on the work of Karim Boucher and Rani Hod
12 *
13 * All files in this archive are subject to the GNU General Public License.
14 * See the file COPYING in the source tree root for full license agreement.
15 *
16 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
17 * KIND, either express or implied.
18 *
19 ****************************************************************************/
20
21 .global mpeg2_idct_copy_coldfire
22 .type mpeg2_idct_copy_coldfire, @function
23 .global mpeg2_idct_add_coldfire
24 .type mpeg2_idct_add_coldfire, @function
25
26 /* The IDCT itself.
27 * Input: %a0: block pointer
28 * All registers are preserved. */
29 .align 2
30.idct:
31 lea.l (-15*4,%sp), %sp
32 movem.l %d0-%d7/%a0-%a6, (%sp) | save all registers
33 move.l %a0, %a6
34
35 move.l #0, %macsr | signed integer mode
36
37 move.l #((2048<<16)+2841), %a0 | W0, W1
38 move.l #((2676<<16)+2408), %a1 | W2, W3
39 move.l #((2048<<16)+1609), %a2 | W4, W5
40 move.l #((1108<<16)+ 565), %a3 | W6, W7
41
42 lea.l (128,%a6), %a4 | secondary, transposed temp buffer
43 moveq.l #8, %d3 | loop counter
44
45.row_loop:
46 movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
47
48 mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
49 mac.w %a1l, %d2l, %acc0 | + W3 * f3
50 mac.w %a2l, %a5u, %acc0 | + W5 * f5
51 mac.w %a3l, %a5l, %acc0 | + W7 * f7
52
53 mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
54 msac.w %a3l, %d2l, %acc1 | - W7 * f3
55 msac.w %a0l, %a5u, %acc1 | - W1 * f5
56 msac.w %a2l, %a5l, %acc1 | - W5 * f7
57
58 mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
59 msac.w %a0l, %d2l, %acc2 | - W1 * f3
60 mac.w %a3l, %a5u, %acc2 | + W7 * f5
61 mac.w %a1l, %a5l, %acc2 | + W3 * f7
62
63 mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
64 msac.w %a2l, %d2l, %acc3 | - W5 * f3
65 mac.w %a1l, %a5u, %acc3 | + W3 * f5
66 msac.w %a0l, %a5l, %acc3 | - W1 * f7
67
68 lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
69 add.l #(1<<16), %d0 | f0 += 1;
70
71 movclr.l %acc0, %d4 | b0
72 movclr.l %acc1, %d5 | b1
73 movclr.l %acc2, %d6 | b2
74 movclr.l %acc3, %d7 | b3
75
76 mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
77 mac.w %a2u, %d1u, %acc0 | + W4 * f4
78 move.l %acc0, %acc3
79 mac.w %a1u, %d0l, %acc0 | + W2 * f2
80 mac.w %a3u, %d1l, %acc0 | + W6 * f6
81
82 mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
83 msac.w %a2u, %d1u, %acc1 | - W4 * f4
84 move.l %acc1, %acc2
85 mac.w %a3u, %d0l, %acc1 | + W6 * f2
86 msac.w %a1u, %d1l, %acc1 | - W2 * f6
87
88 | ^ move.l %acc0, %acc3 %acc2 = W0 * f0 - W4 * f4
89 msac.w %a3u, %d0l, %acc2 | - W6 * f2
90 mac.w %a1u, %d1l, %acc2 | + W2 * f6
91
92 | ^ move.l %acc1, %acc2 %acc3 = W0 * f0 + W4 * f4
93 msac.w %a1u, %d0l, %acc3 | - W2 * f2
94 msac.w %a3u, %d1l, %acc3 | - W6 * f6
95
96 moveq.l #12, %d1 | shift amount
97
98 move.l %acc0, %d0 | block[7] = (a0
99 sub.l %d4,%d0 | - b0)
100 asr.l %d1, %d0 | >> 12
101 move.w %d0, (7*16,%a4)
102
103 move.l %acc1, %d0 | block[6] = (a1
104 sub.l %d5,%d0 | - b1)
105 asr.l %d1, %d0 | >> 12
106 move.w %d0, (6*16,%a4)
107
108 move.l %acc2, %d0 | block[5] = (a2
109 sub.l %d6,%d0 | - b2)
110 asr.l %d1, %d0 | >> 12
111 move.w %d0, (5*16,%a4)
112
113 move.l %acc3, %d0 | block[4] = (a3
114 sub.l %d7,%d0 | - b3)
115 asr.l %d1, %d0 | >> 12
116 move.w %d0, (4*16,%a4)
117
118 movclr.l %acc3, %d0 | block[3] = (a3
119 add.l %d7, %d0 | + b3)
120 asr.l %d1, %d0 | >> 12
121 move.w %d0, (3*16,%a4)
122
123 movclr.l %acc2, %d0 | block[2] = (a2
124 add.l %d6, %d0 | + b2)
125 asr.l %d1, %d0 | >> 12
126 move.w %d0, (2*16,%a4)
127
128 movclr.l %acc1, %d0 | block[1] = (a1
129 add.l %d5, %d0 | + b1)
130 asr.l %d1, %d0 | >> 12
131 move.w %d0, (1*16,%a4)
132
133 movclr.l %acc0, %d0 | block[0] = (a0
134 add.l %d4, %d0 | + b0)
135 asr.l %d1, %d0 | >> 12
136 move.w %d0, (%a4)+ | advance to next temp column
137
138 subq.l #1, %d3 | loop 8 times
139 bne.w .row_loop
140
141 | %a6 now points to the temp buffer, where we need it.
142 lea.l (-16-128,%a4), %a4 | point %a4 back to the input block
143 moveq.l #8, %d3 | loop counter
144
145.col_loop:
146 movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
147
148 mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
149 mac.w %a1l, %d2l, %acc0 | + W3 * f3
150 mac.w %a2l, %a5u, %acc0 | + W5 * f5
151 mac.w %a3l, %a5l, %acc0 | + W7 * f7
152
153 mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
154 msac.w %a3l, %d2l, %acc1 | - W7 * f3
155 msac.w %a0l, %a5u, %acc1 | - W1 * f5
156 msac.w %a2l, %a5l, %acc1 | - W5 * f7
157
158 mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
159 msac.w %a0l, %d2l, %acc2 | - W1 * f3
160 mac.w %a3l, %a5u, %acc2 | + W7 * f5
161 mac.w %a1l, %a5l, %acc2 | + W3 * f7
162
163 mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
164 msac.w %a2l, %d2l, %acc3 | - W5 * f3
165 mac.w %a1l, %a5u, %acc3 | + W3 * f5
166 msac.w %a0l, %a5l, %acc3 | - W1 * f7
167
168 lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
169 add.l #(32<<16), %d0 | DC offset: 0.5
170
171 movclr.l %acc0, %d4 | b0
172 movclr.l %acc1, %d5 | b1
173 movclr.l %acc2, %d6 | b2
174 movclr.l %acc3, %d7 | b3
175
176 mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
177 mac.w %a2u, %d1u, %acc0 | + W4 * f4
178 move.l %acc0, %acc3
179 mac.w %a1u, %d0l, %acc0 | + W2 * f2
180 mac.w %a3u, %d1l, %acc0 | + W6 * f6
181
182 mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
183 msac.w %a2u, %d1u, %acc1 | - W4 * f4
184 move.l %acc1, %acc2
185 mac.w %a3u, %d0l, %acc1 | + W6 * f2
186 msac.w %a1u, %d1l, %acc1 | - W2 * f6
187
188 | ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
189 msac.w %a3u, %d0l, %acc2 | - W6 * f2
190 mac.w %a1u, %d1l, %acc2 | + W2 * f6
191
192 | ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
193 msac.w %a1u, %d0l, %acc3 | - W2 * f2
194 msac.w %a3u, %d1l, %acc3 | - W6 * f6
195
196 moveq.l #17, %d1 | shift amount
197
198 move.l %acc0, %d0 | block[7] = (a0
199 sub.l %d4,%d0 | - b0)
200 asr.l %d1, %d0 | >> 17
201 move.w %d0, (7*16,%a4)
202
203 move.l %acc1, %d0 | block[6] = (a1
204 sub.l %d5,%d0 | - b1)
205 asr.l %d1, %d0 | >> 17
206 move.w %d0, (6*16,%a4)
207
208 move.l %acc2, %d0 | block[5] = (a2
209 sub.l %d6,%d0 | - b2)
210 asr.l %d1, %d0 | >> 17
211 move.w %d0, (5*16,%a4)
212
213 move.l %acc3, %d0 | block[4] = (a3
214 sub.l %d7,%d0 | - b3)
215 asr.l %d1, %d0 | >> 17
216 move.w %d0, (4*16,%a4)
217
218 movclr.l %acc3, %d0 | block[3] = (a3
219 add.l %d7, %d0 | + b3)
220 asr.l %d1, %d0 | >> 17
221 move.w %d0, (3*16,%a4)
222
223 movclr.l %acc2, %d0 | block[2] = (a2
224 add.l %d6, %d0 | + b2)
225 asr.l %d1, %d0 | >> 17
226 move.w %d0, (2*16,%a4)
227
228 movclr.l %acc1, %d0 | block[1] = (a1
229 add.l %d5, %d0 | + b1)
230 asr.l %d1, %d0 | >> 17
231 move.w %d0, (1*16,%a4)
232
233 movclr.l %acc0, %d0 | block[0] = (a0
234 add.l %d4, %d0 | + b0)
235 asr.l %d1, %d0 | >> 17
236 move.w %d0, (%a4)+ | advance to next column
237
238 subq.l #1, %d3 | loop 8 times
239 bne.w .col_loop
240
241 movem.l (%sp), %d0-%d7/%a0-%a6 | restore all registers
242 lea.l (15*4,%sp), %sp
243 rts
244
245 .align 2
246
247mpeg2_idct_copy_coldfire:
248 lea.l (-4*4,%sp), %sp
249 movem.l %d2-%d4/%a2, (%sp) | save some registers
250 movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
251 | %a1 - destination pointer
252 | %a2 - stride
253
254 bsr.w .idct | apply idct to block
255
256 move.l #255, %d1 | preload constant for clipping
257 moveq.l #8, %d4 | loop counter
258
259.copy_clip_loop:
260 move.w (%a0), %d0 | load block[0]
261 ext.l %d0 | sign extend
262 cmp.l %d1, %d0 | overflow?
263 bls.b 1f
264 spl.b %d0 | yes: set appropriate limit value in low byte
2651:
266 move.b %d0, %d2 | collect output bytes 0..3 in %d2
267 lsl.l #8, %d2
268
269 move.w (2,%a0), %d0 | load block[1]
270 ext.l %d0 | sign extend
271 cmp.l %d1, %d0 | overflow?
272 bls.b 1f
273 spl.b %d0 | yes: set appropriate limit value in low byte
2741:
275 move.b %d0, %d2 | collect output bytes 0..3 in %d2
276 lsl.l #8, %d2
277 clr.l (%a0)+ | clear block[0] and block[1],
278 | %a0 now pointing to block[2]
279 move.w (%a0), %d0 | do b2 and b3
280 ext.l %d0
281 cmp.l %d1, %d0
282 bls.b 1f
283 spl.b %d0
2841:
285 move.b %d0, %d2
286 lsl.l #8, %d2
287
288 move.w (2,%a0), %d0
289 ext.l %d0
290 cmp.l %d1, %d0
291 bls.b 1f
292 spl.b %d0
2931:
294 move.b %d0, %d2
295 clr.l (%a0)+
296
297 move.w (%a0), %d0 | do b4 and b5
298 ext.l %d0
299 cmp.l %d1, %d0
300 bls.b 1f
301 spl.b %d0
3021:
303 move.b %d0, %d3
304 lsl.l #8, %d3
305
306 move.w (2,%a0), %d0
307 ext.l %d0
308 cmp.l %d1, %d0
309 bls.b 1f
310 spl.b %d0
3111:
312 move.b %d0, %d3
313 lsl.l #8, %d3
314 clr.l (%a0)+
315
316 move.w (%a0), %d0 | do b6 and b7
317 ext.l %d0
318 cmp.l %d1, %d0
319 bls.b 1f
320 spl.b %d0
3211:
322 move.b %d0, %d3
323 lsl.l #8, %d3
324
325 move.w (2,%a0), %d0
326 ext.l %d0
327 cmp.l %d1, %d0
328 bls.b 1f
329 spl.b %d0
3301:
331 move.b %d0, %d3
332 clr.l (%a0)+
333
334 movem.l %d2-%d3, (%a1) | write all 8 output bytes at once
335 lea.l (%a2,%a1), %a1 | advance output pointer
336 subq.l #1, %d4 | loop 8 times
337 bne.w .copy_clip_loop
338
339 movem.l (%sp), %d2-%d4/%a2 | restore registers
340 lea.l (4*4,%sp), %sp
341 rts
342
343 .align 2
344
345mpeg2_idct_add_coldfire:
346 lea.l (-7*4,%sp), %sp
347 movem.l %d2-%d7/%a2, (%sp) | save some registers
348 movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
349 | %a0 - block pointer
350 | %a1 - destination pointer
351 | %a2 - stride
352 cmp.l #129, %d0 | last == 129 ?
353 bne.b .idct_add | no: perform idct + addition
354 move.w (%a0), %d0
355 ext.l %d0 | ((block[0]
356 asr.l #4, %d0 | >> 4)
357 and.l #7, %d0 | & 7)
358 subq.l #4, %d0 | - 4 == 0 ?
359 bne.w .dc_add | no: just perform addition
360
361.idct_add:
362 bsr.w .idct | apply idct
363
364 move.l #255, %d2 | preload constant for clipping
365 clr.l %d3 | used for splitting input words into bytes
366 moveq.l #8, %d4 | loop counter
367
368.add_clip_loop:
369 movem.l (%a1), %d6-%d7 | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
370 swap %d6 | (b2 b3 b0 b1)
371 swap %d7 | (b6 b7 b4 b5)
372
373 move.w (2,%a0), %d0 | load block[1]
374 ext.l %d0 | sign extend
375 move.b %d6, %d3 | copy b1
376 lsr.l #8, %d6 | prepare 1st buffer for next byte
377 add.l %d3, %d0 | add b1
378 cmp.l %d2, %d0 | overflow ?
379 bls.b 1f
380 spl.b %d0 | yes: set appropriate limit value in low byte
3811:
382 move.w (%a0), %d1 | load block[0]
383 ext.l %d1 | sign extend
384 move.b %d6, %d3 | copy b0
385 lsr.l #8, %d6 | prepare 1st buffer for next byte
386 add.l %d3, %d1 | add b0
387 cmp.l %d2, %d1 | overflow ?
388 bls.b 1f
389 spl.b %d1 | yes: set appropriate limit value in low byte
3901:
391 move.b %d1, %d5 | collect output bytes 0..3 in %d5
392 lsl.l #8, %d5
393 move.b %d0, %d5
394 lsl.l #8, %d5
395 clr.l (%a0)+ | clear block[0] and block[1]
396 | %a0 now pointing to block[2]
397 move.w (2,%a0), %d0 | do b3 and b2
398 ext.l %d0
399 move.b %d6, %d3
400 lsr.l #8, %d6
401 add.l %d3, %d0
402 cmp.l %d2, %d0
403 bls.b 1f
404 spl.b %d0
4051:
406 move.w (%a0), %d1
407 ext.l %d1
408 add.l %d6, %d1
409 cmp.l %d2, %d1
410 bls.b 1f
411 spl.b %d1
4121:
413 move.b %d1, %d5
414 lsl.l #8, %d5
415 move.b %d0, %d5
416 clr.l (%a0)+
417
418 move.w (2,%a0), %d0 | do b5 and b4
419 ext.l %d0
420 move.b %d7, %d3
421 lsr.l #8, %d7
422 add.l %d3, %d0
423 cmp.l %d2, %d0
424 bls.b 1f
425 spl.b %d0
4261:
427 move.w (%a0), %d1
428 ext.l %d1
429 move.b %d7, %d3
430 lsr.l #8, %d7
431 add.l %d3, %d1
432 cmp.l %d2, %d1
433 bls.b 1f
434 spl.b %d1
4351:
436 move.b %d1, %d6
437 lsl.l #8, %d6
438 move.b %d0, %d6
439 lsl.l #8, %d6
440 clr.l (%a0)+
441
442 move.w (2,%a0), %d0 | do b7 and b6
443 ext.l %d0
444 move.b %d7, %d3
445 lsr.l #8, %d7
446 add.l %d3, %d0
447 cmp.l %d2, %d0
448 bls.b 1f
449 spl.b %d0
4501:
451 move.w (%a0), %d1
452 ext.l %d1
453 add.l %d7, %d1
454 cmp.l %d2, %d1
455 bls.b 1f
456 spl.b %d1
4571:
458 move.b %d1, %d6
459 lsl.l #8, %d6
460 move.b %d0, %d6
461 clr.l (%a0)+
462
463 movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
464 lea.l (%a2,%a1), %a1 | advance output pointer
465 subq.l #1, %d4 | loop 8 times
466 bne.w .add_clip_loop
467
468 bra.w .idct_add_end
469
470.dc_add:
471 move.w (%a0), %d0
472 ext.l %d0 | %d0 = (block[0]
473 add.l #64, %d0 | + 64)
474 asr.l #7, %d0 | >> 7
475 clr.w (%a0) | clear block[0]
476 clr.w (63*2,%a0) | and block[63]
477 move.l %d0, %a0 | DC value in %a0
478
479 move.l #255, %d2 | preload constant for clipping
480 clr.l %d3 | for splitting input words into bytes
481 moveq.l #8, %d4 | loop counter
482
483.dc_clip_loop:
484 movem.l (%a1), %d6-%d7 | (b0 b1 b2 b3) (b4 b5 b6 b7)
485 swap %d6 | (b2 b3 b0 b1)
486 swap %d7 | (b6 b7 b4 b5)
487
488 move.l %a0, %d0 | copy DC
489 move.b %d6, %d3 | copy b1
490 lsr.l #8, %d6 | prepare 1st buffer for next byte
491 add.l %d3, %d0 | add b1
492 cmp.l %d2, %d0 | overflow ?
493 bls.b 1f
494 spl.b %d0 | yes: set appropriate limit value in low byte
4951:
496 move.l %a0, %d1 | copy DC
497 move.b %d6, %d3 | copy b0
498 lsr.l #8, %d6 | prepare 1st buffer for next byte
499 add.l %d3, %d1 | add b0
500 cmp.l %d2, %d1 | overflow ?
501 bls.b 1f
502 spl.b %d1 | yes: set appropriate limit value in low byte
5031:
504 move.b %d1, %d5 | collect output bytes 0..3 in %d5
505 lsl.l #8, %d5
506 move.b %d0, %d5
507 lsl.l #8, %d5
508
509 move.l %a0, %d0 | do b3 and b2
510 move.b %d6, %d3
511 lsr.l #8, %d6
512 add.l %d3, %d0
513 cmp.l %d2, %d0
514 bls.b 1f
515 spl.b %d0
5161:
517 move.l %a0, %d1
518 add.l %d6, %d1
519 cmp.l %d2, %d1
520 bls.b 1f
521 spl.b %d1
5221:
523 move.b %d1, %d5
524 lsl.l #8, %d5
525 move.b %d0, %d5
526
527 move.l %a0, %d0 | do b5 and b4
528 move.b %d7, %d3
529 lsr.l #8, %d7
530 add.l %d3, %d0
531 cmp.l %d2, %d0
532 bls.b 1f
533 spl.b %d0
5341:
535 move.l %a0, %d1
536 move.b %d7, %d3
537 lsr.l #8, %d7
538 add.l %d3, %d1
539 cmp.l %d2, %d1
540 bls.b 1f
541 spl.b %d1
5421:
543 move.b %d1, %d6 | do b7 and b6
544 lsl.l #8, %d6
545 move.b %d0, %d6
546 lsl.l #8, %d6
547
548 move.l %a0, %d0
549 move.b %d7, %d3
550 lsr.l #8, %d7
551 add.l %d3, %d0
552 cmp.l %d2, %d0
553 bls.b 1f
554 spl.b %d0
5551:
556 move.l %a0, %d1
557 add.l %d7, %d1
558 cmp.l %d2, %d1
559 bls.b 1f
560 spl.b %d1
5611:
562 move.b %d1, %d6
563 lsl.l #8, %d6
564 move.b %d0, %d6
565
566 movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
567 lea.l (%a2,%a1), %a1 | advance output pointer
568 subq.l #1, %d4 | loop 8 times
569 bne.w .dc_clip_loop
570
571.idct_add_end:
572 movem.l (%sp), %d2-%d7/%a2 | restore registers
573 lea.l (7*4,%sp), %sp
574 rts
diff --git a/apps/plugins/mpegplayer/mpeg2_internal.h b/apps/plugins/mpegplayer/mpeg2_internal.h
index 0c552b766f..1ec85c60f1 100644
--- a/apps/plugins/mpegplayer/mpeg2_internal.h
+++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@@ -20,6 +20,8 @@
20 * along with this program; if not, write to the Free Software 20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */ 22 */
23
24#include "config.h" /* for Rockbox CPU_ #defines */
23 25
24/* macroblock modes */ 26/* macroblock modes */
25#define MACROBLOCK_INTRA 1 27#define MACROBLOCK_INTRA 1
@@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
92 int16_t dc_dct_pred[3]; 94 int16_t dc_dct_pred[3];
93 95
94 /* DCT coefficients */ 96 /* DCT coefficients */
97#ifdef CPU_COLDFIRE
98 int16_t *DCTblock; /* put buffer separately to have it in IRAM */
99#else
95 int16_t DCTblock[64] ATTR_ALIGN(64); 100 int16_t DCTblock[64] ATTR_ALIGN(64);
101#endif
96 102
97 uint8_t * picture_dest[3]; 103 uint8_t * picture_dest[3];
98 void (* convert) (void * convert_id, uint8_t * const * src, 104 void (* convert) (void * convert_id, uint8_t * const * src,