summaryrefslogtreecommitdiff
path: root/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
diff options
context:
space:
mode:
Diffstat (limited to 'apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S')
-rw-r--r--apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S575
1 files changed, 575 insertions, 0 deletions
diff --git a/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
new file mode 100644
index 0000000000..abc54b16cb
--- /dev/null
+++ b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S
@@ -0,0 +1,575 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2007 Jens Arnold
11 * Based on the work of Karim Boucher and Rani Hod
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version 2
16 * of the License, or (at your option) any later version.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ****************************************************************************/
22
23 .global mpeg2_idct_copy
24 .type mpeg2_idct_copy, @function
25 .global mpeg2_idct_add
26 .type mpeg2_idct_add, @function
27
28 /* The IDCT itself.
29 * Input: %a0: block pointer
30 * Caller must save all registers. */
31 .align 2
32.idct:
33 move.l %a0, %a6
34
35 move.l #0, %macsr | signed integer mode
36
37 move.l #((2048<<16)+2841), %a0 | W0, W1
38 move.l #((2676<<16)+2408), %a1 | W2, W3
39 move.l #((2048<<16)+1609), %a2 | W4, W5
40 move.l #((1108<<16)+ 565), %a3 | W6, W7
41
42 lea.l (128,%a6), %a4 | secondary, transposed temp buffer
43 moveq.l #8, %d3 | loop counter
44
45.row_loop:
46 movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
47
48 mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
49 mac.w %a1l, %d2l, %acc0 | + W3 * f3
50 mac.w %a2l, %a5u, %acc0 | + W5 * f5
51 mac.w %a3l, %a5l, %acc0 | + W7 * f7
52
53 mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
54 msac.w %a3l, %d2l, %acc1 | - W7 * f3
55 msac.w %a0l, %a5u, %acc1 | - W1 * f5
56 msac.w %a2l, %a5l, %acc1 | - W5 * f7
57
58 mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
59 msac.w %a0l, %d2l, %acc2 | - W1 * f3
60 mac.w %a3l, %a5u, %acc2 | + W7 * f5
61 mac.w %a1l, %a5l, %acc2 | + W3 * f7
62
63 mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
64 msac.w %a2l, %d2l, %acc3 | - W5 * f3
65 mac.w %a1l, %a5u, %acc3 | + W3 * f5
66 msac.w %a0l, %a5l, %acc3 | - W1 * f7
67
68 lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
69 add.l #(1<<16), %d0 | f0 += 1;
70
71 movclr.l %acc0, %d4 | b0
72 movclr.l %acc1, %d5 | b1
73 movclr.l %acc2, %d6 | b2
74 movclr.l %acc3, %d7 | b3
75
76 mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
77 mac.w %a2u, %d1u, %acc0 | + W4 * f4
78 move.l %acc0, %acc3
79 mac.w %a1u, %d0l, %acc0 | + W2 * f2
80 mac.w %a3u, %d1l, %acc0 | + W6 * f6
81
82 mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
83 msac.w %a2u, %d1u, %acc1 | - W4 * f4
84 move.l %acc1, %acc2
85 mac.w %a3u, %d0l, %acc1 | + W6 * f2
86 msac.w %a1u, %d1l, %acc1 | - W2 * f6
87
88 | ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
89 msac.w %a3u, %d0l, %acc2 | - W6 * f2
90 mac.w %a1u, %d1l, %acc2 | + W2 * f6
91
92 | ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
93 msac.w %a1u, %d0l, %acc3 | - W2 * f2
94 msac.w %a3u, %d1l, %acc3 | - W6 * f6
95
96 moveq.l #12, %d1 | shift amount
97
98 move.l %acc0, %d0 | block[7] = (a0
99 sub.l %d4,%d0 | - b0)
100 asr.l %d1, %d0 | >> 12
101 move.w %d0, (7*16,%a4)
102
103 move.l %acc1, %d0 | block[6] = (a1
104 sub.l %d5,%d0 | - b1)
105 asr.l %d1, %d0 | >> 12
106 move.w %d0, (6*16,%a4)
107
108 move.l %acc2, %d0 | block[5] = (a2
109 sub.l %d6,%d0 | - b2)
110 asr.l %d1, %d0 | >> 12
111 move.w %d0, (5*16,%a4)
112
113 move.l %acc3, %d0 | block[4] = (a3
114 sub.l %d7,%d0 | - b3)
115 asr.l %d1, %d0 | >> 12
116 move.w %d0, (4*16,%a4)
117
118 movclr.l %acc3, %d0 | block[3] = (a3
119 add.l %d7, %d0 | + b3)
120 asr.l %d1, %d0 | >> 12
121 move.w %d0, (3*16,%a4)
122
123 movclr.l %acc2, %d0 | block[2] = (a2
124 add.l %d6, %d0 | + b2)
125 asr.l %d1, %d0 | >> 12
126 move.w %d0, (2*16,%a4)
127
128 movclr.l %acc1, %d0 | block[1] = (a1
129 add.l %d5, %d0 | + b1)
130 asr.l %d1, %d0 | >> 12
131 move.w %d0, (1*16,%a4)
132
133 movclr.l %acc0, %d0 | block[0] = (a0
134 add.l %d4, %d0 | + b0)
135 asr.l %d1, %d0 | >> 12
136 move.w %d0, (%a4)+ | advance to next temp column
137
138 subq.l #1, %d3 | loop 8 times
139 bne.w .row_loop
140
141 | %a6 now points to the temp buffer, where we need it.
142 lea.l (-16-128,%a4), %a4 | point %a4 back to the input block
143 moveq.l #8, %d3 | loop counter
144
145.col_loop:
146 movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
147
148 mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
149 mac.w %a1l, %d2l, %acc0 | + W3 * f3
150 mac.w %a2l, %a5u, %acc0 | + W5 * f5
151 mac.w %a3l, %a5l, %acc0 | + W7 * f7
152
153 mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
154 msac.w %a3l, %d2l, %acc1 | - W7 * f3
155 msac.w %a0l, %a5u, %acc1 | - W1 * f5
156 msac.w %a2l, %a5l, %acc1 | - W5 * f7
157
158 mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
159 msac.w %a0l, %d2l, %acc2 | - W1 * f3
160 mac.w %a3l, %a5u, %acc2 | + W7 * f5
161 mac.w %a1l, %a5l, %acc2 | + W3 * f7
162
163 mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
164 msac.w %a2l, %d2l, %acc3 | - W5 * f3
165 mac.w %a1l, %a5u, %acc3 | + W3 * f5
166 msac.w %a0l, %a5l, %acc3 | - W1 * f7
167
168 lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
169 add.l #(32<<16), %d0 | DC offset: 0.5
170
171 movclr.l %acc0, %d4 | b0
172 movclr.l %acc1, %d5 | b1
173 movclr.l %acc2, %d6 | b2
174 movclr.l %acc3, %d7 | b3
175
176 mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
177 mac.w %a2u, %d1u, %acc0 | + W4 * f4
178 move.l %acc0, %acc3
179 mac.w %a1u, %d0l, %acc0 | + W2 * f2
180 mac.w %a3u, %d1l, %acc0 | + W6 * f6
181
182 mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
183 msac.w %a2u, %d1u, %acc1 | - W4 * f4
184 move.l %acc1, %acc2
185 mac.w %a3u, %d0l, %acc1 | + W6 * f2
186 msac.w %a1u, %d1l, %acc1 | - W2 * f6
187
188 | ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
189 msac.w %a3u, %d0l, %acc2 | - W6 * f2
190 mac.w %a1u, %d1l, %acc2 | + W2 * f6
191
192 | ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
193 msac.w %a1u, %d0l, %acc3 | - W2 * f2
194 msac.w %a3u, %d1l, %acc3 | - W6 * f6
195
196 moveq.l #17, %d1 | shift amount
197
198 move.l %acc0, %d0 | block[7] = (a0
199 sub.l %d4,%d0 | - b0)
200 asr.l %d1, %d0 | >> 17
201 move.w %d0, (7*16,%a4)
202
203 move.l %acc1, %d0 | block[6] = (a1
204 sub.l %d5,%d0 | - b1)
205 asr.l %d1, %d0 | >> 17
206 move.w %d0, (6*16,%a4)
207
208 move.l %acc2, %d0 | block[5] = (a2
209 sub.l %d6,%d0 | - b2)
210 asr.l %d1, %d0 | >> 17
211 move.w %d0, (5*16,%a4)
212
213 move.l %acc3, %d0 | block[4] = (a3
214 sub.l %d7,%d0 | - b3)
215 asr.l %d1, %d0 | >> 17
216 move.w %d0, (4*16,%a4)
217
218 movclr.l %acc3, %d0 | block[3] = (a3
219 add.l %d7, %d0 | + b3)
220 asr.l %d1, %d0 | >> 17
221 move.w %d0, (3*16,%a4)
222
223 movclr.l %acc2, %d0 | block[2] = (a2
224 add.l %d6, %d0 | + b2)
225 asr.l %d1, %d0 | >> 17
226 move.w %d0, (2*16,%a4)
227
228 movclr.l %acc1, %d0 | block[1] = (a1
229 add.l %d5, %d0 | + b1)
230 asr.l %d1, %d0 | >> 17
231 move.w %d0, (1*16,%a4)
232
233 movclr.l %acc0, %d0 | block[0] = (a0
234 add.l %d4, %d0 | + b0)
235 asr.l %d1, %d0 | >> 17
236 move.w %d0, (%a4)+ | advance to next column
237
238 subq.l #1, %d3 | loop 8 times
239 bne.w .col_loop
240
241 rts
242
243 .align 2
244
245mpeg2_idct_copy:
246 lea.l (-11*4,%sp), %sp
247 movem.l %d2-%d7/%a2-%a6, (%sp) | save some registers
248 move.l (11*4+4,%sp), %a0 | %a0 - block pointer for idct
249
250 bsr.w .idct | apply idct to block
251 movem.l (11*4+4,%sp), %a0-%a2 | %a0 - block pointer
252 | %a1 - destination pointer
253 | %a2 - stride
254
255 move.l #255, %d1 | preload constant for clipping
256 moveq.l #8, %d4 | loop counter
257
258.copy_clip_loop:
259 move.w (%a0), %d0 | load block[0]
260 ext.l %d0 | sign extend
261 cmp.l %d1, %d0 | overflow?
262 bls.b 1f
263 spl.b %d0 | yes: set appropriate limit value in low byte
2641:
265 move.b %d0, %d2 | collect output bytes 0..3 in %d2
266 lsl.l #8, %d2
267
268 move.w (2,%a0), %d0 | load block[1]
269 ext.l %d0 | sign extend
270 cmp.l %d1, %d0 | overflow?
271 bls.b 1f
272 spl.b %d0 | yes: set appropriate limit value in low byte
2731:
274 move.b %d0, %d2 | collect output bytes 0..3 in %d2
275 lsl.l #8, %d2
276 clr.l (%a0)+ | clear block[0] and block[1],
277 | %a0 now pointing to block[2]
278 move.w (%a0), %d0 | do b2 and b3
279 ext.l %d0
280 cmp.l %d1, %d0
281 bls.b 1f
282 spl.b %d0
2831:
284 move.b %d0, %d2
285 lsl.l #8, %d2
286
287 move.w (2,%a0), %d0
288 ext.l %d0
289 cmp.l %d1, %d0
290 bls.b 1f
291 spl.b %d0
2921:
293 move.b %d0, %d2
294 clr.l (%a0)+
295
296 move.w (%a0), %d0 | do b4 and b5
297 ext.l %d0
298 cmp.l %d1, %d0
299 bls.b 1f
300 spl.b %d0
3011:
302 move.b %d0, %d3
303 lsl.l #8, %d3
304
305 move.w (2,%a0), %d0
306 ext.l %d0
307 cmp.l %d1, %d0
308 bls.b 1f
309 spl.b %d0
3101:
311 move.b %d0, %d3
312 lsl.l #8, %d3
313 clr.l (%a0)+
314
315 move.w (%a0), %d0 | do b6 and b7
316 ext.l %d0
317 cmp.l %d1, %d0
318 bls.b 1f
319 spl.b %d0
3201:
321 move.b %d0, %d3
322 lsl.l #8, %d3
323
324 move.w (2,%a0), %d0
325 ext.l %d0
326 cmp.l %d1, %d0
327 bls.b 1f
328 spl.b %d0
3291:
330 move.b %d0, %d3
331 clr.l (%a0)+
332
333 movem.l %d2-%d3, (%a1) | write all 8 output bytes at once
334 add.l %a2, %a1 | advance output pointer
335 subq.l #1, %d4 | loop 8 times
336 bne.w .copy_clip_loop
337
338 movem.l (%sp), %d2-%d7/%a2-%a6
339 lea.l (11*4,%sp), %sp
340 rts
341
342 .align 2
343
344mpeg2_idct_add:
345 lea.l (-11*4,%sp), %sp
346 movem.l %d2-%d7/%a2-%a6, (%sp)
347 movem.l (11*4+4,%sp), %d0/%a0-%a2 | %d0 - last value
348 | %a0 - block pointer
349 | %a1 - destination pointer
350 | %a2 - stride
351
352 cmp.l #129, %d0 | last == 129 ?
353 bne.b .idct_add | no: perform idct + addition
354 move.w (%a0), %d0
355 ext.l %d0 | ((block[0]
356 asr.l #4, %d0 | >> 4)
357 and.l #7, %d0 | & 7)
358 subq.l #4, %d0 | - 4 == 0 ?
359 bne.w .dc_add | no: just perform addition
360
361.idct_add:
362 bsr.w .idct | apply idct
363 movem.l (11*4+8,%sp), %a0-%a2 | reload arguments %a0..%a2
364
365 move.l #255, %d2 | preload constant for clipping
366 clr.l %d3 | used for splitting input words into bytes
367 moveq.l #8, %d4 | loop counter
368
369.add_clip_loop:
370 movem.l (%a1), %d6-%d7 | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
371 swap %d6 | (b2 b3 b0 b1)
372 swap %d7 | (b6 b7 b4 b5)
373
374 move.w (2,%a0), %d0 | load block[1]
375 ext.l %d0 | sign extend
376 move.b %d6, %d3 | copy b1
377 lsr.l #8, %d6 | prepare 1st buffer for next byte
378 add.l %d3, %d0 | add b1
379 cmp.l %d2, %d0 | overflow ?
380 bls.b 1f
381 spl.b %d0 | yes: set appropriate limit value in low byte
3821:
383 move.w (%a0), %d1 | load block[0]
384 ext.l %d1 | sign extend
385 move.b %d6, %d3 | copy b0
386 lsr.l #8, %d6 | prepare 1st buffer for next byte
387 add.l %d3, %d1 | add b0
388 cmp.l %d2, %d1 | overflow ?
389 bls.b 1f
390 spl.b %d1 | yes: set appropriate limit value in low byte
3911:
392 move.b %d1, %d5 | collect output bytes 0..3 in %d5
393 lsl.l #8, %d5
394 move.b %d0, %d5
395 lsl.l #8, %d5
396 clr.l (%a0)+ | clear block[0] and block[1]
397 | %a0 now pointing to block[2]
398 move.w (2,%a0), %d0 | do b3 and b2
399 ext.l %d0
400 move.b %d6, %d3
401 lsr.l #8, %d6
402 add.l %d3, %d0
403 cmp.l %d2, %d0
404 bls.b 1f
405 spl.b %d0
4061:
407 move.w (%a0), %d1
408 ext.l %d1
409 add.l %d6, %d1
410 cmp.l %d2, %d1
411 bls.b 1f
412 spl.b %d1
4131:
414 move.b %d1, %d5
415 lsl.l #8, %d5
416 move.b %d0, %d5
417 clr.l (%a0)+
418
419 move.w (2,%a0), %d0 | do b5 and b4
420 ext.l %d0
421 move.b %d7, %d3
422 lsr.l #8, %d7
423 add.l %d3, %d0
424 cmp.l %d2, %d0
425 bls.b 1f
426 spl.b %d0
4271:
428 move.w (%a0), %d1
429 ext.l %d1
430 move.b %d7, %d3
431 lsr.l #8, %d7
432 add.l %d3, %d1
433 cmp.l %d2, %d1
434 bls.b 1f
435 spl.b %d1
4361:
437 move.b %d1, %d6
438 lsl.l #8, %d6
439 move.b %d0, %d6
440 lsl.l #8, %d6
441 clr.l (%a0)+
442
443 move.w (2,%a0), %d0 | do b7 and b6
444 ext.l %d0
445 move.b %d7, %d3
446 lsr.l #8, %d7
447 add.l %d3, %d0
448 cmp.l %d2, %d0
449 bls.b 1f
450 spl.b %d0
4511:
452 move.w (%a0), %d1
453 ext.l %d1
454 add.l %d7, %d1
455 cmp.l %d2, %d1
456 bls.b 1f
457 spl.b %d1
4581:
459 move.b %d1, %d6
460 lsl.l #8, %d6
461 move.b %d0, %d6
462 clr.l (%a0)+
463
464 movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
465 add.l %a2, %a1 | advance output pointer
466 subq.l #1, %d4 | loop 8 times
467 bne.w .add_clip_loop
468
469 bra.w .idct_add_end
470
471.dc_add:
472 move.w (%a0), %d0
473 ext.l %d0 | %d0 = (block[0]
474 add.l #64, %d0 | + 64)
475 asr.l #7, %d0 | >> 7
476 clr.w (%a0) | clear block[0]
477 clr.w (63*2,%a0) | and block[63]
478 move.l %d0, %a0 | DC value in %a0
479
480 move.l #255, %d2 | preload constant for clipping
481 clr.l %d3 | for splitting input words into bytes
482 moveq.l #8, %d4 | loop counter
483
484.dc_clip_loop:
485 movem.l (%a1), %d6-%d7 | (b0 b1 b2 b3) (b4 b5 b6 b7)
486 swap %d6 | (b2 b3 b0 b1)
487 swap %d7 | (b6 b7 b4 b5)
488
489 move.l %a0, %d0 | copy DC
490 move.b %d6, %d3 | copy b1
491 lsr.l #8, %d6 | prepare 1st buffer for next byte
492 add.l %d3, %d0 | add b1
493 cmp.l %d2, %d0 | overflow ?
494 bls.b 1f
495 spl.b %d0 | yes: set appropriate limit value in low byte
4961:
497 move.l %a0, %d1 | copy DC
498 move.b %d6, %d3 | copy b0
499 lsr.l #8, %d6 | prepare 1st buffer for next byte
500 add.l %d3, %d1 | add b0
501 cmp.l %d2, %d1 | overflow ?
502 bls.b 1f
503 spl.b %d1 | yes: set appropriate limit value in low byte
5041:
505 move.b %d1, %d5 | collect output bytes 0..3 in %d5
506 lsl.l #8, %d5
507 move.b %d0, %d5
508 lsl.l #8, %d5
509
510 move.l %a0, %d0 | do b3 and b2
511 move.b %d6, %d3
512 lsr.l #8, %d6
513 add.l %d3, %d0
514 cmp.l %d2, %d0
515 bls.b 1f
516 spl.b %d0
5171:
518 move.l %a0, %d1
519 add.l %d6, %d1
520 cmp.l %d2, %d1
521 bls.b 1f
522 spl.b %d1
5231:
524 move.b %d1, %d5
525 lsl.l #8, %d5
526 move.b %d0, %d5
527
528 move.l %a0, %d0 | do b5 and b4
529 move.b %d7, %d3
530 lsr.l #8, %d7
531 add.l %d3, %d0
532 cmp.l %d2, %d0
533 bls.b 1f
534 spl.b %d0
5351:
536 move.l %a0, %d1
537 move.b %d7, %d3
538 lsr.l #8, %d7
539 add.l %d3, %d1
540 cmp.l %d2, %d1
541 bls.b 1f
542 spl.b %d1
5431:
544 move.b %d1, %d6 | do b7 and b6
545 lsl.l #8, %d6
546 move.b %d0, %d6
547 lsl.l #8, %d6
548
549 move.l %a0, %d0
550 move.b %d7, %d3
551 lsr.l #8, %d7
552 add.l %d3, %d0
553 cmp.l %d2, %d0
554 bls.b 1f
555 spl.b %d0
5561:
557 move.l %a0, %d1
558 add.l %d7, %d1
559 cmp.l %d2, %d1
560 bls.b 1f
561 spl.b %d1
5621:
563 move.b %d1, %d6
564 lsl.l #8, %d6
565 move.b %d0, %d6
566
567 movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
568 add.l %a2, %a1 | advance output pointer
569 subq.l #1, %d4 | loop 8 times
570 bne.w .dc_clip_loop
571
572.idct_add_end:
573 movem.l (%sp), %d2-%d7/%a2-%a6
574 lea.l (11*4,%sp), %sp
575 rts