diff options
Diffstat (limited to 'apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S')
-rw-r--r-- | apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S | 575 |
1 files changed, 575 insertions, 0 deletions
diff --git a/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S new file mode 100644 index 0000000000..abc54b16cb --- /dev/null +++ b/apps/plugins/mpegplayer/libmpeg2/idct_coldfire.S | |||
@@ -0,0 +1,575 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2007 Jens Arnold | ||
11 | * Based on the work of Karim Boucher and Rani Hod | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License | ||
15 | * as published by the Free Software Foundation; either version 2 | ||
16 | * of the License, or (at your option) any later version. | ||
17 | * | ||
18 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
19 | * KIND, either express or implied. | ||
20 | * | ||
21 | ****************************************************************************/ | ||
22 | |||
23 | .global mpeg2_idct_copy | ||
24 | .type mpeg2_idct_copy, @function | ||
25 | .global mpeg2_idct_add | ||
26 | .type mpeg2_idct_add, @function | ||
27 | |||
28 | /* The IDCT itself. | ||
29 | * Input: %a0: block pointer | ||
30 | * Caller must save all registers. */ | ||
31 | .align 2 | ||
32 | .idct: | ||
33 | move.l %a0, %a6 | ||
34 | |||
35 | move.l #0, %macsr | signed integer mode | ||
36 | |||
37 | move.l #((2048<<16)+2841), %a0 | W0, W1 | ||
38 | move.l #((2676<<16)+2408), %a1 | W2, W3 | ||
39 | move.l #((2048<<16)+1609), %a2 | W4, W5 | ||
40 | move.l #((1108<<16)+ 565), %a3 | W6, W7 | ||
41 | |||
42 | lea.l (128,%a6), %a4 | secondary, transposed temp buffer | ||
43 | moveq.l #8, %d3 | loop counter | ||
44 | |||
45 | .row_loop: | ||
46 | movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7) | ||
47 | |||
48 | mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1 | ||
49 | mac.w %a1l, %d2l, %acc0 | + W3 * f3 | ||
50 | mac.w %a2l, %a5u, %acc0 | + W5 * f5 | ||
51 | mac.w %a3l, %a5l, %acc0 | + W7 * f7 | ||
52 | |||
53 | mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1 | ||
54 | msac.w %a3l, %d2l, %acc1 | - W7 * f3 | ||
55 | msac.w %a0l, %a5u, %acc1 | - W1 * f5 | ||
56 | msac.w %a2l, %a5l, %acc1 | - W5 * f7 | ||
57 | |||
58 | mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1 | ||
59 | msac.w %a0l, %d2l, %acc2 | - W1 * f3 | ||
60 | mac.w %a3l, %a5u, %acc2 | + W7 * f5 | ||
61 | mac.w %a1l, %a5l, %acc2 | + W3 * f7 | ||
62 | |||
63 | mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1 | ||
64 | msac.w %a2l, %d2l, %acc3 | - W5 * f3 | ||
65 | mac.w %a1l, %a5u, %acc3 | + W3 * f5 | ||
66 | msac.w %a0l, %a5l, %acc3 | - W1 * f7 | ||
67 | |||
68 | lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency | ||
69 | add.l #(1<<16), %d0 | f0 += 1; | ||
70 | |||
71 | movclr.l %acc0, %d4 | b0 | ||
72 | movclr.l %acc1, %d5 | b1 | ||
73 | movclr.l %acc2, %d6 | b2 | ||
74 | movclr.l %acc3, %d7 | b3 | ||
75 | |||
76 | mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0 | ||
77 | mac.w %a2u, %d1u, %acc0 | + W4 * f4 | ||
78 | move.l %acc0, %acc3 | ||
79 | mac.w %a1u, %d0l, %acc0 | + W2 * f2 | ||
80 | mac.w %a3u, %d1l, %acc0 | + W6 * f6 | ||
81 | |||
82 | mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0 | ||
83 | msac.w %a2u, %d1u, %acc1 | - W4 * f4 | ||
84 | move.l %acc1, %acc2 | ||
85 | mac.w %a3u, %d0l, %acc1 | + W6 * f2 | ||
86 | msac.w %a1u, %d1l, %acc1 | - W2 * f6 | ||
87 | |||
88 | | ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4 | ||
89 | msac.w %a3u, %d0l, %acc2 | - W6 * f2 | ||
90 | mac.w %a1u, %d1l, %acc2 | + W2 * f6 | ||
91 | |||
92 | | ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4 | ||
93 | msac.w %a1u, %d0l, %acc3 | - W2 * f2 | ||
94 | msac.w %a3u, %d1l, %acc3 | - W6 * f6 | ||
95 | |||
96 | moveq.l #12, %d1 | shift amount | ||
97 | |||
98 | move.l %acc0, %d0 | block[7] = (a0 | ||
99 | sub.l %d4,%d0 | - b0) | ||
100 | asr.l %d1, %d0 | >> 12 | ||
101 | move.w %d0, (7*16,%a4) | ||
102 | |||
103 | move.l %acc1, %d0 | block[6] = (a1 | ||
104 | sub.l %d5,%d0 | - b1) | ||
105 | asr.l %d1, %d0 | >> 12 | ||
106 | move.w %d0, (6*16,%a4) | ||
107 | |||
108 | move.l %acc2, %d0 | block[5] = (a2 | ||
109 | sub.l %d6,%d0 | - b2) | ||
110 | asr.l %d1, %d0 | >> 12 | ||
111 | move.w %d0, (5*16,%a4) | ||
112 | |||
113 | move.l %acc3, %d0 | block[4] = (a3 | ||
114 | sub.l %d7,%d0 | - b3) | ||
115 | asr.l %d1, %d0 | >> 12 | ||
116 | move.w %d0, (4*16,%a4) | ||
117 | |||
118 | movclr.l %acc3, %d0 | block[3] = (a3 | ||
119 | add.l %d7, %d0 | + b3) | ||
120 | asr.l %d1, %d0 | >> 12 | ||
121 | move.w %d0, (3*16,%a4) | ||
122 | |||
123 | movclr.l %acc2, %d0 | block[2] = (a2 | ||
124 | add.l %d6, %d0 | + b2) | ||
125 | asr.l %d1, %d0 | >> 12 | ||
126 | move.w %d0, (2*16,%a4) | ||
127 | |||
128 | movclr.l %acc1, %d0 | block[1] = (a1 | ||
129 | add.l %d5, %d0 | + b1) | ||
130 | asr.l %d1, %d0 | >> 12 | ||
131 | move.w %d0, (1*16,%a4) | ||
132 | |||
133 | movclr.l %acc0, %d0 | block[0] = (a0 | ||
134 | add.l %d4, %d0 | + b0) | ||
135 | asr.l %d1, %d0 | >> 12 | ||
136 | move.w %d0, (%a4)+ | advance to next temp column | ||
137 | |||
138 | subq.l #1, %d3 | loop 8 times | ||
139 | bne.w .row_loop | ||
140 | |||
141 | | %a6 now points to the temp buffer, where we need it. | ||
142 | lea.l (-16-128,%a4), %a4 | point %a4 back to the input block | ||
143 | moveq.l #8, %d3 | loop counter | ||
144 | |||
145 | .col_loop: | ||
146 | movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7) | ||
147 | |||
148 | mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1 | ||
149 | mac.w %a1l, %d2l, %acc0 | + W3 * f3 | ||
150 | mac.w %a2l, %a5u, %acc0 | + W5 * f5 | ||
151 | mac.w %a3l, %a5l, %acc0 | + W7 * f7 | ||
152 | |||
153 | mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1 | ||
154 | msac.w %a3l, %d2l, %acc1 | - W7 * f3 | ||
155 | msac.w %a0l, %a5u, %acc1 | - W1 * f5 | ||
156 | msac.w %a2l, %a5l, %acc1 | - W5 * f7 | ||
157 | |||
158 | mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1 | ||
159 | msac.w %a0l, %d2l, %acc2 | - W1 * f3 | ||
160 | mac.w %a3l, %a5u, %acc2 | + W7 * f5 | ||
161 | mac.w %a1l, %a5l, %acc2 | + W3 * f7 | ||
162 | |||
163 | mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1 | ||
164 | msac.w %a2l, %d2l, %acc3 | - W5 * f3 | ||
165 | mac.w %a1l, %a5u, %acc3 | + W3 * f5 | ||
166 | msac.w %a0l, %a5l, %acc3 | - W1 * f7 | ||
167 | |||
168 | lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency | ||
169 | add.l #(32<<16), %d0 | DC offset: 0.5 | ||
170 | |||
171 | movclr.l %acc0, %d4 | b0 | ||
172 | movclr.l %acc1, %d5 | b1 | ||
173 | movclr.l %acc2, %d6 | b2 | ||
174 | movclr.l %acc3, %d7 | b3 | ||
175 | |||
176 | mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0 | ||
177 | mac.w %a2u, %d1u, %acc0 | + W4 * f4 | ||
178 | move.l %acc0, %acc3 | ||
179 | mac.w %a1u, %d0l, %acc0 | + W2 * f2 | ||
180 | mac.w %a3u, %d1l, %acc0 | + W6 * f6 | ||
181 | |||
182 | mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0 | ||
183 | msac.w %a2u, %d1u, %acc1 | - W4 * f4 | ||
184 | move.l %acc1, %acc2 | ||
185 | mac.w %a3u, %d0l, %acc1 | + W6 * f2 | ||
186 | msac.w %a1u, %d1l, %acc1 | - W2 * f6 | ||
187 | |||
188 | | ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4 | ||
189 | msac.w %a3u, %d0l, %acc2 | - W6 * f2 | ||
190 | mac.w %a1u, %d1l, %acc2 | + W2 * f6 | ||
191 | |||
192 | | ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4 | ||
193 | msac.w %a1u, %d0l, %acc3 | - W2 * f2 | ||
194 | msac.w %a3u, %d1l, %acc3 | - W6 * f6 | ||
195 | |||
196 | moveq.l #17, %d1 | shift amount | ||
197 | |||
198 | move.l %acc0, %d0 | block[7] = (a0 | ||
199 | sub.l %d4,%d0 | - b0) | ||
200 | asr.l %d1, %d0 | >> 17 | ||
201 | move.w %d0, (7*16,%a4) | ||
202 | |||
203 | move.l %acc1, %d0 | block[6] = (a1 | ||
204 | sub.l %d5,%d0 | - b1) | ||
205 | asr.l %d1, %d0 | >> 17 | ||
206 | move.w %d0, (6*16,%a4) | ||
207 | |||
208 | move.l %acc2, %d0 | block[5] = (a2 | ||
209 | sub.l %d6,%d0 | - b2) | ||
210 | asr.l %d1, %d0 | >> 17 | ||
211 | move.w %d0, (5*16,%a4) | ||
212 | |||
213 | move.l %acc3, %d0 | block[4] = (a3 | ||
214 | sub.l %d7,%d0 | - b3) | ||
215 | asr.l %d1, %d0 | >> 17 | ||
216 | move.w %d0, (4*16,%a4) | ||
217 | |||
218 | movclr.l %acc3, %d0 | block[3] = (a3 | ||
219 | add.l %d7, %d0 | + b3) | ||
220 | asr.l %d1, %d0 | >> 17 | ||
221 | move.w %d0, (3*16,%a4) | ||
222 | |||
223 | movclr.l %acc2, %d0 | block[2] = (a2 | ||
224 | add.l %d6, %d0 | + b2) | ||
225 | asr.l %d1, %d0 | >> 17 | ||
226 | move.w %d0, (2*16,%a4) | ||
227 | |||
228 | movclr.l %acc1, %d0 | block[1] = (a1 | ||
229 | add.l %d5, %d0 | + b1) | ||
230 | asr.l %d1, %d0 | >> 17 | ||
231 | move.w %d0, (1*16,%a4) | ||
232 | |||
233 | movclr.l %acc0, %d0 | block[0] = (a0 | ||
234 | add.l %d4, %d0 | + b0) | ||
235 | asr.l %d1, %d0 | >> 17 | ||
236 | move.w %d0, (%a4)+ | advance to next column | ||
237 | |||
238 | subq.l #1, %d3 | loop 8 times | ||
239 | bne.w .col_loop | ||
240 | |||
241 | rts | ||
242 | |||
243 | .align 2 | ||
244 | |||
245 | mpeg2_idct_copy: | ||
246 | lea.l (-11*4,%sp), %sp | ||
247 | movem.l %d2-%d7/%a2-%a6, (%sp) | save some registers | ||
248 | move.l (11*4+4,%sp), %a0 | %a0 - block pointer for idct | ||
249 | |||
250 | bsr.w .idct | apply idct to block | ||
251 | movem.l (11*4+4,%sp), %a0-%a2 | %a0 - block pointer | ||
252 | | %a1 - destination pointer | ||
253 | | %a2 - stride | ||
254 | |||
255 | move.l #255, %d1 | preload constant for clipping | ||
256 | moveq.l #8, %d4 | loop counter | ||
257 | |||
258 | .copy_clip_loop: | ||
259 | move.w (%a0), %d0 | load block[0] | ||
260 | ext.l %d0 | sign extend | ||
261 | cmp.l %d1, %d0 | overflow? | ||
262 | bls.b 1f | ||
263 | spl.b %d0 | yes: set appropriate limit value in low byte | ||
264 | 1: | ||
265 | move.b %d0, %d2 | collect output bytes 0..3 in %d2 | ||
266 | lsl.l #8, %d2 | ||
267 | |||
268 | move.w (2,%a0), %d0 | load block[1] | ||
269 | ext.l %d0 | sign extend | ||
270 | cmp.l %d1, %d0 | overflow? | ||
271 | bls.b 1f | ||
272 | spl.b %d0 | yes: set appropriate limit value in low byte | ||
273 | 1: | ||
274 | move.b %d0, %d2 | collect output bytes 0..3 in %d2 | ||
275 | lsl.l #8, %d2 | ||
276 | clr.l (%a0)+ | clear block[0] and block[1], | ||
277 | | %a0 now pointing to block[2] | ||
278 | move.w (%a0), %d0 | do b2 and b3 | ||
279 | ext.l %d0 | ||
280 | cmp.l %d1, %d0 | ||
281 | bls.b 1f | ||
282 | spl.b %d0 | ||
283 | 1: | ||
284 | move.b %d0, %d2 | ||
285 | lsl.l #8, %d2 | ||
286 | |||
287 | move.w (2,%a0), %d0 | ||
288 | ext.l %d0 | ||
289 | cmp.l %d1, %d0 | ||
290 | bls.b 1f | ||
291 | spl.b %d0 | ||
292 | 1: | ||
293 | move.b %d0, %d2 | ||
294 | clr.l (%a0)+ | ||
295 | |||
296 | move.w (%a0), %d0 | do b4 and b5 | ||
297 | ext.l %d0 | ||
298 | cmp.l %d1, %d0 | ||
299 | bls.b 1f | ||
300 | spl.b %d0 | ||
301 | 1: | ||
302 | move.b %d0, %d3 | ||
303 | lsl.l #8, %d3 | ||
304 | |||
305 | move.w (2,%a0), %d0 | ||
306 | ext.l %d0 | ||
307 | cmp.l %d1, %d0 | ||
308 | bls.b 1f | ||
309 | spl.b %d0 | ||
310 | 1: | ||
311 | move.b %d0, %d3 | ||
312 | lsl.l #8, %d3 | ||
313 | clr.l (%a0)+ | ||
314 | |||
315 | move.w (%a0), %d0 | do b6 and b7 | ||
316 | ext.l %d0 | ||
317 | cmp.l %d1, %d0 | ||
318 | bls.b 1f | ||
319 | spl.b %d0 | ||
320 | 1: | ||
321 | move.b %d0, %d3 | ||
322 | lsl.l #8, %d3 | ||
323 | |||
324 | move.w (2,%a0), %d0 | ||
325 | ext.l %d0 | ||
326 | cmp.l %d1, %d0 | ||
327 | bls.b 1f | ||
328 | spl.b %d0 | ||
329 | 1: | ||
330 | move.b %d0, %d3 | ||
331 | clr.l (%a0)+ | ||
332 | |||
333 | movem.l %d2-%d3, (%a1) | write all 8 output bytes at once | ||
334 | add.l %a2, %a1 | advance output pointer | ||
335 | subq.l #1, %d4 | loop 8 times | ||
336 | bne.w .copy_clip_loop | ||
337 | |||
338 | movem.l (%sp), %d2-%d7/%a2-%a6 | ||
339 | lea.l (11*4,%sp), %sp | ||
340 | rts | ||
341 | |||
342 | .align 2 | ||
343 | |||
344 | mpeg2_idct_add: | ||
345 | lea.l (-11*4,%sp), %sp | ||
346 | movem.l %d2-%d7/%a2-%a6, (%sp) | ||
347 | movem.l (11*4+4,%sp), %d0/%a0-%a2 | %d0 - last value | ||
348 | | %a0 - block pointer | ||
349 | | %a1 - destination pointer | ||
350 | | %a2 - stride | ||
351 | |||
352 | cmp.l #129, %d0 | last == 129 ? | ||
353 | bne.b .idct_add | no: perform idct + addition | ||
354 | move.w (%a0), %d0 | ||
355 | ext.l %d0 | ((block[0] | ||
356 | asr.l #4, %d0 | >> 4) | ||
357 | and.l #7, %d0 | & 7) | ||
358 | subq.l #4, %d0 | - 4 == 0 ? | ||
359 | bne.w .dc_add | no: just perform addition | ||
360 | |||
361 | .idct_add: | ||
362 | bsr.w .idct | apply idct | ||
363 | movem.l (11*4+8,%sp), %a0-%a2 | reload arguments %a0..%a2 | ||
364 | |||
365 | move.l #255, %d2 | preload constant for clipping | ||
366 | clr.l %d3 | used for splitting input words into bytes | ||
367 | moveq.l #8, %d4 | loop counter | ||
368 | |||
369 | .add_clip_loop: | ||
370 | movem.l (%a1), %d6-%d7 | fetch (b0 b1 b2 b3) (b4 b5 b6 b7) | ||
371 | swap %d6 | (b2 b3 b0 b1) | ||
372 | swap %d7 | (b6 b7 b4 b5) | ||
373 | |||
374 | move.w (2,%a0), %d0 | load block[1] | ||
375 | ext.l %d0 | sign extend | ||
376 | move.b %d6, %d3 | copy b1 | ||
377 | lsr.l #8, %d6 | prepare 1st buffer for next byte | ||
378 | add.l %d3, %d0 | add b1 | ||
379 | cmp.l %d2, %d0 | overflow ? | ||
380 | bls.b 1f | ||
381 | spl.b %d0 | yes: set appropriate limit value in low byte | ||
382 | 1: | ||
383 | move.w (%a0), %d1 | load block[0] | ||
384 | ext.l %d1 | sign extend | ||
385 | move.b %d6, %d3 | copy b0 | ||
386 | lsr.l #8, %d6 | prepare 1st buffer for next byte | ||
387 | add.l %d3, %d1 | add b0 | ||
388 | cmp.l %d2, %d1 | overflow ? | ||
389 | bls.b 1f | ||
390 | spl.b %d1 | yes: set appropriate limit value in low byte | ||
391 | 1: | ||
392 | move.b %d1, %d5 | collect output bytes 0..3 in %d5 | ||
393 | lsl.l #8, %d5 | ||
394 | move.b %d0, %d5 | ||
395 | lsl.l #8, %d5 | ||
396 | clr.l (%a0)+ | clear block[0] and block[1] | ||
397 | | %a0 now pointing to block[2] | ||
398 | move.w (2,%a0), %d0 | do b3 and b2 | ||
399 | ext.l %d0 | ||
400 | move.b %d6, %d3 | ||
401 | lsr.l #8, %d6 | ||
402 | add.l %d3, %d0 | ||
403 | cmp.l %d2, %d0 | ||
404 | bls.b 1f | ||
405 | spl.b %d0 | ||
406 | 1: | ||
407 | move.w (%a0), %d1 | ||
408 | ext.l %d1 | ||
409 | add.l %d6, %d1 | ||
410 | cmp.l %d2, %d1 | ||
411 | bls.b 1f | ||
412 | spl.b %d1 | ||
413 | 1: | ||
414 | move.b %d1, %d5 | ||
415 | lsl.l #8, %d5 | ||
416 | move.b %d0, %d5 | ||
417 | clr.l (%a0)+ | ||
418 | |||
419 | move.w (2,%a0), %d0 | do b5 and b4 | ||
420 | ext.l %d0 | ||
421 | move.b %d7, %d3 | ||
422 | lsr.l #8, %d7 | ||
423 | add.l %d3, %d0 | ||
424 | cmp.l %d2, %d0 | ||
425 | bls.b 1f | ||
426 | spl.b %d0 | ||
427 | 1: | ||
428 | move.w (%a0), %d1 | ||
429 | ext.l %d1 | ||
430 | move.b %d7, %d3 | ||
431 | lsr.l #8, %d7 | ||
432 | add.l %d3, %d1 | ||
433 | cmp.l %d2, %d1 | ||
434 | bls.b 1f | ||
435 | spl.b %d1 | ||
436 | 1: | ||
437 | move.b %d1, %d6 | ||
438 | lsl.l #8, %d6 | ||
439 | move.b %d0, %d6 | ||
440 | lsl.l #8, %d6 | ||
441 | clr.l (%a0)+ | ||
442 | |||
443 | move.w (2,%a0), %d0 | do b7 and b6 | ||
444 | ext.l %d0 | ||
445 | move.b %d7, %d3 | ||
446 | lsr.l #8, %d7 | ||
447 | add.l %d3, %d0 | ||
448 | cmp.l %d2, %d0 | ||
449 | bls.b 1f | ||
450 | spl.b %d0 | ||
451 | 1: | ||
452 | move.w (%a0), %d1 | ||
453 | ext.l %d1 | ||
454 | add.l %d7, %d1 | ||
455 | cmp.l %d2, %d1 | ||
456 | bls.b 1f | ||
457 | spl.b %d1 | ||
458 | 1: | ||
459 | move.b %d1, %d6 | ||
460 | lsl.l #8, %d6 | ||
461 | move.b %d0, %d6 | ||
462 | clr.l (%a0)+ | ||
463 | |||
464 | movem.l %d5-%d6, (%a1) | write all 8 output bytes at once | ||
465 | add.l %a2, %a1 | advance output pointer | ||
466 | subq.l #1, %d4 | loop 8 times | ||
467 | bne.w .add_clip_loop | ||
468 | |||
469 | bra.w .idct_add_end | ||
470 | |||
471 | .dc_add: | ||
472 | move.w (%a0), %d0 | ||
473 | ext.l %d0 | %d0 = (block[0] | ||
474 | add.l #64, %d0 | + 64) | ||
475 | asr.l #7, %d0 | >> 7 | ||
476 | clr.w (%a0) | clear block[0] | ||
477 | clr.w (63*2,%a0) | and block[63] | ||
478 | move.l %d0, %a0 | DC value in %a0 | ||
479 | |||
480 | move.l #255, %d2 | preload constant for clipping | ||
481 | clr.l %d3 | for splitting input words into bytes | ||
482 | moveq.l #8, %d4 | loop counter | ||
483 | |||
484 | .dc_clip_loop: | ||
485 | movem.l (%a1), %d6-%d7 | (b0 b1 b2 b3) (b4 b5 b6 b7) | ||
486 | swap %d6 | (b2 b3 b0 b1) | ||
487 | swap %d7 | (b6 b7 b4 b5) | ||
488 | |||
489 | move.l %a0, %d0 | copy DC | ||
490 | move.b %d6, %d3 | copy b1 | ||
491 | lsr.l #8, %d6 | prepare 1st buffer for next byte | ||
492 | add.l %d3, %d0 | add b1 | ||
493 | cmp.l %d2, %d0 | overflow ? | ||
494 | bls.b 1f | ||
495 | spl.b %d0 | yes: set appropriate limit value in low byte | ||
496 | 1: | ||
497 | move.l %a0, %d1 | copy DC | ||
498 | move.b %d6, %d3 | copy b0 | ||
499 | lsr.l #8, %d6 | prepare 1st buffer for next byte | ||
500 | add.l %d3, %d1 | add b0 | ||
501 | cmp.l %d2, %d1 | overflow ? | ||
502 | bls.b 1f | ||
503 | spl.b %d1 | yes: set appropriate limit value in low byte | ||
504 | 1: | ||
505 | move.b %d1, %d5 | collect output bytes 0..3 in %d5 | ||
506 | lsl.l #8, %d5 | ||
507 | move.b %d0, %d5 | ||
508 | lsl.l #8, %d5 | ||
509 | |||
510 | move.l %a0, %d0 | do b3 and b2 | ||
511 | move.b %d6, %d3 | ||
512 | lsr.l #8, %d6 | ||
513 | add.l %d3, %d0 | ||
514 | cmp.l %d2, %d0 | ||
515 | bls.b 1f | ||
516 | spl.b %d0 | ||
517 | 1: | ||
518 | move.l %a0, %d1 | ||
519 | add.l %d6, %d1 | ||
520 | cmp.l %d2, %d1 | ||
521 | bls.b 1f | ||
522 | spl.b %d1 | ||
523 | 1: | ||
524 | move.b %d1, %d5 | ||
525 | lsl.l #8, %d5 | ||
526 | move.b %d0, %d5 | ||
527 | |||
528 | move.l %a0, %d0 | do b5 and b4 | ||
529 | move.b %d7, %d3 | ||
530 | lsr.l #8, %d7 | ||
531 | add.l %d3, %d0 | ||
532 | cmp.l %d2, %d0 | ||
533 | bls.b 1f | ||
534 | spl.b %d0 | ||
535 | 1: | ||
536 | move.l %a0, %d1 | ||
537 | move.b %d7, %d3 | ||
538 | lsr.l #8, %d7 | ||
539 | add.l %d3, %d1 | ||
540 | cmp.l %d2, %d1 | ||
541 | bls.b 1f | ||
542 | spl.b %d1 | ||
543 | 1: | ||
544 | move.b %d1, %d6 | do b7 and b6 | ||
545 | lsl.l #8, %d6 | ||
546 | move.b %d0, %d6 | ||
547 | lsl.l #8, %d6 | ||
548 | |||
549 | move.l %a0, %d0 | ||
550 | move.b %d7, %d3 | ||
551 | lsr.l #8, %d7 | ||
552 | add.l %d3, %d0 | ||
553 | cmp.l %d2, %d0 | ||
554 | bls.b 1f | ||
555 | spl.b %d0 | ||
556 | 1: | ||
557 | move.l %a0, %d1 | ||
558 | add.l %d7, %d1 | ||
559 | cmp.l %d2, %d1 | ||
560 | bls.b 1f | ||
561 | spl.b %d1 | ||
562 | 1: | ||
563 | move.b %d1, %d6 | ||
564 | lsl.l #8, %d6 | ||
565 | move.b %d0, %d6 | ||
566 | |||
567 | movem.l %d5-%d6, (%a1) | write all 8 output bytes at once | ||
568 | add.l %a2, %a1 | advance output pointer | ||
569 | subq.l #1, %d4 | loop 8 times | ||
570 | bne.w .dc_clip_loop | ||
571 | |||
572 | .idct_add_end: | ||
573 | movem.l (%sp), %d2-%d7/%a2-%a6 | ||
574 | lea.l (11*4,%sp), %sp | ||
575 | rts | ||