summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libffmpegFLAC/coldfire.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libffmpegFLAC/coldfire.S')
-rw-r--r--lib/rbcodec/codecs/libffmpegFLAC/coldfire.S535
1 files changed, 535 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libffmpegFLAC/coldfire.S b/lib/rbcodec/codecs/libffmpegFLAC/coldfire.S
new file mode 100644
index 0000000000..efbb907874
--- /dev/null
+++ b/lib/rbcodec/codecs/libffmpegFLAC/coldfire.S
@@ -0,0 +1,535 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2005 by Thom Johansen
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22/* The following are assembler optimised version of the LPC filtering
23 routines needed for FLAC decoding. They is optimised for use with the
24 MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
25 */
26
27/* This routine deals with sample widths 16 and lower. All LPC filtering up to
28 order 10 is done in specially optimised unrolled loops, while every order
29 above this is handled by a slower default routine.
30 */
31 .section .icode,"ax",@progbits
32 .global lpc_decode_emac
33 .align 2
34lpc_decode_emac:
35 lea.l (-44, %sp), %sp
36 movem.l %d2-%d7/%a2-%a6, (%sp)
37 movem.l (44+4, %sp), %d0-%d2/%a0-%a1
38 /* d0 = blocksize, d1 = qlevel, d2 = pred_order
39 a0 = data, a1 = coeffs
40 */
41
42 /* the data pointer always lags behind history pointer by 'pred_order'
43 samples. since we have one loop for each order, we can hard code this
44 and free a register by not saving data pointer.
45 */
46 move.l %d2, %d3
47 neg.l %d3
48 lea.l (%a0, %d3.l*4), %a0 | history
49 clr.l %d3
50 move.l %d3, %macsr | we'll need integer mode for this
51 tst.l %d0
52 jeq .exit | zero samples to process, exit
53 moveq.l #10, %d3
54 cmp.l %d3, %d2
55 jgt .default | order is over 10, jump to default case
56 jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order
57| jumptable:
58 bra.w .exit | zero order filter isn't possible, exit function
59 bra.w .order1
60 bra.w .order2
61 bra.w .order3
62 bra.w .order4
63 bra.w .order5
64 bra.w .order6
65 bra.w .order7
66 bra.w .order8
67 bra.w .order9
68
69| last jump table entry coincides with target, so leave it out
70.order10:
71 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
72 move.l (%a0)+, %a6 | load first history sample
731:
74 mac.l %a6, %a5, (%a0)+, %a6, %acc0
75 mac.l %a6, %a4, (%a0)+, %a6, %acc0
76 mac.l %a6, %a3, (%a0)+, %a6, %acc0
77 mac.l %a6, %a2, (%a0)+, %a6, %acc0
78 mac.l %a6, %a1, (%a0)+, %a6, %acc0
79 mac.l %a6, %d7, (%a0)+, %a6, %acc0
80 mac.l %a6, %d6, (%a0)+, %a6, %acc0
81 mac.l %a6, %d5, (%a0)+, %a6, %acc0
82 mac.l %a6, %d4, (%a0)+, %a6, %acc0
83 mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration
84 movclr.l %acc0, %d2 | get sum
85 asr.l %d1, %d2 | shift sum by qlevel bits
86 add.l %d2, (%a0) | add residual and save
87 lea.l (-8*4, %a0), %a0 | point history back at second element
88 subq.l #1, %d0 | decrement sample count
89 jne 1b | are we done?
90 jra .exit
91
92.order9:
93 movem.l (%a1), %d4-%d7/%a1-%a5
94 move.l (%a0)+, %a6
951:
96 mac.l %a6, %a5, (%a0)+, %a6, %acc0
97 mac.l %a6, %a4, (%a0)+, %a6, %acc0
98 mac.l %a6, %a3, (%a0)+, %a6, %acc0
99 mac.l %a6, %a2, (%a0)+, %a6, %acc0
100 mac.l %a6, %a1, (%a0)+, %a6, %acc0
101 mac.l %a6, %d7, (%a0)+, %a6, %acc0
102 mac.l %a6, %d6, (%a0)+, %a6, %acc0
103 mac.l %a6, %d5, (%a0)+, %a6, %acc0
104 mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0
105 movclr.l %acc0, %d2
106 asr.l %d1, %d2
107 add.l %d2, (%a0)
108 lea.l (-7*4, %a0), %a0
109 subq.l #1, %d0
110 jne 1b
111 jra .exit
112
113.order8:
114 movem.l (%a1), %d5-%d7/%a1-%a5
115 move.l (%a0)+, %a6
1161:
117 mac.l %a6, %a5, (%a0)+, %a6, %acc0
118 mac.l %a6, %a4, (%a0)+, %a6, %acc0
119 mac.l %a6, %a3, (%a0)+, %a6, %acc0
120 mac.l %a6, %a2, (%a0)+, %a6, %acc0
121 mac.l %a6, %a1, (%a0)+, %a6, %acc0
122 mac.l %a6, %d7, (%a0)+, %a6, %acc0
123 mac.l %a6, %d6, (%a0)+, %a6, %acc0
124 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0
125 movclr.l %acc0, %d2
126 asr.l %d1, %d2
127 add.l %d2, (%a0)
128 lea.l (-6*4, %a0), %a0
129 subq.l #1, %d0
130 jne 1b
131 jra .exit
132
133.order7:
134 movem.l (%a1), %d6-%d7/%a1-%a5
135 move.l (%a0)+, %a6
1361:
137 mac.l %a6, %a5, (%a0)+, %a6, %acc0
138 mac.l %a6, %a4, (%a0)+, %a6, %acc0
139 mac.l %a6, %a3, (%a0)+, %a6, %acc0
140 mac.l %a6, %a2, (%a0)+, %a6, %acc0
141 mac.l %a6, %a1, (%a0)+, %a6, %acc0
142 mac.l %a6, %d7, (%a0)+, %a6, %acc0
143 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
144 movclr.l %acc0, %d2
145 asr.l %d1, %d2
146 add.l %d2, (%a0)
147 lea.l (-5*4, %a0), %a0
148 subq.l #1, %d0
149 jne 1b
150 jra .exit
151
152.order6:
153 movem.l (%a1), %d7/%a1-%a5
154 move.l (%a0)+, %a6
1551:
156 mac.l %a6, %a5, (%a0)+, %a6, %acc0
157 mac.l %a6, %a4, (%a0)+, %a6, %acc0
158 mac.l %a6, %a3, (%a0)+, %a6, %acc0
159 mac.l %a6, %a2, (%a0)+, %a6, %acc0
160 mac.l %a6, %a1, (%a0)+, %a6, %acc0
161 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
162 movclr.l %acc0, %d2
163 asr.l %d1, %d2
164 add.l %d2, (%a0)
165 lea.l (-4*4, %a0), %a0
166 subq.l #1, %d0
167 jne 1b
168 jra .exit
169
170.order5:
171 movem.l (%a1), %a1-%a5
172 move.l (%a0)+, %a6
1731:
174 mac.l %a6, %a5, (%a0)+, %a6, %acc0
175 mac.l %a6, %a4, (%a0)+, %a6, %acc0
176 mac.l %a6, %a3, (%a0)+, %a6, %acc0
177 mac.l %a6, %a2, (%a0)+, %a6, %acc0
178 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
179 movclr.l %acc0, %d2
180 asr.l %d1, %d2
181 add.l %d2, (%a0)
182 lea.l (-3*4, %a0), %a0
183 subq.l #1, %d0
184 jne 1b
185 jra .exit
186
187.order4:
188 movem.l (%a1), %a2-%a5
189 move.l (%a0)+, %a6
1901:
191 mac.l %a6, %a5, (%a0)+, %a6, %acc0
192 mac.l %a6, %a4, (%a0)+, %a6, %acc0
193 mac.l %a6, %a3, (%a0)+, %a6, %acc0
194 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
195 movclr.l %acc0, %d2
196 asr.l %d1, %d2
197 add.l %d2, (%a0)
198 subq.l #8, %a0
199 subq.l #1, %d0
200 jne 1b
201 jra .exit
202
203.order3:
204 movem.l (%a1), %a3-%a5
205 move.l (%a0)+, %a6
2061:
207 mac.l %a6, %a5, (%a0)+, %a6, %acc0
208 mac.l %a6, %a4, (%a0)+, %a6, %acc0
209 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
210 movclr.l %acc0, %d2
211 asr.l %d1, %d2
212 add.l %d2, (%a0)
213 subq.l #4, %a0
214 subq.l #1, %d0
215 jne 1b
216 jra .exit
217
218.order2:
219 movem.l (%a1), %a4-%a5
220 move.l (%a0)+, %a6
2211:
222 mac.l %a6, %a5, (%a0)+, %a6, %acc0
223 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
224 movclr.l %acc0, %d2
225 asr.l %d1, %d2
226 add.l %d2, (%a0)
227 subq.l #1, %d0
228 jne 1b
229 jra .exit
230
231.order1:
232 | no point in using mac here
233 move.l (%a1), %a5
2341:
235 move.l %a5, %d2
236 muls.l (%a0)+, %d2
237 asr.l %d1, %d2
238 add.l %d2, (%a0)
239 subq.l #1, %d0
240 jne 1b
241 jra .exit
242
243.default:
244 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
245 do the rest by jump table. */
246 lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
247 move.l %a0, %a3 | working copy of history pointer
248 move.l %d2, %d3
249 lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop
250 move.l (%a3)+, %a5 | preload data for loop
2511:
252 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
253 movem.l (%a2), %d4-%d7 | load four coefs
254 mac.l %a5, %d7, (%a3)+, %a5, %acc0
255 mac.l %a5, %d6, (%a3)+, %a5, %acc0
256 mac.l %a5, %d5, (%a3)+, %a5, %acc0
257 mac.l %a5, %d4, (%a3)+, %a5, %acc0
258 subq.l #1, %d3 | any more unrolled loop operations left?
259 jne 1b
260
261 moveq.l #3, %d3 | mask 0x00000003
262 and.l %d2, %d3 | get the remaining samples to be filtered
263 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
264| jumptable:
265 bra.b 3f | none left
266 bra.b 2f | one left
267 bra.b 1f | two left
268| three left
269 move.l -(%a2), %d4
270 mac.l %a5, %d4, (%a3)+, %a5, %acc0
2711:
272 move.l -(%a2), %d4
273 mac.l %a5, %d4, (%a3)+, %a5, %acc0
2742:
275 move.l -(%a2), %d4
276 mac.l %a5, %d4, (%a3)+, %a5, %acc0
2773:
278 movclr.l %acc0, %d3 | get result
279 asr.l %d1, %d3 | shift qlevel bits right
280 add.l %a5, %d3 | add residual, which is in a5 by now
281 move.l %d3, -(%a3) | save, a3 is also one past save location
282 addq.l #4, %a0 | increment history pointer
283 subq.l #1, %d0 | decrement sample count
284 jne .default | are we done?
285 jra .exit | if so, fall through to exit
286
287
288/* This routine deals with sample widths 24 and lower. All LPC filtering up to
289 order 8 is done in specially optimised unrolled loops, while every order
290 above this is handled by a slower default routine.
291 */
292 .global lpc_decode_emac_wide
293 .align 2
294lpc_decode_emac_wide:
295 lea.l (-44, %sp), %sp
296 movem.l %d2-%d7/%a2-%a6, (%sp)
297 movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1
298 /* d0 = blocksize, d1 = qlevel, d3 = pred_order
299 a0 = data, a1 = coeffs
300 */
301
302 /* the data pointer always lags behind history pointer by 'pred_order'
303 samples. since we have one loop for each order, we can hard code this
304 and free a register by not saving data pointer.
305 */
306 move.l %d3, %d2
307 neg.l %d2
308 lea.l (%a0, %d2.l*4), %a0 | history
309 clr.l %d2
310 move.l %d2, %macsr | we'll need integer mode for this
311 tst.l %d0
312 jeq .exit | zero samples to process, exit
313 moveq.l #32, %d2
314 sub.l %d1, %d2 | calculate shift amount for extension byte
315 moveq.l #8, %d4
316 cmp.l %d4, %d3
317 jgt .wdefault | order is over 8, jump to default case
318 jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order
319| jumptable:
320 bra.w .exit | zero order filter isn't possible, exit function
321 bra.w .worder1
322 bra.w .worder2
323 bra.w .worder3
324 bra.w .worder4
325 bra.w .worder5
326 bra.w .worder6
327 bra.w .worder7
328
329| last jump table entry coincides with target, so leave it out
330.worder8:
331 movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs
332 move.l (%a0)+, %a6 | load first history sample
3331:
334 mac.l %a6, %a5, (%a0)+, %a6, %acc0
335 mac.l %a6, %a4, (%a0)+, %a6, %acc0
336 mac.l %a6, %a3, (%a0)+, %a6, %acc0
337 mac.l %a6, %a2, (%a0)+, %a6, %acc0
338 mac.l %a6, %a1, (%a0)+, %a6, %acc0
339 mac.l %a6, %d7, (%a0)+, %a6, %acc0
340 mac.l %a6, %d6, (%a0)+, %a6, %acc0
341 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration
342 move.l %accext01, %d4 | get top 8 bits of sum
343 movclr.l %acc0, %d3 | then botten 32 bits
344 lsr.l %d1, %d3 | shift bottom bits qlevel bits right
345 asl.l %d2, %d4 | shift top bits 32 - qlevel bits left
346 or.l %d4, %d3 | now combine results
347 add.l %d3, (%a0) | add residual and save
348 lea.l (-6*4, %a0), %a0 | point history back at second element
349 subq.l #1, %d0 | decrement sample count
350 jne 1b | are we done?
351 jra .exit
352
353.worder7:
354 movem.l (%a1), %d6-%d7/%a1-%a5
355 move.l (%a0)+, %a6
3561:
357 mac.l %a6, %a5, (%a0)+, %a6, %acc0
358 mac.l %a6, %a4, (%a0)+, %a6, %acc0
359 mac.l %a6, %a3, (%a0)+, %a6, %acc0
360 mac.l %a6, %a2, (%a0)+, %a6, %acc0
361 mac.l %a6, %a1, (%a0)+, %a6, %acc0
362 mac.l %a6, %d7, (%a0)+, %a6, %acc0
363 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
364 move.l %accext01, %d4
365 movclr.l %acc0, %d3
366 lsr.l %d1, %d3
367 asl.l %d2, %d4
368 or.l %d4, %d3
369 add.l %d3, (%a0)
370 lea.l (-5*4, %a0), %a0
371 subq.l #1, %d0
372 jne 1b
373 jra .exit
374
375.worder6:
376 movem.l (%a1), %d7/%a1-%a5
377 move.l (%a0)+, %a6
3781:
379 mac.l %a6, %a5, (%a0)+, %a6, %acc0
380 mac.l %a6, %a4, (%a0)+, %a6, %acc0
381 mac.l %a6, %a3, (%a0)+, %a6, %acc0
382 mac.l %a6, %a2, (%a0)+, %a6, %acc0
383 mac.l %a6, %a1, (%a0)+, %a6, %acc0
384 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
385 move.l %accext01, %d4
386 movclr.l %acc0, %d3
387 lsr.l %d1, %d3
388 asl.l %d2, %d4
389 or.l %d4, %d3
390 add.l %d3, (%a0)
391 lea.l (-4*4, %a0), %a0
392 subq.l #1, %d0
393 jne 1b
394 jra .exit
395
396.worder5:
397 movem.l (%a1), %a1-%a5
398 move.l (%a0)+, %a6
3991:
400 mac.l %a6, %a5, (%a0)+, %a6, %acc0
401 mac.l %a6, %a4, (%a0)+, %a6, %acc0
402 mac.l %a6, %a3, (%a0)+, %a6, %acc0
403 mac.l %a6, %a2, (%a0)+, %a6, %acc0
404 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
405 move.l %accext01, %d4
406 movclr.l %acc0, %d3
407 lsr.l %d1, %d3
408 asl.l %d2, %d4
409 or.l %d4, %d3
410 add.l %d3, (%a0)
411 lea.l (-3*4, %a0), %a0
412 subq.l #1, %d0
413 jne 1b
414 jra .exit
415
416.worder4:
417 movem.l (%a1), %a2-%a5
418 move.l (%a0)+, %a6
4191:
420 mac.l %a6, %a5, (%a0)+, %a6, %acc0
421 mac.l %a6, %a4, (%a0)+, %a6, %acc0
422 mac.l %a6, %a3, (%a0)+, %a6, %acc0
423 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
424 move.l %accext01, %d4
425 movclr.l %acc0, %d3
426 lsr.l %d1, %d3
427 asl.l %d2, %d4
428 or.l %d4, %d3
429 add.l %d3, (%a0)
430 subq.l #8, %a0
431 subq.l #1, %d0
432 jne 1b
433 jra .exit
434
435.worder3:
436 movem.l (%a1), %a3-%a5
437 move.l (%a0)+, %a6
4381:
439 mac.l %a6, %a5, (%a0)+, %a6, %acc0
440 mac.l %a6, %a4, (%a0)+, %a6, %acc0
441 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
442 move.l %accext01, %d4
443 movclr.l %acc0, %d3
444 lsr.l %d1, %d3
445 asl.l %d2, %d4
446 or.l %d4, %d3
447 add.l %d3, (%a0)
448 subq.l #4, %a0
449 subq.l #1, %d0
450 jne 1b
451 jra .exit
452
453.worder2:
454 movem.l (%a1), %a4-%a5
455 move.l (%a0)+, %a6
4561:
457 mac.l %a6, %a5, (%a0)+, %a6, %acc0
458 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
459 move.l %accext01, %d4
460 movclr.l %acc0, %d3
461 lsr.l %d1, %d3
462 asl.l %d2, %d4
463 or.l %d4, %d3
464 add.l %d3, (%a0)
465 subq.l #1, %d0
466 jne 1b
467 jra .exit
468
469.worder1:
470 move.l (%a1), %a5
471 move.l (%a0)+, %a6
4721:
473 mac.l %a6, %a5, (%a0), %a6, %acc0
474 move.l %accext01, %d4
475 movclr.l %acc0, %d3
476 lsr.l %d1, %d3
477 asl.l %d2, %d4
478 or.l %d4, %d3
479 add.l %a6, %d3 | residual is already in a6
480 move.l %d3, (%a0)+
481 subq.l #1, %d0
482 jne 1b
483 jra .exit
484
485.wdefault:
486 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
487 do the rest by jump table. */
488 lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs
489 move.l %a0, %a3 | working copy of history pointer
490 move.l %d3, %d4
491 lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop
492 move.l (%a3)+, %a5 | preload data for loop
4931:
494 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
495 movem.l (%a2), %d5-%d7/%a4 | load four coefs
496 mac.l %a5, %a4, (%a3)+, %a5, %acc0
497 mac.l %a5, %d7, (%a3)+, %a5, %acc0
498 mac.l %a5, %d6, (%a3)+, %a5, %acc0
499 mac.l %a5, %d5, (%a3)+, %a5, %acc0
500 subq.l #1, %d4 | any more unrolled loop operations left?
501 jne 1b
502
503 moveq.l #3, %d4 | mask 0x00000003
504 and.l %d3, %d4 | get the remaining samples to be filtered
505 jmp.l (2, %pc, %d4*2) | then jump into mac.l chain
506| jumptable:
507 bra.b 3f | none left
508 bra.b 2f | one left
509 bra.b 1f | two left
510| three left
511 move.l -(%a2), %d4
512 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5131:
514 move.l -(%a2), %d4
515 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5162:
517 move.l -(%a2), %d4
518 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5193:
520 move.l %accext01, %d5 | get high 32 bits of result
521 movclr.l %acc0, %d4 | get low 32 bits of result
522 lsr.l %d1, %d4 | shift qlevel bits right
523 asl.l %d2, %d5 | shift 32 - qlevel bits left
524 or.l %d5, %d4 | combine top and low bits after shift
525 add.l %a5, %d4 | add residual, which is in a5 by now
526 move.l %d4, -(%a3) | save, a3 is also one past save location
527 addq.l #4, %a0 | increment history pointer
528 subq.l #1, %d0 | decrement sample count
529 jne .wdefault | are we done?
530 | if so, fall through to exit
531
532.exit:
533 movem.l (%sp), %d2-%d7/%a2-%a6
534 lea.l (44, %sp), %sp
535 rts