summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2006-10-25 00:59:38 +0000
committerThom Johansen <thomj@rockbox.org>2006-10-25 00:59:38 +0000
commit4134e91950a3104bb364148768c6d8060b18a1d0 (patch)
treed5e5833354308211af9619efc558bf0fc1027ec9
parent73c0cfa6321e077c45d0a72882a3ad2ce9a453fd (diff)
downloadrockbox-4134e91950a3104bb364148768c6d8060b18a1d0.tar.gz
rockbox-4134e91950a3104bb364148768c6d8060b18a1d0.zip
Assembler optimised FLAC 24 bit handling routines for Coldfire based players. Decoding speed should be improved drastically. Haven't got so many 24 bit files myself, so let me know if something sounds off.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@11329 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libffmpegFLAC/coldfire.S321
-rw-r--r--apps/codecs/libffmpegFLAC/coldfire.h5
-rw-r--r--apps/codecs/libffmpegFLAC/decoder.c13
3 files changed, 299 insertions, 40 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S
index 1d144ecc76..5f464be762 100644
--- a/apps/codecs/libffmpegFLAC/coldfire.S
+++ b/apps/codecs/libffmpegFLAC/coldfire.S
@@ -17,11 +17,14 @@
17 * 17 *
18 ****************************************************************************/ 18 ****************************************************************************/
19 19
20/* The following is an assembler optimised version of the LPC filtering 20/* The following are assembler optimised version of the LPC filtering
21 routines needed for FLAC decoding. It is optimised for use with the 21 routines needed for FLAC decoding. They is optimised for use with the
22 MCF5249 processor, or any other similar ColdFire core with the EMAC unit. 22 MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
23 All LPC filtering up to order 10 is done in specially optimised unrolled 23 */
24 loops, while every order above this is handled by a slower default routine. 24
25/* This routine deals with sample widths 16 and lower. All LPC filtering up to
26 order 10 is done in specially optimised unrolled loops, while every order
27 above this is handled by a slower default routine.
25 */ 28 */
26 .section .icode,"ax",@progbits 29 .section .icode,"ax",@progbits
27 .global lpc_decode_emac 30 .global lpc_decode_emac
@@ -65,7 +68,7 @@ lpc_decode_emac:
65.order10: 68.order10:
66 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs 69 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
67 move.l (%a0)+, %a6 | load first history sample 70 move.l (%a0)+, %a6 | load first history sample
68.loop10: 711:
69 mac.l %a6, %a5, (%a0)+, %a6, %acc0 72 mac.l %a6, %a5, (%a0)+, %a6, %acc0
70 mac.l %a6, %a4, (%a0)+, %a6, %acc0 73 mac.l %a6, %a4, (%a0)+, %a6, %acc0
71 mac.l %a6, %a3, (%a0)+, %a6, %acc0 74 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -81,13 +84,13 @@ lpc_decode_emac:
81 add.l %d2, (%a0) | add residual and save 84 add.l %d2, (%a0) | add residual and save
82 lea.l (-8*4, %a0), %a0 | point history back at second element 85 lea.l (-8*4, %a0), %a0 | point history back at second element
83 subq.l #1, %d0 | decrement sample count 86 subq.l #1, %d0 | decrement sample count
84 jne .loop10 | are we done? 87 jne 1b | are we done?
85 jra .exit 88 jra .exit
86 89
87.order9: 90.order9:
88 movem.l (%a1), %d4-%d7/%a1-%a5 91 movem.l (%a1), %d4-%d7/%a1-%a5
89 move.l (%a0)+, %a6 92 move.l (%a0)+, %a6
90.loop9: 931:
91 mac.l %a6, %a5, (%a0)+, %a6, %acc0 94 mac.l %a6, %a5, (%a0)+, %a6, %acc0
92 mac.l %a6, %a4, (%a0)+, %a6, %acc0 95 mac.l %a6, %a4, (%a0)+, %a6, %acc0
93 mac.l %a6, %a3, (%a0)+, %a6, %acc0 96 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -102,13 +105,13 @@ lpc_decode_emac:
102 add.l %d2, (%a0) 105 add.l %d2, (%a0)
103 lea.l (-7*4, %a0), %a0 106 lea.l (-7*4, %a0), %a0
104 subq.l #1, %d0 107 subq.l #1, %d0
105 jne .loop9 108 jne 1b
106 jra .exit 109 jra .exit
107 110
108.order8: 111.order8:
109 movem.l (%a1), %d5-%d7/%a1-%a5 112 movem.l (%a1), %d5-%d7/%a1-%a5
110 move.l (%a0)+, %a6 113 move.l (%a0)+, %a6
111.loop8: 1141:
112 mac.l %a6, %a5, (%a0)+, %a6, %acc0 115 mac.l %a6, %a5, (%a0)+, %a6, %acc0
113 mac.l %a6, %a4, (%a0)+, %a6, %acc0 116 mac.l %a6, %a4, (%a0)+, %a6, %acc0
114 mac.l %a6, %a3, (%a0)+, %a6, %acc0 117 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -122,13 +125,13 @@ lpc_decode_emac:
122 add.l %d2, (%a0) 125 add.l %d2, (%a0)
123 lea.l (-6*4, %a0), %a0 126 lea.l (-6*4, %a0), %a0
124 subq.l #1, %d0 127 subq.l #1, %d0
125 jne .loop8 128 jne 1b
126 jra .exit 129 jra .exit
127 130
128.order7: 131.order7:
129 movem.l (%a1), %d6-%d7/%a1-%a5 132 movem.l (%a1), %d6-%d7/%a1-%a5
130 move.l (%a0)+, %a6 133 move.l (%a0)+, %a6
131.loop7: 1341:
132 mac.l %a6, %a5, (%a0)+, %a6, %acc0 135 mac.l %a6, %a5, (%a0)+, %a6, %acc0
133 mac.l %a6, %a4, (%a0)+, %a6, %acc0 136 mac.l %a6, %a4, (%a0)+, %a6, %acc0
134 mac.l %a6, %a3, (%a0)+, %a6, %acc0 137 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -141,13 +144,13 @@ lpc_decode_emac:
141 add.l %d2, (%a0) 144 add.l %d2, (%a0)
142 lea.l (-5*4, %a0), %a0 145 lea.l (-5*4, %a0), %a0
143 subq.l #1, %d0 146 subq.l #1, %d0
144 jne .loop7 147 jne 1b
145 jra .exit 148 jra .exit
146 149
147.order6: 150.order6:
148 movem.l (%a1), %d7/%a1-%a5 151 movem.l (%a1), %d7/%a1-%a5
149 move.l (%a0)+, %a6 152 move.l (%a0)+, %a6
150.loop6: 1531:
151 mac.l %a6, %a5, (%a0)+, %a6, %acc0 154 mac.l %a6, %a5, (%a0)+, %a6, %acc0
152 mac.l %a6, %a4, (%a0)+, %a6, %acc0 155 mac.l %a6, %a4, (%a0)+, %a6, %acc0
153 mac.l %a6, %a3, (%a0)+, %a6, %acc0 156 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -159,13 +162,13 @@ lpc_decode_emac:
159 add.l %d2, (%a0) 162 add.l %d2, (%a0)
160 lea.l (-4*4, %a0), %a0 163 lea.l (-4*4, %a0), %a0
161 subq.l #1, %d0 164 subq.l #1, %d0
162 jne .loop6 165 jne 1b
163 jra .exit 166 jra .exit
164 167
165.order5: 168.order5:
166 movem.l (%a1), %a1-%a5 169 movem.l (%a1), %a1-%a5
167 move.l (%a0)+, %a6 170 move.l (%a0)+, %a6
168.loop5: 1711:
169 mac.l %a6, %a5, (%a0)+, %a6, %acc0 172 mac.l %a6, %a5, (%a0)+, %a6, %acc0
170 mac.l %a6, %a4, (%a0)+, %a6, %acc0 173 mac.l %a6, %a4, (%a0)+, %a6, %acc0
171 mac.l %a6, %a3, (%a0)+, %a6, %acc0 174 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -176,13 +179,13 @@ lpc_decode_emac:
176 add.l %d2, (%a0) 179 add.l %d2, (%a0)
177 lea.l (-3*4, %a0), %a0 180 lea.l (-3*4, %a0), %a0
178 subq.l #1, %d0 181 subq.l #1, %d0
179 jne .loop5 182 jne 1b
180 jra .exit 183 jra .exit
181 184
182.order4: 185.order4:
183 movem.l (%a1), %a2-%a5 186 movem.l (%a1), %a2-%a5
184 move.l (%a0)+, %a6 187 move.l (%a0)+, %a6
185.loop4: 1881:
186 mac.l %a6, %a5, (%a0)+, %a6, %acc0 189 mac.l %a6, %a5, (%a0)+, %a6, %acc0
187 mac.l %a6, %a4, (%a0)+, %a6, %acc0 190 mac.l %a6, %a4, (%a0)+, %a6, %acc0
188 mac.l %a6, %a3, (%a0)+, %a6, %acc0 191 mac.l %a6, %a3, (%a0)+, %a6, %acc0
@@ -192,13 +195,13 @@ lpc_decode_emac:
192 add.l %d2, (%a0) 195 add.l %d2, (%a0)
193 subq.l #8, %a0 196 subq.l #8, %a0
194 subq.l #1, %d0 197 subq.l #1, %d0
195 jne .loop4 198 jne 1b
196 jra .exit 199 jra .exit
197 200
198.order3: 201.order3:
199 movem.l (%a1), %a3-%a5 202 movem.l (%a1), %a3-%a5
200 move.l (%a0)+, %a6 203 move.l (%a0)+, %a6
201.loop3: 2041:
202 mac.l %a6, %a5, (%a0)+, %a6, %acc0 205 mac.l %a6, %a5, (%a0)+, %a6, %acc0
203 mac.l %a6, %a4, (%a0)+, %a6, %acc0 206 mac.l %a6, %a4, (%a0)+, %a6, %acc0
204 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 207 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
@@ -207,32 +210,32 @@ lpc_decode_emac:
207 add.l %d2, (%a0) 210 add.l %d2, (%a0)
208 subq.l #4, %a0 211 subq.l #4, %a0
209 subq.l #1, %d0 212 subq.l #1, %d0
210 jne .loop3 213 jne 1b
211 jra .exit 214 jra .exit
212 215
213.order2: 216.order2:
214 movem.l (%a1), %a4-%a5 217 movem.l (%a1), %a4-%a5
215 move.l (%a0)+, %a6 218 move.l (%a0)+, %a6
216.loop2: 2191:
217 mac.l %a6, %a5, (%a0)+, %a6, %acc0 220 mac.l %a6, %a5, (%a0)+, %a6, %acc0
218 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded 221 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
219 movclr.l %acc0, %d2 222 movclr.l %acc0, %d2
220 asr.l %d1, %d2 223 asr.l %d1, %d2
221 add.l %d2, (%a0) 224 add.l %d2, (%a0)
222 subq.l #1, %d0 225 subq.l #1, %d0
223 jne .loop2 226 jne 1b
224 jra .exit 227 jra .exit
225 228
226.order1: 229.order1:
227 | no point in using mac here 230 | no point in using mac here
228 move.l (%a1), %a5 231 move.l (%a1), %a5
229.loop1: 2321:
230 move.l %a5, %d2 233 move.l %a5, %d2
231 muls.l (%a0)+, %d2 234 muls.l (%a0)+, %d2
232 asr.l %d1, %d2 235 asr.l %d1, %d2
233 add.l %d2, (%a0) 236 add.l %d2, (%a0)
234 subq.l #1, %d0 237 subq.l #1, %d0
235 jne .loop1 238 jne 1b
236 jra .exit 239 jra .exit
237 240
238.default: 241.default:
@@ -243,7 +246,7 @@ lpc_decode_emac:
243 move.l %d2, %d3 246 move.l %d2, %d3
244 lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop 247 lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop
245 move.l (%a3)+, %a5 | preload data for loop 248 move.l (%a3)+, %a5 | preload data for loop
246.dloop1: 2491:
247 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards 250 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
248 movem.l (%a2), %d4-%d7 | load four coefs 251 movem.l (%a2), %d4-%d7 | load four coefs
249 mac.l %a5, %d7, (%a3)+, %a5, %acc0 252 mac.l %a5, %d7, (%a3)+, %a5, %acc0
@@ -251,33 +254,277 @@ lpc_decode_emac:
251 mac.l %a5, %d5, (%a3)+, %a5, %acc0 254 mac.l %a5, %d5, (%a3)+, %a5, %acc0
252 mac.l %a5, %d4, (%a3)+, %a5, %acc0 255 mac.l %a5, %d4, (%a3)+, %a5, %acc0
253 subq.l #1, %d3 | any more unrolled loop operations left? 256 subq.l #1, %d3 | any more unrolled loop operations left?
254 jne .dloop1 257 jne 1b
255 258
256 moveq.l #3, %d3 | mask 0x00000003 259 moveq.l #3, %d3 | mask 0x00000003
257 and.l %d2, %d3 | get the remaining samples to be filtered 260 and.l %d2, %d3 | get the remaining samples to be filtered
258 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain 261 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
259| jumptable: 262| jumptable:
260 bra.b .dsave 263 bra.b 3f | none left
261 bra.b .oneleft 264 bra.b 2f | one left
262 bra.b .twoleft 265 bra.b 1f | two left
263| implicit .threeleft 266| three left
264 move.l -(%a2), %d4 267 move.l -(%a2), %d4
265 mac.l %a5, %d4, (%a3)+, %a5, %acc0 268 mac.l %a5, %d4, (%a3)+, %a5, %acc0
266.twoleft: 2691:
267 move.l -(%a2), %d4 270 move.l -(%a2), %d4
268 mac.l %a5, %d4, (%a3)+, %a5, %acc0 271 mac.l %a5, %d4, (%a3)+, %a5, %acc0
269.oneleft: 2722:
270 move.l -(%a2), %d4 273 move.l -(%a2), %d4
271 mac.l %a5, %d4, (%a3)+, %a5, %acc0 | need this fetch to not break line below 274 mac.l %a5, %d4, (%a3)+, %a5, %acc0
272 2753:
273.dsave:
274 subq.l #4, %a3 | we're one past the save location
275 movclr.l %acc0, %d3 | get result 276 movclr.l %acc0, %d3 | get result
276 asr.l %d1, %d3 | shift qlevel bits right 277 asr.l %d1, %d3 | shift qlevel bits right
277 add.l %d3, (%a3) | add residual and save 278 add.l %a5, %d3 | add residual, which is in a5 by now
279 move.l %d3, -(%a3) | save, a3 is also one past save location
278 addq.l #4, %a0 | increment history pointer 280 addq.l #4, %a0 | increment history pointer
279 subq.l #1, %d0 | decrement sample count 281 subq.l #1, %d0 | decrement sample count
280 jne .default | are we done? 282 jne .default | are we done?
283 jra .exit | if so, fall through to exit
284
285
286/* This routine deals with sample widths 24 and lower. All LPC filtering up to
287 order 8 is done in specially optimised unrolled loops, while every order
288 above this is handled by a slower default routine.
289 */
290 .global lpc_decode_emac_wide
291 .align 2
292lpc_decode_emac_wide:
293 lea.l (-44, %sp), %sp
294 movem.l %d2-%d7/%a2-%a6, (%sp)
295 movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1
296 /* d0 = blocksize, d1 = qlevel, d3 = pred_order
297 a0 = data, a1 = coeffs
298 */
299
300 /* the data pointer always lags behind history pointer by 'pred_order'
301 samples. since we have one loop for each order, we can hard code this
302 and free a register by not saving data pointer.
303 */
304 move.l %d3, %d2
305 neg.l %d2
306 lea.l (%a0, %d2.l*4), %a0 | history
307 clr.l %d2
308 move.l %d2, %macsr | we'll need integer mode for this
309 tst.l %d0
310 jeq .exit | zero samples to process, exit
311 moveq.l #32, %d2
312 sub.l %d1, %d2 | calculate shift amount for extension byte
313 moveq.l #8, %d4
314 cmp.l %d4, %d3
315 jgt .wdefault | order is over 8, jump to default case
316 jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order
317| jumptable:
318 bra.w .exit | zero order filter isn't possible, exit function
319 bra.w .worder1
320 bra.w .worder2
321 bra.w .worder3
322 bra.w .worder4
323 bra.w .worder5
324 bra.w .worder6
325 bra.w .worder7
326
327| last jump table entry coincides with target, so leave it out
328.worder8:
329 movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs
330 move.l (%a0)+, %a6 | load first history sample
3311:
332 mac.l %a6, %a5, (%a0)+, %a6, %acc0
333 mac.l %a6, %a4, (%a0)+, %a6, %acc0
334 mac.l %a6, %a3, (%a0)+, %a6, %acc0
335 mac.l %a6, %a2, (%a0)+, %a6, %acc0
336 mac.l %a6, %a1, (%a0)+, %a6, %acc0
337 mac.l %a6, %d7, (%a0)+, %a6, %acc0
338 mac.l %a6, %d6, (%a0)+, %a6, %acc0
339 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration
340 move.l %accext01, %d4 | get top 8 bits of sum
341 movclr.l %acc0, %d3 | then botten 32 bits
342 lsr.l %d1, %d3 | shift bottom bits qlevel bits right
343 asl.l %d2, %d4 | shift top bits 32 - qlevel bits left
344 or.l %d4, %d3 | now combine results
345 add.l %d3, (%a0) | add residual and save
346 lea.l (-6*4, %a0), %a0 | point history back at second element
347 subq.l #1, %d0 | decrement sample count
348 jne 1b | are we done?
349 jra .exit
350
351.worder7:
352 movem.l (%a1), %d6-%d7/%a1-%a5
353 move.l (%a0)+, %a6
3541:
355 mac.l %a6, %a5, (%a0)+, %a6, %acc0
356 mac.l %a6, %a4, (%a0)+, %a6, %acc0
357 mac.l %a6, %a3, (%a0)+, %a6, %acc0
358 mac.l %a6, %a2, (%a0)+, %a6, %acc0
359 mac.l %a6, %a1, (%a0)+, %a6, %acc0
360 mac.l %a6, %d7, (%a0)+, %a6, %acc0
361 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
362 move.l %accext01, %d4
363 movclr.l %acc0, %d3
364 lsr.l %d1, %d3
365 asl.l %d2, %d4
366 or.l %d4, %d3
367 add.l %d3, (%a0)
368 lea.l (-5*4, %a0), %a0
369 subq.l #1, %d0
370 jne 1b
371 jra .exit
372
373.worder6:
374 movem.l (%a1), %d7/%a1-%a5
375 move.l (%a0)+, %a6
3761:
377 mac.l %a6, %a5, (%a0)+, %a6, %acc0
378 mac.l %a6, %a4, (%a0)+, %a6, %acc0
379 mac.l %a6, %a3, (%a0)+, %a6, %acc0
380 mac.l %a6, %a2, (%a0)+, %a6, %acc0
381 mac.l %a6, %a1, (%a0)+, %a6, %acc0
382 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
383 move.l %accext01, %d4
384 movclr.l %acc0, %d3
385 lsr.l %d1, %d3
386 asl.l %d2, %d4
387 or.l %d4, %d3
388 add.l %d3, (%a0)
389 lea.l (-4*4, %a0), %a0
390 subq.l #1, %d0
391 jne 1b
392 jra .exit
393
394.worder5:
395 movem.l (%a1), %a1-%a5
396 move.l (%a0)+, %a6
3971:
398 mac.l %a6, %a5, (%a0)+, %a6, %acc0
399 mac.l %a6, %a4, (%a0)+, %a6, %acc0
400 mac.l %a6, %a3, (%a0)+, %a6, %acc0
401 mac.l %a6, %a2, (%a0)+, %a6, %acc0
402 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
403 move.l %accext01, %d4
404 movclr.l %acc0, %d3
405 lsr.l %d1, %d3
406 asl.l %d2, %d4
407 or.l %d4, %d3
408 add.l %d3, (%a0)
409 lea.l (-3*4, %a0), %a0
410 subq.l #1, %d0
411 jne 1b
412 jra .exit
413
414.worder4:
415 movem.l (%a1), %a2-%a5
416 move.l (%a0)+, %a6
4171:
418 mac.l %a6, %a5, (%a0)+, %a6, %acc0
419 mac.l %a6, %a4, (%a0)+, %a6, %acc0
420 mac.l %a6, %a3, (%a0)+, %a6, %acc0
421 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
422 move.l %accext01, %d4
423 movclr.l %acc0, %d3
424 lsr.l %d1, %d3
425 asl.l %d2, %d4
426 or.l %d4, %d3
427 add.l %d3, (%a0)
428 subq.l #8, %a0
429 subq.l #1, %d0
430 jne 1b
431 jra .exit
432
433.worder3:
434 movem.l (%a1), %a3-%a5
435 move.l (%a0)+, %a6
4361:
437 mac.l %a6, %a5, (%a0)+, %a6, %acc0
438 mac.l %a6, %a4, (%a0)+, %a6, %acc0
439 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
440 move.l %accext01, %d4
441 movclr.l %acc0, %d3
442 lsr.l %d1, %d3
443 asl.l %d2, %d4
444 or.l %d4, %d3
445 add.l %d3, (%a0)
446 subq.l #4, %a0
447 subq.l #1, %d0
448 jne 1b
449 jra .exit
450
451.worder2:
452 movem.l (%a1), %a4-%a5
453 move.l (%a0)+, %a6
4541:
455 mac.l %a6, %a5, (%a0)+, %a6, %acc0
456 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
457 move.l %accext01, %d4
458 movclr.l %acc0, %d3
459 lsr.l %d1, %d3
460 asl.l %d2, %d4
461 or.l %d4, %d3
462 add.l %d3, (%a0)
463 subq.l #1, %d0
464 jne 1b
465 jra .exit
466
467.worder1:
468 move.l (%a1), %a5
469 move.l (%a0)+, %a6
4701:
471 mac.l %a6, %a5, (%a0), %a6, %acc0
472 move.l %accext01, %d4
473 movclr.l %acc0, %d3
474 lsr.l %d1, %d3
475 asl.l %d2, %d4
476 or.l %d4, %d3
477 add.l %a6, %d3 | residual is already in a6
478 move.l %d3, (%a0)+
479 subq.l #1, %d0
480 jne 1b
481 jra .exit
482
483.wdefault:
484 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
485 do the rest by jump table. */
486 lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs
487 move.l %a0, %a3 | working copy of history pointer
488 move.l %d3, %d4
489 lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop
490 move.l (%a3)+, %a5 | preload data for loop
4911:
492 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
493 movem.l (%a2), %d5-%d7/%a4 | load four coefs
494 mac.l %a5, %a4, (%a3)+, %a5, %acc0
495 mac.l %a5, %d7, (%a3)+, %a5, %acc0
496 mac.l %a5, %d6, (%a3)+, %a5, %acc0
497 mac.l %a5, %d5, (%a3)+, %a5, %acc0
498 subq.l #1, %d4 | any more unrolled loop operations left?
499 jne 1b
500
501 moveq.l #3, %d4 | mask 0x00000003
502 and.l %d3, %d4 | get the remaining samples to be filtered
503 jmp.l (2, %pc, %d4*2) | then jump into mac.l chain
504| jumptable:
505 bra.b 3f | none left
506 bra.b 2f | one left
507 bra.b 1f | two left
508| three left
509 move.l -(%a2), %d4
510 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5111:
512 move.l -(%a2), %d4
513 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5142:
515 move.l -(%a2), %d4
516 mac.l %a5, %d4, (%a3)+, %a5, %acc0
5173:
518 move.l %accext01, %d5 | get high 32 bits of result
519 movclr.l %acc0, %d4 | get low 32 bits of result
520 lsr.l %d1, %d4 | shift qlevel bits right
521 asl.l %d2, %d5 | shift 32 - qlevel bits left
522 or.l %d5, %d4 | combine top and low bits after shift
523 add.l %a5, %d4 | add residual, which is in a5 by now
524 move.l %d4, -(%a3) | save, a3 is also one past save location
525 addq.l #4, %a0 | increment history pointer
526 subq.l #1, %d0 | decrement sample count
527 jne .wdefault | are we done?
281 | if so, fall through to exit 528 | if so, fall through to exit
282 529
283.exit: 530.exit:
diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h
index 5493f549f7..d6f0996935 100644
--- a/apps/codecs/libffmpegFLAC/coldfire.h
+++ b/apps/codecs/libffmpegFLAC/coldfire.h
@@ -3,6 +3,9 @@
3 3
4#include "bitstream.h" 4#include "bitstream.h"
5 5
6void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); 6void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data,
7 int* coeffs);
8void lpc_decode_emac_wide(int blocksize, int qlevel, int pred_order,
9 int32_t* data, int* coeffs);
7 10
8#endif 11#endif
diff --git a/apps/codecs/libffmpegFLAC/decoder.c b/apps/codecs/libffmpegFLAC/decoder.c
index 9c85864e26..4dbae972cb 100644
--- a/apps/codecs/libffmpegFLAC/decoder.c
+++ b/apps/codecs/libffmpegFLAC/decoder.c
@@ -262,10 +262,12 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order)
262 if ((s->bps + coeff_prec + av_log2(pred_order)) <= 32) { 262 if ((s->bps + coeff_prec + av_log2(pred_order)) <= 32) {
263 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) 263 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
264 (void)sum; 264 (void)sum;
265 lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); 265 lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order,
266 decoded + pred_order, coeffs);
266 #elif defined(CPU_ARM) && !defined(SIMULATOR) 267 #elif defined(CPU_ARM) && !defined(SIMULATOR)
267 (void)sum; 268 (void)sum;
268 lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); 269 lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order,
270 decoded + pred_order, coeffs);
269 #else 271 #else
270 for (i = pred_order; i < s->blocksize; i++) 272 for (i = pred_order; i < s->blocksize; i++)
271 { 273 {
@@ -276,6 +278,12 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order)
276 } 278 }
277 #endif 279 #endif
278 } else { 280 } else {
281 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
282 (void)wsum;
283 (void)j;
284 lpc_decode_emac_wide(s->blocksize - pred_order, qlevel, pred_order,
285 decoded + pred_order, coeffs);
286 #else
279 for (i = pred_order; i < s->blocksize; i++) 287 for (i = pred_order; i < s->blocksize; i++)
280 { 288 {
281 wsum = 0; 289 wsum = 0;
@@ -283,6 +291,7 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order)
283 wsum += (int64_t)coeffs[j] * (int64_t)decoded[i-j-1]; 291 wsum += (int64_t)coeffs[j] * (int64_t)decoded[i-j-1];
284 decoded[i] += wsum >> qlevel; 292 decoded[i] += wsum >> qlevel;
285 } 293 }
294 #endif
286 } 295 }
287 296
288 return 0; 297 return 0;