diff options
-rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.S | 321 | ||||
-rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.h | 5 | ||||
-rw-r--r-- | apps/codecs/libffmpegFLAC/decoder.c | 13 |
3 files changed, 299 insertions, 40 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S index 1d144ecc76..5f464be762 100644 --- a/apps/codecs/libffmpegFLAC/coldfire.S +++ b/apps/codecs/libffmpegFLAC/coldfire.S | |||
@@ -17,11 +17,14 @@ | |||
17 | * | 17 | * |
18 | ****************************************************************************/ | 18 | ****************************************************************************/ |
19 | 19 | ||
20 | /* The following is an assembler optimised version of the LPC filtering | 20 | /* The following are assembler optimised version of the LPC filtering |
21 | routines needed for FLAC decoding. It is optimised for use with the | 21 | routines needed for FLAC decoding. They is optimised for use with the |
22 | MCF5249 processor, or any other similar ColdFire core with the EMAC unit. | 22 | MCF5249 processor, or any other similar ColdFire core with the EMAC unit. |
23 | All LPC filtering up to order 10 is done in specially optimised unrolled | 23 | */ |
24 | loops, while every order above this is handled by a slower default routine. | 24 | |
25 | /* This routine deals with sample widths 16 and lower. All LPC filtering up to | ||
26 | order 10 is done in specially optimised unrolled loops, while every order | ||
27 | above this is handled by a slower default routine. | ||
25 | */ | 28 | */ |
26 | .section .icode,"ax",@progbits | 29 | .section .icode,"ax",@progbits |
27 | .global lpc_decode_emac | 30 | .global lpc_decode_emac |
@@ -65,7 +68,7 @@ lpc_decode_emac: | |||
65 | .order10: | 68 | .order10: |
66 | movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs | 69 | movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs |
67 | move.l (%a0)+, %a6 | load first history sample | 70 | move.l (%a0)+, %a6 | load first history sample |
68 | .loop10: | 71 | 1: |
69 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 72 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
70 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 73 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
71 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 74 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -81,13 +84,13 @@ lpc_decode_emac: | |||
81 | add.l %d2, (%a0) | add residual and save | 84 | add.l %d2, (%a0) | add residual and save |
82 | lea.l (-8*4, %a0), %a0 | point history back at second element | 85 | lea.l (-8*4, %a0), %a0 | point history back at second element |
83 | subq.l #1, %d0 | decrement sample count | 86 | subq.l #1, %d0 | decrement sample count |
84 | jne .loop10 | are we done? | 87 | jne 1b | are we done? |
85 | jra .exit | 88 | jra .exit |
86 | 89 | ||
87 | .order9: | 90 | .order9: |
88 | movem.l (%a1), %d4-%d7/%a1-%a5 | 91 | movem.l (%a1), %d4-%d7/%a1-%a5 |
89 | move.l (%a0)+, %a6 | 92 | move.l (%a0)+, %a6 |
90 | .loop9: | 93 | 1: |
91 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 94 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
92 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 95 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
93 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 96 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -102,13 +105,13 @@ lpc_decode_emac: | |||
102 | add.l %d2, (%a0) | 105 | add.l %d2, (%a0) |
103 | lea.l (-7*4, %a0), %a0 | 106 | lea.l (-7*4, %a0), %a0 |
104 | subq.l #1, %d0 | 107 | subq.l #1, %d0 |
105 | jne .loop9 | 108 | jne 1b |
106 | jra .exit | 109 | jra .exit |
107 | 110 | ||
108 | .order8: | 111 | .order8: |
109 | movem.l (%a1), %d5-%d7/%a1-%a5 | 112 | movem.l (%a1), %d5-%d7/%a1-%a5 |
110 | move.l (%a0)+, %a6 | 113 | move.l (%a0)+, %a6 |
111 | .loop8: | 114 | 1: |
112 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 115 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
113 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 116 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
114 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 117 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -122,13 +125,13 @@ lpc_decode_emac: | |||
122 | add.l %d2, (%a0) | 125 | add.l %d2, (%a0) |
123 | lea.l (-6*4, %a0), %a0 | 126 | lea.l (-6*4, %a0), %a0 |
124 | subq.l #1, %d0 | 127 | subq.l #1, %d0 |
125 | jne .loop8 | 128 | jne 1b |
126 | jra .exit | 129 | jra .exit |
127 | 130 | ||
128 | .order7: | 131 | .order7: |
129 | movem.l (%a1), %d6-%d7/%a1-%a5 | 132 | movem.l (%a1), %d6-%d7/%a1-%a5 |
130 | move.l (%a0)+, %a6 | 133 | move.l (%a0)+, %a6 |
131 | .loop7: | 134 | 1: |
132 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 135 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
133 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 136 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
134 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 137 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -141,13 +144,13 @@ lpc_decode_emac: | |||
141 | add.l %d2, (%a0) | 144 | add.l %d2, (%a0) |
142 | lea.l (-5*4, %a0), %a0 | 145 | lea.l (-5*4, %a0), %a0 |
143 | subq.l #1, %d0 | 146 | subq.l #1, %d0 |
144 | jne .loop7 | 147 | jne 1b |
145 | jra .exit | 148 | jra .exit |
146 | 149 | ||
147 | .order6: | 150 | .order6: |
148 | movem.l (%a1), %d7/%a1-%a5 | 151 | movem.l (%a1), %d7/%a1-%a5 |
149 | move.l (%a0)+, %a6 | 152 | move.l (%a0)+, %a6 |
150 | .loop6: | 153 | 1: |
151 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 154 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
152 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 155 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
153 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 156 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -159,13 +162,13 @@ lpc_decode_emac: | |||
159 | add.l %d2, (%a0) | 162 | add.l %d2, (%a0) |
160 | lea.l (-4*4, %a0), %a0 | 163 | lea.l (-4*4, %a0), %a0 |
161 | subq.l #1, %d0 | 164 | subq.l #1, %d0 |
162 | jne .loop6 | 165 | jne 1b |
163 | jra .exit | 166 | jra .exit |
164 | 167 | ||
165 | .order5: | 168 | .order5: |
166 | movem.l (%a1), %a1-%a5 | 169 | movem.l (%a1), %a1-%a5 |
167 | move.l (%a0)+, %a6 | 170 | move.l (%a0)+, %a6 |
168 | .loop5: | 171 | 1: |
169 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 172 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
170 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 173 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
171 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 174 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -176,13 +179,13 @@ lpc_decode_emac: | |||
176 | add.l %d2, (%a0) | 179 | add.l %d2, (%a0) |
177 | lea.l (-3*4, %a0), %a0 | 180 | lea.l (-3*4, %a0), %a0 |
178 | subq.l #1, %d0 | 181 | subq.l #1, %d0 |
179 | jne .loop5 | 182 | jne 1b |
180 | jra .exit | 183 | jra .exit |
181 | 184 | ||
182 | .order4: | 185 | .order4: |
183 | movem.l (%a1), %a2-%a5 | 186 | movem.l (%a1), %a2-%a5 |
184 | move.l (%a0)+, %a6 | 187 | move.l (%a0)+, %a6 |
185 | .loop4: | 188 | 1: |
186 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 189 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
187 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 190 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
188 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | 191 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
@@ -192,13 +195,13 @@ lpc_decode_emac: | |||
192 | add.l %d2, (%a0) | 195 | add.l %d2, (%a0) |
193 | subq.l #8, %a0 | 196 | subq.l #8, %a0 |
194 | subq.l #1, %d0 | 197 | subq.l #1, %d0 |
195 | jne .loop4 | 198 | jne 1b |
196 | jra .exit | 199 | jra .exit |
197 | 200 | ||
198 | .order3: | 201 | .order3: |
199 | movem.l (%a1), %a3-%a5 | 202 | movem.l (%a1), %a3-%a5 |
200 | move.l (%a0)+, %a6 | 203 | move.l (%a0)+, %a6 |
201 | .loop3: | 204 | 1: |
202 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 205 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
203 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | 206 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
204 | mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 | 207 | mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 |
@@ -207,32 +210,32 @@ lpc_decode_emac: | |||
207 | add.l %d2, (%a0) | 210 | add.l %d2, (%a0) |
208 | subq.l #4, %a0 | 211 | subq.l #4, %a0 |
209 | subq.l #1, %d0 | 212 | subq.l #1, %d0 |
210 | jne .loop3 | 213 | jne 1b |
211 | jra .exit | 214 | jra .exit |
212 | 215 | ||
213 | .order2: | 216 | .order2: |
214 | movem.l (%a1), %a4-%a5 | 217 | movem.l (%a1), %a4-%a5 |
215 | move.l (%a0)+, %a6 | 218 | move.l (%a0)+, %a6 |
216 | .loop2: | 219 | 1: |
217 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | 220 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
218 | mac.l %a6, %a4, %acc0 | data for next iteration is already loaded | 221 | mac.l %a6, %a4, %acc0 | data for next iteration is already loaded |
219 | movclr.l %acc0, %d2 | 222 | movclr.l %acc0, %d2 |
220 | asr.l %d1, %d2 | 223 | asr.l %d1, %d2 |
221 | add.l %d2, (%a0) | 224 | add.l %d2, (%a0) |
222 | subq.l #1, %d0 | 225 | subq.l #1, %d0 |
223 | jne .loop2 | 226 | jne 1b |
224 | jra .exit | 227 | jra .exit |
225 | 228 | ||
226 | .order1: | 229 | .order1: |
227 | | no point in using mac here | 230 | | no point in using mac here |
228 | move.l (%a1), %a5 | 231 | move.l (%a1), %a5 |
229 | .loop1: | 232 | 1: |
230 | move.l %a5, %d2 | 233 | move.l %a5, %d2 |
231 | muls.l (%a0)+, %d2 | 234 | muls.l (%a0)+, %d2 |
232 | asr.l %d1, %d2 | 235 | asr.l %d1, %d2 |
233 | add.l %d2, (%a0) | 236 | add.l %d2, (%a0) |
234 | subq.l #1, %d0 | 237 | subq.l #1, %d0 |
235 | jne .loop1 | 238 | jne 1b |
236 | jra .exit | 239 | jra .exit |
237 | 240 | ||
238 | .default: | 241 | .default: |
@@ -243,7 +246,7 @@ lpc_decode_emac: | |||
243 | move.l %d2, %d3 | 246 | move.l %d2, %d3 |
244 | lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop | 247 | lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop |
245 | move.l (%a3)+, %a5 | preload data for loop | 248 | move.l (%a3)+, %a5 | preload data for loop |
246 | .dloop1: | 249 | 1: |
247 | lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards | 250 | lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards |
248 | movem.l (%a2), %d4-%d7 | load four coefs | 251 | movem.l (%a2), %d4-%d7 | load four coefs |
249 | mac.l %a5, %d7, (%a3)+, %a5, %acc0 | 252 | mac.l %a5, %d7, (%a3)+, %a5, %acc0 |
@@ -251,33 +254,277 @@ lpc_decode_emac: | |||
251 | mac.l %a5, %d5, (%a3)+, %a5, %acc0 | 254 | mac.l %a5, %d5, (%a3)+, %a5, %acc0 |
252 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | 255 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 |
253 | subq.l #1, %d3 | any more unrolled loop operations left? | 256 | subq.l #1, %d3 | any more unrolled loop operations left? |
254 | jne .dloop1 | 257 | jne 1b |
255 | 258 | ||
256 | moveq.l #3, %d3 | mask 0x00000003 | 259 | moveq.l #3, %d3 | mask 0x00000003 |
257 | and.l %d2, %d3 | get the remaining samples to be filtered | 260 | and.l %d2, %d3 | get the remaining samples to be filtered |
258 | jmp.l (2, %pc, %d3*2) | then jump into mac.l chain | 261 | jmp.l (2, %pc, %d3*2) | then jump into mac.l chain |
259 | | jumptable: | 262 | | jumptable: |
260 | bra.b .dsave | 263 | bra.b 3f | none left |
261 | bra.b .oneleft | 264 | bra.b 2f | one left |
262 | bra.b .twoleft | 265 | bra.b 1f | two left |
263 | | implicit .threeleft | 266 | | three left |
264 | move.l -(%a2), %d4 | 267 | move.l -(%a2), %d4 |
265 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | 268 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 |
266 | .twoleft: | 269 | 1: |
267 | move.l -(%a2), %d4 | 270 | move.l -(%a2), %d4 |
268 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | 271 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 |
269 | .oneleft: | 272 | 2: |
270 | move.l -(%a2), %d4 | 273 | move.l -(%a2), %d4 |
271 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | need this fetch to not break line below | 274 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 |
272 | 275 | 3: | |
273 | .dsave: | ||
274 | subq.l #4, %a3 | we're one past the save location | ||
275 | movclr.l %acc0, %d3 | get result | 276 | movclr.l %acc0, %d3 | get result |
276 | asr.l %d1, %d3 | shift qlevel bits right | 277 | asr.l %d1, %d3 | shift qlevel bits right |
277 | add.l %d3, (%a3) | add residual and save | 278 | add.l %a5, %d3 | add residual, which is in a5 by now |
279 | move.l %d3, -(%a3) | save, a3 is also one past save location | ||
278 | addq.l #4, %a0 | increment history pointer | 280 | addq.l #4, %a0 | increment history pointer |
279 | subq.l #1, %d0 | decrement sample count | 281 | subq.l #1, %d0 | decrement sample count |
280 | jne .default | are we done? | 282 | jne .default | are we done? |
283 | jra .exit | if so, fall through to exit | ||
284 | |||
285 | |||
286 | /* This routine deals with sample widths 24 and lower. All LPC filtering up to | ||
287 | order 8 is done in specially optimised unrolled loops, while every order | ||
288 | above this is handled by a slower default routine. | ||
289 | */ | ||
290 | .global lpc_decode_emac_wide | ||
291 | .align 2 | ||
292 | lpc_decode_emac_wide: | ||
293 | lea.l (-44, %sp), %sp | ||
294 | movem.l %d2-%d7/%a2-%a6, (%sp) | ||
295 | movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1 | ||
296 | /* d0 = blocksize, d1 = qlevel, d3 = pred_order | ||
297 | a0 = data, a1 = coeffs | ||
298 | */ | ||
299 | |||
300 | /* the data pointer always lags behind history pointer by 'pred_order' | ||
301 | samples. since we have one loop for each order, we can hard code this | ||
302 | and free a register by not saving data pointer. | ||
303 | */ | ||
304 | move.l %d3, %d2 | ||
305 | neg.l %d2 | ||
306 | lea.l (%a0, %d2.l*4), %a0 | history | ||
307 | clr.l %d2 | ||
308 | move.l %d2, %macsr | we'll need integer mode for this | ||
309 | tst.l %d0 | ||
310 | jeq .exit | zero samples to process, exit | ||
311 | moveq.l #32, %d2 | ||
312 | sub.l %d1, %d2 | calculate shift amount for extension byte | ||
313 | moveq.l #8, %d4 | ||
314 | cmp.l %d4, %d3 | ||
315 | jgt .wdefault | order is over 8, jump to default case | ||
316 | jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order | ||
317 | | jumptable: | ||
318 | bra.w .exit | zero order filter isn't possible, exit function | ||
319 | bra.w .worder1 | ||
320 | bra.w .worder2 | ||
321 | bra.w .worder3 | ||
322 | bra.w .worder4 | ||
323 | bra.w .worder5 | ||
324 | bra.w .worder6 | ||
325 | bra.w .worder7 | ||
326 | |||
327 | | last jump table entry coincides with target, so leave it out | ||
328 | .worder8: | ||
329 | movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs | ||
330 | move.l (%a0)+, %a6 | load first history sample | ||
331 | 1: | ||
332 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
333 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
334 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
335 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
336 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 | ||
337 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 | ||
338 | mac.l %a6, %d6, (%a0)+, %a6, %acc0 | ||
339 | mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration | ||
340 | move.l %accext01, %d4 | get top 8 bits of sum | ||
341 | movclr.l %acc0, %d3 | then botten 32 bits | ||
342 | lsr.l %d1, %d3 | shift bottom bits qlevel bits right | ||
343 | asl.l %d2, %d4 | shift top bits 32 - qlevel bits left | ||
344 | or.l %d4, %d3 | now combine results | ||
345 | add.l %d3, (%a0) | add residual and save | ||
346 | lea.l (-6*4, %a0), %a0 | point history back at second element | ||
347 | subq.l #1, %d0 | decrement sample count | ||
348 | jne 1b | are we done? | ||
349 | jra .exit | ||
350 | |||
351 | .worder7: | ||
352 | movem.l (%a1), %d6-%d7/%a1-%a5 | ||
353 | move.l (%a0)+, %a6 | ||
354 | 1: | ||
355 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
356 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
357 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
358 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
359 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 | ||
360 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 | ||
361 | mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0 | ||
362 | move.l %accext01, %d4 | ||
363 | movclr.l %acc0, %d3 | ||
364 | lsr.l %d1, %d3 | ||
365 | asl.l %d2, %d4 | ||
366 | or.l %d4, %d3 | ||
367 | add.l %d3, (%a0) | ||
368 | lea.l (-5*4, %a0), %a0 | ||
369 | subq.l #1, %d0 | ||
370 | jne 1b | ||
371 | jra .exit | ||
372 | |||
373 | .worder6: | ||
374 | movem.l (%a1), %d7/%a1-%a5 | ||
375 | move.l (%a0)+, %a6 | ||
376 | 1: | ||
377 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
378 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
379 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
380 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
381 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 | ||
382 | mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0 | ||
383 | move.l %accext01, %d4 | ||
384 | movclr.l %acc0, %d3 | ||
385 | lsr.l %d1, %d3 | ||
386 | asl.l %d2, %d4 | ||
387 | or.l %d4, %d3 | ||
388 | add.l %d3, (%a0) | ||
389 | lea.l (-4*4, %a0), %a0 | ||
390 | subq.l #1, %d0 | ||
391 | jne 1b | ||
392 | jra .exit | ||
393 | |||
394 | .worder5: | ||
395 | movem.l (%a1), %a1-%a5 | ||
396 | move.l (%a0)+, %a6 | ||
397 | 1: | ||
398 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
399 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
400 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
401 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
402 | mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0 | ||
403 | move.l %accext01, %d4 | ||
404 | movclr.l %acc0, %d3 | ||
405 | lsr.l %d1, %d3 | ||
406 | asl.l %d2, %d4 | ||
407 | or.l %d4, %d3 | ||
408 | add.l %d3, (%a0) | ||
409 | lea.l (-3*4, %a0), %a0 | ||
410 | subq.l #1, %d0 | ||
411 | jne 1b | ||
412 | jra .exit | ||
413 | |||
414 | .worder4: | ||
415 | movem.l (%a1), %a2-%a5 | ||
416 | move.l (%a0)+, %a6 | ||
417 | 1: | ||
418 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
419 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
420 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
421 | mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0 | ||
422 | move.l %accext01, %d4 | ||
423 | movclr.l %acc0, %d3 | ||
424 | lsr.l %d1, %d3 | ||
425 | asl.l %d2, %d4 | ||
426 | or.l %d4, %d3 | ||
427 | add.l %d3, (%a0) | ||
428 | subq.l #8, %a0 | ||
429 | subq.l #1, %d0 | ||
430 | jne 1b | ||
431 | jra .exit | ||
432 | |||
433 | .worder3: | ||
434 | movem.l (%a1), %a3-%a5 | ||
435 | move.l (%a0)+, %a6 | ||
436 | 1: | ||
437 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
438 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
439 | mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 | ||
440 | move.l %accext01, %d4 | ||
441 | movclr.l %acc0, %d3 | ||
442 | lsr.l %d1, %d3 | ||
443 | asl.l %d2, %d4 | ||
444 | or.l %d4, %d3 | ||
445 | add.l %d3, (%a0) | ||
446 | subq.l #4, %a0 | ||
447 | subq.l #1, %d0 | ||
448 | jne 1b | ||
449 | jra .exit | ||
450 | |||
451 | .worder2: | ||
452 | movem.l (%a1), %a4-%a5 | ||
453 | move.l (%a0)+, %a6 | ||
454 | 1: | ||
455 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
456 | mac.l %a6, %a4, %acc0 | data for next iteration is already loaded | ||
457 | move.l %accext01, %d4 | ||
458 | movclr.l %acc0, %d3 | ||
459 | lsr.l %d1, %d3 | ||
460 | asl.l %d2, %d4 | ||
461 | or.l %d4, %d3 | ||
462 | add.l %d3, (%a0) | ||
463 | subq.l #1, %d0 | ||
464 | jne 1b | ||
465 | jra .exit | ||
466 | |||
467 | .worder1: | ||
468 | move.l (%a1), %a5 | ||
469 | move.l (%a0)+, %a6 | ||
470 | 1: | ||
471 | mac.l %a6, %a5, (%a0), %a6, %acc0 | ||
472 | move.l %accext01, %d4 | ||
473 | movclr.l %acc0, %d3 | ||
474 | lsr.l %d1, %d3 | ||
475 | asl.l %d2, %d4 | ||
476 | or.l %d4, %d3 | ||
477 | add.l %a6, %d3 | residual is already in a6 | ||
478 | move.l %d3, (%a0)+ | ||
479 | subq.l #1, %d0 | ||
480 | jne 1b | ||
481 | jra .exit | ||
482 | |||
483 | .wdefault: | ||
484 | /* we do the filtering in an unrolled by 4 loop as far as we can, and then | ||
485 | do the rest by jump table. */ | ||
486 | lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs | ||
487 | move.l %a0, %a3 | working copy of history pointer | ||
488 | move.l %d3, %d4 | ||
489 | lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop | ||
490 | move.l (%a3)+, %a5 | preload data for loop | ||
491 | 1: | ||
492 | lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards | ||
493 | movem.l (%a2), %d5-%d7/%a4 | load four coefs | ||
494 | mac.l %a5, %a4, (%a3)+, %a5, %acc0 | ||
495 | mac.l %a5, %d7, (%a3)+, %a5, %acc0 | ||
496 | mac.l %a5, %d6, (%a3)+, %a5, %acc0 | ||
497 | mac.l %a5, %d5, (%a3)+, %a5, %acc0 | ||
498 | subq.l #1, %d4 | any more unrolled loop operations left? | ||
499 | jne 1b | ||
500 | |||
501 | moveq.l #3, %d4 | mask 0x00000003 | ||
502 | and.l %d3, %d4 | get the remaining samples to be filtered | ||
503 | jmp.l (2, %pc, %d4*2) | then jump into mac.l chain | ||
504 | | jumptable: | ||
505 | bra.b 3f | none left | ||
506 | bra.b 2f | one left | ||
507 | bra.b 1f | two left | ||
508 | | three left | ||
509 | move.l -(%a2), %d4 | ||
510 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
511 | 1: | ||
512 | move.l -(%a2), %d4 | ||
513 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
514 | 2: | ||
515 | move.l -(%a2), %d4 | ||
516 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
517 | 3: | ||
518 | move.l %accext01, %d5 | get high 32 bits of result | ||
519 | movclr.l %acc0, %d4 | get low 32 bits of result | ||
520 | lsr.l %d1, %d4 | shift qlevel bits right | ||
521 | asl.l %d2, %d5 | shift 32 - qlevel bits left | ||
522 | or.l %d5, %d4 | combine top and low bits after shift | ||
523 | add.l %a5, %d4 | add residual, which is in a5 by now | ||
524 | move.l %d4, -(%a3) | save, a3 is also one past save location | ||
525 | addq.l #4, %a0 | increment history pointer | ||
526 | subq.l #1, %d0 | decrement sample count | ||
527 | jne .wdefault | are we done? | ||
281 | | if so, fall through to exit | 528 | | if so, fall through to exit |
282 | 529 | ||
283 | .exit: | 530 | .exit: |
diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h index 5493f549f7..d6f0996935 100644 --- a/apps/codecs/libffmpegFLAC/coldfire.h +++ b/apps/codecs/libffmpegFLAC/coldfire.h | |||
@@ -3,6 +3,9 @@ | |||
3 | 3 | ||
4 | #include "bitstream.h" | 4 | #include "bitstream.h" |
5 | 5 | ||
6 | void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); | 6 | void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, |
7 | int* coeffs); | ||
8 | void lpc_decode_emac_wide(int blocksize, int qlevel, int pred_order, | ||
9 | int32_t* data, int* coeffs); | ||
7 | 10 | ||
8 | #endif | 11 | #endif |
diff --git a/apps/codecs/libffmpegFLAC/decoder.c b/apps/codecs/libffmpegFLAC/decoder.c index 9c85864e26..4dbae972cb 100644 --- a/apps/codecs/libffmpegFLAC/decoder.c +++ b/apps/codecs/libffmpegFLAC/decoder.c | |||
@@ -262,10 +262,12 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order) | |||
262 | if ((s->bps + coeff_prec + av_log2(pred_order)) <= 32) { | 262 | if ((s->bps + coeff_prec + av_log2(pred_order)) <= 32) { |
263 | #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) | 263 | #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) |
264 | (void)sum; | 264 | (void)sum; |
265 | lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); | 265 | lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, |
266 | decoded + pred_order, coeffs); | ||
266 | #elif defined(CPU_ARM) && !defined(SIMULATOR) | 267 | #elif defined(CPU_ARM) && !defined(SIMULATOR) |
267 | (void)sum; | 268 | (void)sum; |
268 | lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); | 269 | lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order, |
270 | decoded + pred_order, coeffs); | ||
269 | #else | 271 | #else |
270 | for (i = pred_order; i < s->blocksize; i++) | 272 | for (i = pred_order; i < s->blocksize; i++) |
271 | { | 273 | { |
@@ -276,6 +278,12 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order) | |||
276 | } | 278 | } |
277 | #endif | 279 | #endif |
278 | } else { | 280 | } else { |
281 | #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) | ||
282 | (void)wsum; | ||
283 | (void)j; | ||
284 | lpc_decode_emac_wide(s->blocksize - pred_order, qlevel, pred_order, | ||
285 | decoded + pred_order, coeffs); | ||
286 | #else | ||
279 | for (i = pred_order; i < s->blocksize; i++) | 287 | for (i = pred_order; i < s->blocksize; i++) |
280 | { | 288 | { |
281 | wsum = 0; | 289 | wsum = 0; |
@@ -283,6 +291,7 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order) | |||
283 | wsum += (int64_t)coeffs[j] * (int64_t)decoded[i-j-1]; | 291 | wsum += (int64_t)coeffs[j] * (int64_t)decoded[i-j-1]; |
284 | decoded[i] += wsum >> qlevel; | 292 | decoded[i] += wsum >> qlevel; |
285 | } | 293 | } |
294 | #endif | ||
286 | } | 295 | } |
287 | 296 | ||
288 | return 0; | 297 | return 0; |