diff options
-rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.S | 237 | ||||
-rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.h | 8 |
2 files changed, 245 insertions, 0 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S new file mode 100644 index 0000000000..7e19e4b695 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.S | |||
@@ -0,0 +1,237 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2005 by Thom Johansen | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | |||
20 | /* The following is an assembler optimised version of the LPC filtering | ||
21 | routines needed for FLAC decoding. It is optimised for use with the | ||
22 | MCF5249 processor, or any other similar ColdFire core with the EMAC unit. | ||
23 | All LPC filtering up to order 8 is done in specially optimised unrolled | ||
24 | loops, while every order above this is handled by a slower default routine. | ||
25 | */ | ||
26 | .text | ||
27 | .global lpc_decode_emac | ||
28 | .align 2 | ||
29 | lpc_decode_emac: | ||
30 | lea.l (-40, %sp), %sp | ||
31 | movem.l %d2-%d7/%a2-%a5, (%sp) | ||
32 | movem.l (40+4, %sp), %d0-%d2/%a0-%a1 | ||
33 | /* d0 = blocksize, d1 = qlevel, d2 = pred_order | ||
34 | a0 = data, a1 = coeffs | ||
35 | */ | ||
36 | |||
37 | /* the data pointer always lags behind history pointer by 'pred_order' | ||
38 | samples. since we have one loop for each order, we can hard code this | ||
39 | and free a register by not saving data pointer. | ||
40 | */ | ||
41 | move.l %d2, %d3 | ||
42 | neg.l %d3 | ||
43 | lea.l (%a0, %d3.l*4), %a0 | history | ||
44 | clr.l %d3 | ||
45 | move.l %d3, %macsr | we'll need integer mode for this | ||
46 | tst.l %d0 | ||
47 | jeq .exit | zero samples to process, exit | ||
48 | moveq.l #8, %d3 | ||
49 | cmp.l %d3, %d2 | ||
50 | jgt .default | order is over 8, jump to default case | ||
51 | lea.l .jumptable, %a4 | ||
52 | move.l (%a4, %d2.l*4), %a4 | ||
53 | jmp (%a4) | ||
54 | .align 4 | avoid unaligned fetch | ||
55 | .jumptable: | ||
56 | .long .exit | ||
57 | .long .order1 | ||
58 | .long .order2 | ||
59 | .long .order3 | ||
60 | .long .order4 | ||
61 | .long .order5 | ||
62 | .long .order6 | ||
63 | .long .order7 | ||
64 | .long .order8 | ||
65 | |||
66 | .order8: | ||
67 | movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs | ||
68 | move.l (%a0)+, %a5 | load first history sample | ||
69 | .loop8: | ||
70 | mac.l %a5, %a4, (%a0)+, %a5, %acc0 | ||
71 | mac.l %a5, %a3, (%a0)+, %a5, %acc0 | ||
72 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | ||
73 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | ||
74 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | ||
75 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
76 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
77 | mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration | ||
78 | movclr.l %acc0, %d2 | get sum | ||
79 | asr.l %d1, %d2 | shift sum by lp_quantization bits | ||
80 | add.l %d2, (%a0) | add residual and save | ||
81 | lea.l (-6*4, %a0), %a0 | history pointer points at second element | ||
82 | subq.l #1, %d0 | decrement counter | ||
83 | jne .loop8 | are we done? | ||
84 | jra .exit | ||
85 | |||
86 | .order7: | ||
87 | movem.l (%a1), %d3-%d7/%a2-%a3 | ||
88 | move.l (%a0)+, %a5 | ||
89 | .loop7: | ||
90 | mac.l %a5, %a3, (%a0)+, %a5, %acc0 | ||
91 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | ||
92 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | ||
93 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | ||
94 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
95 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
96 | mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0 | ||
97 | movclr.l %acc0, %d2 | ||
98 | asr.l %d1, %d2 | ||
99 | add.l %d2, (%a0) | ||
100 | lea.l (-5*4, %a0), %a0 | ||
101 | subq.l #1, %d0 | ||
102 | jne .loop7 | ||
103 | jra .exit | ||
104 | |||
105 | .order6: | ||
106 | movem.l (%a1), %d3-%d7/%a2 | ||
107 | move.l (%a0)+, %a5 | ||
108 | .loop6: | ||
109 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | ||
110 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | ||
111 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | ||
112 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
113 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
114 | mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0 | ||
115 | movclr.l %acc0, %d2 | ||
116 | asr.l %d1, %d2 | ||
117 | add.l %d2, (%a0) | ||
118 | lea.l (-4*4, %a0), %a0 | ||
119 | subq.l #1, %d0 | ||
120 | jne .loop6 | ||
121 | jra .exit | ||
122 | |||
123 | .order5: | ||
124 | movem.l (%a1), %d3-%d7 | ||
125 | move.l (%a0)+, %a5 | ||
126 | .loop5: | ||
127 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | ||
128 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | ||
129 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
130 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
131 | mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0 | ||
132 | movclr.l %acc0, %d2 | ||
133 | asr.l %d1, %d2 | ||
134 | add.l %d2, (%a0) | ||
135 | lea.l (-3*4, %a0), %a0 | ||
136 | subq.l #1, %d0 | ||
137 | jne .loop5 | ||
138 | jra .exit | ||
139 | |||
140 | .order4: | ||
141 | movem.l (%a1), %d3-%d6 | ||
142 | move.l (%a0)+, %a5 | ||
143 | .loop4: | ||
144 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | ||
145 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
146 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
147 | mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0 | ||
148 | movclr.l %acc0, %d2 | ||
149 | asr.l %d1, %d2 | ||
150 | add.l %d2, (%a0) | ||
151 | subq.l #8, %a0 | ||
152 | subq.l #1, %d0 | ||
153 | jne .loop4 | ||
154 | jra .exit | ||
155 | |||
156 | .order3: | ||
157 | movem.l (%a1), %d3-%d5 | ||
158 | move.l (%a0)+, %a5 | ||
159 | .loop3: | ||
160 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | ||
161 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
162 | mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0 | ||
163 | movclr.l %acc0, %d2 | ||
164 | asr.l %d1, %d2 | ||
165 | add.l %d2, (%a0) | ||
166 | subq.l #4, %a0 | ||
167 | subq.l #1, %d0 | ||
168 | jne .loop3 | ||
169 | jra .exit | ||
170 | |||
171 | .order2: | ||
172 | movem.l (%a1), %d3-%d4 | ||
173 | move.l (%a0)+, %a5 | ||
174 | .loop2: | ||
175 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | ||
176 | mac.l %a5, %d3, %acc0 | data for next iteration is already loaded | ||
177 | movclr.l %acc0, %d2 | ||
178 | asr.l %d1, %d2 | ||
179 | add.l %d2, (%a0) | ||
180 | subq.l #1, %d0 | ||
181 | jne .loop2 | ||
182 | jra .exit | ||
183 | |||
184 | .order1: | ||
185 | | no point in using mac here | ||
186 | move.l (%a1), %d3 | ||
187 | .loop1: | ||
188 | move.l %d3, %d2 | ||
189 | muls.l (%a0)+, %d2 | ||
190 | asr.l %d1, %d2 | ||
191 | add.l %d2, (%a0) | ||
192 | subq.l #1, %d0 | ||
193 | jne .loop1 | ||
194 | jra .exit | ||
195 | |||
196 | .default: | ||
197 | /* we do the filtering in an unrolled by 4 loop as far as we can, and then | ||
198 | do the rest in an ordinary one by one sample loop. | ||
199 | */ | ||
200 | lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs | ||
201 | move.l %a0, %a3 | working copy of history pointer | ||
202 | move.l %d2, %d3 | ||
203 | lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop | ||
204 | move.l (%a3)+, %a5 | preload data for loop | ||
205 | .dloop1: | ||
206 | lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards | ||
207 | movem.l (%a2), %d4-%d7 | load four coefs | ||
208 | mac.l %a5, %d7, (%a3)+, %a5, %acc0 | ||
209 | mac.l %a5, %d6, (%a3)+, %a5, %acc0 | ||
210 | mac.l %a5, %d5, (%a3)+, %a5, %acc0 | ||
211 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
212 | subq.l #1, %d3 | any more unrolled loop operations left? | ||
213 | jne .dloop1 | ||
214 | |||
215 | move.l %d2, %d3 | ||
216 | moveq.l #3, %d4 | mask 0x00000003 | ||
217 | and.l %d4, %d3 | get the remaining samples to be filtered | ||
218 | jeq .dsave | no remaining samples | ||
219 | .dloop2: | ||
220 | move.l -(%a2), %d4 | get lpc coef | ||
221 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
222 | subq.l #1, %d3 | any more iterations left? | ||
223 | jne .dloop2 | ||
224 | .dsave: | ||
225 | movclr.l %acc0, %d3 | get result | ||
226 | asr.l %d1, %d3 | shift lp_quantization bits right | ||
227 | subq.l #4, %a3 | we're one past the save location | ||
228 | add.l %d3, (%a3) | add residual and save | ||
229 | addq.l #4, %a0 | increment history pointer | ||
230 | subq.l #1, %d0 | decrement data_len | ||
231 | jne .default | are we done? | ||
232 | | if so, fall through to exit | ||
233 | |||
234 | .exit: | ||
235 | movem.l (%sp), %d2-%d7/%a2-%a5 | ||
236 | lea.l (40, %sp), %sp | ||
237 | rts | ||
diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h new file mode 100644 index 0000000000..5493f549f7 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.h | |||
@@ -0,0 +1,8 @@ | |||
1 | #ifndef _FLAC_COLDFIRE_H | ||
2 | #define _FLAC_COLDFIRE_H | ||
3 | |||
4 | #include "bitstream.h" | ||
5 | |||
6 | void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); | ||
7 | |||
8 | #endif | ||