diff options
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.S | 212 |
1 files changed, 132 insertions, 80 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S index 33964cdbc1..1d144ecc76 100644 --- a/apps/codecs/libffmpegFLAC/coldfire.S +++ b/apps/codecs/libffmpegFLAC/coldfire.S | |||
@@ -20,16 +20,16 @@ | |||
20 | /* The following is an assembler optimised version of the LPC filtering | 20 | /* The following is an assembler optimised version of the LPC filtering |
21 | routines needed for FLAC decoding. It is optimised for use with the | 21 | routines needed for FLAC decoding. It is optimised for use with the |
22 | MCF5249 processor, or any other similar ColdFire core with the EMAC unit. | 22 | MCF5249 processor, or any other similar ColdFire core with the EMAC unit. |
23 | All LPC filtering up to order 8 is done in specially optimised unrolled | 23 | All LPC filtering up to order 10 is done in specially optimised unrolled |
24 | loops, while every order above this is handled by a slower default routine. | 24 | loops, while every order above this is handled by a slower default routine. |
25 | */ | 25 | */ |
26 | .section .icode,"ax",@progbits | 26 | .section .icode,"ax",@progbits |
27 | .global lpc_decode_emac | 27 | .global lpc_decode_emac |
28 | .align 2 | 28 | .align 2 |
29 | lpc_decode_emac: | 29 | lpc_decode_emac: |
30 | lea.l (-40, %sp), %sp | 30 | lea.l (-44, %sp), %sp |
31 | movem.l %d2-%d7/%a2-%a5, (%sp) | 31 | movem.l %d2-%d7/%a2-%a6, (%sp) |
32 | movem.l (40+4, %sp), %d0-%d2/%a0-%a1 | 32 | movem.l (44+4, %sp), %d0-%d2/%a0-%a1 |
33 | /* d0 = blocksize, d1 = qlevel, d2 = pred_order | 33 | /* d0 = blocksize, d1 = qlevel, d2 = pred_order |
34 | a0 = data, a1 = coeffs | 34 | a0 = data, a1 = coeffs |
35 | */ | 35 | */ |
@@ -39,17 +39,17 @@ lpc_decode_emac: | |||
39 | and free a register by not saving data pointer. | 39 | and free a register by not saving data pointer. |
40 | */ | 40 | */ |
41 | move.l %d2, %d3 | 41 | move.l %d2, %d3 |
42 | neg.l %d3 | 42 | neg.l %d3 |
43 | lea.l (%a0, %d3.l*4), %a0 | history | 43 | lea.l (%a0, %d3.l*4), %a0 | history |
44 | clr.l %d3 | 44 | clr.l %d3 |
45 | move.l %d3, %macsr | we'll need integer mode for this | 45 | move.l %d3, %macsr | we'll need integer mode for this |
46 | tst.l %d0 | 46 | tst.l %d0 |
47 | jeq .exit | zero samples to process, exit | 47 | jeq .exit | zero samples to process, exit |
48 | moveq.l #8, %d3 | 48 | moveq.l #10, %d3 |
49 | cmp.l %d3, %d2 | 49 | cmp.l %d3, %d2 |
50 | jgt .default | order is over 8, jump to default case | 50 | jgt .default | order is over 10, jump to default case |
51 | jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order | 51 | jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order |
52 | .jumptable: | 52 | | jumptable: |
53 | bra.w .exit | zero order filter isn't possible, exit function | 53 | bra.w .exit | zero order filter isn't possible, exit function |
54 | bra.w .order1 | 54 | bra.w .order1 |
55 | bra.w .order2 | 55 | bra.w .order2 |
@@ -58,39 +58,84 @@ lpc_decode_emac: | |||
58 | bra.w .order5 | 58 | bra.w .order5 |
59 | bra.w .order6 | 59 | bra.w .order6 |
60 | bra.w .order7 | 60 | bra.w .order7 |
61 | bra.w .order8 | ||
62 | bra.w .order9 | ||
61 | 63 | ||
62 | | last jump table entry coincides with target, so leave it out | 64 | | last jump table entry coincides with target, so leave it out |
63 | .order8: | 65 | .order10: |
64 | movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs | 66 | movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs |
65 | move.l (%a0)+, %a5 | load first history sample | 67 | move.l (%a0)+, %a6 | load first history sample |
66 | .loop8: | 68 | .loop10: |
67 | mac.l %a5, %a4, (%a0)+, %a5, %acc0 | 69 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
68 | mac.l %a5, %a3, (%a0)+, %a5, %acc0 | 70 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
69 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | 71 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
70 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | 72 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 |
71 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | 73 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 |
72 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 74 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 |
73 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 75 | mac.l %a6, %d6, (%a0)+, %a6, %acc0 |
74 | mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration | 76 | mac.l %a6, %d5, (%a0)+, %a6, %acc0 |
77 | mac.l %a6, %d4, (%a0)+, %a6, %acc0 | ||
78 | mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration | ||
75 | movclr.l %acc0, %d2 | get sum | 79 | movclr.l %acc0, %d2 | get sum |
76 | asr.l %d1, %d2 | shift sum by lp_quantization bits | 80 | asr.l %d1, %d2 | shift sum by qlevel bits |
77 | add.l %d2, (%a0) | add residual and save | 81 | add.l %d2, (%a0) | add residual and save |
78 | lea.l (-6*4, %a0), %a0 | point history back at second element | 82 | lea.l (-8*4, %a0), %a0 | point history back at second element |
79 | subq.l #1, %d0 | decrement counter | 83 | subq.l #1, %d0 | decrement sample count |
80 | jne .loop8 | are we done? | 84 | jne .loop10 | are we done? |
85 | jra .exit | ||
86 | |||
87 | .order9: | ||
88 | movem.l (%a1), %d4-%d7/%a1-%a5 | ||
89 | move.l (%a0)+, %a6 | ||
90 | .loop9: | ||
91 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
92 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
93 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
94 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
95 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 | ||
96 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 | ||
97 | mac.l %a6, %d6, (%a0)+, %a6, %acc0 | ||
98 | mac.l %a6, %d5, (%a0)+, %a6, %acc0 | ||
99 | mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0 | ||
100 | movclr.l %acc0, %d2 | ||
101 | asr.l %d1, %d2 | ||
102 | add.l %d2, (%a0) | ||
103 | lea.l (-7*4, %a0), %a0 | ||
104 | subq.l #1, %d0 | ||
105 | jne .loop9 | ||
106 | jra .exit | ||
107 | |||
108 | .order8: | ||
109 | movem.l (%a1), %d5-%d7/%a1-%a5 | ||
110 | move.l (%a0)+, %a6 | ||
111 | .loop8: | ||
112 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 | ||
113 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 | ||
114 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 | ||
115 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 | ||
116 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 | ||
117 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 | ||
118 | mac.l %a6, %d6, (%a0)+, %a6, %acc0 | ||
119 | mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | ||
120 | movclr.l %acc0, %d2 | ||
121 | asr.l %d1, %d2 | ||
122 | add.l %d2, (%a0) | ||
123 | lea.l (-6*4, %a0), %a0 | ||
124 | subq.l #1, %d0 | ||
125 | jne .loop8 | ||
81 | jra .exit | 126 | jra .exit |
82 | 127 | ||
83 | .order7: | 128 | .order7: |
84 | movem.l (%a1), %d3-%d7/%a2-%a3 | 129 | movem.l (%a1), %d6-%d7/%a1-%a5 |
85 | move.l (%a0)+, %a5 | 130 | move.l (%a0)+, %a6 |
86 | .loop7: | 131 | .loop7: |
87 | mac.l %a5, %a3, (%a0)+, %a5, %acc0 | 132 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
88 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | 133 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
89 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | 134 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
90 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | 135 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 |
91 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 136 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 |
92 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 137 | mac.l %a6, %d7, (%a0)+, %a6, %acc0 |
93 | mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0 | 138 | mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0 |
94 | movclr.l %acc0, %d2 | 139 | movclr.l %acc0, %d2 |
95 | asr.l %d1, %d2 | 140 | asr.l %d1, %d2 |
96 | add.l %d2, (%a0) | 141 | add.l %d2, (%a0) |
@@ -100,15 +145,15 @@ lpc_decode_emac: | |||
100 | jra .exit | 145 | jra .exit |
101 | 146 | ||
102 | .order6: | 147 | .order6: |
103 | movem.l (%a1), %d3-%d7/%a2 | 148 | movem.l (%a1), %d7/%a1-%a5 |
104 | move.l (%a0)+, %a5 | 149 | move.l (%a0)+, %a6 |
105 | .loop6: | 150 | .loop6: |
106 | mac.l %a5, %a2, (%a0)+, %a5, %acc0 | 151 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
107 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | 152 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
108 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | 153 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
109 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 154 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 |
110 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 155 | mac.l %a6, %a1, (%a0)+, %a6, %acc0 |
111 | mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0 | 156 | mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0 |
112 | movclr.l %acc0, %d2 | 157 | movclr.l %acc0, %d2 |
113 | asr.l %d1, %d2 | 158 | asr.l %d1, %d2 |
114 | add.l %d2, (%a0) | 159 | add.l %d2, (%a0) |
@@ -118,14 +163,14 @@ lpc_decode_emac: | |||
118 | jra .exit | 163 | jra .exit |
119 | 164 | ||
120 | .order5: | 165 | .order5: |
121 | movem.l (%a1), %d3-%d7 | 166 | movem.l (%a1), %a1-%a5 |
122 | move.l (%a0)+, %a5 | 167 | move.l (%a0)+, %a6 |
123 | .loop5: | 168 | .loop5: |
124 | mac.l %a5, %d7, (%a0)+, %a5, %acc0 | 169 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
125 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | 170 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
126 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 171 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
127 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 172 | mac.l %a6, %a2, (%a0)+, %a6, %acc0 |
128 | mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0 | 173 | mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0 |
129 | movclr.l %acc0, %d2 | 174 | movclr.l %acc0, %d2 |
130 | asr.l %d1, %d2 | 175 | asr.l %d1, %d2 |
131 | add.l %d2, (%a0) | 176 | add.l %d2, (%a0) |
@@ -135,13 +180,13 @@ lpc_decode_emac: | |||
135 | jra .exit | 180 | jra .exit |
136 | 181 | ||
137 | .order4: | 182 | .order4: |
138 | movem.l (%a1), %d3-%d6 | 183 | movem.l (%a1), %a2-%a5 |
139 | move.l (%a0)+, %a5 | 184 | move.l (%a0)+, %a6 |
140 | .loop4: | 185 | .loop4: |
141 | mac.l %a5, %d6, (%a0)+, %a5, %acc0 | 186 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
142 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 187 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
143 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 188 | mac.l %a6, %a3, (%a0)+, %a6, %acc0 |
144 | mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0 | 189 | mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0 |
145 | movclr.l %acc0, %d2 | 190 | movclr.l %acc0, %d2 |
146 | asr.l %d1, %d2 | 191 | asr.l %d1, %d2 |
147 | add.l %d2, (%a0) | 192 | add.l %d2, (%a0) |
@@ -151,12 +196,12 @@ lpc_decode_emac: | |||
151 | jra .exit | 196 | jra .exit |
152 | 197 | ||
153 | .order3: | 198 | .order3: |
154 | movem.l (%a1), %d3-%d5 | 199 | movem.l (%a1), %a3-%a5 |
155 | move.l (%a0)+, %a5 | 200 | move.l (%a0)+, %a6 |
156 | .loop3: | 201 | .loop3: |
157 | mac.l %a5, %d5, (%a0)+, %a5, %acc0 | 202 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
158 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 203 | mac.l %a6, %a4, (%a0)+, %a6, %acc0 |
159 | mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0 | 204 | mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 |
160 | movclr.l %acc0, %d2 | 205 | movclr.l %acc0, %d2 |
161 | asr.l %d1, %d2 | 206 | asr.l %d1, %d2 |
162 | add.l %d2, (%a0) | 207 | add.l %d2, (%a0) |
@@ -166,11 +211,11 @@ lpc_decode_emac: | |||
166 | jra .exit | 211 | jra .exit |
167 | 212 | ||
168 | .order2: | 213 | .order2: |
169 | movem.l (%a1), %d3-%d4 | 214 | movem.l (%a1), %a4-%a5 |
170 | move.l (%a0)+, %a5 | 215 | move.l (%a0)+, %a6 |
171 | .loop2: | 216 | .loop2: |
172 | mac.l %a5, %d4, (%a0)+, %a5, %acc0 | 217 | mac.l %a6, %a5, (%a0)+, %a6, %acc0 |
173 | mac.l %a5, %d3, %acc0 | data for next iteration is already loaded | 218 | mac.l %a6, %a4, %acc0 | data for next iteration is already loaded |
174 | movclr.l %acc0, %d2 | 219 | movclr.l %acc0, %d2 |
175 | asr.l %d1, %d2 | 220 | asr.l %d1, %d2 |
176 | add.l %d2, (%a0) | 221 | add.l %d2, (%a0) |
@@ -180,9 +225,9 @@ lpc_decode_emac: | |||
180 | 225 | ||
181 | .order1: | 226 | .order1: |
182 | | no point in using mac here | 227 | | no point in using mac here |
183 | move.l (%a1), %d3 | 228 | move.l (%a1), %a5 |
184 | .loop1: | 229 | .loop1: |
185 | move.l %d3, %d2 | 230 | move.l %a5, %d2 |
186 | muls.l (%a0)+, %d2 | 231 | muls.l (%a0)+, %d2 |
187 | asr.l %d1, %d2 | 232 | asr.l %d1, %d2 |
188 | add.l %d2, (%a0) | 233 | add.l %d2, (%a0) |
@@ -192,8 +237,7 @@ lpc_decode_emac: | |||
192 | 237 | ||
193 | .default: | 238 | .default: |
194 | /* we do the filtering in an unrolled by 4 loop as far as we can, and then | 239 | /* we do the filtering in an unrolled by 4 loop as far as we can, and then |
195 | do the rest in an ordinary one by one sample loop. | 240 | do the rest by jump table. */ |
196 | */ | ||
197 | lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs | 241 | lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs |
198 | move.l %a0, %a3 | working copy of history pointer | 242 | move.l %a0, %a3 | working copy of history pointer |
199 | move.l %d2, %d3 | 243 | move.l %d2, %d3 |
@@ -209,26 +253,34 @@ lpc_decode_emac: | |||
209 | subq.l #1, %d3 | any more unrolled loop operations left? | 253 | subq.l #1, %d3 | any more unrolled loop operations left? |
210 | jne .dloop1 | 254 | jne .dloop1 |
211 | 255 | ||
212 | move.l %d2, %d3 | 256 | moveq.l #3, %d3 | mask 0x00000003 |
213 | moveq.l #3, %d4 | mask 0x00000003 | 257 | and.l %d2, %d3 | get the remaining samples to be filtered |
214 | and.l %d4, %d3 | get the remaining samples to be filtered | 258 | jmp.l (2, %pc, %d3*2) | then jump into mac.l chain |
215 | jeq .dsave | no remaining samples | 259 | | jumptable: |
216 | .dloop2: | 260 | bra.b .dsave |
217 | move.l -(%a2), %d4 | get lpc coef | 261 | bra.b .oneleft |
262 | bra.b .twoleft | ||
263 | | implicit .threeleft | ||
264 | move.l -(%a2), %d4 | ||
265 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | ||
266 | .twoleft: | ||
267 | move.l -(%a2), %d4 | ||
218 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | 268 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 |
219 | subq.l #1, %d3 | any more iterations left? | 269 | .oneleft: |
220 | jne .dloop2 | 270 | move.l -(%a2), %d4 |
271 | mac.l %a5, %d4, (%a3)+, %a5, %acc0 | need this fetch to not break line below | ||
272 | |||
221 | .dsave: | 273 | .dsave: |
222 | movclr.l %acc0, %d3 | get result | ||
223 | asr.l %d1, %d3 | shift lp_quantization bits right | ||
224 | subq.l #4, %a3 | we're one past the save location | 274 | subq.l #4, %a3 | we're one past the save location |
275 | movclr.l %acc0, %d3 | get result | ||
276 | asr.l %d1, %d3 | shift qlevel bits right | ||
225 | add.l %d3, (%a3) | add residual and save | 277 | add.l %d3, (%a3) | add residual and save |
226 | addq.l #4, %a0 | increment history pointer | 278 | addq.l #4, %a0 | increment history pointer |
227 | subq.l #1, %d0 | decrement data_len | 279 | subq.l #1, %d0 | decrement sample count |
228 | jne .default | are we done? | 280 | jne .default | are we done? |
229 | | if so, fall through to exit | 281 | | if so, fall through to exit |
230 | 282 | ||
231 | .exit: | 283 | .exit: |
232 | movem.l (%sp), %d2-%d7/%a2-%a5 | 284 | movem.l (%sp), %d2-%d7/%a2-%a6 |
233 | lea.l (40, %sp), %sp | 285 | lea.l (44, %sp), %sp |
234 | rts | 286 | rts |