summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2005-11-05 17:54:37 +0000
committerThom Johansen <thomj@rockbox.org>2005-11-05 17:54:37 +0000
commit97a21a3b36aa74d133af5bf5411cbf6d576f8a86 (patch)
tree85e235134730fdb1c8bc3bcc6f5e5f1b70d12c6a
parent63fbc0729f66ad55413579da4cb93b9ea51db223 (diff)
downloadrockbox-97a21a3b36aa74d133af5bf5411cbf6d576f8a86.tar.gz
rockbox-97a21a3b36aa74d133af5bf5411cbf6d576f8a86.zip
Unrolled loops up to order 10 plus slight optimisation of default case.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7759 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libffmpegFLAC/coldfire.S212
1 files changed, 132 insertions, 80 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S
index 33964cdbc1..1d144ecc76 100644
--- a/apps/codecs/libffmpegFLAC/coldfire.S
+++ b/apps/codecs/libffmpegFLAC/coldfire.S
@@ -20,16 +20,16 @@
20/* The following is an assembler optimised version of the LPC filtering 20/* The following is an assembler optimised version of the LPC filtering
21 routines needed for FLAC decoding. It is optimised for use with the 21 routines needed for FLAC decoding. It is optimised for use with the
22 MCF5249 processor, or any other similar ColdFire core with the EMAC unit. 22 MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
23 All LPC filtering up to order 8 is done in specially optimised unrolled 23 All LPC filtering up to order 10 is done in specially optimised unrolled
24 loops, while every order above this is handled by a slower default routine. 24 loops, while every order above this is handled by a slower default routine.
25 */ 25 */
26 .section .icode,"ax",@progbits 26 .section .icode,"ax",@progbits
27 .global lpc_decode_emac 27 .global lpc_decode_emac
28 .align 2 28 .align 2
29lpc_decode_emac: 29lpc_decode_emac:
30 lea.l (-40, %sp), %sp 30 lea.l (-44, %sp), %sp
31 movem.l %d2-%d7/%a2-%a5, (%sp) 31 movem.l %d2-%d7/%a2-%a6, (%sp)
32 movem.l (40+4, %sp), %d0-%d2/%a0-%a1 32 movem.l (44+4, %sp), %d0-%d2/%a0-%a1
33 /* d0 = blocksize, d1 = qlevel, d2 = pred_order 33 /* d0 = blocksize, d1 = qlevel, d2 = pred_order
34 a0 = data, a1 = coeffs 34 a0 = data, a1 = coeffs
35 */ 35 */
@@ -39,17 +39,17 @@ lpc_decode_emac:
39 and free a register by not saving data pointer. 39 and free a register by not saving data pointer.
40 */ 40 */
41 move.l %d2, %d3 41 move.l %d2, %d3
42 neg.l %d3 42 neg.l %d3
43 lea.l (%a0, %d3.l*4), %a0 | history 43 lea.l (%a0, %d3.l*4), %a0 | history
44 clr.l %d3 44 clr.l %d3
45 move.l %d3, %macsr | we'll need integer mode for this 45 move.l %d3, %macsr | we'll need integer mode for this
46 tst.l %d0 46 tst.l %d0
47 jeq .exit | zero samples to process, exit 47 jeq .exit | zero samples to process, exit
48 moveq.l #8, %d3 48 moveq.l #10, %d3
49 cmp.l %d3, %d2 49 cmp.l %d3, %d2
50 jgt .default | order is over 8, jump to default case 50 jgt .default | order is over 10, jump to default case
51 jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order 51 jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order
52.jumptable: 52| jumptable:
53 bra.w .exit | zero order filter isn't possible, exit function 53 bra.w .exit | zero order filter isn't possible, exit function
54 bra.w .order1 54 bra.w .order1
55 bra.w .order2 55 bra.w .order2
@@ -58,39 +58,84 @@ lpc_decode_emac:
58 bra.w .order5 58 bra.w .order5
59 bra.w .order6 59 bra.w .order6
60 bra.w .order7 60 bra.w .order7
61 bra.w .order8
62 bra.w .order9
61 63
62| last jump table entry coincides with target, so leave it out 64| last jump table entry coincides with target, so leave it out
63.order8: 65.order10:
64 movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs 66 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
65 move.l (%a0)+, %a5 | load first history sample 67 move.l (%a0)+, %a6 | load first history sample
66.loop8: 68.loop10:
67 mac.l %a5, %a4, (%a0)+, %a5, %acc0 69 mac.l %a6, %a5, (%a0)+, %a6, %acc0
68 mac.l %a5, %a3, (%a0)+, %a5, %acc0 70 mac.l %a6, %a4, (%a0)+, %a6, %acc0
69 mac.l %a5, %a2, (%a0)+, %a5, %acc0 71 mac.l %a6, %a3, (%a0)+, %a6, %acc0
70 mac.l %a5, %d7, (%a0)+, %a5, %acc0 72 mac.l %a6, %a2, (%a0)+, %a6, %acc0
71 mac.l %a5, %d6, (%a0)+, %a5, %acc0 73 mac.l %a6, %a1, (%a0)+, %a6, %acc0
72 mac.l %a5, %d5, (%a0)+, %a5, %acc0 74 mac.l %a6, %d7, (%a0)+, %a6, %acc0
73 mac.l %a5, %d4, (%a0)+, %a5, %acc0 75 mac.l %a6, %d6, (%a0)+, %a6, %acc0
74 mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration 76 mac.l %a6, %d5, (%a0)+, %a6, %acc0
77 mac.l %a6, %d4, (%a0)+, %a6, %acc0
78 mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration
75 movclr.l %acc0, %d2 | get sum 79 movclr.l %acc0, %d2 | get sum
76 asr.l %d1, %d2 | shift sum by lp_quantization bits 80 asr.l %d1, %d2 | shift sum by qlevel bits
77 add.l %d2, (%a0) | add residual and save 81 add.l %d2, (%a0) | add residual and save
78 lea.l (-6*4, %a0), %a0 | point history back at second element 82 lea.l (-8*4, %a0), %a0 | point history back at second element
79 subq.l #1, %d0 | decrement counter 83 subq.l #1, %d0 | decrement sample count
80 jne .loop8 | are we done? 84 jne .loop10 | are we done?
85 jra .exit
86
87.order9:
88 movem.l (%a1), %d4-%d7/%a1-%a5
89 move.l (%a0)+, %a6
90.loop9:
91 mac.l %a6, %a5, (%a0)+, %a6, %acc0
92 mac.l %a6, %a4, (%a0)+, %a6, %acc0
93 mac.l %a6, %a3, (%a0)+, %a6, %acc0
94 mac.l %a6, %a2, (%a0)+, %a6, %acc0
95 mac.l %a6, %a1, (%a0)+, %a6, %acc0
96 mac.l %a6, %d7, (%a0)+, %a6, %acc0
97 mac.l %a6, %d6, (%a0)+, %a6, %acc0
98 mac.l %a6, %d5, (%a0)+, %a6, %acc0
99 mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0
100 movclr.l %acc0, %d2
101 asr.l %d1, %d2
102 add.l %d2, (%a0)
103 lea.l (-7*4, %a0), %a0
104 subq.l #1, %d0
105 jne .loop9
106 jra .exit
107
108.order8:
109 movem.l (%a1), %d5-%d7/%a1-%a5
110 move.l (%a0)+, %a6
111.loop8:
112 mac.l %a6, %a5, (%a0)+, %a6, %acc0
113 mac.l %a6, %a4, (%a0)+, %a6, %acc0
114 mac.l %a6, %a3, (%a0)+, %a6, %acc0
115 mac.l %a6, %a2, (%a0)+, %a6, %acc0
116 mac.l %a6, %a1, (%a0)+, %a6, %acc0
117 mac.l %a6, %d7, (%a0)+, %a6, %acc0
118 mac.l %a6, %d6, (%a0)+, %a6, %acc0
119 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0
120 movclr.l %acc0, %d2
121 asr.l %d1, %d2
122 add.l %d2, (%a0)
123 lea.l (-6*4, %a0), %a0
124 subq.l #1, %d0
125 jne .loop8
81 jra .exit 126 jra .exit
82 127
83.order7: 128.order7:
84 movem.l (%a1), %d3-%d7/%a2-%a3 129 movem.l (%a1), %d6-%d7/%a1-%a5
85 move.l (%a0)+, %a5 130 move.l (%a0)+, %a6
86.loop7: 131.loop7:
87 mac.l %a5, %a3, (%a0)+, %a5, %acc0 132 mac.l %a6, %a5, (%a0)+, %a6, %acc0
88 mac.l %a5, %a2, (%a0)+, %a5, %acc0 133 mac.l %a6, %a4, (%a0)+, %a6, %acc0
89 mac.l %a5, %d7, (%a0)+, %a5, %acc0 134 mac.l %a6, %a3, (%a0)+, %a6, %acc0
90 mac.l %a5, %d6, (%a0)+, %a5, %acc0 135 mac.l %a6, %a2, (%a0)+, %a6, %acc0
91 mac.l %a5, %d5, (%a0)+, %a5, %acc0 136 mac.l %a6, %a1, (%a0)+, %a6, %acc0
92 mac.l %a5, %d4, (%a0)+, %a5, %acc0 137 mac.l %a6, %d7, (%a0)+, %a6, %acc0
93 mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0 138 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
94 movclr.l %acc0, %d2 139 movclr.l %acc0, %d2
95 asr.l %d1, %d2 140 asr.l %d1, %d2
96 add.l %d2, (%a0) 141 add.l %d2, (%a0)
@@ -100,15 +145,15 @@ lpc_decode_emac:
100 jra .exit 145 jra .exit
101 146
102.order6: 147.order6:
103 movem.l (%a1), %d3-%d7/%a2 148 movem.l (%a1), %d7/%a1-%a5
104 move.l (%a0)+, %a5 149 move.l (%a0)+, %a6
105.loop6: 150.loop6:
106 mac.l %a5, %a2, (%a0)+, %a5, %acc0 151 mac.l %a6, %a5, (%a0)+, %a6, %acc0
107 mac.l %a5, %d7, (%a0)+, %a5, %acc0 152 mac.l %a6, %a4, (%a0)+, %a6, %acc0
108 mac.l %a5, %d6, (%a0)+, %a5, %acc0 153 mac.l %a6, %a3, (%a0)+, %a6, %acc0
109 mac.l %a5, %d5, (%a0)+, %a5, %acc0 154 mac.l %a6, %a2, (%a0)+, %a6, %acc0
110 mac.l %a5, %d4, (%a0)+, %a5, %acc0 155 mac.l %a6, %a1, (%a0)+, %a6, %acc0
111 mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0 156 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
112 movclr.l %acc0, %d2 157 movclr.l %acc0, %d2
113 asr.l %d1, %d2 158 asr.l %d1, %d2
114 add.l %d2, (%a0) 159 add.l %d2, (%a0)
@@ -118,14 +163,14 @@ lpc_decode_emac:
118 jra .exit 163 jra .exit
119 164
120.order5: 165.order5:
121 movem.l (%a1), %d3-%d7 166 movem.l (%a1), %a1-%a5
122 move.l (%a0)+, %a5 167 move.l (%a0)+, %a6
123.loop5: 168.loop5:
124 mac.l %a5, %d7, (%a0)+, %a5, %acc0 169 mac.l %a6, %a5, (%a0)+, %a6, %acc0
125 mac.l %a5, %d6, (%a0)+, %a5, %acc0 170 mac.l %a6, %a4, (%a0)+, %a6, %acc0
126 mac.l %a5, %d5, (%a0)+, %a5, %acc0 171 mac.l %a6, %a3, (%a0)+, %a6, %acc0
127 mac.l %a5, %d4, (%a0)+, %a5, %acc0 172 mac.l %a6, %a2, (%a0)+, %a6, %acc0
128 mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0 173 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
129 movclr.l %acc0, %d2 174 movclr.l %acc0, %d2
130 asr.l %d1, %d2 175 asr.l %d1, %d2
131 add.l %d2, (%a0) 176 add.l %d2, (%a0)
@@ -135,13 +180,13 @@ lpc_decode_emac:
135 jra .exit 180 jra .exit
136 181
137.order4: 182.order4:
138 movem.l (%a1), %d3-%d6 183 movem.l (%a1), %a2-%a5
139 move.l (%a0)+, %a5 184 move.l (%a0)+, %a6
140.loop4: 185.loop4:
141 mac.l %a5, %d6, (%a0)+, %a5, %acc0 186 mac.l %a6, %a5, (%a0)+, %a6, %acc0
142 mac.l %a5, %d5, (%a0)+, %a5, %acc0 187 mac.l %a6, %a4, (%a0)+, %a6, %acc0
143 mac.l %a5, %d4, (%a0)+, %a5, %acc0 188 mac.l %a6, %a3, (%a0)+, %a6, %acc0
144 mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0 189 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
145 movclr.l %acc0, %d2 190 movclr.l %acc0, %d2
146 asr.l %d1, %d2 191 asr.l %d1, %d2
147 add.l %d2, (%a0) 192 add.l %d2, (%a0)
@@ -151,12 +196,12 @@ lpc_decode_emac:
151 jra .exit 196 jra .exit
152 197
153.order3: 198.order3:
154 movem.l (%a1), %d3-%d5 199 movem.l (%a1), %a3-%a5
155 move.l (%a0)+, %a5 200 move.l (%a0)+, %a6
156.loop3: 201.loop3:
157 mac.l %a5, %d5, (%a0)+, %a5, %acc0 202 mac.l %a6, %a5, (%a0)+, %a6, %acc0
158 mac.l %a5, %d4, (%a0)+, %a5, %acc0 203 mac.l %a6, %a4, (%a0)+, %a6, %acc0
159 mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0 204 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
160 movclr.l %acc0, %d2 205 movclr.l %acc0, %d2
161 asr.l %d1, %d2 206 asr.l %d1, %d2
162 add.l %d2, (%a0) 207 add.l %d2, (%a0)
@@ -166,11 +211,11 @@ lpc_decode_emac:
166 jra .exit 211 jra .exit
167 212
168.order2: 213.order2:
169 movem.l (%a1), %d3-%d4 214 movem.l (%a1), %a4-%a5
170 move.l (%a0)+, %a5 215 move.l (%a0)+, %a6
171.loop2: 216.loop2:
172 mac.l %a5, %d4, (%a0)+, %a5, %acc0 217 mac.l %a6, %a5, (%a0)+, %a6, %acc0
173 mac.l %a5, %d3, %acc0 | data for next iteration is already loaded 218 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
174 movclr.l %acc0, %d2 219 movclr.l %acc0, %d2
175 asr.l %d1, %d2 220 asr.l %d1, %d2
176 add.l %d2, (%a0) 221 add.l %d2, (%a0)
@@ -180,9 +225,9 @@ lpc_decode_emac:
180 225
181.order1: 226.order1:
182 | no point in using mac here 227 | no point in using mac here
183 move.l (%a1), %d3 228 move.l (%a1), %a5
184.loop1: 229.loop1:
185 move.l %d3, %d2 230 move.l %a5, %d2
186 muls.l (%a0)+, %d2 231 muls.l (%a0)+, %d2
187 asr.l %d1, %d2 232 asr.l %d1, %d2
188 add.l %d2, (%a0) 233 add.l %d2, (%a0)
@@ -192,8 +237,7 @@ lpc_decode_emac:
192 237
193.default: 238.default:
194 /* we do the filtering in an unrolled by 4 loop as far as we can, and then 239 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
195 do the rest in an ordinary one by one sample loop. 240 do the rest by jump table. */
196 */
197 lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs 241 lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
198 move.l %a0, %a3 | working copy of history pointer 242 move.l %a0, %a3 | working copy of history pointer
199 move.l %d2, %d3 243 move.l %d2, %d3
@@ -209,26 +253,34 @@ lpc_decode_emac:
209 subq.l #1, %d3 | any more unrolled loop operations left? 253 subq.l #1, %d3 | any more unrolled loop operations left?
210 jne .dloop1 254 jne .dloop1
211 255
212 move.l %d2, %d3 256 moveq.l #3, %d3 | mask 0x00000003
213 moveq.l #3, %d4 | mask 0x00000003 257 and.l %d2, %d3 | get the remaining samples to be filtered
214 and.l %d4, %d3 | get the remaining samples to be filtered 258 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
215 jeq .dsave | no remaining samples 259| jumptable:
216.dloop2: 260 bra.b .dsave
217 move.l -(%a2), %d4 | get lpc coef 261 bra.b .oneleft
262 bra.b .twoleft
263| implicit .threeleft
264 move.l -(%a2), %d4
265 mac.l %a5, %d4, (%a3)+, %a5, %acc0
266.twoleft:
267 move.l -(%a2), %d4
218 mac.l %a5, %d4, (%a3)+, %a5, %acc0 268 mac.l %a5, %d4, (%a3)+, %a5, %acc0
219 subq.l #1, %d3 | any more iterations left? 269.oneleft:
220 jne .dloop2 270 move.l -(%a2), %d4
271 mac.l %a5, %d4, (%a3)+, %a5, %acc0 | need this fetch to not break line below
272
221.dsave: 273.dsave:
222 movclr.l %acc0, %d3 | get result
223 asr.l %d1, %d3 | shift lp_quantization bits right
224 subq.l #4, %a3 | we're one past the save location 274 subq.l #4, %a3 | we're one past the save location
275 movclr.l %acc0, %d3 | get result
276 asr.l %d1, %d3 | shift qlevel bits right
225 add.l %d3, (%a3) | add residual and save 277 add.l %d3, (%a3) | add residual and save
226 addq.l #4, %a0 | increment history pointer 278 addq.l #4, %a0 | increment history pointer
227 subq.l #1, %d0 | decrement data_len 279 subq.l #1, %d0 | decrement sample count
228 jne .default | are we done? 280 jne .default | are we done?
229 | if so, fall through to exit 281 | if so, fall through to exit
230 282
231.exit: 283.exit:
232 movem.l (%sp), %d2-%d7/%a2-%a5 284 movem.l (%sp), %d2-%d7/%a2-%a6
233 lea.l (40, %sp), %sp 285 lea.l (44, %sp), %sp
234 rts 286 rts