diff options
author | Jens Arnold <amiconn@rockbox.org> | 2009-07-15 20:36:31 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2009-07-15 20:36:31 +0000 |
commit | 0a2197b84608bb6f2e279bc2e909fba14aab674b (patch) | |
tree | 8453f1dc51da1b1c0de020c9d3a4b08261431906 /apps | |
parent | be54b7f6a2312a007814284b3ae48ed5da892704 (diff) | |
download | rockbox-0a2197b84608bb6f2e279bc2e909fba14aab674b.tar.gz rockbox-0a2197b84608bb6f2e279bc2e909fba14aab674b.zip |
Further ARMv6 imdct optimisation, ~5.5% speedup.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21884 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r-- | apps/plugins/mpegplayer/idct_armv6.S | 161 |
1 files changed, 60 insertions, 101 deletions
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S index 6b940065c8..fbffa4dfa9 100644 --- a/apps/plugins/mpegplayer/idct_armv6.S +++ b/apps/plugins/mpegplayer/idct_armv6.S | |||
@@ -75,15 +75,17 @@ | |||
75 | add r11, r11, r3 @ a1 = a1' + a2' | 75 | add r11, r11, r3 @ a1 = a1' + a2' |
76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | 76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' |
77 | 77 | ||
78 | @ Special store order for making the column pass calculate columns in | ||
79 | @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages. | ||
78 | sub r2, r10, r6 @ block[7] = (a0 - b0) | 80 | sub r2, r10, r6 @ block[7] = (a0 - b0) |
79 | mov r2, r2, asr #12 @ >> 12 | 81 | mov r2, r2, asr #12 @ >> 12 |
80 | strh r2, [r1, #7*16] | 82 | strh r2, [r1, #7*16] |
81 | sub r2, r11, r7 @ block[6] = (a1 - b1) | 83 | sub r2, r11, r7 @ block[6] = (a1 - b1) |
82 | mov r2, r2, asr #12 @ >> 12 | 84 | mov r2, r2, asr #12 @ >> 12 |
83 | strh r2, [r1, #6*16] | 85 | strh r2, [r1, #5*16] |
84 | sub r2, r3, r8 @ block[5] = (a2 - b2) | 86 | sub r2, r3, r8 @ block[5] = (a2 - b2) |
85 | mov r2, r2, asr #12 @ >> 12 | 87 | mov r2, r2, asr #12 @ >> 12 |
86 | strh r2, [r1, #5*16] | 88 | strh r2, [r1, #6*16] |
87 | sub r2, r12, r9 @ block[4] = (a3 - b3) | 89 | sub r2, r12, r9 @ block[4] = (a3 - b3) |
88 | mov r2, r2, asr #12 @ >> 12 | 90 | mov r2, r2, asr #12 @ >> 12 |
89 | strh r2, [r1, #4*16] | 91 | strh r2, [r1, #4*16] |
@@ -92,10 +94,10 @@ | |||
92 | strh r2, [r1, #3*16] | 94 | strh r2, [r1, #3*16] |
93 | add r2, r3, r8 @ block[2] = (a2 + b2) | 95 | add r2, r3, r8 @ block[2] = (a2 + b2) |
94 | mov r2, r2, asr #12 @ >> 12 | 96 | mov r2, r2, asr #12 @ >> 12 |
95 | strh r2, [r1, #2*16] | 97 | strh r2, [r1, #1*16] |
96 | add r2, r11, r7 @ block[1] = (a1 + b1) | 98 | add r2, r11, r7 @ block[1] = (a1 + b1) |
97 | mov r2, r2, asr #12 @ >> 12 | 99 | mov r2, r2, asr #12 @ >> 12 |
98 | strh r2, [r1, #1*16] | 100 | strh r2, [r1, #2*16] |
99 | add r2, r10, r6 @ block[0] = (a0 + b0) | 101 | add r2, r10, r6 @ block[0] = (a0 + b0) |
100 | mov r2, r2, asr #12 @ >> 12 | 102 | mov r2, r2, asr #12 @ >> 12 |
101 | strh r2, [r1], #2 @ advance to next temp column | 103 | strh r2, [r1], #2 @ advance to next temp column |
@@ -200,34 +202,23 @@ mpeg2_idct_copy: | |||
200 | bl .idct | 202 | bl .idct |
201 | ldmfd sp!, {r1-r2} | 203 | ldmfd sp!, {r1-r2} |
202 | 204 | ||
203 | add r12, r0, #128 | 205 | add r3, r0, #128 |
204 | ldrd r4, [r0] | ||
205 | mov r8, #0 | 206 | mov r8, #0 |
206 | mov r9, #0 | 207 | mov r9, #0 |
207 | mov r10, #0 | 208 | mov r10, #0 |
208 | mov r11, #0 | 209 | mov r11, #0 |
209 | 1: | 210 | 1: @ idct data is in order 0-2-1-3-4-6-5-7, |
210 | ldrd r6, [r0, #8] | 211 | ldmia r0, {r4-r7} @ see above |
212 | stmia r0!, {r8-r11} | ||
211 | usat16 r4, #8, r4 | 213 | usat16 r4, #8, r4 |
212 | strb r4, [r1, #0] | ||
213 | mov r4, r4, lsr #16 | ||
214 | strb r4, [r1, #1] | ||
215 | usat16 r5, #8, r5 | 214 | usat16 r5, #8, r5 |
216 | strb r5, [r1, #2] | 215 | orr r4, r4, r5, lsl #8 |
217 | mov r5, r5, lsr #16 | ||
218 | strb r5, [r1, #3] | ||
219 | ldrd r4, [r0, #16] | ||
220 | usat16 r6, #8, r6 | 216 | usat16 r6, #8, r6 |
221 | strb r6, [r1, #4] | ||
222 | mov r6, r6, lsr #16 | ||
223 | strb r6, [r1, #5] | ||
224 | usat16 r7, #8, r7 | 217 | usat16 r7, #8, r7 |
225 | strb r7, [r1, #6] | 218 | orr r5, r6, r7, lsl #8 |
226 | mov r7, r7, lsr #16 | 219 | strd r4, [r1] @ r4, r5 |
227 | strb r7, [r1, #7] | ||
228 | stmia r0!, {r8-r11} | ||
229 | add r1, r1, r2 | 220 | add r1, r1, r2 |
230 | cmp r0, r12 | 221 | cmp r0, r3 |
231 | blo 1b | 222 | blo 1b |
232 | 223 | ||
233 | ldmfd sp!, {r4-r11, pc} | 224 | ldmfd sp!, {r4-r11, pc} |
@@ -244,93 +235,61 @@ mpeg2_idct_add: | |||
244 | stmfd sp!, {r2-r11, lr} | 235 | stmfd sp!, {r2-r11, lr} |
245 | bl .idct | 236 | bl .idct |
246 | ldmfd sp!, {r1-r2} | 237 | ldmfd sp!, {r1-r2} |
238 | |||
239 | add r3, r0, #128 | ||
240 | mov r10, #0 | ||
247 | mov r11, #0 | 241 | mov r11, #0 |
248 | add r12, r0, #128 | 242 | mov r12, #0 |
249 | 2: | 243 | mov lr, #0 |
250 | ldmia r0, {r3-r6} | 244 | ldrd r8, [r1] @ r8, r9 |
251 | ldrb r7, [r1, #0] | 245 | 2: @ idct data is in order 0-2-1-3-4-6-5-7, |
252 | ldrb r8, [r1, #1] | 246 | ldmia r0, {r4-r7} @ see above |
253 | ldrb r9, [r1, #2] | 247 | stmia r0!, {r10-r12, lr} |
254 | ldrb r10, [r1, #3] | 248 | uxtab16 r4, r4, r8 |
255 | str r11, [r0], #4 | 249 | uxtab16 r5, r5, r8, ror #8 |
256 | orr r7, r7, r8, lsl #16 | ||
257 | sadd16 r3, r3, r7 | ||
258 | usat16 r3, #8, r3 | ||
259 | strb r3, [r1, #0] | ||
260 | mov r3, r3, lsr #16 | ||
261 | strb r3, [r1, #1] | ||
262 | str r11, [r0], #4 | ||
263 | orr r9, r9, r10, lsl #16 | ||
264 | sadd16 r4, r4, r9 | ||
265 | usat16 r4, #8, r4 | 250 | usat16 r4, #8, r4 |
266 | strb r4, [r1, #2] | ||
267 | mov r4, r4, lsr #16 | ||
268 | strb r4, [r1, #3] | ||
269 | ldrb r7, [r1, #4] | ||
270 | ldrb r8, [r1, #5] | ||
271 | ldrb r9, [r1, #6] | ||
272 | ldrb r10, [r1, #7] | ||
273 | str r11, [r0], #4 | ||
274 | orr r7, r7, r8, lsl #16 | ||
275 | sadd16 r5, r5, r7 | ||
276 | usat16 r5, #8, r5 | 251 | usat16 r5, #8, r5 |
277 | strb r5, [r1, #4] | 252 | orr r4, r4, r5, lsl #8 |
278 | mov r5, r5, lsr #16 | 253 | uxtab16 r6, r6, r9 |
279 | strb r5, [r1, #5] | 254 | uxtab16 r7, r7, r9, ror #8 |
280 | str r11, [r0], #4 | ||
281 | orr r9, r9, r10, lsl #16 | ||
282 | sadd16 r6, r6, r9 | ||
283 | usat16 r6, #8, r6 | 255 | usat16 r6, #8, r6 |
284 | strb r6, [r1, #6] | 256 | usat16 r7, #8, r7 |
285 | mov r6, r6, lsr #16 | 257 | orr r5, r6, r7, lsl #8 |
286 | strb r6, [r1, #7] | 258 | strd r4, [r1] @ r4, r5 |
287 | add r1, r1, r2 | 259 | add r1, r1, r2 |
288 | cmp r0, r12 | 260 | cmp r0, r3 |
261 | ldrlod r8, [r1] @ r8, r9 | ||
289 | blo 2b | 262 | blo 2b |
263 | |||
290 | ldmfd sp!, {r4-r11, pc} | 264 | ldmfd sp!, {r4-r11, pc} |
291 | 265 | ||
292 | 3: | 266 | 3: |
293 | stmfd sp!, {r4-r5, lr} | 267 | stmfd sp!, {r4, lr} |
294 | ldrsh r1, [r0, #0] /* r1 = block[0] */ | 268 | ldrsh r4, [r0, #0] @ r4 = block[0] |
295 | mov r4, #0 | 269 | mov r12, #0 |
296 | strh r4, [r0, #0] /* block[0] = 0 */ | 270 | strh r12, [r0, #0] @ block[0] = 0 |
297 | strh r4, [r0, #126] /* block[63] = 0 */ | 271 | strh r12, [r0, #126] @ block[63] = 0 |
298 | add r1, r1, #64 /* r1 = DC << 7 */ | 272 | add r4, r4, #64 |
299 | add r0, r2, r3, asl #3 | 273 | mov r4, r4, asr #7 @ r4 = DC |
274 | mov r4, r4, lsl #16 @ spread to 2 halfwords | ||
275 | orr r4, r4, r4, lsr #16 | ||
276 | ldrd r0, [r2] @ r0, r1 | ||
277 | add r12, r2, r3, asl #3 | ||
300 | 4: | 278 | 4: |
301 | ldrb r4, [r2, #0] | 279 | uxtab16 lr, r4, r0, ror #8 |
302 | ldrb r5, [r2, #1] | 280 | uxtab16 r0, r4, r0 |
303 | ldrb r12, [r2, #2] | 281 | usat16 lr, #8, lr |
304 | ldrb lr, [r2, #3] | 282 | usat16 r0, #8, r0 |
305 | add r4, r4, r1, asr #7 | 283 | orr r0, r0, lr, lsl #8 |
306 | usat r4, #8, r4 | 284 | uxtab16 lr, r4, r1, ror #8 |
307 | strb r4, [r2, #0] | 285 | uxtab16 r1, r4, r1 |
308 | add r5, r5, r1, asr #7 | 286 | usat16 lr, #8, lr |
309 | usat r5, #8, r5 | 287 | usat16 r1, #8, r1 |
310 | strb r5, [r2, #1] | 288 | orr r1, r1, lr, lsl #8 |
311 | add r12, r12, r1, asr #7 | 289 | strd r0, [r2] @ r0, r1 |
312 | usat r12, #8, r12 | ||
313 | strb r12, [r2, #2] | ||
314 | add lr, lr, r1, asr #7 | ||
315 | usat lr, #8, lr | ||
316 | strb lr, [r2, #3] | ||
317 | ldrb r4, [r2, #4] | ||
318 | ldrb r5, [r2, #5] | ||
319 | ldrb r12, [r2, #6] | ||
320 | ldrb lr, [r2, #7] | ||
321 | add r4, r4, r1, asr #7 | ||
322 | usat r4, #8, r4 | ||
323 | strb r4, [r2, #4] | ||
324 | add r5, r5, r1, asr #7 | ||
325 | usat r5, #8, r5 | ||
326 | strb r5, [r2, #5] | ||
327 | add r12, r12, r1, asr #7 | ||
328 | usat r12, #8, r12 | ||
329 | strb r12, [r2, #6] | ||
330 | add lr, lr, r1, asr #7 | ||
331 | usat lr, #8, lr | ||
332 | strb lr, [r2, #7] | ||
333 | add r2, r2, r3 | 290 | add r2, r2, r3 |
334 | cmp r2, r0 | 291 | cmp r2, r12 |
292 | ldrlod r0, [r2] @ r0, r1 | ||
335 | blo 4b | 293 | blo 4b |
336 | ldmfd sp!, {r4-r5, pc} | 294 | |
295 | ldmfd sp!, {r4, pc} | ||