diff options
author | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-28 02:32:43 +0000 |
---|---|---|
committer | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-28 02:32:43 +0000 |
commit | 815dcfdd3502bd23c4f2705ff2b044755dd512cc (patch) | |
tree | 004612371e0b949a02410b4b60fbba5acd132371 | |
parent | 99ae7bcc438495d468322b0a81864a12a782f37b (diff) | |
download | rockbox-815dcfdd3502bd23c4f2705ff2b044755dd512cc.tar.gz rockbox-815dcfdd3502bd23c4f2705ff2b044755dd512cc.zip |
Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | apps/recorder/jpeg_idct_arm.S | 162 |
1 files changed, 86 insertions, 76 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S index 01b08c4b5a..d84e5e7962 100644 --- a/apps/recorder/jpeg_idct_arm.S +++ b/apps/recorder/jpeg_idct_arm.S | |||
@@ -113,7 +113,11 @@ jpeg_idct2h: | |||
113 | results can not be stored merged. | 113 | results can not be stored merged. |
114 | */ | 114 | */ |
115 | stmdb sp!, { r4-r5, lr } | 115 | stmdb sp!, { r4-r5, lr } |
116 | #if ARM_ARCH < 5 | ||
116 | ldr r14, =4112 | 117 | ldr r14, =4112 |
118 | #else | ||
119 | ldrsh r14, .Lpool4+2 | ||
120 | #endif | ||
117 | 1: | 121 | 1: |
118 | ldrsh r12, [r0] | 122 | ldrsh r12, [r0] |
119 | ldrsh r4, [r0, #2] | 123 | ldrsh r4, [r0, #2] |
@@ -140,7 +144,7 @@ jpeg_idct2h: | |||
140 | ldmia sp!, { r4-r5, pc } | 144 | ldmia sp!, { r4-r5, pc } |
141 | #else | 145 | #else |
142 | stmdb sp!, { r4, lr } | 146 | stmdb sp!, { r4, lr } |
143 | ldr r14, =4112 | 147 | ldrsh r14, .Lpool4+2 |
144 | 1: | 148 | 1: |
145 | ldr r12, [r0] | 149 | ldr r12, [r0] |
146 | sadd16 r12, r12, r14 | 150 | sadd16 r12, r12, r14 |
@@ -198,27 +202,26 @@ jpeg_idct4v: | |||
198 | ldmia sp!, { r4-r7, pc } | 202 | ldmia sp!, { r4-r7, pc } |
199 | #elif ARM_ARCH < 6 | 203 | #elif ARM_ARCH < 6 |
200 | stmdb sp!, { r4-r8, lr } | 204 | stmdb sp!, { r4-r8, lr } |
201 | ldr r8, =1024 | 205 | mov r8, #1024 |
202 | ldr r14, =4433 | 206 | ldrd r4, .Lpool4 |
203 | ldr r12, =3302955134 | ||
204 | 1: | 207 | 1: |
205 | ldrsh r5, [r0, #48] | 208 | ldrsh r14, [r0, #48] |
206 | ldrsh r3, [r0, #16] | 209 | ldrsh r3, [r0, #16] |
207 | ldrsh r4, [r0, #32] | 210 | ldrsh r12, [r0, #32] |
208 | ldrsh r2, [r0] | 211 | ldrsh r2, [r0] |
209 | add r6, r3, r5 /* r6 = z1 = d1 + d3 */ | 212 | add r6, r3, r14 /* r6 = z1 = d1 + d3 */ |
210 | add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */ | 213 | add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */ |
211 | smlabb r6, r14, r6, r8 /* z1 *= 4433 */ | 214 | smlabb r6, r5, r6, r8 /* z1 *= 4433 */ |
212 | sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */ | 215 | sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */ |
213 | smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */ | 216 | smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */ |
214 | smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */ | 217 | smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */ |
215 | mov r7, r7, lsl #2 | 218 | mov r7, r7, lsl #2 |
216 | mov r2, r2, lsl #2 | 219 | mov r2, r2, lsl #2 |
217 | add r4, r7, r3, asr #11 /* r4 = o0 */ | 220 | add r12, r7, r3, asr #11 /* r12 = o0 */ |
218 | sub r7, r7, r3, asr #11 /* r7 = o3 */ | 221 | sub r7, r7, r3, asr #11 /* r7 = o3 */ |
219 | add r3, r2, r5, asr #11 /* r3 = o1 */ | 222 | add r3, r2, r14, asr #11 /* r3 = o1 */ |
220 | sub r2, r2, r5, asr #11 /* r2 = o2 */ | 223 | sub r2, r2, r14, asr #11 /* r2 = o2 */ |
221 | strh r4, [r0] | 224 | strh r12, [r0] |
222 | strh r7, [r0, #48] | 225 | strh r7, [r0, #48] |
223 | strh r3, [r0, #16] | 226 | strh r3, [r0, #16] |
224 | strh r2, [r0, #32] | 227 | strh r2, [r0, #32] |
@@ -228,9 +231,8 @@ jpeg_idct4v: | |||
228 | ldmia sp!, { r4-r8, pc } | 231 | ldmia sp!, { r4-r8, pc } |
229 | #else | 232 | #else |
230 | stmdb sp!, { r4-r10, lr } | 233 | stmdb sp!, { r4-r10, lr } |
231 | ldr r2, =1024 | 234 | ldrd r2, .Lpool4 |
232 | ldr r3, =4433 | 235 | mov r12, #1024 |
233 | ldr r12, =3302955134 | ||
234 | 1: | 236 | 1: |
235 | ldr r6, [r0, #32] | 237 | ldr r6, [r0, #32] |
236 | ldr r4, [r0] | 238 | ldr r4, [r0] |
@@ -247,12 +249,12 @@ jpeg_idct4v: | |||
247 | /* multiplication expands values beyond 16 bits, so this part needs to be | 249 | /* multiplication expands values beyond 16 bits, so this part needs to be |
248 | split. the values will be merged below so that the rest of the addition | 250 | split. the values will be merged below so that the rest of the addition |
249 | can be done in parallel */ | 251 | can be done in parallel */ |
250 | smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */ | 252 | smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */ |
251 | smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */ | 253 | smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */ |
252 | smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */ | 254 | smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */ |
253 | smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */ | 255 | smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */ |
254 | smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */ | 256 | smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */ |
255 | smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */ | 257 | smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */ |
256 | mov r8, r8, lsl #2 /* complete the parallel shift started */ | 258 | mov r8, r8, lsl #2 /* complete the parallel shift started */ |
257 | mov r4, r4, lsl #2 /* with the earlier bic instructions */ | 259 | mov r4, r4, lsl #2 /* with the earlier bic instructions */ |
258 | /* tmp2 are in r10, r5; tmp0 are in r14, r6 */ | 260 | /* tmp2 are in r10, r5; tmp0 are in r14, r6 */ |
@@ -276,6 +278,17 @@ jpeg_idct4v: | |||
276 | #endif | 278 | #endif |
277 | .size jpeg_idct4v, .-jpeg_idct4v | 279 | .size jpeg_idct4v, .-jpeg_idct4v |
278 | 280 | ||
281 | #if ARM_ARCH > 4 | ||
282 | .align 4 | ||
283 | .Lpool4: | ||
284 | .short -15137 | ||
285 | .short 4112 | ||
286 | .short 4433 | ||
287 | .short 6270 | ||
288 | |||
289 | .align 2 | ||
290 | #endif | ||
291 | |||
279 | jpeg_idct4h: | 292 | jpeg_idct4h: |
280 | #if ARM_ARCH < 5 | 293 | #if ARM_ARCH < 5 |
281 | stmdb sp!, { r4-r10, lr } | 294 | stmdb sp!, { r4-r10, lr } |
@@ -328,88 +341,85 @@ jpeg_idct4h: | |||
328 | cmp r0, r2 | 341 | cmp r0, r2 |
329 | bcc 1b | 342 | bcc 1b |
330 | ldmia sp!, { r4-r10, pc } | 343 | ldmia sp!, { r4-r10, pc } |
331 | #elif ARM_ARCH < 6 | 344 | #elif ARM_ARCH < 6 || 1 |
332 | stmdb sp!, { r4-r10, lr } | 345 | stmdb sp!, { r4-r9, lr } |
333 | ldr r10, =4433 | 346 | ldrd r4, .Lpool4 |
334 | ldr r14, =4112 | ||
335 | ldr r12, =3302955134 | ||
336 | 1: | 347 | 1: |
337 | ldrsh r7, [r0, #6] | 348 | ldrsh r7, [r0, #6] |
338 | ldrsh r5, [r0, #2] | 349 | ldrsh r14, [r0, #2] |
339 | ldrsh r4, [r0] | 350 | ldrsh r12, [r0] |
340 | ldrsh r6, [r0, #4] | 351 | ldrsh r6, [r0, #4] |
341 | add r8, r5, r7 /* r8 = z1 = d1 + d3 */ | 352 | add r8, r14, r7 /* r8 = z1 = d1 + d3 */ |
342 | add r4, r4, r14 | 353 | add r12, r12, r4, lsr #16 |
343 | smulbb r8, r10, r8 /* z1 *= 4433 */ | 354 | smulbb r8, r5, r8 /* z1 *= 4433 */ |
344 | add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */ | 355 | add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */ |
345 | smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */ | 356 | smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */ |
346 | smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */ | 357 | smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */ |
347 | sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */ | 358 | sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */ |
348 | add r6, r5, r9, lsl #13 /* r6 = o0 */ | 359 | add r6, r14, r9, lsl #13 /* r6 = o0 */ |
349 | rsb r9, r5, r9, lsl #13 /* r9 = o3 */ | 360 | rsb r9, r14, r9, lsl #13 /* r9 = o3 */ |
350 | add r5, r7, r4, lsl #13 /* r5 = o1 */ | 361 | add r14, r7, r12, lsl #13 /* r14= o1 */ |
351 | rsb r4, r7, r4, lsl #13 /* r4 = o2 */ | 362 | rsb r12, r7, r12, lsl #13 /* r12= o2 */ |
352 | mov r6, r6, asr #18 | 363 | mov r6, r6, asr #18 |
353 | mov r5, r5, asr #18 | 364 | mov r14, r14, asr #18 |
354 | mov r4, r4, asr #18 | 365 | mov r12, r12, asr #18 |
355 | mov r9, r9, asr #18 | 366 | mov r9, r9, asr #18 |
356 | cmp r6, #255 | 367 | cmp r6, #255 |
357 | mvnhi r6, r6, asr #31 | 368 | mvnhi r6, r6, asr #31 |
358 | cmp r5, #255 | 369 | cmp r14, #255 |
359 | mvnhi r5, r5, asr #31 | 370 | mvnhi r14, r14, asr #31 |
360 | cmp r4, #255 | 371 | cmp r12, #255 |
361 | mvnhi r4, r4, asr #31 | 372 | mvnhi r12, r12, asr #31 |
362 | cmp r9, #255 | 373 | cmp r9, #255 |
363 | mvnhi r9, r9, asr #31 | 374 | mvnhi r9, r9, asr #31 |
364 | #ifdef HAVE_LCD_COLOR | 375 | #ifdef HAVE_LCD_COLOR |
365 | strb r6, [r1] | 376 | strb r6, [r1] |
366 | strb r5, [r1, #4] | 377 | strb r14, [r1, #4] |
367 | strb r4, [r1, #8] | 378 | strb r12, [r1, #8] |
368 | strb r9, [r1, #12] | 379 | strb r9, [r1, #12] |
369 | #else | 380 | #else |
370 | strb r6, [r1] | 381 | strb r6, [r1] |
371 | strb r5, [r1, #1] | 382 | strb r14, [r1, #1] |
372 | strb r4, [r1, #2] | 383 | strb r12, [r1, #2] |
373 | strb r9, [r1, #3] | 384 | strb r9, [r1, #3] |
374 | #endif | 385 | #endif |
375 | add r0, r0, #16 | 386 | add r0, r0, #16 |
376 | add r1, r1, r3 | 387 | add r1, r1, r3 |
377 | cmp r0, r2 | 388 | cmp r0, r2 |
378 | bcc 1b | 389 | bcc 1b |
379 | ldmia sp!, { r4-r10, pc } | 390 | ldmia sp!, { r4-r9, pc } |
380 | #else | 391 | #else |
381 | stmdb sp!, { r4-r9, lr } | 392 | stmdb sp!, { r4-r9, lr } |
382 | ldr r9, =4433 | 393 | ldrd r4, .Lpool4 |
383 | ldr r14, =4112 | 394 | mov r9, r4, lsr #16 |
384 | ldr r12, =3302955134 | ||
385 | 1: | 395 | 1: |
386 | ldmia r0, { r4-r5 } | 396 | ldmia r0, { r12, r14 } |
387 | sadd16 r4, r4, r14 | 397 | sadd16 r12, r12, r9 |
388 | sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */ | 398 | sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */ |
389 | ssub16 r7, r4, r5 /* r7lo = d0 - d2 */ | 399 | ssub16 r7, r12, r14 /* r7lo = d0 - d2 */ |
390 | smulbt r8, r9, r6 | 400 | smulbt r8, r5, r6 |
391 | sxth r6, r6 | 401 | sxth r6, r6 |
392 | smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */ | 402 | smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */ |
393 | smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */ | 403 | smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */ |
394 | sxth r7, r7 | 404 | sxth r7, r7 |
395 | add r8, r4, r6, lsl #13 /* r8 = o0 */ | 405 | add r8, r12, r6, lsl #13 /* r8 = o0 */ |
396 | rsb r6, r4, r6, lsl #13 /* r6 = o3 */ | 406 | rsb r6, r12, r6, lsl #13 /* r6 = o3 */ |
397 | add r4, r5, r7, lsl #13 /* r4 = o1 */ | 407 | add r12, r14, r7, lsl #13 /* r12= o1 */ |
398 | rsb r5, r5, r7, lsl #13 /* r5 = o2 */ | 408 | rsb r14, r14, r7, lsl #13 /* r14= o2 */ |
399 | usat r8, #8, r8, asr #18 | 409 | usat r8, #8, r8, asr #18 |
400 | usat r6, #8, r6, asr #18 | 410 | usat r6, #8, r6, asr #18 |
401 | usat r4, #8, r4, asr #18 | 411 | usat r12, #8, r12, asr #18 |
402 | usat r5, #8, r5, asr #18 | 412 | usat r14, #8, r14, asr #18 |
403 | #ifdef HAVE_LCD_COLOR | 413 | #ifdef HAVE_LCD_COLOR |
404 | strb r8, [r1] | 414 | strb r8, [r1] |
405 | strb r6, [r1, #12] | 415 | strb r6, [r1, #12] |
406 | strb r4, [r1, #4] | 416 | strb r12, [r1, #4] |
407 | strb r5, [r1, #8] | 417 | strb r14, [r1, #8] |
408 | #else | 418 | #else |
409 | strb r8, [r1] | 419 | strb r8, [r1] |
410 | strb r6, [r1, #3] | 420 | strb r6, [r1, #3] |
411 | strb r4, [r1, #1] | 421 | strb r12, [r1, #1] |
412 | strb r5, [r1, #2] | 422 | strb r14, [r1, #2] |
413 | #endif | 423 | #endif |
414 | add r0, r0, #16 | 424 | add r0, r0, #16 |
415 | add r1, r1, r3 | 425 | add r1, r1, r3 |
@@ -450,7 +460,7 @@ jpeg_idct8v: | |||
450 | mov r11, r11, asr #16 /* r11 = z3 = d6 */ | 460 | mov r11, r11, asr #16 /* r11 = z3 = d6 */ |
451 | add r8, r8, #8192 | 461 | add r8, r8, #8192 |
452 | add r9, r10, r11 | 462 | add r9, r10, r11 |
453 | mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ | 463 | mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */ |
454 | mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ | 464 | mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ |
455 | ldr r14, =6270 | 465 | ldr r14, =6270 |
456 | mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ | 466 | mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ |