diff options
author | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-27 09:21:22 +0000 |
---|---|---|
committer | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-27 09:21:22 +0000 |
commit | 3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6 (patch) | |
tree | 9cc77098cf188b151ef905068da6b1ae6e982df3 /apps | |
parent | 07a55a747d161f164c4cb1c73e2d697a84e4e5de (diff) | |
download | rockbox-3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6.tar.gz rockbox-3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6.zip |
ARM assembly 8-point IDCT, both passes. No ARMv5/6 optimizations yet, aside from usat for final output.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21526 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r-- | apps/recorder/jpeg_idct_arm.S | 265 | ||||
-rw-r--r-- | apps/recorder/jpeg_load.c | 17 |
2 files changed, 275 insertions, 7 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S index b9c94e5639..01b08c4b5a 100644 --- a/apps/recorder/jpeg_idct_arm.S +++ b/apps/recorder/jpeg_idct_arm.S | |||
@@ -35,6 +35,10 @@ | |||
35 | .type jpeg_idct4v, %function | 35 | .type jpeg_idct4v, %function |
36 | .global jpeg_idct4h | 36 | .global jpeg_idct4h |
37 | .type jpeg_idct4h, %function | 37 | .type jpeg_idct4h, %function |
38 | .global jpeg_idct8v | ||
39 | .type jpeg_idct8v, %function | ||
40 | .global jpeg_idct8h | ||
41 | .type jpeg_idct8h, %function | ||
38 | 42 | ||
39 | jpeg_idct1h: | 43 | jpeg_idct1h: |
40 | /* In the common case of one pass through the loop, the extra add should be | 44 | /* In the common case of one pass through the loop, the extra add should be |
@@ -414,3 +418,264 @@ jpeg_idct4h: | |||
414 | ldmia sp!, { r4-r9, pc } | 418 | ldmia sp!, { r4-r9, pc } |
415 | #endif | 419 | #endif |
416 | .size jpeg_idct4h, .-jpeg_idct4h | 420 | .size jpeg_idct4h, .-jpeg_idct4h |
421 | |||
422 | jpeg_idct8v: | ||
423 | stmdb sp!, { r4-r11, lr } | ||
424 | add r2, r0, #128 | ||
425 | 1: | ||
426 | ldmia r0!, { r4-r7 } | ||
427 | mov r8, r4, lsl #16 | ||
428 | orrs r9, r6, r7 | ||
429 | orreqs r9, r5, r4, lsr #16 | ||
430 | bne 2f | ||
431 | mov r8, r8, asr #14 | ||
432 | strh r8, [r2] | ||
433 | strh r8, [r2, #16] | ||
434 | strh r8, [r2, #32] | ||
435 | strh r8, [r2, #48] | ||
436 | strh r8, [r2, #64] | ||
437 | strh r8, [r2, #80] | ||
438 | strh r8, [r2, #96] | ||
439 | strh r8, [r2, #112] | ||
440 | cmp r0, r1 | ||
441 | add r2, r2, #2 | ||
442 | bcc 1b | ||
443 | ldmia sp!, { r4-r11, pc } | ||
444 | 2: | ||
445 | ldr r14, =4433 | ||
446 | ldr r12, =-15137 | ||
447 | mov r10, r5, lsl #16 | ||
448 | mov r11, r7, lsl #16 | ||
449 | mov r10, r10, asr #16 /* r10 = z2 = d2 */ | ||
450 | mov r11, r11, asr #16 /* r11 = z3 = d6 */ | ||
451 | add r8, r8, #8192 | ||
452 | add r9, r10, r11 | ||
453 | mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ | ||
454 | mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ | ||
455 | ldr r14, =6270 | ||
456 | mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ | ||
457 | mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */ | ||
458 | mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */ | ||
459 | add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */ | ||
460 | sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */ | ||
461 | add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */ | ||
462 | sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */ | ||
463 | add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */ | ||
464 | sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */ | ||
465 | stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */ | ||
466 | mov r4, r4, asr #16 /* r4 = tmp3 = d1 */ | ||
467 | mov r5, r5, asr #16 /* r5 = tmp2 = d3 */ | ||
468 | mov r6, r6, asr #16 /* r6 = tmp1 = d5 */ | ||
469 | mov r7, r7, asr #16 /* r7 = tmp0 = d7 */ | ||
470 | ldr r10, =9633 | ||
471 | ldr r11, =-16069 | ||
472 | add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */ | ||
473 | add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */ | ||
474 | add r9, r12, r14 /* r9 = z3 + z4 */ | ||
475 | mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */ | ||
476 | ldr r10, =-3196 | ||
477 | mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */ | ||
478 | ldr r11, =-7373 | ||
479 | mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */ | ||
480 | ldr r10, =2446 | ||
481 | add r9, r4, r7 /* r9 = tmp0 + tmp3 */ | ||
482 | mla r8, r11, r9, r12 /* r8 = z1 + z3 */ | ||
483 | mla r9, r11, r9, r14 /* r9 = z1 + z4 */ | ||
484 | ldr r11, =12299 | ||
485 | mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */ | ||
486 | ldr r10, =-20995 | ||
487 | mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */ | ||
488 | ldr r11, =25172 | ||
489 | add r9, r5, r6 /* r9 = tmp1 + tmp2 */ | ||
490 | mla r12, r10, r9, r12 /* r12 = z2 + z3 */ | ||
491 | mla r14, r10, r9, r14 /* r14 = z2 + z4 */ | ||
492 | ldr r10, =16819 | ||
493 | mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */ | ||
494 | mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */ | ||
495 | ldmdb sp, { r8-r11 } | ||
496 | add r12, r8, r4 /* o0 */ | ||
497 | sub r14, r8, r4 /* o7 */ | ||
498 | add r8, r9, r7 /* o3 */ | ||
499 | sub r9, r9, r7 /* o4 */ | ||
500 | add r4, r10, r5 /* O1 */ | ||
501 | sub r5, r10, r5 /* o6 */ | ||
502 | add r10, r11, r6 /* o2 */ | ||
503 | sub r11, r11, r6 /* o5 */ | ||
504 | /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */ | ||
505 | mov r12, r12, asr #11 | ||
506 | mov r4, r4, asr #11 | ||
507 | mov r10, r10, asr #11 | ||
508 | mov r8, r8, asr #11 | ||
509 | mov r9, r9, asr #11 | ||
510 | mov r11, r11, asr #11 | ||
511 | mov r5, r5, asr #11 | ||
512 | mov r14, r14, asr #11 | ||
513 | strh r12, [r2] | ||
514 | strh r4, [r2, #16] | ||
515 | strh r10, [r2, #32] | ||
516 | strh r8, [r2, #48] | ||
517 | strh r9, [r2, #64] | ||
518 | strh r11, [r2, #80] | ||
519 | strh r5, [r2, #96] | ||
520 | strh r14, [r2, #112] | ||
521 | cmp r0, r1 | ||
522 | add r2, r2, #2 | ||
523 | bcc 1b | ||
524 | ldmia sp!, { r4-r11, pc } | ||
525 | .size jpeg_idct8v, .-jpeg_idct8v | ||
526 | |||
527 | jpeg_idct8h: | ||
528 | stmdb sp!, { r4-r11, lr } | ||
529 | 1: | ||
530 | ldmia r0!, { r4-r7 } | ||
531 | ldr r14, =4112 | ||
532 | mov r8, r4, lsl #16 | ||
533 | add r8, r8, r14, lsl #16 | ||
534 | orrs r9, r6, r7 | ||
535 | orreqs r9, r5, r4, lsr #16 | ||
536 | bne 2f | ||
537 | mov r8, r8, asr #21 | ||
538 | cmp r8, #255 | ||
539 | mvnhi r8, r8, asr #31 | ||
540 | #ifdef HAVE_LCD_COLOR | ||
541 | strb r8, [r1] | ||
542 | strb r8, [r1, #4] | ||
543 | strb r8, [r1, #8] | ||
544 | strb r8, [r1, #12] | ||
545 | strb r8, [r1, #16] | ||
546 | strb r8, [r1, #20] | ||
547 | strb r8, [r1, #24] | ||
548 | strb r8, [r1, #28] | ||
549 | #else | ||
550 | strb r8, [r1] | ||
551 | strb r8, [r1, #1] | ||
552 | strb r8, [r1, #2] | ||
553 | strb r8, [r1, #3] | ||
554 | strb r8, [r1, #4] | ||
555 | strb r8, [r1, #5] | ||
556 | strb r8, [r1, #6] | ||
557 | strb r8, [r1, #7] | ||
558 | #endif | ||
559 | add r1, r1, r3 | ||
560 | cmp r0, r2 | ||
561 | bcc 1b | ||
562 | ldmia sp!, { r4-r11, pc } | ||
563 | 2: | ||
564 | ldr r14, =4433 | ||
565 | ldr r12, =-15137 | ||
566 | mov r10, r5, lsl #16 | ||
567 | mov r11, r7, lsl #16 | ||
568 | mov r10, r10, asr #16 /* r10 = z2 = d2 */ | ||
569 | mov r11, r11, asr #16 /* r11 = z3 = d6 */ | ||
570 | add r9, r10, r11 | ||
571 | mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ | ||
572 | mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ | ||
573 | ldr r14, =6270 | ||
574 | mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ | ||
575 | mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */ | ||
576 | mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */ | ||
577 | add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */ | ||
578 | sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */ | ||
579 | add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */ | ||
580 | sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */ | ||
581 | add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */ | ||
582 | sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */ | ||
583 | stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */ | ||
584 | mov r4, r4, asr #16 /* r4 = tmp3 = d1 */ | ||
585 | mov r5, r5, asr #16 /* r5 = tmp2 = d3 */ | ||
586 | mov r6, r6, asr #16 /* r6 = tmp1 = d5 */ | ||
587 | mov r7, r7, asr #16 /* r7 = tmp0 = d7 */ | ||
588 | ldr r10, =9633 | ||
589 | ldr r11, =-16069 | ||
590 | add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */ | ||
591 | add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */ | ||
592 | add r9, r12, r14 /* r9 = z3 + z4 */ | ||
593 | mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */ | ||
594 | ldr r10, =-3196 | ||
595 | mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */ | ||
596 | ldr r11, =-7373 | ||
597 | mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */ | ||
598 | ldr r10, =2446 | ||
599 | add r9, r4, r7 /* r9 = tmp0 + tmp3 */ | ||
600 | mla r8, r11, r9, r12 /* r8 = z1 + z3 */ | ||
601 | mla r9, r11, r9, r14 /* r9 = z1 + z4 */ | ||
602 | ldr r11, =12299 | ||
603 | mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */ | ||
604 | ldr r10, =-20995 | ||
605 | mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */ | ||
606 | ldr r11, =25172 | ||
607 | add r9, r5, r6 /* r9 = tmp1 + tmp2 */ | ||
608 | mla r12, r10, r9, r12 /* r12 = z2 + z3 */ | ||
609 | mla r14, r10, r9, r14 /* r14 = z2 + z4 */ | ||
610 | ldr r10, =16819 | ||
611 | mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */ | ||
612 | mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */ | ||
613 | ldmdb sp, { r8-r11 } | ||
614 | add r12, r8, r4 /* o0 */ | ||
615 | sub r14, r8, r4 /* o7 */ | ||
616 | add r8, r9, r7 /* o3 */ | ||
617 | sub r9, r9, r7 /* o4 */ | ||
618 | add r4, r10, r5 /* O1 */ | ||
619 | sub r5, r10, r5 /* o6 */ | ||
620 | add r10, r11, r6 /* o2 */ | ||
621 | sub r11, r11, r6 /* o5 */ | ||
622 | /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */ | ||
623 | #if ARM_ARCH < 6 | ||
624 | mov r12, r12, asr #18 | ||
625 | cmp r12, #255 | ||
626 | mvnhi r12, r12, asr #31 | ||
627 | mov r4, r4, asr #18 | ||
628 | cmp r4, #255 | ||
629 | mvnhi r4, r4, asr #31 | ||
630 | mov r10, r10, asr #18 | ||
631 | cmp r10, #255 | ||
632 | mvnhi r10, r10, asr #31 | ||
633 | mov r8, r8, asr #18 | ||
634 | cmp r8, #255 | ||
635 | mvnhi r8, r8, asr #31 | ||
636 | mov r9, r9, asr #18 | ||
637 | cmp r9, #255 | ||
638 | mvnhi r9, r9, asr #31 | ||
639 | mov r11, r11, asr #18 | ||
640 | cmp r11, #255 | ||
641 | mvnhi r11, r11, asr #31 | ||
642 | mov r5, r5, asr #18 | ||
643 | cmp r5, #255 | ||
644 | mvnhi r5, r5, asr #31 | ||
645 | mov r14, r14, asr #18 | ||
646 | cmp r14, #255 | ||
647 | mvnhi r14, r14, asr #31 | ||
648 | #else | ||
649 | usat r12, #8, r12, asr #18 | ||
650 | usat r4, #8, r4, asr #18 | ||
651 | usat r10, #8, r10, asr #18 | ||
652 | usat r8, #8, r8, asr #18 | ||
653 | usat r9, #8, r9, asr #18 | ||
654 | usat r11, #8, r11, asr #18 | ||
655 | usat r5, #8, r5, asr #18 | ||
656 | usat r14, #8, r14, asr #18 | ||
657 | #endif | ||
658 | #ifdef HAVE_LCD_COLOR | ||
659 | strb r12, [r1] | ||
660 | strb r4, [r1, #4] | ||
661 | strb r10, [r1, #8] | ||
662 | strb r8, [r1, #12] | ||
663 | strb r9, [r1, #16] | ||
664 | strb r11, [r1, #20] | ||
665 | strb r5, [r1, #24] | ||
666 | strb r14, [r1, #28] | ||
667 | #else | ||
668 | strb r12, [r1] | ||
669 | strb r4, [r1, #1] | ||
670 | strb r10, [r1, #2] | ||
671 | strb r8, [r1, #3] | ||
672 | strb r9, [r1, #4] | ||
673 | strb r11, [r1, #5] | ||
674 | strb r5, [r1, #6] | ||
675 | strb r14, [r1, #7] | ||
676 | #endif | ||
677 | add r1, r1, r3 | ||
678 | cmp r0, r2 | ||
679 | bcc 1b | ||
680 | ldmia sp!, { r4-r11, pc } | ||
681 | .size jpeg_idct8h, .-jpeg_idct8h | ||
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c index fa2df5b993..5ffa4a54a0 100644 --- a/apps/recorder/jpeg_load.c +++ b/apps/recorder/jpeg_load.c | |||
@@ -382,13 +382,6 @@ static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowst | |||
382 | DS_OUT)); | 382 | DS_OUT)); |
383 | } | 383 | } |
384 | } | 384 | } |
385 | #else | ||
386 | extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
387 | extern void jpeg_idct2v(int16_t *ws, int16_t *end); | ||
388 | extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
389 | extern void jpeg_idct4v(int16_t *ws, int16_t *end); | ||
390 | extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
391 | #endif | ||
392 | 385 | ||
393 | /* vertical-pass 8-point IDCT */ | 386 | /* vertical-pass 8-point IDCT */ |
394 | static void jpeg_idct8v(int16_t *ws, int16_t *end) | 387 | static void jpeg_idct8v(int16_t *ws, int16_t *end) |
@@ -599,6 +592,16 @@ static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowst | |||
599 | } | 592 | } |
600 | } | 593 | } |
601 | 594 | ||
595 | #else | ||
596 | extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
597 | extern void jpeg_idct2v(int16_t *ws, int16_t *end); | ||
598 | extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
599 | extern void jpeg_idct4v(int16_t *ws, int16_t *end); | ||
600 | extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
601 | extern void jpeg_idct8v(int16_t *ws, int16_t *end); | ||
602 | extern void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); | ||
603 | #endif | ||
604 | |||
602 | #ifdef HAVE_LCD_COLOR | 605 | #ifdef HAVE_LCD_COLOR |
603 | /* vertical-pass 16-point IDCT */ | 606 | /* vertical-pass 16-point IDCT */ |
604 | static void jpeg_idct16v(int16_t *ws, int16_t *end) | 607 | static void jpeg_idct16v(int16_t *ws, int16_t *end) |