summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2009-07-02 09:57:03 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2009-07-02 09:57:03 +0000
commit017c1a1027627e601cc5c22e43e42e1735835259 (patch)
tree1025d3aed96f33b48751bd276a5cc506e11856e0
parent293b499093baef544f7148a0fcfa18d28ed3d1ea (diff)
downloadrockbox-017c1a1027627e601cc5c22e43e42e1735835259.tar.gz
rockbox-017c1a1027627e601cc5c22e43e42e1735835259.zip
Core JPEG IDCT8 optimizations for ARMv5+, small optimizations for ARMv4.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21612 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/recorder/jpeg_idct_arm.S247
1 files changed, 233 insertions, 14 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index d84e5e7962..46ac479caa 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -434,6 +434,7 @@ jpeg_idct8v:
434 add r2, r0, #128 434 add r2, r0, #128
4351: 4351:
436 ldmia r0!, { r4-r7 } 436 ldmia r0!, { r4-r7 }
437#if ARM_ARCH < 5
437 mov r8, r4, lsl #16 438 mov r8, r4, lsl #16
438 orrs r9, r6, r7 439 orrs r9, r6, r7
439 orreqs r9, r5, r4, lsr #16 440 orreqs r9, r5, r4, lsr #16
@@ -528,25 +529,125 @@ jpeg_idct8v:
528 strh r11, [r2, #80] 529 strh r11, [r2, #80]
529 strh r5, [r2, #96] 530 strh r5, [r2, #96]
530 strh r14, [r2, #112] 531 strh r14, [r2, #112]
532#else /* ARMv5+ */
533 mov r12, r4, lsl #16
534 orrs r9, r6, r7
535 orreqs r9, r5, r4, lsr #16
536 bne 2f
537 mov r12, r12, asr #14
538 strh r12, [r2]
539 strh r12, [r2, #16]
540 strh r12, [r2, #32]
541 strh r12, [r2, #48]
542 strh r12, [r2, #64]
543 strh r12, [r2, #80]
544 strh r12, [r2, #96]
545 strh r12, [r2, #112]
546 add r2, r2, #2
547 cmp r0, r1
548 bcc 1b
549 ldmia sp!, { r4-r11, pc }
5502:
551 ldrd r8, .Lpool8
552 add r12, r12, #8192
553 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
554 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
555 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
556 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
557 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
558 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
559 add r8, r11, r14, asr #3 /* r8 = tmp11 */
560 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
561 add r14, r10, r12, asr #3 /* r14 = tmp10 */
562 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
563 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
564 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
565 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
566 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
567 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
568 add r8, r12, r14 /* r8 = z3 + z4 */
569 ldrd r10, .Lpool8+8
570 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
571 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
572 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
573 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
574 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
575 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
576 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
577 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
578 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
579 ldrd r10, .Lpool8+16
580 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
581 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
582 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
583 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
584 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
585 add r12, r8, r5 /* o1 */
586 sub r14, r8, r5 /* o6 */
587 add r8, r9, r6 /* o2 */
588 sub r9, r9, r6 /* o5 */
589 add r6, r10, r7 /* o3 */
590 sub r7, r10, r7 /* o4 */
591 add r10, r11, r4 /* o0 */
592 sub r11, r11, r4 /* o7 */
593 mov r12, r12, asr #11
594 mov r14, r14, asr #11
595 mov r8, r8, asr #11
596 mov r9, r9, asr #11
597 mov r6, r6, asr #11
598 mov r7, r7, asr #11
599 mov r10, r10, asr #11
600 mov r11, r11, asr #11
601 strh r10, [r2]
602 strh r12, [r2, #16]
603 strh r8, [r2, #32]
604 strh r6, [r2, #48]
605 strh r7, [r2, #64]
606 strh r9, [r2, #80]
607 strh r14, [r2, #96]
608 strh r11, [r2, #112]
609#endif
531 cmp r0, r1 610 cmp r0, r1
532 add r2, r2, #2 611 add r2, r2, #2
533 bcc 1b 612 bcc 1b
534 ldmia sp!, { r4-r11, pc } 613 ldmia sp!, { r4-r11, pc }
535 .size jpeg_idct8v, .-jpeg_idct8v 614 .size jpeg_idct8v, .-jpeg_idct8v
536 615
616#if ARM_ARCH > 4
617 .align 4
618.Lpool8:
619 .short 4433
620 .short -15137
621 .short 6270
622 .short 9633
623 .short -16069
624 .short -3196
625 .short -7373
626 .short -20995
627 .short 2446
628 .short 12299
629 .short 16819
630 .short 25172
631 .align 2
632#endif
633
537jpeg_idct8h: 634jpeg_idct8h:
538 stmdb sp!, { r4-r11, lr } 635 stmdb sp!, { r4-r11, lr }
5391: 6361:
540 ldmia r0!, { r4-r7 } 637 ldmia r0!, { r4-r7 }
541 ldr r14, =4112 638 ldr r14, =(4112<<16)
542 mov r8, r4, lsl #16 639#if ARM_ARCH < 5
543 add r8, r8, r14, lsl #16 640 add r8, r14, r4, lsl #16
544 orrs r9, r6, r7 641 orrs r9, r6, r7
545 orreqs r9, r5, r4, lsr #16 642 orreqs r9, r5, r4, lsr #16
546 bne 2f 643 bne 2f
644#if ARM_ARCH < 6
547 mov r8, r8, asr #21 645 mov r8, r8, asr #21
548 cmp r8, #255 646 cmp r8, #255
549 mvnhi r8, r8, asr #31 647 mvnhi r8, r8, asr #31
648#else
649 usat r8, #8, r8, asr #21
650#endif
550#ifdef HAVE_LCD_COLOR 651#ifdef HAVE_LCD_COLOR
551 strb r8, [r1] 652 strb r8, [r1]
552 strb r8, [r1, #4] 653 strb r8, [r1, #4]
@@ -630,7 +731,6 @@ jpeg_idct8h:
630 add r10, r11, r6 /* o2 */ 731 add r10, r11, r6 /* o2 */
631 sub r11, r11, r6 /* o5 */ 732 sub r11, r11, r6 /* o5 */
632 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */ 733 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
633#if ARM_ARCH < 6
634 mov r12, r12, asr #18 734 mov r12, r12, asr #18
635 cmp r12, #255 735 cmp r12, #255
636 mvnhi r12, r12, asr #31 736 mvnhi r12, r12, asr #31
@@ -655,16 +755,6 @@ jpeg_idct8h:
655 mov r14, r14, asr #18 755 mov r14, r14, asr #18
656 cmp r14, #255 756 cmp r14, #255
657 mvnhi r14, r14, asr #31 757 mvnhi r14, r14, asr #31
658#else
659 usat r12, #8, r12, asr #18
660 usat r4, #8, r4, asr #18
661 usat r10, #8, r10, asr #18
662 usat r8, #8, r8, asr #18
663 usat r9, #8, r9, asr #18
664 usat r11, #8, r11, asr #18
665 usat r5, #8, r5, asr #18
666 usat r14, #8, r14, asr #18
667#endif
668#ifdef HAVE_LCD_COLOR 758#ifdef HAVE_LCD_COLOR
669 strb r12, [r1] 759 strb r12, [r1]
670 strb r4, [r1, #4] 760 strb r4, [r1, #4]
@@ -684,6 +774,135 @@ jpeg_idct8h:
684 strb r5, [r1, #6] 774 strb r5, [r1, #6]
685 strb r14, [r1, #7] 775 strb r14, [r1, #7]
686#endif 776#endif
777#else /* ARMv5+ */
778 add r12, r14, r4, lsl #16
779 orrs r9, r6, r7
780 orreqs r9, r5, r4, lsr #16
781 bne 2f
782 mov r12, r12, asr #21
783 cmp r12, #255
784 mvnhi r12, r12, asr #31
785#ifdef HAVE_LCD_COLOR
786 strb r12, [r1]
787 strb r12, [r1, #4]
788 strb r12, [r1, #8]
789 strb r12, [r1, #12]
790 strb r12, [r1, #16]
791 strb r12, [r1, #20]
792 strb r12, [r1, #24]
793 strb r12, [r1, #28]
794#else
795 strb r12, [r1]
796 strb r12, [r1, #1]
797 strb r12, [r1, #2]
798 strb r12, [r1, #3]
799 strb r12, [r1, #4]
800 strb r12, [r1, #5]
801 strb r12, [r1, #6]
802 strb r12, [r1, #7]
803#endif
804 add r1, r1, r3
805 cmp r0, r2
806 bcc 1b
807 ldmia sp!, { r4-r11, pc }
8082:
809 ldrd r8, .Lpool8
810 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
811 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
812 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
813 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
814 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
815 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
816 add r8, r11, r14, asr #3 /* r8 = tmp11 */
817 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
818 add r14, r10, r12, asr #3 /* r14 = tmp10 */
819 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
820 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
821 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
822 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
823 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
824 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
825 add r8, r12, r14 /* r8 = z3 + z4 */
826 ldrd r10, .Lpool8+8
827 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
828 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
829 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
830 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
831 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
832 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
833 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
834 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
835 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
836 ldrd r10, .Lpool8+16
837 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
838 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
839 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
840 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
841 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
842 add r12, r8, r5 /* o1 */
843 sub r14, r8, r5 /* o6 */
844 add r8, r9, r6 /* o2 */
845 sub r9, r9, r6 /* o5 */
846 add r6, r10, r7 /* o3 */
847 sub r7, r10, r7 /* o4 */
848 add r10, r11, r4 /* o0 */
849 sub r11, r11, r4 /* o7 */
850 /* output in order: r10 r12 r8 r6 r7 r9 r14 r11 */
851#if ARM_ARCH < 6
852 mov r10, r10, asr #18
853 cmp r10, #255
854 mvnhi r10, r10, asr #31
855 mov r12, r12, asr #18
856 cmp r12, #255
857 mvnhi r12, r12, asr #31
858 mov r8, r8, asr #18
859 cmp r8, #255
860 mvnhi r8, r8, asr #31
861 mov r6, r6, asr #18
862 cmp r6, #255
863 mvnhi r6, r6, asr #31
864 mov r7, r7, asr #18
865 cmp r7, #255
866 mvnhi r7, r7, asr #31
867 mov r9, r9, asr #18
868 cmp r9, #255
869 mvnhi r9, r9, asr #31
870 mov r14, r14, asr #18
871 cmp r14, #255
872 mvnhi r14, r14, asr #31
873 mov r11, r11, asr #18
874 cmp r11, #255
875 mvnhi r11, r11, asr #31
876#else
877 usat r10, #8, r10, asr #18
878 usat r12, #8, r12, asr #18
879 usat r8, #8, r8, asr #18
880 usat r6, #8, r6, asr #18
881 usat r7, #8, r7, asr #18
882 usat r9, #8, r9, asr #18
883 usat r14, #8, r14, asr #18
884 usat r11, #8, r11, asr #18
885#endif
886#ifdef HAVE_LCD_COLOR
887 strb r10, [r1]
888 strb r12, [r1, #4]
889 strb r8, [r1, #8]
890 strb r6, [r1, #12]
891 strb r7, [r1, #16]
892 strb r9, [r1, #20]
893 strb r14, [r1, #24]
894 strb r11, [r1, #28]
895#else
896 strb r10, [r1]
897 strb r12, [r1, #1]
898 strb r8, [r1, #2]
899 strb r6, [r1, #3]
900 strb r7, [r1, #4]
901 strb r9, [r1, #5]
902 strb r14, [r1, #6]
903 strb r11, [r1, #7]
904#endif
905#endif
687 add r1, r1, r3 906 add r1, r1, r3
688 cmp r0, r2 907 cmp r0, r2
689 bcc 1b 908 bcc 1b