diff options
author | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 14:13:01 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 14:13:01 +0000 |
commit | 71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch) | |
tree | b9a97081ec04d4d311a7b45747393e68837912a2 /apps/plugins/lib/gray_core.c | |
parent | bcd94a9b01d19d87a437cd8158a758f206b30825 (diff) | |
download | rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip |
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib/gray_core.c')
-rw-r--r-- | apps/plugins/lib/gray_core.c | 1417 |
1 files changed, 993 insertions, 424 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c index e65a7f259e..809e88dba1 100644 --- a/apps/plugins/lib/gray_core.c +++ b/apps/plugins/lib/gray_core.c | |||
@@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height) | |||
649 | bbuf = _gray_info.back_buffer + srcofs_row; | 649 | bbuf = _gray_info.back_buffer + srcofs_row; |
650 | 650 | ||
651 | #ifdef CPU_ARM | 651 | #ifdef CPU_ARM |
652 | asm volatile ( | 652 | asm volatile |
653 | ( | ||
653 | "ldr r0, [%[cbuf]] \n" | 654 | "ldr r0, [%[cbuf]] \n" |
654 | "ldr r1, [%[bbuf]] \n" | 655 | "ldr r1, [%[bbuf]] \n" |
655 | "eor r1, r0, r1 \n" | 656 | "eor r1, r0, r1 \n" |
@@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height) | |||
668 | 669 | ||
669 | if (change != 0) | 670 | if (change != 0) |
670 | { | 671 | { |
671 | unsigned char *addr, *end; | 672 | unsigned char *addr; |
672 | unsigned mask, trash; | 673 | unsigned mask, depth, trash; |
673 | 674 | ||
674 | pat_ptr = &pat_stack[8]; | 675 | pat_ptr = &pat_stack[8]; |
675 | 676 | ||
676 | /* precalculate the bit patterns with random shifts | 677 | /* precalculate the bit patterns with random shifts |
677 | * for all 8 pixels and put them on an extra "stack" */ | 678 | * for all 8 pixels and put them on an extra "stack" */ |
678 | asm volatile ( | 679 | asm volatile |
679 | "mov r3, #8 \n" /* loop count */ | 680 | ( |
680 | "mov %[mask], #0 \n" | 681 | "mov r3, #8 \n" /* loop count */ |
681 | 682 | "mov %[mask], #0 \n" | |
682 | ".ur_pre_loop: \n" | 683 | |
683 | "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ | 684 | ".ur_pre_loop: \n" |
684 | "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ | 685 | "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ |
685 | "ldrb r1, [%[bbuf]] \n" /* read back buffer */ | 686 | "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ |
686 | "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ | 687 | "ldrb r1, [%[bbuf]] \n" /* read back buffer */ |
687 | "mov r2, #0 \n" /* preset for skipped pixel */ | 688 | "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ |
688 | "cmp r0, r1 \n" /* no change? */ | 689 | "mov r2, #0 \n" /* preset for skipped pixel */ |
689 | "beq .ur_skip \n" /* -> skip */ | 690 | "cmp r0, r1 \n" /* no change? */ |
690 | 691 | "beq .ur_skip \n" /* -> skip */ | |
691 | "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ | 692 | |
692 | 693 | "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ | |
693 | "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ | 694 | |
694 | "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" | 695 | "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ |
695 | "add %[rnd], %[rnd], #74 \n" /* add another 74 */ | 696 | "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" |
696 | /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ | 697 | "add %[rnd], %[rnd], #74 \n" /* add another 74 */ |
697 | "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ | 698 | /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ |
698 | 699 | "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ | |
699 | "cmp r1, %[dpth] \n" /* random >= depth ? */ | 700 | |
700 | "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ | 701 | "cmp r1, %[dpth] \n" /* random >= depth ? */ |
701 | 702 | "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ | |
702 | "mov r0, r2, lsl r1 \n" /** rotate pattern **/ | 703 | |
703 | "sub r1, %[dpth], r1 \n" | 704 | "mov r0, r2, lsl r1 \n" /** rotate pattern **/ |
704 | "orr r2, r0, r2, lsr r1 \n" | 705 | "sub r1, %[dpth], r1 \n" |
705 | 706 | "orr r2, r0, r2, lsr r1 \n" | |
706 | "orr %[mask], %[mask], #1 \n" /* set mask bit */ | 707 | |
708 | "orr %[mask], %[mask], #1 \n" /* set mask bit */ | ||
707 | 709 | ||
708 | ".ur_skip: \n" | 710 | ".ur_skip: \n" |
709 | "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ | 711 | "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ |
710 | 712 | ||
711 | "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ | 713 | "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ |
712 | "bne .ur_pre_loop \n" | 714 | "bne .ur_pre_loop \n" |
713 | : /* outputs */ | 715 | : /* outputs */ |
714 | [cbuf]"+r"(cbuf), | 716 | [cbuf]"+r"(cbuf), |
715 | [bbuf]"+r"(bbuf), | 717 | [bbuf]"+r"(bbuf), |
716 | [patp]"+r"(pat_ptr), | 718 | [patp]"+r"(pat_ptr), |
717 | [rnd] "+r"(_gray_random_buffer), | 719 | [rnd] "+r"(_gray_random_buffer), |
718 | [mask]"=&r"(mask) | 720 | [mask]"=&r"(mask) |
719 | : /* inputs */ | 721 | : /* inputs */ |
720 | [bpat]"r"(_gray_info.bitpattern), | 722 | [bpat]"r"(_gray_info.bitpattern), |
721 | [dpth]"r"(_gray_info.depth), | 723 | [dpth]"r"(_gray_info.depth), |
722 | [rmsk]"r"(_gray_info.randmask) | 724 | [rmsk]"r"(_gray_info.randmask) |
723 | : /* clobbers */ | 725 | : /* clobbers */ |
724 | "r0", "r1", "r2", "r3" | 726 | "r0", "r1", "r2", "r3" |
725 | ); | 727 | ); |
726 | 728 | ||
727 | addr = dst_row; | 729 | addr = dst_row; |
728 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 730 | depth = _gray_info.depth; |
729 | 731 | ||
730 | /* set the bits for all 8 pixels in all bytes according to the | 732 | /* set the bits for all 8 pixels in all bytes according to the |
731 | * precalculated patterns on the pattern stack */ | 733 | * precalculated patterns on the pattern stack */ |
732 | asm volatile ( | 734 | asm volatile |
733 | "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ | 735 | ( |
736 | "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ | ||
737 | |||
738 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ | ||
739 | |||
740 | "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ | ||
741 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
742 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ | ||
743 | "eor r0, r1, r5, lsl #4 \n" | ||
744 | "and r0, r0, %[rx] \n" | ||
745 | "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ | ||
746 | "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ | ||
747 | "eor r0, r2, r6, lsl #4 \n" | ||
748 | "and r0, r0, %[rx] \n" | ||
749 | "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ | ||
750 | "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ | ||
751 | "eor r0, r3, r7, lsl #4 \n" | ||
752 | "and r0, r0, %[rx] \n" | ||
753 | "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | ||
754 | "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | ||
755 | "eor r0, r4, r8, lsl #4 \n" | ||
756 | "and r0, r0, %[rx] \n" | ||
757 | "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
758 | "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
759 | |||
760 | "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ | ||
761 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
762 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ | ||
763 | "eor r0, r1, r3, lsl #2 \n" | ||
764 | "and r0, r0, %[rx] \n" | ||
765 | "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
766 | "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
767 | "eor r0, r2, r4, lsl #2 \n" | ||
768 | "and r0, r0, %[rx] \n" | ||
769 | "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
770 | "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
771 | "eor r0, r5, r7, lsl #2 \n" | ||
772 | "and r0, r0, %[rx] \n" | ||
773 | "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
774 | "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
775 | "eor r0, r6, r8, lsl #2 \n" | ||
776 | "and r0, r0, %[rx] \n" | ||
777 | "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
778 | "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
779 | |||
780 | "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ | ||
781 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
782 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ | ||
783 | "eor r0, r1, r2, lsl #1 \n" | ||
784 | "and r0, r0, %[rx] \n" | ||
785 | "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
786 | "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
787 | "eor r0, r3, r4, lsl #1 \n" | ||
788 | "and r0, r0, %[rx] \n" | ||
789 | "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
790 | "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
791 | "eor r0, r5, r6, lsl #1 \n" | ||
792 | "and r0, r0, %[rx] \n" | ||
793 | "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
794 | "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
795 | "eor r0, r7, r8, lsl #1 \n" | ||
796 | "and r0, r0, %[rx] \n" | ||
797 | "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
798 | "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
799 | |||
800 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
801 | "ands %[mask], %[mask], #0xff \n" | ||
802 | "beq .ur_sloop \n" /* short loop if no bits to keep */ | ||
803 | |||
804 | ".ur_floop: \n" /** full loop (bits to keep)**/ | ||
805 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ | ||
806 | "bhs .ur_f8 \n" | ||
807 | |||
808 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ | ||
809 | "add %[addr], %[addr], r0 \n" /* for this round */ | ||
810 | |||
811 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ | ||
812 | "add pc, pc, r0 \n" | ||
813 | ".ur_ftable: \n" | ||
814 | ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */ | ||
815 | ".byte .ur_f1 - .ur_ftable - 4 \n" | ||
816 | ".byte .ur_f2 - .ur_ftable - 4 \n" | ||
817 | ".byte .ur_f3 - .ur_ftable - 4 \n" | ||
818 | ".byte .ur_f4 - .ur_ftable - 4 \n" | ||
819 | ".byte .ur_f5 - .ur_ftable - 4 \n" | ||
820 | ".byte .ur_f6 - .ur_ftable - 4 \n" | ||
821 | ".byte .ur_f7 - .ur_ftable - 4 \n" | ||
822 | |||
823 | ".ur_f8: \n" | ||
824 | "add %[addr], %[addr], %[psiz], lsl #3 \n" | ||
825 | /* Point behind the last plane for this round. Note: We're using the | ||
826 | * registers backwards in order to reuse the streak for the last round. | ||
827 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
828 | * the bit order would be destroyed which results in more flicker. */ | ||
829 | "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ | ||
830 | "and r0, r0, %[mask] \n" /* mask out replaced bits */ | ||
831 | "orr r0, r0, r8 \n" /* set new bits */ | ||
832 | "strb r0, [%[addr]] \n" /* store byte */ | ||
833 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
834 | ".ur_f7: \n" | ||
835 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
836 | "and r0, r0, %[mask] \n" | ||
837 | "orr r0, r0, r7 \n" | ||
838 | "strb r0, [%[addr]] \n" | ||
839 | "mov r7, r7, lsr #8 \n" | ||
840 | ".ur_f6: \n" | ||
841 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
842 | "and r0, r0, %[mask] \n" | ||
843 | "orr r0, r0, r6 \n" | ||
844 | "strb r0, [%[addr]] \n" | ||
845 | "mov r6, r6, lsr #8 \n" | ||
846 | ".ur_f5: \n" | ||
847 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
848 | "and r0, r0, %[mask] \n" | ||
849 | "orr r0, r0, r5 \n" | ||
850 | "strb r0, [%[addr]] \n" | ||
851 | "mov r5, r5, lsr #8 \n" | ||
852 | ".ur_f4: \n" | ||
853 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
854 | "and r0, r0, %[mask] \n" | ||
855 | "orr r0, r0, r4 \n" | ||
856 | "strb r0, [%[addr]] \n" | ||
857 | "mov r4, r4, lsr #8 \n" | ||
858 | ".ur_f3: \n" | ||
859 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
860 | "and r0, r0, %[mask] \n" | ||
861 | "orr r0, r0, r3 \n" | ||
862 | "strb r0, [%[addr]] \n" | ||
863 | "mov r3, r3, lsr #8 \n" | ||
864 | ".ur_f2: \n" | ||
865 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
866 | "and r0, r0, %[mask] \n" | ||
867 | "orr r0, r0, r2 \n" | ||
868 | "strb r0, [%[addr]] \n" | ||
869 | "mov r2, r2, lsr #8 \n" | ||
870 | ".ur_f1: \n" | ||
871 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
872 | "and r0, r0, %[mask] \n" | ||
873 | "orr r0, r0, r1 \n" | ||
874 | "strb r0, [%[addr]] \n" | ||
875 | "mov r1, r1, lsr #8 \n" | ||
876 | ".ur_f0: \n" | ||
877 | |||
878 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
879 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
880 | "bhi .ur_floop \n" | ||
881 | |||
882 | "b .ur_end \n" | ||
883 | |||
884 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
885 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ | ||
886 | "bhs .ur_s8 \n" | ||
887 | |||
888 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ | ||
889 | "add %[addr], %[addr], r0 \n" /* for this round */ | ||
734 | 890 | ||
735 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | 891 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ |
736 | "ands %[mask], %[mask], #0xff \n" | 892 | "add pc, pc, r0 \n" |
737 | "beq .ur_sloop \n" /* short loop if nothing to keep */ | 893 | ".ur_stable: \n" |
738 | 894 | ".byte .ur_s0 - .ur_stable - 4 \n" | |
739 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 895 | ".byte .ur_s1 - .ur_stable - 4 \n" |
740 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 896 | ".byte .ur_s2 - .ur_stable - 4 \n" |
741 | "adc r0, r0, r0 \n" /* put bit into LSB for byte */ | 897 | ".byte .ur_s3 - .ur_stable - 4 \n" |
742 | "movs r8, r8, lsr #1 \n" | 898 | ".byte .ur_s4 - .ur_stable - 4 \n" |
743 | "adc r0, r0, r0 \n" | 899 | ".byte .ur_s5 - .ur_stable - 4 \n" |
744 | "movs r7, r7, lsr #1 \n" | 900 | ".byte .ur_s6 - .ur_stable - 4 \n" |
745 | "adc r0, r0, r0 \n" | 901 | ".byte .ur_s7 - .ur_stable - 4 \n" |
746 | "movs r6, r6, lsr #1 \n" | 902 | |
747 | "adc r0, r0, r0 \n" | 903 | ".ur_s8: \n" |
748 | "movs r5, r5, lsr #1 \n" | 904 | "add %[addr], %[addr], %[psiz], lsl #3 \n" |
749 | "adc r0, r0, r0 \n" | 905 | /* Point behind the last plane for this round. See above. */ |
750 | "movs r4, r4, lsr #1 \n" | 906 | "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ |
751 | "adc r0, r0, r0 \n" | 907 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ |
752 | "movs r3, r3, lsr #1 \n" | 908 | ".ur_s7: \n" |
753 | "adc r0, r0, r0 \n" | 909 | "strb r7, [%[addr], -%[psiz]]! \n" |
754 | "movs r2, r2, lsr #1 \n" | 910 | "mov r7, r7, lsr #8 \n" |
755 | "adc r0, r0, r0 \n" | 911 | ".ur_s6: \n" |
756 | 912 | "strb r6, [%[addr], -%[psiz]]! \n" | |
757 | "ldrb r1, [%[addr]] \n" /* read old value */ | 913 | "mov r6, r6, lsr #8 \n" |
758 | "and r1, r1, %[mask] \n" /* mask out replaced bits */ | 914 | ".ur_s5: \n" |
759 | "orr r1, r1, r0 \n" /* set new bits */ | 915 | "strb r5, [%[addr], -%[psiz]]! \n" |
760 | "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ | 916 | "mov r5, r5, lsr #8 \n" |
761 | 917 | ".ur_s4: \n" | |
762 | "cmp %[end], %[addr] \n" /* loop for all bitplanes */ | 918 | "strb r4, [%[addr], -%[psiz]]! \n" |
763 | "bne .ur_floop \n" | 919 | "mov r4, r4, lsr #8 \n" |
764 | 920 | ".ur_s3: \n" | |
765 | "b .ur_end \n" | 921 | "strb r3, [%[addr], -%[psiz]]! \n" |
766 | 922 | "mov r3, r3, lsr #8 \n" | |
767 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 923 | ".ur_s2: \n" |
768 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 924 | "strb r2, [%[addr], -%[psiz]]! \n" |
769 | "adc r0, r0, r0 \n" /* put bit into LSB for byte */ | 925 | "mov r2, r2, lsr #8 \n" |
770 | "movs r8, r8, lsr #1 \n" | 926 | ".ur_s1: \n" |
771 | "adc r0, r0, r0 \n" | 927 | "strb r1, [%[addr], -%[psiz]]! \n" |
772 | "movs r7, r7, lsr #1 \n" | 928 | "mov r1, r1, lsr #8 \n" |
773 | "adc r0, r0, r0 \n" | 929 | ".ur_s0: \n" |
774 | "movs r6, r6, lsr #1 \n" | 930 | |
775 | "adc r0, r0, r0 \n" | 931 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ |
776 | "movs r5, r5, lsr #1 \n" | 932 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ |
777 | "adc r0, r0, r0 \n" | 933 | "bhi .ur_sloop \n" |
778 | "movs r4, r4, lsr #1 \n" | 934 | |
779 | "adc r0, r0, r0 \n" | 935 | ".ur_end: \n" |
780 | "movs r3, r3, lsr #1 \n" | 936 | : /* outputs */ |
781 | "adc r0, r0, r0 \n" | 937 | [addr]"+r"(addr), |
782 | "movs r2, r2, lsr #1 \n" | 938 | [mask]"+r"(mask), |
783 | "adc r0, r0, r0 \n" | 939 | [dpth]"+r"(depth), |
784 | 940 | [rx] "=&r"(trash) | |
785 | "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ | 941 | : /* inputs */ |
786 | 942 | [psiz]"r"(_gray_info.plane_size), | |
787 | "cmp %[end], %[addr] \n" /* loop for all bitplanes */ | 943 | [patp]"[rx]"(pat_ptr) |
788 | "bne .ur_sloop \n" | 944 | : /* clobbers */ |
789 | 945 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | |
790 | ".ur_end: \n" | 946 | ); |
791 | : /* outputs */ | ||
792 | [addr]"+r"(addr), | ||
793 | [mask]"+r"(mask), | ||
794 | [rx] "=&r"(trash) | ||
795 | : /* inputs */ | ||
796 | [psiz]"r"(_gray_info.plane_size), | ||
797 | [end] "r"(end), | ||
798 | [patp]"[rx]"(pat_ptr) | ||
799 | : /* clobbers */ | ||
800 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | ||
801 | ); | ||
802 | } | 947 | } |
803 | #else /* C version, for reference*/ | 948 | #else /* C version, for reference*/ |
804 | #warning C version of gray_update_rect() used | 949 | #warning C version of gray_update_rect() used |
@@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height) | |||
873 | 1018 | ||
874 | for (i = 7; i >= 0; i--) | 1019 | for (i = 7; i >= 0; i--) |
875 | data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); | 1020 | data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); |
876 | 1021 | ||
877 | *addr = (*addr & mask) | data; | 1022 | *addr = (*addr & mask) | data; |
878 | addr += _gray_info.plane_size; | 1023 | addr += _gray_info.plane_size; |
879 | test <<= 1; | 1024 | test <<= 1; |
@@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height) | |||
935 | 1080 | ||
936 | #if CONFIG_CPU == SH7034 | 1081 | #if CONFIG_CPU == SH7034 |
937 | asm volatile ( | 1082 | asm volatile ( |
938 | "mov.l @%[cbuf],r1 \n" | 1083 | "mov.l @%[cbuf], r1 \n" |
939 | "mov.l @%[bbuf],r2 \n" | 1084 | "mov.l @%[bbuf], r2 \n" |
940 | "xor r1,r2 \n" | 1085 | "xor r1, r2 \n" |
941 | "mov.l @(4,%[cbuf]),r1 \n" | 1086 | "mov.l @(4,%[cbuf]), r1 \n" |
942 | "mov.l @(4,%[bbuf]),%[chg] \n" | 1087 | "mov.l @(4,%[bbuf]), %[chg]\n" |
943 | "xor r1,%[chg] \n" | 1088 | "xor r1, %[chg] \n" |
944 | "or r2,%[chg] \n" | 1089 | "or r2, %[chg] \n" |
945 | : /* outputs */ | 1090 | : /* outputs */ |
946 | [chg] "=r"(change) | 1091 | [chg] "=r"(change) |
947 | : /* inputs */ | 1092 | : /* inputs */ |
@@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height) | |||
953 | 1098 | ||
954 | if (change != 0) | 1099 | if (change != 0) |
955 | { | 1100 | { |
956 | unsigned char *addr, *end; | 1101 | unsigned char *addr; |
957 | unsigned mask, trash; | 1102 | unsigned mask, depth, trash; |
958 | 1103 | ||
959 | pat_ptr = &pat_stack[8]; | 1104 | pat_ptr = &pat_stack[8]; |
960 | 1105 | ||
961 | /* precalculate the bit patterns with random shifts | 1106 | /* precalculate the bit patterns with random shifts |
962 | * for all 8 pixels and put them on an extra "stack" */ | 1107 | * for all 8 pixels and put them on an extra "stack" */ |
963 | asm volatile ( | 1108 | asm volatile |
964 | "mov #8,r3 \n" /* loop count */ | 1109 | ( |
965 | 1110 | "mov #8, r3 \n" /* loop count */ | |
966 | ".ur_pre_loop: \n" | 1111 | |
967 | "mov.b @%[cbuf]+,r0\n" /* read current buffer */ | 1112 | ".ur_pre_loop: \n" |
968 | "mov.b @%[bbuf],r1 \n" /* read back buffer */ | 1113 | "mov.b @%[cbuf]+, r0 \n" /* read current buffer */ |
969 | "mov #0,r2 \n" /* preset for skipped pixel */ | 1114 | "mov.b @%[bbuf], r1 \n" /* read back buffer */ |
970 | "mov.b r0,@%[bbuf] \n" /* update back buffer */ | 1115 | "mov #0, r2 \n" /* preset for skipped pixel */ |
971 | "add #1,%[bbuf] \n" | 1116 | "mov.b r0, @%[bbuf] \n" /* update back buffer */ |
972 | "cmp/eq r0,r1 \n" /* no change? */ | 1117 | "add #1, %[bbuf] \n" |
973 | "bt .ur_skip \n" /* -> skip */ | 1118 | "cmp/eq r0, r1 \n" /* no change? */ |
974 | 1119 | "bt .ur_skip \n" /* -> skip */ | |
975 | "shll2 r0 \n" /* pixel value -> pattern offset */ | 1120 | |
976 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ | 1121 | "shll2 r0 \n" /* pixel value -> pattern offset */ |
977 | 1122 | "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ | |
978 | "mov #75,r0 \n" | 1123 | |
979 | "mulu r0,%[rnd] \n" /* multiply by 75 */ | 1124 | "mov #75, r0 \n" |
980 | "sts macl,%[rnd] \n" | 1125 | "mulu r0, %[rnd] \n" /* multiply by 75 */ |
981 | "add #74,%[rnd] \n" /* add another 74 */ | 1126 | "sts macl, %[rnd] \n" |
982 | /* Since the lower bits are not very random: */ | 1127 | "add #74, %[rnd] \n" /* add another 74 */ |
983 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ | 1128 | /* Since the lower bits are not very random: */ |
984 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ | 1129 | "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ |
985 | 1130 | "and %[rmsk], r1 \n" /* mask out unneeded bits */ | |
986 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ | 1131 | |
987 | "bf .ur_ntrim \n" | 1132 | "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ |
988 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ | 1133 | "bf .ur_ntrim \n" |
989 | ".ur_ntrim: \n" | 1134 | "sub %[dpth], r1 \n" /* yes: random -= depth; */ |
1135 | ".ur_ntrim: \n" | ||
990 | 1136 | ||
991 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ | 1137 | "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ |
992 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ | 1138 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ |
993 | "mov r1,r5 \n" | 1139 | "mov r1, r5 \n" |
994 | 1140 | ||
995 | "mov %[dpth],r5 \n" | 1141 | "mov %[dpth], r5 \n" |
996 | "sub r1,r5 \n" /* r5 = depth - r1 */ | 1142 | "sub r1, r5 \n" /* r5 = depth - r1 */ |
997 | "mov.l .lshrsi3,r1 \n" | 1143 | "mov.l .lshrsi3, r1 \n" |
998 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ | 1144 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ |
999 | "mov r0,r2 \n" /* store previous result in r2 */ | 1145 | "mov r0, r2 \n" /* store previous result in r2 */ |
1000 | 1146 | ||
1001 | "or r0,r2 \n" /* rotated_pattern = r2 | r0 */ | 1147 | "or r0, r2 \n" /* rotated_pattern = r2 | r0 */ |
1002 | "clrt \n" /* mask bit = 0 (replace) */ | 1148 | "clrt \n" /* mask bit = 0 (replace) */ |
1003 | 1149 | ||
1004 | ".ur_skip: \n" /* T == 1 if skipped */ | 1150 | ".ur_skip: \n" /* T == 1 if skipped */ |
1005 | "rotcr %[mask] \n" /* get mask bit */ | 1151 | "rotcr %[mask] \n" /* get mask bit */ |
1006 | "mov.l r2,@-%[patp]\n" /* push on pattern stack */ | 1152 | "mov.l r2, @-%[patp] \n" /* push on pattern stack */ |
1007 | 1153 | ||
1008 | "add #-1,r3 \n" /* loop 8 times (pixel block) */ | 1154 | "add #-1, r3 \n" /* loop 8 times (pixel block) */ |
1009 | "cmp/pl r3 \n" | 1155 | "cmp/pl r3 \n" |
1010 | "bt .ur_pre_loop\n" | 1156 | "bt .ur_pre_loop \n" |
1011 | 1157 | ||
1012 | "shlr8 %[mask] \n" /* shift mask to low byte */ | 1158 | "shlr8 %[mask] \n" /* shift mask to low byte */ |
1013 | "shlr16 %[mask] \n" | 1159 | "shlr16 %[mask] \n" |
1014 | : /* outputs */ | 1160 | : /* outputs */ |
1015 | [cbuf]"+r"(cbuf), | 1161 | [cbuf]"+r"(cbuf), |
1016 | [bbuf]"+r"(bbuf), | 1162 | [bbuf]"+r"(bbuf), |
1017 | [rnd] "+r"(_gray_random_buffer), | 1163 | [rnd] "+r"(_gray_random_buffer), |
1018 | [patp]"+r"(pat_ptr), | 1164 | [patp]"+r"(pat_ptr), |
1019 | [mask]"=&r"(mask) | 1165 | [mask]"=&r"(mask) |
1020 | : /* inputs */ | 1166 | : /* inputs */ |
1021 | [dpth]"r"(_gray_info.depth), | 1167 | [dpth]"r"(_gray_info.depth), |
1022 | [bpat]"r"(_gray_info.bitpattern), | 1168 | [bpat]"r"(_gray_info.bitpattern), |
1023 | [rmsk]"r"(_gray_info.randmask) | 1169 | [rmsk]"r"(_gray_info.randmask) |
1024 | : /* clobbers */ | 1170 | : /* clobbers */ |
1025 | "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" | 1171 | "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" |
1026 | ); | 1172 | ); |
1027 | 1173 | ||
1028 | addr = dst_row; | 1174 | addr = dst_row; |
1029 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1175 | depth = _gray_info.depth; |
1030 | 1176 | ||
1031 | /* set the bits for all 8 pixels in all bytes according to the | 1177 | /* set the bits for all 8 pixels in all bytes according to the |
1032 | * precalculated patterns on the pattern stack */ | 1178 | * precalculated patterns on the pattern stack */ |
1033 | asm volatile ( | 1179 | asm volatile |
1034 | "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ | 1180 | ( |
1035 | "mov.l @%[patp]+,r2 \n" | 1181 | "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ |
1036 | "mov.l @%[patp]+,r3 \n" | 1182 | "mov.l @%[patp]+, r7 \n" |
1037 | "mov.l @%[patp]+,r6 \n" | 1183 | "mov.l @%[patp]+, r6 \n" |
1038 | "mov.l @%[patp]+,r7 \n" | 1184 | "mov.l @%[patp]+, r5 \n" |
1039 | "mov.l @%[patp]+,r8 \n" | 1185 | "mov.l @%[patp]+, r4 \n" |
1040 | "mov.l @%[patp]+,r9 \n" | 1186 | "mov.l @%[patp]+, r3 \n" |
1041 | "mov.l @%[patp],r10 \n" | 1187 | "mov.l @%[patp]+, r2 \n" |
1042 | 1188 | "mov.l @%[patp], r1 \n" | |
1043 | "tst %[mask],%[mask] \n" | 1189 | |
1044 | "bt .ur_sloop \n" /* short loop if nothing to keep */ | 1190 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1045 | 1191 | ||
1046 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 1192 | "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */ |
1047 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1193 | "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ |
1048 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1194 | "shll2 r0 \n" |
1049 | "shlr r2 \n" | 1195 | "shll2 r0 \n" |
1050 | "rotcl r0 \n" | 1196 | "xor r1, r0 \n" |
1051 | "shlr r3 \n" | 1197 | "and %[rx], r0 \n" |
1052 | "rotcl r0 \n" | 1198 | "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ |
1053 | "shlr r6 \n" | 1199 | "shlr2 r0 \n" |
1054 | "rotcl r0 \n" | 1200 | "shlr2 r0 \n" |
1055 | "shlr r7 \n" | 1201 | "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ |
1056 | "rotcl r0 \n" | 1202 | "mov r6, r0 \n" |
1057 | "shlr r8 \n" | 1203 | "shll2 r0 \n" |
1058 | "rotcl r0 \n" | 1204 | "shll2 r0 \n" |
1059 | "shlr r9 \n" | 1205 | "xor r2, r0 \n" |
1060 | "rotcl r0 \n" | 1206 | "and %[rx], r0 \n" |
1061 | "shlr r10 \n" | 1207 | "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ |
1062 | "mov.b @%[addr],%[rx] \n" /* read old value */ | 1208 | "shlr2 r0 \n" |
1063 | "rotcl r0 \n" | 1209 | "shlr2 r0 \n" |
1064 | "and %[mask],%[rx] \n" /* mask out replaced bits */ | 1210 | "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ |
1065 | "or %[rx],r0 \n" /* set new bits */ | 1211 | "mov r7, r0 \n" |
1066 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ | 1212 | "shll2 r0 \n" |
1067 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1213 | "shll2 r0 \n" |
1068 | "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ | 1214 | "xor r3, r0 \n" |
1069 | "bt .ur_floop \n" | 1215 | "and %[rx], r0 \n" |
1070 | 1216 | "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | |
1071 | "bra .ur_end \n" | 1217 | "shlr2 r0 \n" |
1072 | "nop \n" | 1218 | "shlr2 r0 \n" |
1073 | 1219 | "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | |
1074 | /* References to C library routines used in the precalc block */ | 1220 | "mov r8, r0 \n" |
1075 | ".align 2 \n" | 1221 | "shll2 r0 \n" |
1076 | ".ashlsi3: \n" /* C library routine: */ | 1222 | "shll2 r0 \n" |
1077 | ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ | 1223 | "xor r4, r0 \n" |
1078 | ".lshrsi3: \n" /* C library routine: */ | 1224 | "and %[rx], r0 \n" |
1079 | ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ | 1225 | "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ |
1080 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | 1226 | "shlr2 r0 \n" |
1081 | 1227 | "shlr2 r0 \n" | |
1082 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 1228 | "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ |
1083 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1229 | |
1084 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1230 | "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */ |
1085 | "shlr r2 \n" | 1231 | "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ |
1086 | "rotcl r0 \n" | 1232 | "shll2 r0 \n" |
1087 | "shlr r3 \n" | 1233 | "xor r1, r0 \n" |
1088 | "rotcl r0 \n" | 1234 | "and %[rx], r0 \n" |
1089 | "shlr r6 \n" | 1235 | "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ |
1090 | "rotcl r0 \n" | 1236 | "shlr2 r0 \n" |
1091 | "shlr r7 \n" | 1237 | "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ |
1092 | "rotcl r0 \n" | 1238 | "mov r4, r0 \n" |
1093 | "shlr r8 \n" | 1239 | "shll2 r0 \n" |
1094 | "rotcl r0 \n" | 1240 | "xor r2, r0 \n" |
1095 | "shlr r9 \n" | 1241 | "and %[rx], r0 \n" |
1096 | "rotcl r0 \n" | 1242 | "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1097 | "shlr r10 \n" | 1243 | "shlr2 r0 \n" |
1098 | "rotcl r0 \n" | 1244 | "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ |
1099 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ | 1245 | "mov r7, r0 \n" |
1100 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1246 | "shll2 r0 \n" |
1101 | "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ | 1247 | "xor r5, r0 \n" |
1102 | "bt .ur_sloop \n" | 1248 | "and %[rx], r0 \n" |
1103 | 1249 | "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | |
1104 | ".ur_end: \n" | 1250 | "shlr2 r0 \n" |
1105 | : /* outputs */ | 1251 | "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ |
1106 | [addr]"+r"(addr), | 1252 | "mov r8, r0 \n" |
1107 | [mask]"+r"(mask), | 1253 | "shll2 r0 \n" |
1108 | [rx] "=&r"(trash) | 1254 | "xor r6, r0 \n" |
1109 | : /* inputs */ | 1255 | "and %[rx], r0 \n" |
1110 | [psiz]"r"(_gray_info.plane_size), | 1256 | "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ |
1111 | [end] "r"(end), | 1257 | "shlr2 r0 \n" |
1112 | [patp]"[rx]"(pat_ptr) | 1258 | "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ |
1113 | : /* clobbers */ | 1259 | |
1114 | "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" | 1260 | "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */ |
1261 | "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1262 | "shll r0 \n" | ||
1263 | "xor r1, r0 \n" | ||
1264 | "and %[rx], r0 \n" | ||
1265 | "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
1266 | "shlr r0 \n" | ||
1267 | "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
1268 | "mov r4, r0 \n" | ||
1269 | "shll r0 \n" | ||
1270 | "xor r3, r0 \n" | ||
1271 | "and %[rx], r0 \n" | ||
1272 | "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
1273 | "shlr r0 \n" | ||
1274 | "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
1275 | "mov r6, r0 \n" | ||
1276 | "shll r0 \n" | ||
1277 | "xor r5, r0 \n" | ||
1278 | "and %[rx], r0 \n" | ||
1279 | "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1280 | "shlr r0 \n" | ||
1281 | "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1282 | "mov r8, r0 \n" | ||
1283 | "shll r0 \n" | ||
1284 | "xor r7, r0 \n" | ||
1285 | "and %[rx], r0 \n" | ||
1286 | "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1287 | "shlr r0 \n" | ||
1288 | "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1289 | |||
1290 | "tst %[mask], %[mask] \n" | ||
1291 | "bt .ur_sloop \n" /* short loop if nothing to keep */ | ||
1292 | |||
1293 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | ||
1294 | "mov #8, r0 \n" | ||
1295 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1296 | "bt .ur_f8 \n" | ||
1297 | |||
1298 | "mulu %[psiz], %[dpth] \n" | ||
1299 | "mova .ur_ftable, r0 \n" | ||
1300 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1301 | "add %[rx], r0 \n" | ||
1302 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1303 | "jmp @r0 \n" /* jump into streak */ | ||
1304 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1305 | |||
1306 | ".align 2 \n" | ||
1307 | ".ur_ftable: \n" | ||
1308 | ".byte .ur_f0 - .ur_ftable \n" | ||
1309 | ".byte .ur_f1 - .ur_ftable \n" | ||
1310 | ".byte .ur_f2 - .ur_ftable \n" | ||
1311 | ".byte .ur_f3 - .ur_ftable \n" | ||
1312 | ".byte .ur_f4 - .ur_ftable \n" | ||
1313 | ".byte .ur_f5 - .ur_ftable \n" | ||
1314 | ".byte .ur_f6 - .ur_ftable \n" | ||
1315 | ".byte .ur_f7 - .ur_ftable \n" | ||
1316 | |||
1317 | ".ur_f8: \n" | ||
1318 | "mov %[psiz], %[rx] \n" | ||
1319 | "shll2 %[rx] \n" | ||
1320 | "add %[rx], %[rx] \n" | ||
1321 | "add %[rx], %[addr] \n" | ||
1322 | /* Point behind the last plane for this round. Note: We're using the | ||
1323 | * registers backwards in order to reuse the streak for the last round. | ||
1324 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1325 | * the bit order would be destroyed which results in more flicker. */ | ||
1326 | "sub %[psiz], %[addr] \n" | ||
1327 | "mov.b @%[addr], r0 \n" /* load old byte */ | ||
1328 | "and %[mask], r0 \n" /* mask out replaced bits */ | ||
1329 | "or r8, r0 \n" /* set new bits */ | ||
1330 | "mov.b r0, @%[addr] \n" /* store byte */ | ||
1331 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1332 | ".ur_f7: \n" | ||
1333 | "sub %[psiz], %[addr] \n" | ||
1334 | "mov.b @%[addr], r0 \n" | ||
1335 | "and %[mask], r0 \n" | ||
1336 | "or r7, r0 \n" | ||
1337 | "mov.b r0, @%[addr] \n" | ||
1338 | "shlr8 r7 \n" | ||
1339 | ".ur_f6: \n" | ||
1340 | "sub %[psiz], %[addr] \n" | ||
1341 | "mov.b @%[addr], r0 \n" | ||
1342 | "and %[mask], r0 \n" | ||
1343 | "or r6, r0 \n" | ||
1344 | "mov.b r0, @%[addr] \n" | ||
1345 | "shlr8 r6 \n" | ||
1346 | ".ur_f5: \n" | ||
1347 | "sub %[psiz], %[addr] \n" | ||
1348 | "mov.b @%[addr], r0 \n" | ||
1349 | "and %[mask], r0 \n" | ||
1350 | "or r5, r0 \n" | ||
1351 | "mov.b r0, @%[addr] \n" | ||
1352 | "shlr8 r5 \n" | ||
1353 | ".ur_f4: \n" | ||
1354 | "sub %[psiz], %[addr] \n" | ||
1355 | "mov.b @%[addr], r0 \n" | ||
1356 | "and %[mask], r0 \n" | ||
1357 | "or r4, r0 \n" | ||
1358 | "mov.b r0, @%[addr] \n" | ||
1359 | "shlr8 r4 \n" | ||
1360 | ".ur_f3: \n" | ||
1361 | "sub %[psiz], %[addr] \n" | ||
1362 | "mov.b @%[addr], r0 \n" | ||
1363 | "and %[mask], r0 \n" | ||
1364 | "or r3, r0 \n" | ||
1365 | "mov.b r0, @%[addr] \n" | ||
1366 | "shlr8 r3 \n" | ||
1367 | ".ur_f2: \n" | ||
1368 | "sub %[psiz], %[addr] \n" | ||
1369 | "mov.b @%[addr], r0 \n" | ||
1370 | "and %[mask], r0 \n" | ||
1371 | "or r2, r0 \n" | ||
1372 | "mov.b r0, @%[addr] \n" | ||
1373 | "shlr8 r2 \n" | ||
1374 | ".ur_f1: \n" | ||
1375 | "sub %[psiz], %[addr] \n" | ||
1376 | "mov.b @%[addr], r0 \n" | ||
1377 | "and %[mask], r0 \n" | ||
1378 | "or r1, r0 \n" | ||
1379 | "mov.b r0, @%[addr] \n" | ||
1380 | "shlr8 r1 \n" | ||
1381 | ".ur_f0: \n" | ||
1382 | |||
1383 | "add %[rx], %[addr] \n" /* correct address */ | ||
1384 | "add #-8, %[dpth] \n" | ||
1385 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1386 | "bt .ur_floop \n" | ||
1387 | |||
1388 | "bra .ur_end \n" | ||
1389 | "nop \n" | ||
1390 | |||
1391 | /* References to C library routines used in the precalc block */ | ||
1392 | ".align 2 \n" | ||
1393 | ".ashlsi3: \n" /* C library routine: */ | ||
1394 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ | ||
1395 | ".lshrsi3: \n" /* C library routine: */ | ||
1396 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ | ||
1397 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | ||
1398 | |||
1399 | /* Bitmasks for the bit block rotation */ | ||
1400 | ".ur_mask4: \n" | ||
1401 | ".long 0xF0F0F0F0 \n" | ||
1402 | ".ur_mask2: \n" | ||
1403 | ".long 0xCCCCCCCC \n" | ||
1404 | ".ur_mask1: \n" | ||
1405 | ".long 0xAAAAAAAA \n" | ||
1406 | |||
1407 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
1408 | "mov #8, r0 \n" | ||
1409 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1410 | "bt .ur_s8 \n" | ||
1411 | |||
1412 | "mulu %[psiz], %[dpth] \n" | ||
1413 | "mova .ur_stable, r0 \n" | ||
1414 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1415 | "add %[rx], r0 \n" | ||
1416 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1417 | "jmp @r0 \n" /* jump into streak */ | ||
1418 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1419 | |||
1420 | ".align 2 \n" | ||
1421 | ".ur_stable: \n" | ||
1422 | ".byte .ur_s0 - .ur_stable \n" | ||
1423 | ".byte .ur_s1 - .ur_stable \n" | ||
1424 | ".byte .ur_s2 - .ur_stable \n" | ||
1425 | ".byte .ur_s3 - .ur_stable \n" | ||
1426 | ".byte .ur_s4 - .ur_stable \n" | ||
1427 | ".byte .ur_s5 - .ur_stable \n" | ||
1428 | ".byte .ur_s6 - .ur_stable \n" | ||
1429 | ".byte .ur_s7 - .ur_stable \n" | ||
1430 | |||
1431 | ".ur_s8: \n" | ||
1432 | "mov %[psiz], %[rx] \n" /* Point behind the last plane */ | ||
1433 | "shll2 %[rx] \n" /* for this round. */ | ||
1434 | "add %[rx], %[rx] \n" /* See above. */ | ||
1435 | "add %[rx], %[addr] \n" | ||
1436 | |||
1437 | "sub %[psiz], %[addr] \n" | ||
1438 | "mov.b r8, @%[addr] \n" /* store byte */ | ||
1439 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1440 | ".ur_s7: \n" | ||
1441 | "sub %[psiz], %[addr] \n" | ||
1442 | "mov.b r7, @%[addr] \n" | ||
1443 | "shlr8 r7 \n" | ||
1444 | ".ur_s6: \n" | ||
1445 | "sub %[psiz], %[addr] \n" | ||
1446 | "mov.b r6, @%[addr] \n" | ||
1447 | "shlr8 r6 \n" | ||
1448 | ".ur_s5: \n" | ||
1449 | "sub %[psiz], %[addr] \n" | ||
1450 | "mov.b r5, @%[addr] \n" | ||
1451 | "shlr8 r5 \n" | ||
1452 | ".ur_s4: \n" | ||
1453 | "sub %[psiz], %[addr] \n" | ||
1454 | "mov.b r4, @%[addr] \n" | ||
1455 | "shlr8 r4 \n" | ||
1456 | ".ur_s3: \n" | ||
1457 | "sub %[psiz], %[addr] \n" | ||
1458 | "mov.b r3, @%[addr] \n" | ||
1459 | "shlr8 r3 \n" | ||
1460 | ".ur_s2: \n" | ||
1461 | "sub %[psiz], %[addr] \n" | ||
1462 | "mov.b r2, @%[addr] \n" | ||
1463 | "shlr8 r2 \n" | ||
1464 | ".ur_s1: \n" | ||
1465 | "sub %[psiz], %[addr] \n" | ||
1466 | "mov.b r1, @%[addr] \n" | ||
1467 | "shlr8 r1 \n" | ||
1468 | ".ur_s0: \n" | ||
1469 | |||
1470 | "add %[rx], %[addr] \n" /* correct address */ | ||
1471 | "add #-8, %[dpth] \n" | ||
1472 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1473 | "bt .ur_sloop \n" | ||
1474 | |||
1475 | ".ur_end: \n" | ||
1476 | : /* outputs */ | ||
1477 | [addr]"+r"(addr), | ||
1478 | [dpth]"+r"(depth), | ||
1479 | [rx] "=&r"(trash) | ||
1480 | : /* inputs */ | ||
1481 | [mask]"r"(mask), | ||
1482 | [psiz]"r"(_gray_info.plane_size), | ||
1483 | [patp]"[rx]"(pat_ptr) | ||
1484 | : /* clobbers */ | ||
1485 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" | ||
1115 | ); | 1486 | ); |
1116 | } | 1487 | } |
1117 | #elif defined(CPU_COLDFIRE) | 1488 | #elif defined(CPU_COLDFIRE) |
1118 | asm volatile ( | 1489 | asm volatile ( |
1119 | "move.l (%[cbuf]),%%d0 \n" | 1490 | "move.l (%[cbuf]), %%d0 \n" |
1120 | "move.l (%[bbuf]),%%d1 \n" | 1491 | "move.l (%[bbuf]), %%d1 \n" |
1121 | "eor.l %%d0,%%d1 \n" | 1492 | "eor.l %%d0, %%d1 \n" |
1122 | "move.l (4,%[cbuf]),%%d0 \n" | 1493 | "move.l (4,%[cbuf]), %%d0 \n" |
1123 | "move.l (4,%[bbuf]),%[chg] \n" | 1494 | "move.l (4,%[bbuf]), %[chg] \n" |
1124 | "eor.l %%d0,%[chg] \n" | 1495 | "eor.l %%d0, %[chg] \n" |
1125 | "or.l %%d1,%[chg] \n" | 1496 | "or.l %%d1, %[chg] \n" |
1126 | : /* outputs */ | 1497 | : /* outputs */ |
1127 | [chg] "=&d"(change) | 1498 | [chg] "=&d"(change) |
1128 | : /* inputs */ | 1499 | : /* inputs */ |
@@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height) | |||
1134 | 1505 | ||
1135 | if (change != 0) | 1506 | if (change != 0) |
1136 | { | 1507 | { |
1137 | unsigned char *addr, *end; | 1508 | unsigned char *addr; |
1138 | unsigned mask, trash; | 1509 | unsigned mask, depth, trash; |
1139 | 1510 | ||
1140 | pat_ptr = &pat_stack[8]; | 1511 | pat_ptr = &pat_stack[8]; |
1141 | 1512 | ||
1142 | /* precalculate the bit patterns with random shifts | 1513 | /* precalculate the bit patterns with random shifts |
1143 | * for all 8 pixels and put them on an extra "stack" */ | 1514 | * for all 8 pixels and put them on an extra "stack" */ |
1144 | asm volatile ( | 1515 | asm volatile |
1145 | "moveq.l #8,%%d3 \n" /* loop count */ | 1516 | ( |
1146 | "clr.l %[mask] \n" | 1517 | "moveq.l #8, %%d3 \n" /* loop count */ |
1147 | 1518 | "clr.l %[mask] \n" | |
1148 | ".ur_pre_loop: \n" | 1519 | |
1149 | "clr.l %%d0 \n" | 1520 | ".ur_pre_loop: \n" |
1150 | "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ | 1521 | "clr.l %%d0 \n" |
1151 | "clr.l %%d1 \n" | 1522 | "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */ |
1152 | "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ | 1523 | "clr.l %%d1 \n" |
1153 | "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ | 1524 | "move.b (%[bbuf]), %%d1 \n" /* read back buffer */ |
1154 | "clr.l %%d2 \n" /* preset for skipped pixel */ | 1525 | "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */ |
1155 | "cmp.l %%d0,%%d1 \n" /* no change? */ | 1526 | "clr.l %%d2 \n" /* preset for skipped pixel */ |
1156 | "beq.b .ur_skip \n" /* -> skip */ | 1527 | "cmp.l %%d0, %%d1 \n" /* no change? */ |
1157 | 1528 | "beq.b .ur_skip \n" /* -> skip */ | |
1158 | "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ | 1529 | |
1159 | 1530 | "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ | |
1160 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ | 1531 | |
1161 | "add.l #74,%[rnd] \n" /* add another 74 */ | 1532 | "mulu.w #75, %[rnd] \n" /* multiply by 75 */ |
1162 | /* Since the lower bits are not very random: */ | 1533 | "add.l #74, %[rnd] \n" /* add another 74 */ |
1163 | "move.l %[rnd],%%d1 \n" | 1534 | /* Since the lower bits are not very random: */ |
1164 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ | 1535 | "move.l %[rnd], %%d1 \n" |
1165 | "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ | 1536 | "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ |
1166 | 1537 | "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ | |
1167 | "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ | 1538 | |
1168 | "blo.b .ur_ntrim \n" | 1539 | "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ |
1169 | "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ | 1540 | "blo.b .ur_ntrim \n" |
1170 | ".ur_ntrim: \n" | 1541 | "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ |
1171 | 1542 | ".ur_ntrim: \n" | |
1172 | "move.l %%d2,%%d0 \n" /** rotate pattern **/ | 1543 | |
1173 | "lsl.l %%d1,%%d0 \n" | 1544 | "move.l %%d2, %%d0 \n" /** rotate pattern **/ |
1174 | "sub.l %[dpth],%%d1 \n" | 1545 | "lsl.l %%d1, %%d0 \n" |
1175 | "neg.l %%d1 \n" /* d1 = depth - d1 */ | 1546 | "sub.l %[dpth], %%d1 \n" |
1176 | "lsr.l %%d1,%%d2 \n" | 1547 | "neg.l %%d1 \n" /* d1 = depth - d1 */ |
1177 | "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ | 1548 | "lsr.l %%d1, %%d2 \n" |
1178 | 1549 | "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */ | |
1179 | "or.l #0x0100,%[mask] \n" /* set mask bit */ | 1550 | |
1180 | 1551 | "or.l #0x0100, %[mask] \n" /* set mask bit */ | |
1181 | ".ur_skip: \n" | 1552 | |
1182 | "lsr.l #1,%[mask] \n" /* shift mask */ | 1553 | ".ur_skip: \n" |
1183 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ | 1554 | "lsr.l #1, %[mask] \n" /* shift mask */ |
1184 | 1555 | "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ | |
1185 | "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ | 1556 | |
1186 | "bne.b .ur_pre_loop \n" | 1557 | "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ |
1187 | : /* outputs */ | 1558 | "bne.b .ur_pre_loop \n" |
1188 | [cbuf]"+a"(cbuf), | 1559 | : /* outputs */ |
1189 | [bbuf]"+a"(bbuf), | 1560 | [cbuf]"+a"(cbuf), |
1190 | [patp]"+a"(pat_ptr), | 1561 | [bbuf]"+a"(bbuf), |
1191 | [rnd] "+d"(_gray_random_buffer), | 1562 | [patp]"+a"(pat_ptr), |
1192 | [mask]"=&d"(mask) | 1563 | [rnd] "+d"(_gray_random_buffer), |
1193 | : /* inputs */ | 1564 | [mask]"=&d"(mask) |
1194 | [bpat]"a"(_gray_info.bitpattern), | 1565 | : /* inputs */ |
1195 | [dpth]"d"(_gray_info.depth), | 1566 | [bpat]"a"(_gray_info.bitpattern), |
1196 | [rmsk]"d"(_gray_info.randmask) | 1567 | [dpth]"d"(_gray_info.depth), |
1197 | : /* clobbers */ | 1568 | [rmsk]"d"(_gray_info.randmask) |
1198 | "d0", "d1", "d2", "d3" | 1569 | : /* clobbers */ |
1570 | "d0", "d1", "d2", "d3" | ||
1199 | ); | 1571 | ); |
1200 | 1572 | ||
1201 | addr = dst_row; | 1573 | addr = dst_row; |
1202 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1574 | mask = ~mask & 0xff; |
1575 | depth = _gray_info.depth; | ||
1203 | 1576 | ||
1204 | /* set the bits for all 8 pixels in all bytes according to the | 1577 | /* set the bits for all 8 pixels in all bytes according to the |
1205 | * precalculated patterns on the pattern stack */ | 1578 | * precalculated patterns on the pattern stack */ |
1206 | asm volatile ( | 1579 | asm volatile |
1207 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" | 1580 | ( |
1208 | /* pop all 8 patterns */ | 1581 | "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ |
1209 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ | 1582 | /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ |
1210 | "and.l #0xFF,%[mask] \n" | 1583 | |
1211 | "beq.b .ur_sstart \n" /* short loop if nothing to keep */ | 1584 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1212 | 1585 | ||
1213 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 1586 | "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ |
1214 | "clr.l %%d0 \n" | 1587 | "lsl.l #4, %%d0 \n" |
1215 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1588 | /* move.l %[ax], %%d5 */ /* already in d5 */ |
1216 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1589 | "eor.l %%d5, %%d0 \n" |
1217 | "lsr.l #1,%%d3 \n" | 1590 | "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ |
1218 | "addx.l %%d0,%%d0 \n" | 1591 | "eor.l %%d0, %%d5 \n" |
1219 | "lsr.l #1,%%d4 \n" | 1592 | "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ |
1220 | "addx.l %%d0,%%d0 \n" | 1593 | "lsr.l #4, %%d0 \n" |
1221 | "lsr.l #1,%%d5 \n" | 1594 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ |
1222 | "addx.l %%d0,%%d0 \n" | 1595 | "move.l %%d2, %%d0 \n" |
1223 | "lsr.l #1,%%d6 \n" | 1596 | "lsl.l #4, %%d0 \n" |
1224 | "addx.l %%d0,%%d0 \n" | 1597 | "eor.l %%d6, %%d0 \n" |
1225 | "move.l %%a0,%%d1 \n" | 1598 | "and.l #0xF0F0F0F0, %%d0 \n" |
1226 | "lsr.l #1,%%d1 \n" | 1599 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ |
1227 | "addx.l %%d0,%%d0 \n" | 1600 | "lsr.l #4, %%d0 \n" |
1228 | "move.l %%d1,%%a0 \n" | 1601 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ |
1229 | "move.l %%a1,%%d1 \n" | 1602 | "move.l %%d3, %%d0 \n" |
1230 | "lsr.l #1,%%d1 \n" | 1603 | "lsl.l #4, %%d0 \n" |
1231 | "addx.l %%d0,%%d0 \n" | 1604 | "eor.l %%d7, %%d0 \n" |
1232 | "move.l %%d1,%%a1 \n" | 1605 | "and.l #0xF0F0F0F0, %%d0 \n" |
1233 | "move.l %[ax],%%d1 \n" | 1606 | "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ |
1234 | "lsr.l #1,%%d1 \n" | 1607 | "lsr.l #4, %%d0 \n" |
1235 | "addx.l %%d0,%%d0 \n" | 1608 | "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ |
1236 | "move.l %%d1,%[ax] \n" | 1609 | "move.l %%d4, %%d0 \n" |
1237 | 1610 | "lsl.l #4, %%d0 \n" | |
1238 | "move.b (%[addr]),%%d1 \n" /* read old value */ | 1611 | "move.l %%a0, %%d5 \n" |
1239 | "and.l %[mask],%%d1 \n" /* mask out replaced bits */ | 1612 | "eor.l %%d5, %%d0 \n" |
1240 | "or.l %%d0,%%d1 \n" /* set new bits */ | 1613 | "and.l #0xF0F0F0F0, %%d0 \n" |
1241 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ | 1614 | "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ |
1242 | 1615 | /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ | |
1243 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1616 | "lsr.l #4, %%d0 \n" |
1244 | "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ | 1617 | "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ |
1245 | "bhi.b .ur_floop \n" | 1618 | |
1246 | 1619 | "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ | |
1247 | "bra.b .ur_end \n" | 1620 | "lsl.l #2, %%d0 \n" |
1248 | 1621 | /* move.l %%a0, %%d5 */ /* still in d5 */ | |
1249 | ".ur_sstart: \n" | 1622 | "eor.l %%d5, %%d0 \n" |
1250 | "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ | 1623 | "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ |
1251 | 1624 | "eor.l %%d0, %%d5 \n" | |
1252 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 1625 | "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ |
1253 | "clr.l %%d0 \n" | 1626 | "lsr.l #2, %%d0 \n" |
1254 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1627 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ |
1255 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1628 | "move.l %[ax], %%d5 \n" |
1256 | "lsr.l #1,%%d3 \n" | 1629 | "move.l %%d5, %%d0 \n" |
1257 | "addx.l %%d0,%%d0 \n" | 1630 | "lsl.l #2, %%d0 \n" |
1258 | "lsr.l #1,%%d4 \n" | 1631 | "eor.l %%d7, %%d0 \n" |
1259 | "addx.l %%d0,%%d0 \n" | 1632 | "and.l #0xCCCCCCCC, %%d0 \n" |
1260 | "lsr.l #1,%%d5 \n" | 1633 | "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1261 | "addx.l %%d0,%%d0 \n" | 1634 | "lsr.l #2, %%d0 \n" |
1262 | "lsr.l #1,%%d6 \n" | 1635 | "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ |
1263 | "addx.l %%d0,%%d0 \n" | 1636 | /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ |
1264 | "lsr.l #1,%[mask] \n" | 1637 | "move.l %%d2, %%d0 \n" |
1265 | "addx.l %%d0,%%d0 \n" | 1638 | "lsl.l #2, %%d0 \n" |
1266 | "move.l %%a1,%%d1 \n" | 1639 | "eor.l %%d4, %%d0 \n" |
1267 | "lsr.l #1,%%d1 \n" | 1640 | "and.l #0xCCCCCCCC, %%d0 \n" |
1268 | "addx.l %%d0,%%d0 \n" | 1641 | "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ |
1269 | "move.l %%d1,%%a1 \n" | 1642 | "lsr.l #2, %%d0 \n" |
1270 | "move.l %[ax],%%d1 \n" | 1643 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ |
1271 | "lsr.l #1,%%d1 \n" | 1644 | "move.l %%d1, %%d0 \n" |
1272 | "addx.l %%d0,%%d0 \n" | 1645 | "lsl.l #2, %%d0 \n" |
1273 | "move.l %%d1,%[ax] \n" | 1646 | "eor.l %%d3, %%d0 \n" |
1274 | 1647 | "and.l #0xCCCCCCCC, %%d0 \n" | |
1275 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ | 1648 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ |
1276 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1649 | "lsr.l #2, %%d0 \n" |
1277 | "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ | 1650 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ |
1278 | "bhi.b .ur_sloop \n" | 1651 | |
1279 | 1652 | "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ | |
1280 | ".ur_end: \n" | 1653 | "lsl.l #1, %%d0 \n" |
1281 | : /* outputs */ | 1654 | "eor.l %%d2, %%d0 \n" |
1282 | [addr]"+a"(addr), | 1655 | "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ |
1283 | [mask]"+d"(mask), | 1656 | "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ |
1284 | [ax] "=&a"(trash) | 1657 | "lsr.l #1, %%d0 \n" |
1285 | : /* inputs */ | 1658 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ |
1286 | [psiz]"a"(_gray_info.plane_size), | 1659 | "move.l %%d3, %%d0 \n" |
1287 | [end] "a"(end), | 1660 | "lsl.l #1, %%d0 \n" |
1288 | [patp]"[ax]"(pat_ptr) | 1661 | "eor.l %%d4, %%d0 \n" |
1289 | : /* clobbers */ | 1662 | "and.l #0xAAAAAAAA, %%d0 \n" |
1290 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" | 1663 | "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ |
1664 | "lsr.l #1, %%d0 \n" | ||
1665 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ | ||
1666 | /* move.l %[ax], %%d5 */ /* still in d5 */ | ||
1667 | "move.l %%d5, %%d0 \n" | ||
1668 | "lsl.l #1, %%d0 \n" | ||
1669 | "eor.l %%d6, %%d0 \n" | ||
1670 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1671 | "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ | ||
1672 | "lsr.l #1, %%d0 \n" | ||
1673 | "eor.l %%d0, %%d5 \n" | ||
1674 | "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ | ||
1675 | "move.l %%d7, %%d0 \n" | ||
1676 | "lsl.l #1, %%d0 \n" | ||
1677 | "move.l %%a0, %%d5 \n" | ||
1678 | "eor.l %%d5, %%d0 \n" | ||
1679 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1680 | "eor.l %%d0, %%d5 \n" | ||
1681 | "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ | ||
1682 | "lsr.l #1, %%d0 \n" | ||
1683 | "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ | ||
1684 | |||
1685 | "tst.l %[mask] \n" | ||
1686 | "jeq .ur_sloop \n" /* short loop if nothing to keep */ | ||
1687 | |||
1688 | "move.l %[mask], %%d5 \n" /* need mask in data reg. */ | ||
1689 | "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ | ||
1690 | |||
1691 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | ||
1692 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1693 | "bhs.s .ur_f8 \n" | ||
1694 | |||
1695 | "move.l %[psiz], %%d0 \n" | ||
1696 | "move.l %[dpth], %%d1 \n" | ||
1697 | "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ | ||
1698 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1699 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1700 | "bra.s .ur_f1 \n" /* dpth == 0 should never happen */ | ||
1701 | "bra.s .ur_f2 \n" | ||
1702 | "bra.s .ur_f3 \n" | ||
1703 | "bra.s .ur_f4 \n" | ||
1704 | "bra.s .ur_f5 \n" | ||
1705 | "bra.s .ur_f6 \n" | ||
1706 | "bra.s .ur_f7 \n" | ||
1707 | |||
1708 | ".ur_f8: \n" | ||
1709 | "move.l %[psiz], %%d0 \n" | ||
1710 | "lsl.l #3, %%d0 \n" | ||
1711 | "add.l %%d0, %[addr] \n" | ||
1712 | /* Point behind the last plane for this round. Note: We're using the | ||
1713 | * registers backwards in order to reuse the streak for the last round. | ||
1714 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1715 | * the bit order would be destroyed which results in more flicker. */ | ||
1716 | "sub.l %[psiz], %[addr] \n" | ||
1717 | "move.b (%[addr]), %%d0 \n" /* load old byte */ | ||
1718 | "and.l %%d5, %%d0 \n" /* mask out replaced bits */ | ||
1719 | "move.l %[mask], %%d1 \n" | ||
1720 | "or.l %%d1, %%d0 \n" /* set new bits */ | ||
1721 | "move.b %%d0, (%[addr]) \n" /* store byte */ | ||
1722 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1723 | "move.l %%d1, %[mask] \n" | ||
1724 | ".ur_f7: \n" | ||
1725 | "sub.l %[psiz], %[addr] \n" | ||
1726 | "move.b (%[addr]), %%d0 \n" | ||
1727 | "and.l %%d5, %%d0 \n" | ||
1728 | "or.l %%d2, %%d0 \n" | ||
1729 | "move.b %%d0, (%[addr]) \n" | ||
1730 | "lsr.l #8, %%d2 \n" | ||
1731 | ".ur_f6: \n" | ||
1732 | "sub.l %[psiz], %[addr] \n" | ||
1733 | "move.b (%[addr]), %%d0 \n" | ||
1734 | "and.l %%d5, %%d0 \n" | ||
1735 | "or.l %%d3, %%d0 \n" | ||
1736 | "move.b %%d0, (%[addr]) \n" | ||
1737 | "lsr.l #8, %%d3 \n" | ||
1738 | ".ur_f5: \n" | ||
1739 | "sub.l %[psiz], %[addr] \n" | ||
1740 | "move.b (%[addr]), %%d0 \n" | ||
1741 | "and.l %%d5, %%d0 \n" | ||
1742 | "or.l %%d4, %%d0 \n" | ||
1743 | "move.b %%d0, (%[addr]) \n" | ||
1744 | "lsr.l #8, %%d4 \n" | ||
1745 | ".ur_f4: \n" | ||
1746 | "sub.l %[psiz], %[addr] \n" | ||
1747 | "move.b (%[addr]), %%d0 \n" | ||
1748 | "and.l %%d5, %%d0 \n" | ||
1749 | "move.l %[ax], %%d1 \n" | ||
1750 | "or.l %%d1, %%d0 \n" | ||
1751 | "move.b %%d0, (%[addr]) \n" | ||
1752 | "lsr.l #8, %%d1 \n" | ||
1753 | "move.l %%d1, %[ax] \n" | ||
1754 | ".ur_f3: \n" | ||
1755 | "sub.l %[psiz], %[addr] \n" | ||
1756 | "move.b (%[addr]), %%d0 \n" | ||
1757 | "and.l %%d5, %%d0 \n" | ||
1758 | "or.l %%d6, %%d0 \n" | ||
1759 | "move.b %%d0, (%[addr]) \n" | ||
1760 | "lsr.l #8, %%d6 \n" | ||
1761 | ".ur_f2: \n" | ||
1762 | "sub.l %[psiz], %[addr] \n" | ||
1763 | "move.b (%[addr]), %%d0 \n" | ||
1764 | "and.l %%d5, %%d0 \n" | ||
1765 | "or.l %%d7, %%d0 \n" | ||
1766 | "move.b %%d0, (%[addr]) \n" | ||
1767 | "lsr.l #8, %%d7 \n" | ||
1768 | ".ur_f1: \n" | ||
1769 | "sub.l %[psiz], %[addr] \n" | ||
1770 | "move.b (%[addr]), %%d0 \n" | ||
1771 | "and.l %%d5, %%d0 \n" | ||
1772 | "move.l %%a0, %%d1 \n" | ||
1773 | "or.l %%d1, %%d0 \n" | ||
1774 | "move.b %%d0, (%[addr]) \n" | ||
1775 | "lsr.l #8, %%d1 \n" | ||
1776 | "move.l %%d1, %%a0 \n" | ||
1777 | |||
1778 | "move.l %[psiz], %%d0 \n" | ||
1779 | "lsl.l #3, %%d0 \n" | ||
1780 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1781 | "subq.l #8, %[dpth] \n" | ||
1782 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1783 | "jgt .ur_floop \n" /* next round if anything left */ | ||
1784 | |||
1785 | "jra .ur_end \n" | ||
1786 | |||
1787 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
1788 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1789 | "bhs.s .ur_s8 \n" | ||
1790 | |||
1791 | "move.l %[psiz], %%d0 \n" | ||
1792 | "move.l %[dpth], %%d5 \n" | ||
1793 | "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ | ||
1794 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1795 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1796 | "bra.s .ur_s1 \n" /* dpth == 0 should never happen */ | ||
1797 | "bra.s .ur_s2 \n" | ||
1798 | "bra.s .ur_s3 \n" | ||
1799 | "bra.s .ur_s4 \n" | ||
1800 | "bra.s .ur_s5 \n" | ||
1801 | "bra.s .ur_s6 \n" | ||
1802 | "bra.s .ur_s7 \n" | ||
1803 | |||
1804 | ".ur_s8: \n" | ||
1805 | "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ | ||
1806 | "lsl.l #3, %%d0 \n" /* for this round. */ | ||
1807 | "add.l %%d0, %[addr] \n" /* See above. */ | ||
1808 | |||
1809 | "sub.l %[psiz], %[addr] \n" | ||
1810 | "move.b %%d1, (%[addr]) \n" /* store byte */ | ||
1811 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1812 | ".ur_s7: \n" | ||
1813 | "sub.l %[psiz], %[addr] \n" | ||
1814 | "move.b %%d2, (%[addr]) \n" | ||
1815 | "lsr.l #8, %%d2 \n" | ||
1816 | ".ur_s6: \n" | ||
1817 | "sub.l %[psiz], %[addr] \n" | ||
1818 | "move.b %%d3, (%[addr]) \n" | ||
1819 | "lsr.l #8, %%d3 \n" | ||
1820 | ".ur_s5: \n" | ||
1821 | "sub.l %[psiz], %[addr] \n" | ||
1822 | "move.b %%d4, (%[addr]) \n" | ||
1823 | "lsr.l #8, %%d4 \n" | ||
1824 | ".ur_s4: \n" | ||
1825 | "sub.l %[psiz], %[addr] \n" | ||
1826 | "move.l %[ax], %%d5 \n" | ||
1827 | "move.b %%d5, (%[addr]) \n" | ||
1828 | "lsr.l #8, %%d5 \n" | ||
1829 | "move.l %%d5, %[ax] \n" | ||
1830 | ".ur_s3: \n" | ||
1831 | "sub.l %[psiz], %[addr] \n" | ||
1832 | "move.b %%d6, (%[addr]) \n" | ||
1833 | "lsr.l #8, %%d6 \n" | ||
1834 | ".ur_s2: \n" | ||
1835 | "sub.l %[psiz], %[addr] \n" | ||
1836 | "move.b %%d7, (%[addr]) \n" | ||
1837 | "lsr.l #8, %%d7 \n" | ||
1838 | ".ur_s1: \n" | ||
1839 | "sub.l %[psiz], %[addr] \n" | ||
1840 | "move.l %%a0, %%d5 \n" | ||
1841 | "move.b %%d5, (%[addr]) \n" | ||
1842 | "lsr.l #8, %%d5 \n" | ||
1843 | "move.l %%d5, %%a0 \n" | ||
1844 | |||
1845 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1846 | "subq.l #8, %[dpth] \n" | ||
1847 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1848 | "jgt .ur_sloop \n" /* next round if anything left */ | ||
1849 | |||
1850 | ".ur_end: \n" | ||
1851 | : /* outputs */ | ||
1852 | [addr]"+a"(addr), | ||
1853 | [dpth]"+a"(depth), | ||
1854 | [mask]"+a"(mask), | ||
1855 | [ax] "=&a"(trash) | ||
1856 | : /* inputs */ | ||
1857 | [psiz]"a"(_gray_info.plane_size), | ||
1858 | [patp]"[ax]"(pat_ptr) | ||
1859 | : /* clobbers */ | ||
1860 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" | ||
1291 | ); | 1861 | ); |
1292 | } | 1862 | } |
1293 | #else /* C version, for reference*/ | 1863 | #else /* C version, for reference*/ |
@@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd) | |||
1680 | } | 2250 | } |
1681 | 2251 | ||
1682 | #endif /* HAVE_LCD_BITMAP */ | 2252 | #endif /* HAVE_LCD_BITMAP */ |
1683 | |||