diff options
Diffstat (limited to 'apps')
-rw-r--r-- | apps/plugins/lib/gray_core.c | 1417 | ||||
-rw-r--r-- | apps/plugins/lib/gray_draw.c | 1156 |
2 files changed, 1858 insertions, 715 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c index e65a7f259e..809e88dba1 100644 --- a/apps/plugins/lib/gray_core.c +++ b/apps/plugins/lib/gray_core.c | |||
@@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height) | |||
649 | bbuf = _gray_info.back_buffer + srcofs_row; | 649 | bbuf = _gray_info.back_buffer + srcofs_row; |
650 | 650 | ||
651 | #ifdef CPU_ARM | 651 | #ifdef CPU_ARM |
652 | asm volatile ( | 652 | asm volatile |
653 | ( | ||
653 | "ldr r0, [%[cbuf]] \n" | 654 | "ldr r0, [%[cbuf]] \n" |
654 | "ldr r1, [%[bbuf]] \n" | 655 | "ldr r1, [%[bbuf]] \n" |
655 | "eor r1, r0, r1 \n" | 656 | "eor r1, r0, r1 \n" |
@@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height) | |||
668 | 669 | ||
669 | if (change != 0) | 670 | if (change != 0) |
670 | { | 671 | { |
671 | unsigned char *addr, *end; | 672 | unsigned char *addr; |
672 | unsigned mask, trash; | 673 | unsigned mask, depth, trash; |
673 | 674 | ||
674 | pat_ptr = &pat_stack[8]; | 675 | pat_ptr = &pat_stack[8]; |
675 | 676 | ||
676 | /* precalculate the bit patterns with random shifts | 677 | /* precalculate the bit patterns with random shifts |
677 | * for all 8 pixels and put them on an extra "stack" */ | 678 | * for all 8 pixels and put them on an extra "stack" */ |
678 | asm volatile ( | 679 | asm volatile |
679 | "mov r3, #8 \n" /* loop count */ | 680 | ( |
680 | "mov %[mask], #0 \n" | 681 | "mov r3, #8 \n" /* loop count */ |
681 | 682 | "mov %[mask], #0 \n" | |
682 | ".ur_pre_loop: \n" | 683 | |
683 | "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ | 684 | ".ur_pre_loop: \n" |
684 | "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ | 685 | "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ |
685 | "ldrb r1, [%[bbuf]] \n" /* read back buffer */ | 686 | "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ |
686 | "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ | 687 | "ldrb r1, [%[bbuf]] \n" /* read back buffer */ |
687 | "mov r2, #0 \n" /* preset for skipped pixel */ | 688 | "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ |
688 | "cmp r0, r1 \n" /* no change? */ | 689 | "mov r2, #0 \n" /* preset for skipped pixel */ |
689 | "beq .ur_skip \n" /* -> skip */ | 690 | "cmp r0, r1 \n" /* no change? */ |
690 | 691 | "beq .ur_skip \n" /* -> skip */ | |
691 | "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ | 692 | |
692 | 693 | "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ | |
693 | "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ | 694 | |
694 | "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" | 695 | "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ |
695 | "add %[rnd], %[rnd], #74 \n" /* add another 74 */ | 696 | "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" |
696 | /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ | 697 | "add %[rnd], %[rnd], #74 \n" /* add another 74 */ |
697 | "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ | 698 | /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ |
698 | 699 | "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ | |
699 | "cmp r1, %[dpth] \n" /* random >= depth ? */ | 700 | |
700 | "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ | 701 | "cmp r1, %[dpth] \n" /* random >= depth ? */ |
701 | 702 | "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ | |
702 | "mov r0, r2, lsl r1 \n" /** rotate pattern **/ | 703 | |
703 | "sub r1, %[dpth], r1 \n" | 704 | "mov r0, r2, lsl r1 \n" /** rotate pattern **/ |
704 | "orr r2, r0, r2, lsr r1 \n" | 705 | "sub r1, %[dpth], r1 \n" |
705 | 706 | "orr r2, r0, r2, lsr r1 \n" | |
706 | "orr %[mask], %[mask], #1 \n" /* set mask bit */ | 707 | |
708 | "orr %[mask], %[mask], #1 \n" /* set mask bit */ | ||
707 | 709 | ||
708 | ".ur_skip: \n" | 710 | ".ur_skip: \n" |
709 | "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ | 711 | "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ |
710 | 712 | ||
711 | "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ | 713 | "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ |
712 | "bne .ur_pre_loop \n" | 714 | "bne .ur_pre_loop \n" |
713 | : /* outputs */ | 715 | : /* outputs */ |
714 | [cbuf]"+r"(cbuf), | 716 | [cbuf]"+r"(cbuf), |
715 | [bbuf]"+r"(bbuf), | 717 | [bbuf]"+r"(bbuf), |
716 | [patp]"+r"(pat_ptr), | 718 | [patp]"+r"(pat_ptr), |
717 | [rnd] "+r"(_gray_random_buffer), | 719 | [rnd] "+r"(_gray_random_buffer), |
718 | [mask]"=&r"(mask) | 720 | [mask]"=&r"(mask) |
719 | : /* inputs */ | 721 | : /* inputs */ |
720 | [bpat]"r"(_gray_info.bitpattern), | 722 | [bpat]"r"(_gray_info.bitpattern), |
721 | [dpth]"r"(_gray_info.depth), | 723 | [dpth]"r"(_gray_info.depth), |
722 | [rmsk]"r"(_gray_info.randmask) | 724 | [rmsk]"r"(_gray_info.randmask) |
723 | : /* clobbers */ | 725 | : /* clobbers */ |
724 | "r0", "r1", "r2", "r3" | 726 | "r0", "r1", "r2", "r3" |
725 | ); | 727 | ); |
726 | 728 | ||
727 | addr = dst_row; | 729 | addr = dst_row; |
728 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 730 | depth = _gray_info.depth; |
729 | 731 | ||
730 | /* set the bits for all 8 pixels in all bytes according to the | 732 | /* set the bits for all 8 pixels in all bytes according to the |
731 | * precalculated patterns on the pattern stack */ | 733 | * precalculated patterns on the pattern stack */ |
732 | asm volatile ( | 734 | asm volatile |
733 | "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ | 735 | ( |
736 | "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ | ||
737 | |||
738 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ | ||
739 | |||
740 | "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ | ||
741 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
742 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ | ||
743 | "eor r0, r1, r5, lsl #4 \n" | ||
744 | "and r0, r0, %[rx] \n" | ||
745 | "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ | ||
746 | "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ | ||
747 | "eor r0, r2, r6, lsl #4 \n" | ||
748 | "and r0, r0, %[rx] \n" | ||
749 | "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ | ||
750 | "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ | ||
751 | "eor r0, r3, r7, lsl #4 \n" | ||
752 | "and r0, r0, %[rx] \n" | ||
753 | "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | ||
754 | "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | ||
755 | "eor r0, r4, r8, lsl #4 \n" | ||
756 | "and r0, r0, %[rx] \n" | ||
757 | "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
758 | "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
759 | |||
760 | "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ | ||
761 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
762 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ | ||
763 | "eor r0, r1, r3, lsl #2 \n" | ||
764 | "and r0, r0, %[rx] \n" | ||
765 | "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
766 | "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
767 | "eor r0, r2, r4, lsl #2 \n" | ||
768 | "and r0, r0, %[rx] \n" | ||
769 | "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
770 | "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
771 | "eor r0, r5, r7, lsl #2 \n" | ||
772 | "and r0, r0, %[rx] \n" | ||
773 | "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
774 | "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
775 | "eor r0, r6, r8, lsl #2 \n" | ||
776 | "and r0, r0, %[rx] \n" | ||
777 | "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
778 | "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
779 | |||
780 | "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ | ||
781 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
782 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ | ||
783 | "eor r0, r1, r2, lsl #1 \n" | ||
784 | "and r0, r0, %[rx] \n" | ||
785 | "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
786 | "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
787 | "eor r0, r3, r4, lsl #1 \n" | ||
788 | "and r0, r0, %[rx] \n" | ||
789 | "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
790 | "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
791 | "eor r0, r5, r6, lsl #1 \n" | ||
792 | "and r0, r0, %[rx] \n" | ||
793 | "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
794 | "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
795 | "eor r0, r7, r8, lsl #1 \n" | ||
796 | "and r0, r0, %[rx] \n" | ||
797 | "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
798 | "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
799 | |||
800 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
801 | "ands %[mask], %[mask], #0xff \n" | ||
802 | "beq .ur_sloop \n" /* short loop if no bits to keep */ | ||
803 | |||
804 | ".ur_floop: \n" /** full loop (bits to keep)**/ | ||
805 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ | ||
806 | "bhs .ur_f8 \n" | ||
807 | |||
808 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ | ||
809 | "add %[addr], %[addr], r0 \n" /* for this round */ | ||
810 | |||
811 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ | ||
812 | "add pc, pc, r0 \n" | ||
813 | ".ur_ftable: \n" | ||
814 | ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */ | ||
815 | ".byte .ur_f1 - .ur_ftable - 4 \n" | ||
816 | ".byte .ur_f2 - .ur_ftable - 4 \n" | ||
817 | ".byte .ur_f3 - .ur_ftable - 4 \n" | ||
818 | ".byte .ur_f4 - .ur_ftable - 4 \n" | ||
819 | ".byte .ur_f5 - .ur_ftable - 4 \n" | ||
820 | ".byte .ur_f6 - .ur_ftable - 4 \n" | ||
821 | ".byte .ur_f7 - .ur_ftable - 4 \n" | ||
822 | |||
823 | ".ur_f8: \n" | ||
824 | "add %[addr], %[addr], %[psiz], lsl #3 \n" | ||
825 | /* Point behind the last plane for this round. Note: We're using the | ||
826 | * registers backwards in order to reuse the streak for the last round. | ||
827 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
828 | * the bit order would be destroyed which results in more flicker. */ | ||
829 | "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ | ||
830 | "and r0, r0, %[mask] \n" /* mask out replaced bits */ | ||
831 | "orr r0, r0, r8 \n" /* set new bits */ | ||
832 | "strb r0, [%[addr]] \n" /* store byte */ | ||
833 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
834 | ".ur_f7: \n" | ||
835 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
836 | "and r0, r0, %[mask] \n" | ||
837 | "orr r0, r0, r7 \n" | ||
838 | "strb r0, [%[addr]] \n" | ||
839 | "mov r7, r7, lsr #8 \n" | ||
840 | ".ur_f6: \n" | ||
841 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
842 | "and r0, r0, %[mask] \n" | ||
843 | "orr r0, r0, r6 \n" | ||
844 | "strb r0, [%[addr]] \n" | ||
845 | "mov r6, r6, lsr #8 \n" | ||
846 | ".ur_f5: \n" | ||
847 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
848 | "and r0, r0, %[mask] \n" | ||
849 | "orr r0, r0, r5 \n" | ||
850 | "strb r0, [%[addr]] \n" | ||
851 | "mov r5, r5, lsr #8 \n" | ||
852 | ".ur_f4: \n" | ||
853 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
854 | "and r0, r0, %[mask] \n" | ||
855 | "orr r0, r0, r4 \n" | ||
856 | "strb r0, [%[addr]] \n" | ||
857 | "mov r4, r4, lsr #8 \n" | ||
858 | ".ur_f3: \n" | ||
859 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
860 | "and r0, r0, %[mask] \n" | ||
861 | "orr r0, r0, r3 \n" | ||
862 | "strb r0, [%[addr]] \n" | ||
863 | "mov r3, r3, lsr #8 \n" | ||
864 | ".ur_f2: \n" | ||
865 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
866 | "and r0, r0, %[mask] \n" | ||
867 | "orr r0, r0, r2 \n" | ||
868 | "strb r0, [%[addr]] \n" | ||
869 | "mov r2, r2, lsr #8 \n" | ||
870 | ".ur_f1: \n" | ||
871 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
872 | "and r0, r0, %[mask] \n" | ||
873 | "orr r0, r0, r1 \n" | ||
874 | "strb r0, [%[addr]] \n" | ||
875 | "mov r1, r1, lsr #8 \n" | ||
876 | ".ur_f0: \n" | ||
877 | |||
878 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
879 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
880 | "bhi .ur_floop \n" | ||
881 | |||
882 | "b .ur_end \n" | ||
883 | |||
884 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
885 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ | ||
886 | "bhs .ur_s8 \n" | ||
887 | |||
888 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ | ||
889 | "add %[addr], %[addr], r0 \n" /* for this round */ | ||
734 | 890 | ||
735 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | 891 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ |
736 | "ands %[mask], %[mask], #0xff \n" | 892 | "add pc, pc, r0 \n" |
737 | "beq .ur_sloop \n" /* short loop if nothing to keep */ | 893 | ".ur_stable: \n" |
738 | 894 | ".byte .ur_s0 - .ur_stable - 4 \n" | |
739 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 895 | ".byte .ur_s1 - .ur_stable - 4 \n" |
740 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 896 | ".byte .ur_s2 - .ur_stable - 4 \n" |
741 | "adc r0, r0, r0 \n" /* put bit into LSB for byte */ | 897 | ".byte .ur_s3 - .ur_stable - 4 \n" |
742 | "movs r8, r8, lsr #1 \n" | 898 | ".byte .ur_s4 - .ur_stable - 4 \n" |
743 | "adc r0, r0, r0 \n" | 899 | ".byte .ur_s5 - .ur_stable - 4 \n" |
744 | "movs r7, r7, lsr #1 \n" | 900 | ".byte .ur_s6 - .ur_stable - 4 \n" |
745 | "adc r0, r0, r0 \n" | 901 | ".byte .ur_s7 - .ur_stable - 4 \n" |
746 | "movs r6, r6, lsr #1 \n" | 902 | |
747 | "adc r0, r0, r0 \n" | 903 | ".ur_s8: \n" |
748 | "movs r5, r5, lsr #1 \n" | 904 | "add %[addr], %[addr], %[psiz], lsl #3 \n" |
749 | "adc r0, r0, r0 \n" | 905 | /* Point behind the last plane for this round. See above. */ |
750 | "movs r4, r4, lsr #1 \n" | 906 | "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ |
751 | "adc r0, r0, r0 \n" | 907 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ |
752 | "movs r3, r3, lsr #1 \n" | 908 | ".ur_s7: \n" |
753 | "adc r0, r0, r0 \n" | 909 | "strb r7, [%[addr], -%[psiz]]! \n" |
754 | "movs r2, r2, lsr #1 \n" | 910 | "mov r7, r7, lsr #8 \n" |
755 | "adc r0, r0, r0 \n" | 911 | ".ur_s6: \n" |
756 | 912 | "strb r6, [%[addr], -%[psiz]]! \n" | |
757 | "ldrb r1, [%[addr]] \n" /* read old value */ | 913 | "mov r6, r6, lsr #8 \n" |
758 | "and r1, r1, %[mask] \n" /* mask out replaced bits */ | 914 | ".ur_s5: \n" |
759 | "orr r1, r1, r0 \n" /* set new bits */ | 915 | "strb r5, [%[addr], -%[psiz]]! \n" |
760 | "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ | 916 | "mov r5, r5, lsr #8 \n" |
761 | 917 | ".ur_s4: \n" | |
762 | "cmp %[end], %[addr] \n" /* loop for all bitplanes */ | 918 | "strb r4, [%[addr], -%[psiz]]! \n" |
763 | "bne .ur_floop \n" | 919 | "mov r4, r4, lsr #8 \n" |
764 | 920 | ".ur_s3: \n" | |
765 | "b .ur_end \n" | 921 | "strb r3, [%[addr], -%[psiz]]! \n" |
766 | 922 | "mov r3, r3, lsr #8 \n" | |
767 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 923 | ".ur_s2: \n" |
768 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 924 | "strb r2, [%[addr], -%[psiz]]! \n" |
769 | "adc r0, r0, r0 \n" /* put bit into LSB for byte */ | 925 | "mov r2, r2, lsr #8 \n" |
770 | "movs r8, r8, lsr #1 \n" | 926 | ".ur_s1: \n" |
771 | "adc r0, r0, r0 \n" | 927 | "strb r1, [%[addr], -%[psiz]]! \n" |
772 | "movs r7, r7, lsr #1 \n" | 928 | "mov r1, r1, lsr #8 \n" |
773 | "adc r0, r0, r0 \n" | 929 | ".ur_s0: \n" |
774 | "movs r6, r6, lsr #1 \n" | 930 | |
775 | "adc r0, r0, r0 \n" | 931 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ |
776 | "movs r5, r5, lsr #1 \n" | 932 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ |
777 | "adc r0, r0, r0 \n" | 933 | "bhi .ur_sloop \n" |
778 | "movs r4, r4, lsr #1 \n" | 934 | |
779 | "adc r0, r0, r0 \n" | 935 | ".ur_end: \n" |
780 | "movs r3, r3, lsr #1 \n" | 936 | : /* outputs */ |
781 | "adc r0, r0, r0 \n" | 937 | [addr]"+r"(addr), |
782 | "movs r2, r2, lsr #1 \n" | 938 | [mask]"+r"(mask), |
783 | "adc r0, r0, r0 \n" | 939 | [dpth]"+r"(depth), |
784 | 940 | [rx] "=&r"(trash) | |
785 | "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ | 941 | : /* inputs */ |
786 | 942 | [psiz]"r"(_gray_info.plane_size), | |
787 | "cmp %[end], %[addr] \n" /* loop for all bitplanes */ | 943 | [patp]"[rx]"(pat_ptr) |
788 | "bne .ur_sloop \n" | 944 | : /* clobbers */ |
789 | 945 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | |
790 | ".ur_end: \n" | 946 | ); |
791 | : /* outputs */ | ||
792 | [addr]"+r"(addr), | ||
793 | [mask]"+r"(mask), | ||
794 | [rx] "=&r"(trash) | ||
795 | : /* inputs */ | ||
796 | [psiz]"r"(_gray_info.plane_size), | ||
797 | [end] "r"(end), | ||
798 | [patp]"[rx]"(pat_ptr) | ||
799 | : /* clobbers */ | ||
800 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | ||
801 | ); | ||
802 | } | 947 | } |
803 | #else /* C version, for reference*/ | 948 | #else /* C version, for reference*/ |
804 | #warning C version of gray_update_rect() used | 949 | #warning C version of gray_update_rect() used |
@@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height) | |||
873 | 1018 | ||
874 | for (i = 7; i >= 0; i--) | 1019 | for (i = 7; i >= 0; i--) |
875 | data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); | 1020 | data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); |
876 | 1021 | ||
877 | *addr = (*addr & mask) | data; | 1022 | *addr = (*addr & mask) | data; |
878 | addr += _gray_info.plane_size; | 1023 | addr += _gray_info.plane_size; |
879 | test <<= 1; | 1024 | test <<= 1; |
@@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height) | |||
935 | 1080 | ||
936 | #if CONFIG_CPU == SH7034 | 1081 | #if CONFIG_CPU == SH7034 |
937 | asm volatile ( | 1082 | asm volatile ( |
938 | "mov.l @%[cbuf],r1 \n" | 1083 | "mov.l @%[cbuf], r1 \n" |
939 | "mov.l @%[bbuf],r2 \n" | 1084 | "mov.l @%[bbuf], r2 \n" |
940 | "xor r1,r2 \n" | 1085 | "xor r1, r2 \n" |
941 | "mov.l @(4,%[cbuf]),r1 \n" | 1086 | "mov.l @(4,%[cbuf]), r1 \n" |
942 | "mov.l @(4,%[bbuf]),%[chg] \n" | 1087 | "mov.l @(4,%[bbuf]), %[chg]\n" |
943 | "xor r1,%[chg] \n" | 1088 | "xor r1, %[chg] \n" |
944 | "or r2,%[chg] \n" | 1089 | "or r2, %[chg] \n" |
945 | : /* outputs */ | 1090 | : /* outputs */ |
946 | [chg] "=r"(change) | 1091 | [chg] "=r"(change) |
947 | : /* inputs */ | 1092 | : /* inputs */ |
@@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height) | |||
953 | 1098 | ||
954 | if (change != 0) | 1099 | if (change != 0) |
955 | { | 1100 | { |
956 | unsigned char *addr, *end; | 1101 | unsigned char *addr; |
957 | unsigned mask, trash; | 1102 | unsigned mask, depth, trash; |
958 | 1103 | ||
959 | pat_ptr = &pat_stack[8]; | 1104 | pat_ptr = &pat_stack[8]; |
960 | 1105 | ||
961 | /* precalculate the bit patterns with random shifts | 1106 | /* precalculate the bit patterns with random shifts |
962 | * for all 8 pixels and put them on an extra "stack" */ | 1107 | * for all 8 pixels and put them on an extra "stack" */ |
963 | asm volatile ( | 1108 | asm volatile |
964 | "mov #8,r3 \n" /* loop count */ | 1109 | ( |
965 | 1110 | "mov #8, r3 \n" /* loop count */ | |
966 | ".ur_pre_loop: \n" | 1111 | |
967 | "mov.b @%[cbuf]+,r0\n" /* read current buffer */ | 1112 | ".ur_pre_loop: \n" |
968 | "mov.b @%[bbuf],r1 \n" /* read back buffer */ | 1113 | "mov.b @%[cbuf]+, r0 \n" /* read current buffer */ |
969 | "mov #0,r2 \n" /* preset for skipped pixel */ | 1114 | "mov.b @%[bbuf], r1 \n" /* read back buffer */ |
970 | "mov.b r0,@%[bbuf] \n" /* update back buffer */ | 1115 | "mov #0, r2 \n" /* preset for skipped pixel */ |
971 | "add #1,%[bbuf] \n" | 1116 | "mov.b r0, @%[bbuf] \n" /* update back buffer */ |
972 | "cmp/eq r0,r1 \n" /* no change? */ | 1117 | "add #1, %[bbuf] \n" |
973 | "bt .ur_skip \n" /* -> skip */ | 1118 | "cmp/eq r0, r1 \n" /* no change? */ |
974 | 1119 | "bt .ur_skip \n" /* -> skip */ | |
975 | "shll2 r0 \n" /* pixel value -> pattern offset */ | 1120 | |
976 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ | 1121 | "shll2 r0 \n" /* pixel value -> pattern offset */ |
977 | 1122 | "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ | |
978 | "mov #75,r0 \n" | 1123 | |
979 | "mulu r0,%[rnd] \n" /* multiply by 75 */ | 1124 | "mov #75, r0 \n" |
980 | "sts macl,%[rnd] \n" | 1125 | "mulu r0, %[rnd] \n" /* multiply by 75 */ |
981 | "add #74,%[rnd] \n" /* add another 74 */ | 1126 | "sts macl, %[rnd] \n" |
982 | /* Since the lower bits are not very random: */ | 1127 | "add #74, %[rnd] \n" /* add another 74 */ |
983 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ | 1128 | /* Since the lower bits are not very random: */ |
984 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ | 1129 | "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ |
985 | 1130 | "and %[rmsk], r1 \n" /* mask out unneeded bits */ | |
986 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ | 1131 | |
987 | "bf .ur_ntrim \n" | 1132 | "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ |
988 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ | 1133 | "bf .ur_ntrim \n" |
989 | ".ur_ntrim: \n" | 1134 | "sub %[dpth], r1 \n" /* yes: random -= depth; */ |
1135 | ".ur_ntrim: \n" | ||
990 | 1136 | ||
991 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ | 1137 | "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ |
992 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ | 1138 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ |
993 | "mov r1,r5 \n" | 1139 | "mov r1, r5 \n" |
994 | 1140 | ||
995 | "mov %[dpth],r5 \n" | 1141 | "mov %[dpth], r5 \n" |
996 | "sub r1,r5 \n" /* r5 = depth - r1 */ | 1142 | "sub r1, r5 \n" /* r5 = depth - r1 */ |
997 | "mov.l .lshrsi3,r1 \n" | 1143 | "mov.l .lshrsi3, r1 \n" |
998 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ | 1144 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ |
999 | "mov r0,r2 \n" /* store previous result in r2 */ | 1145 | "mov r0, r2 \n" /* store previous result in r2 */ |
1000 | 1146 | ||
1001 | "or r0,r2 \n" /* rotated_pattern = r2 | r0 */ | 1147 | "or r0, r2 \n" /* rotated_pattern = r2 | r0 */ |
1002 | "clrt \n" /* mask bit = 0 (replace) */ | 1148 | "clrt \n" /* mask bit = 0 (replace) */ |
1003 | 1149 | ||
1004 | ".ur_skip: \n" /* T == 1 if skipped */ | 1150 | ".ur_skip: \n" /* T == 1 if skipped */ |
1005 | "rotcr %[mask] \n" /* get mask bit */ | 1151 | "rotcr %[mask] \n" /* get mask bit */ |
1006 | "mov.l r2,@-%[patp]\n" /* push on pattern stack */ | 1152 | "mov.l r2, @-%[patp] \n" /* push on pattern stack */ |
1007 | 1153 | ||
1008 | "add #-1,r3 \n" /* loop 8 times (pixel block) */ | 1154 | "add #-1, r3 \n" /* loop 8 times (pixel block) */ |
1009 | "cmp/pl r3 \n" | 1155 | "cmp/pl r3 \n" |
1010 | "bt .ur_pre_loop\n" | 1156 | "bt .ur_pre_loop \n" |
1011 | 1157 | ||
1012 | "shlr8 %[mask] \n" /* shift mask to low byte */ | 1158 | "shlr8 %[mask] \n" /* shift mask to low byte */ |
1013 | "shlr16 %[mask] \n" | 1159 | "shlr16 %[mask] \n" |
1014 | : /* outputs */ | 1160 | : /* outputs */ |
1015 | [cbuf]"+r"(cbuf), | 1161 | [cbuf]"+r"(cbuf), |
1016 | [bbuf]"+r"(bbuf), | 1162 | [bbuf]"+r"(bbuf), |
1017 | [rnd] "+r"(_gray_random_buffer), | 1163 | [rnd] "+r"(_gray_random_buffer), |
1018 | [patp]"+r"(pat_ptr), | 1164 | [patp]"+r"(pat_ptr), |
1019 | [mask]"=&r"(mask) | 1165 | [mask]"=&r"(mask) |
1020 | : /* inputs */ | 1166 | : /* inputs */ |
1021 | [dpth]"r"(_gray_info.depth), | 1167 | [dpth]"r"(_gray_info.depth), |
1022 | [bpat]"r"(_gray_info.bitpattern), | 1168 | [bpat]"r"(_gray_info.bitpattern), |
1023 | [rmsk]"r"(_gray_info.randmask) | 1169 | [rmsk]"r"(_gray_info.randmask) |
1024 | : /* clobbers */ | 1170 | : /* clobbers */ |
1025 | "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" | 1171 | "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" |
1026 | ); | 1172 | ); |
1027 | 1173 | ||
1028 | addr = dst_row; | 1174 | addr = dst_row; |
1029 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1175 | depth = _gray_info.depth; |
1030 | 1176 | ||
1031 | /* set the bits for all 8 pixels in all bytes according to the | 1177 | /* set the bits for all 8 pixels in all bytes according to the |
1032 | * precalculated patterns on the pattern stack */ | 1178 | * precalculated patterns on the pattern stack */ |
1033 | asm volatile ( | 1179 | asm volatile |
1034 | "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ | 1180 | ( |
1035 | "mov.l @%[patp]+,r2 \n" | 1181 | "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ |
1036 | "mov.l @%[patp]+,r3 \n" | 1182 | "mov.l @%[patp]+, r7 \n" |
1037 | "mov.l @%[patp]+,r6 \n" | 1183 | "mov.l @%[patp]+, r6 \n" |
1038 | "mov.l @%[patp]+,r7 \n" | 1184 | "mov.l @%[patp]+, r5 \n" |
1039 | "mov.l @%[patp]+,r8 \n" | 1185 | "mov.l @%[patp]+, r4 \n" |
1040 | "mov.l @%[patp]+,r9 \n" | 1186 | "mov.l @%[patp]+, r3 \n" |
1041 | "mov.l @%[patp],r10 \n" | 1187 | "mov.l @%[patp]+, r2 \n" |
1042 | 1188 | "mov.l @%[patp], r1 \n" | |
1043 | "tst %[mask],%[mask] \n" | 1189 | |
1044 | "bt .ur_sloop \n" /* short loop if nothing to keep */ | 1190 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1045 | 1191 | ||
1046 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 1192 | "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */ |
1047 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1193 | "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ |
1048 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1194 | "shll2 r0 \n" |
1049 | "shlr r2 \n" | 1195 | "shll2 r0 \n" |
1050 | "rotcl r0 \n" | 1196 | "xor r1, r0 \n" |
1051 | "shlr r3 \n" | 1197 | "and %[rx], r0 \n" |
1052 | "rotcl r0 \n" | 1198 | "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ |
1053 | "shlr r6 \n" | 1199 | "shlr2 r0 \n" |
1054 | "rotcl r0 \n" | 1200 | "shlr2 r0 \n" |
1055 | "shlr r7 \n" | 1201 | "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ |
1056 | "rotcl r0 \n" | 1202 | "mov r6, r0 \n" |
1057 | "shlr r8 \n" | 1203 | "shll2 r0 \n" |
1058 | "rotcl r0 \n" | 1204 | "shll2 r0 \n" |
1059 | "shlr r9 \n" | 1205 | "xor r2, r0 \n" |
1060 | "rotcl r0 \n" | 1206 | "and %[rx], r0 \n" |
1061 | "shlr r10 \n" | 1207 | "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ |
1062 | "mov.b @%[addr],%[rx] \n" /* read old value */ | 1208 | "shlr2 r0 \n" |
1063 | "rotcl r0 \n" | 1209 | "shlr2 r0 \n" |
1064 | "and %[mask],%[rx] \n" /* mask out replaced bits */ | 1210 | "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ |
1065 | "or %[rx],r0 \n" /* set new bits */ | 1211 | "mov r7, r0 \n" |
1066 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ | 1212 | "shll2 r0 \n" |
1067 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1213 | "shll2 r0 \n" |
1068 | "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ | 1214 | "xor r3, r0 \n" |
1069 | "bt .ur_floop \n" | 1215 | "and %[rx], r0 \n" |
1070 | 1216 | "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | |
1071 | "bra .ur_end \n" | 1217 | "shlr2 r0 \n" |
1072 | "nop \n" | 1218 | "shlr2 r0 \n" |
1073 | 1219 | "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | |
1074 | /* References to C library routines used in the precalc block */ | 1220 | "mov r8, r0 \n" |
1075 | ".align 2 \n" | 1221 | "shll2 r0 \n" |
1076 | ".ashlsi3: \n" /* C library routine: */ | 1222 | "shll2 r0 \n" |
1077 | ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ | 1223 | "xor r4, r0 \n" |
1078 | ".lshrsi3: \n" /* C library routine: */ | 1224 | "and %[rx], r0 \n" |
1079 | ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ | 1225 | "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ |
1080 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | 1226 | "shlr2 r0 \n" |
1081 | 1227 | "shlr2 r0 \n" | |
1082 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 1228 | "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ |
1083 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1229 | |
1084 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1230 | "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */ |
1085 | "shlr r2 \n" | 1231 | "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ |
1086 | "rotcl r0 \n" | 1232 | "shll2 r0 \n" |
1087 | "shlr r3 \n" | 1233 | "xor r1, r0 \n" |
1088 | "rotcl r0 \n" | 1234 | "and %[rx], r0 \n" |
1089 | "shlr r6 \n" | 1235 | "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ |
1090 | "rotcl r0 \n" | 1236 | "shlr2 r0 \n" |
1091 | "shlr r7 \n" | 1237 | "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ |
1092 | "rotcl r0 \n" | 1238 | "mov r4, r0 \n" |
1093 | "shlr r8 \n" | 1239 | "shll2 r0 \n" |
1094 | "rotcl r0 \n" | 1240 | "xor r2, r0 \n" |
1095 | "shlr r9 \n" | 1241 | "and %[rx], r0 \n" |
1096 | "rotcl r0 \n" | 1242 | "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1097 | "shlr r10 \n" | 1243 | "shlr2 r0 \n" |
1098 | "rotcl r0 \n" | 1244 | "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ |
1099 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ | 1245 | "mov r7, r0 \n" |
1100 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1246 | "shll2 r0 \n" |
1101 | "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ | 1247 | "xor r5, r0 \n" |
1102 | "bt .ur_sloop \n" | 1248 | "and %[rx], r0 \n" |
1103 | 1249 | "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | |
1104 | ".ur_end: \n" | 1250 | "shlr2 r0 \n" |
1105 | : /* outputs */ | 1251 | "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ |
1106 | [addr]"+r"(addr), | 1252 | "mov r8, r0 \n" |
1107 | [mask]"+r"(mask), | 1253 | "shll2 r0 \n" |
1108 | [rx] "=&r"(trash) | 1254 | "xor r6, r0 \n" |
1109 | : /* inputs */ | 1255 | "and %[rx], r0 \n" |
1110 | [psiz]"r"(_gray_info.plane_size), | 1256 | "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ |
1111 | [end] "r"(end), | 1257 | "shlr2 r0 \n" |
1112 | [patp]"[rx]"(pat_ptr) | 1258 | "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ |
1113 | : /* clobbers */ | 1259 | |
1114 | "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" | 1260 | "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */ |
1261 | "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1262 | "shll r0 \n" | ||
1263 | "xor r1, r0 \n" | ||
1264 | "and %[rx], r0 \n" | ||
1265 | "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
1266 | "shlr r0 \n" | ||
1267 | "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
1268 | "mov r4, r0 \n" | ||
1269 | "shll r0 \n" | ||
1270 | "xor r3, r0 \n" | ||
1271 | "and %[rx], r0 \n" | ||
1272 | "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
1273 | "shlr r0 \n" | ||
1274 | "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
1275 | "mov r6, r0 \n" | ||
1276 | "shll r0 \n" | ||
1277 | "xor r5, r0 \n" | ||
1278 | "and %[rx], r0 \n" | ||
1279 | "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1280 | "shlr r0 \n" | ||
1281 | "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1282 | "mov r8, r0 \n" | ||
1283 | "shll r0 \n" | ||
1284 | "xor r7, r0 \n" | ||
1285 | "and %[rx], r0 \n" | ||
1286 | "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1287 | "shlr r0 \n" | ||
1288 | "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1289 | |||
1290 | "tst %[mask], %[mask] \n" | ||
1291 | "bt .ur_sloop \n" /* short loop if nothing to keep */ | ||
1292 | |||
1293 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | ||
1294 | "mov #8, r0 \n" | ||
1295 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1296 | "bt .ur_f8 \n" | ||
1297 | |||
1298 | "mulu %[psiz], %[dpth] \n" | ||
1299 | "mova .ur_ftable, r0 \n" | ||
1300 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1301 | "add %[rx], r0 \n" | ||
1302 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1303 | "jmp @r0 \n" /* jump into streak */ | ||
1304 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1305 | |||
1306 | ".align 2 \n" | ||
1307 | ".ur_ftable: \n" | ||
1308 | ".byte .ur_f0 - .ur_ftable \n" | ||
1309 | ".byte .ur_f1 - .ur_ftable \n" | ||
1310 | ".byte .ur_f2 - .ur_ftable \n" | ||
1311 | ".byte .ur_f3 - .ur_ftable \n" | ||
1312 | ".byte .ur_f4 - .ur_ftable \n" | ||
1313 | ".byte .ur_f5 - .ur_ftable \n" | ||
1314 | ".byte .ur_f6 - .ur_ftable \n" | ||
1315 | ".byte .ur_f7 - .ur_ftable \n" | ||
1316 | |||
1317 | ".ur_f8: \n" | ||
1318 | "mov %[psiz], %[rx] \n" | ||
1319 | "shll2 %[rx] \n" | ||
1320 | "add %[rx], %[rx] \n" | ||
1321 | "add %[rx], %[addr] \n" | ||
1322 | /* Point behind the last plane for this round. Note: We're using the | ||
1323 | * registers backwards in order to reuse the streak for the last round. | ||
1324 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1325 | * the bit order would be destroyed which results in more flicker. */ | ||
1326 | "sub %[psiz], %[addr] \n" | ||
1327 | "mov.b @%[addr], r0 \n" /* load old byte */ | ||
1328 | "and %[mask], r0 \n" /* mask out replaced bits */ | ||
1329 | "or r8, r0 \n" /* set new bits */ | ||
1330 | "mov.b r0, @%[addr] \n" /* store byte */ | ||
1331 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1332 | ".ur_f7: \n" | ||
1333 | "sub %[psiz], %[addr] \n" | ||
1334 | "mov.b @%[addr], r0 \n" | ||
1335 | "and %[mask], r0 \n" | ||
1336 | "or r7, r0 \n" | ||
1337 | "mov.b r0, @%[addr] \n" | ||
1338 | "shlr8 r7 \n" | ||
1339 | ".ur_f6: \n" | ||
1340 | "sub %[psiz], %[addr] \n" | ||
1341 | "mov.b @%[addr], r0 \n" | ||
1342 | "and %[mask], r0 \n" | ||
1343 | "or r6, r0 \n" | ||
1344 | "mov.b r0, @%[addr] \n" | ||
1345 | "shlr8 r6 \n" | ||
1346 | ".ur_f5: \n" | ||
1347 | "sub %[psiz], %[addr] \n" | ||
1348 | "mov.b @%[addr], r0 \n" | ||
1349 | "and %[mask], r0 \n" | ||
1350 | "or r5, r0 \n" | ||
1351 | "mov.b r0, @%[addr] \n" | ||
1352 | "shlr8 r5 \n" | ||
1353 | ".ur_f4: \n" | ||
1354 | "sub %[psiz], %[addr] \n" | ||
1355 | "mov.b @%[addr], r0 \n" | ||
1356 | "and %[mask], r0 \n" | ||
1357 | "or r4, r0 \n" | ||
1358 | "mov.b r0, @%[addr] \n" | ||
1359 | "shlr8 r4 \n" | ||
1360 | ".ur_f3: \n" | ||
1361 | "sub %[psiz], %[addr] \n" | ||
1362 | "mov.b @%[addr], r0 \n" | ||
1363 | "and %[mask], r0 \n" | ||
1364 | "or r3, r0 \n" | ||
1365 | "mov.b r0, @%[addr] \n" | ||
1366 | "shlr8 r3 \n" | ||
1367 | ".ur_f2: \n" | ||
1368 | "sub %[psiz], %[addr] \n" | ||
1369 | "mov.b @%[addr], r0 \n" | ||
1370 | "and %[mask], r0 \n" | ||
1371 | "or r2, r0 \n" | ||
1372 | "mov.b r0, @%[addr] \n" | ||
1373 | "shlr8 r2 \n" | ||
1374 | ".ur_f1: \n" | ||
1375 | "sub %[psiz], %[addr] \n" | ||
1376 | "mov.b @%[addr], r0 \n" | ||
1377 | "and %[mask], r0 \n" | ||
1378 | "or r1, r0 \n" | ||
1379 | "mov.b r0, @%[addr] \n" | ||
1380 | "shlr8 r1 \n" | ||
1381 | ".ur_f0: \n" | ||
1382 | |||
1383 | "add %[rx], %[addr] \n" /* correct address */ | ||
1384 | "add #-8, %[dpth] \n" | ||
1385 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1386 | "bt .ur_floop \n" | ||
1387 | |||
1388 | "bra .ur_end \n" | ||
1389 | "nop \n" | ||
1390 | |||
1391 | /* References to C library routines used in the precalc block */ | ||
1392 | ".align 2 \n" | ||
1393 | ".ashlsi3: \n" /* C library routine: */ | ||
1394 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ | ||
1395 | ".lshrsi3: \n" /* C library routine: */ | ||
1396 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ | ||
1397 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | ||
1398 | |||
1399 | /* Bitmasks for the bit block rotation */ | ||
1400 | ".ur_mask4: \n" | ||
1401 | ".long 0xF0F0F0F0 \n" | ||
1402 | ".ur_mask2: \n" | ||
1403 | ".long 0xCCCCCCCC \n" | ||
1404 | ".ur_mask1: \n" | ||
1405 | ".long 0xAAAAAAAA \n" | ||
1406 | |||
1407 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
1408 | "mov #8, r0 \n" | ||
1409 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1410 | "bt .ur_s8 \n" | ||
1411 | |||
1412 | "mulu %[psiz], %[dpth] \n" | ||
1413 | "mova .ur_stable, r0 \n" | ||
1414 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1415 | "add %[rx], r0 \n" | ||
1416 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1417 | "jmp @r0 \n" /* jump into streak */ | ||
1418 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1419 | |||
1420 | ".align 2 \n" | ||
1421 | ".ur_stable: \n" | ||
1422 | ".byte .ur_s0 - .ur_stable \n" | ||
1423 | ".byte .ur_s1 - .ur_stable \n" | ||
1424 | ".byte .ur_s2 - .ur_stable \n" | ||
1425 | ".byte .ur_s3 - .ur_stable \n" | ||
1426 | ".byte .ur_s4 - .ur_stable \n" | ||
1427 | ".byte .ur_s5 - .ur_stable \n" | ||
1428 | ".byte .ur_s6 - .ur_stable \n" | ||
1429 | ".byte .ur_s7 - .ur_stable \n" | ||
1430 | |||
1431 | ".ur_s8: \n" | ||
1432 | "mov %[psiz], %[rx] \n" /* Point behind the last plane */ | ||
1433 | "shll2 %[rx] \n" /* for this round. */ | ||
1434 | "add %[rx], %[rx] \n" /* See above. */ | ||
1435 | "add %[rx], %[addr] \n" | ||
1436 | |||
1437 | "sub %[psiz], %[addr] \n" | ||
1438 | "mov.b r8, @%[addr] \n" /* store byte */ | ||
1439 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1440 | ".ur_s7: \n" | ||
1441 | "sub %[psiz], %[addr] \n" | ||
1442 | "mov.b r7, @%[addr] \n" | ||
1443 | "shlr8 r7 \n" | ||
1444 | ".ur_s6: \n" | ||
1445 | "sub %[psiz], %[addr] \n" | ||
1446 | "mov.b r6, @%[addr] \n" | ||
1447 | "shlr8 r6 \n" | ||
1448 | ".ur_s5: \n" | ||
1449 | "sub %[psiz], %[addr] \n" | ||
1450 | "mov.b r5, @%[addr] \n" | ||
1451 | "shlr8 r5 \n" | ||
1452 | ".ur_s4: \n" | ||
1453 | "sub %[psiz], %[addr] \n" | ||
1454 | "mov.b r4, @%[addr] \n" | ||
1455 | "shlr8 r4 \n" | ||
1456 | ".ur_s3: \n" | ||
1457 | "sub %[psiz], %[addr] \n" | ||
1458 | "mov.b r3, @%[addr] \n" | ||
1459 | "shlr8 r3 \n" | ||
1460 | ".ur_s2: \n" | ||
1461 | "sub %[psiz], %[addr] \n" | ||
1462 | "mov.b r2, @%[addr] \n" | ||
1463 | "shlr8 r2 \n" | ||
1464 | ".ur_s1: \n" | ||
1465 | "sub %[psiz], %[addr] \n" | ||
1466 | "mov.b r1, @%[addr] \n" | ||
1467 | "shlr8 r1 \n" | ||
1468 | ".ur_s0: \n" | ||
1469 | |||
1470 | "add %[rx], %[addr] \n" /* correct address */ | ||
1471 | "add #-8, %[dpth] \n" | ||
1472 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1473 | "bt .ur_sloop \n" | ||
1474 | |||
1475 | ".ur_end: \n" | ||
1476 | : /* outputs */ | ||
1477 | [addr]"+r"(addr), | ||
1478 | [dpth]"+r"(depth), | ||
1479 | [rx] "=&r"(trash) | ||
1480 | : /* inputs */ | ||
1481 | [mask]"r"(mask), | ||
1482 | [psiz]"r"(_gray_info.plane_size), | ||
1483 | [patp]"[rx]"(pat_ptr) | ||
1484 | : /* clobbers */ | ||
1485 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" | ||
1115 | ); | 1486 | ); |
1116 | } | 1487 | } |
1117 | #elif defined(CPU_COLDFIRE) | 1488 | #elif defined(CPU_COLDFIRE) |
1118 | asm volatile ( | 1489 | asm volatile ( |
1119 | "move.l (%[cbuf]),%%d0 \n" | 1490 | "move.l (%[cbuf]), %%d0 \n" |
1120 | "move.l (%[bbuf]),%%d1 \n" | 1491 | "move.l (%[bbuf]), %%d1 \n" |
1121 | "eor.l %%d0,%%d1 \n" | 1492 | "eor.l %%d0, %%d1 \n" |
1122 | "move.l (4,%[cbuf]),%%d0 \n" | 1493 | "move.l (4,%[cbuf]), %%d0 \n" |
1123 | "move.l (4,%[bbuf]),%[chg] \n" | 1494 | "move.l (4,%[bbuf]), %[chg] \n" |
1124 | "eor.l %%d0,%[chg] \n" | 1495 | "eor.l %%d0, %[chg] \n" |
1125 | "or.l %%d1,%[chg] \n" | 1496 | "or.l %%d1, %[chg] \n" |
1126 | : /* outputs */ | 1497 | : /* outputs */ |
1127 | [chg] "=&d"(change) | 1498 | [chg] "=&d"(change) |
1128 | : /* inputs */ | 1499 | : /* inputs */ |
@@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height) | |||
1134 | 1505 | ||
1135 | if (change != 0) | 1506 | if (change != 0) |
1136 | { | 1507 | { |
1137 | unsigned char *addr, *end; | 1508 | unsigned char *addr; |
1138 | unsigned mask, trash; | 1509 | unsigned mask, depth, trash; |
1139 | 1510 | ||
1140 | pat_ptr = &pat_stack[8]; | 1511 | pat_ptr = &pat_stack[8]; |
1141 | 1512 | ||
1142 | /* precalculate the bit patterns with random shifts | 1513 | /* precalculate the bit patterns with random shifts |
1143 | * for all 8 pixels and put them on an extra "stack" */ | 1514 | * for all 8 pixels and put them on an extra "stack" */ |
1144 | asm volatile ( | 1515 | asm volatile |
1145 | "moveq.l #8,%%d3 \n" /* loop count */ | 1516 | ( |
1146 | "clr.l %[mask] \n" | 1517 | "moveq.l #8, %%d3 \n" /* loop count */ |
1147 | 1518 | "clr.l %[mask] \n" | |
1148 | ".ur_pre_loop: \n" | 1519 | |
1149 | "clr.l %%d0 \n" | 1520 | ".ur_pre_loop: \n" |
1150 | "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ | 1521 | "clr.l %%d0 \n" |
1151 | "clr.l %%d1 \n" | 1522 | "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */ |
1152 | "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ | 1523 | "clr.l %%d1 \n" |
1153 | "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ | 1524 | "move.b (%[bbuf]), %%d1 \n" /* read back buffer */ |
1154 | "clr.l %%d2 \n" /* preset for skipped pixel */ | 1525 | "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */ |
1155 | "cmp.l %%d0,%%d1 \n" /* no change? */ | 1526 | "clr.l %%d2 \n" /* preset for skipped pixel */ |
1156 | "beq.b .ur_skip \n" /* -> skip */ | 1527 | "cmp.l %%d0, %%d1 \n" /* no change? */ |
1157 | 1528 | "beq.b .ur_skip \n" /* -> skip */ | |
1158 | "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ | 1529 | |
1159 | 1530 | "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ | |
1160 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ | 1531 | |
1161 | "add.l #74,%[rnd] \n" /* add another 74 */ | 1532 | "mulu.w #75, %[rnd] \n" /* multiply by 75 */ |
1162 | /* Since the lower bits are not very random: */ | 1533 | "add.l #74, %[rnd] \n" /* add another 74 */ |
1163 | "move.l %[rnd],%%d1 \n" | 1534 | /* Since the lower bits are not very random: */ |
1164 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ | 1535 | "move.l %[rnd], %%d1 \n" |
1165 | "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ | 1536 | "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ |
1166 | 1537 | "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ | |
1167 | "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ | 1538 | |
1168 | "blo.b .ur_ntrim \n" | 1539 | "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ |
1169 | "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ | 1540 | "blo.b .ur_ntrim \n" |
1170 | ".ur_ntrim: \n" | 1541 | "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ |
1171 | 1542 | ".ur_ntrim: \n" | |
1172 | "move.l %%d2,%%d0 \n" /** rotate pattern **/ | 1543 | |
1173 | "lsl.l %%d1,%%d0 \n" | 1544 | "move.l %%d2, %%d0 \n" /** rotate pattern **/ |
1174 | "sub.l %[dpth],%%d1 \n" | 1545 | "lsl.l %%d1, %%d0 \n" |
1175 | "neg.l %%d1 \n" /* d1 = depth - d1 */ | 1546 | "sub.l %[dpth], %%d1 \n" |
1176 | "lsr.l %%d1,%%d2 \n" | 1547 | "neg.l %%d1 \n" /* d1 = depth - d1 */ |
1177 | "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ | 1548 | "lsr.l %%d1, %%d2 \n" |
1178 | 1549 | "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */ | |
1179 | "or.l #0x0100,%[mask] \n" /* set mask bit */ | 1550 | |
1180 | 1551 | "or.l #0x0100, %[mask] \n" /* set mask bit */ | |
1181 | ".ur_skip: \n" | 1552 | |
1182 | "lsr.l #1,%[mask] \n" /* shift mask */ | 1553 | ".ur_skip: \n" |
1183 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ | 1554 | "lsr.l #1, %[mask] \n" /* shift mask */ |
1184 | 1555 | "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ | |
1185 | "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ | 1556 | |
1186 | "bne.b .ur_pre_loop \n" | 1557 | "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ |
1187 | : /* outputs */ | 1558 | "bne.b .ur_pre_loop \n" |
1188 | [cbuf]"+a"(cbuf), | 1559 | : /* outputs */ |
1189 | [bbuf]"+a"(bbuf), | 1560 | [cbuf]"+a"(cbuf), |
1190 | [patp]"+a"(pat_ptr), | 1561 | [bbuf]"+a"(bbuf), |
1191 | [rnd] "+d"(_gray_random_buffer), | 1562 | [patp]"+a"(pat_ptr), |
1192 | [mask]"=&d"(mask) | 1563 | [rnd] "+d"(_gray_random_buffer), |
1193 | : /* inputs */ | 1564 | [mask]"=&d"(mask) |
1194 | [bpat]"a"(_gray_info.bitpattern), | 1565 | : /* inputs */ |
1195 | [dpth]"d"(_gray_info.depth), | 1566 | [bpat]"a"(_gray_info.bitpattern), |
1196 | [rmsk]"d"(_gray_info.randmask) | 1567 | [dpth]"d"(_gray_info.depth), |
1197 | : /* clobbers */ | 1568 | [rmsk]"d"(_gray_info.randmask) |
1198 | "d0", "d1", "d2", "d3" | 1569 | : /* clobbers */ |
1570 | "d0", "d1", "d2", "d3" | ||
1199 | ); | 1571 | ); |
1200 | 1572 | ||
1201 | addr = dst_row; | 1573 | addr = dst_row; |
1202 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1574 | mask = ~mask & 0xff; |
1575 | depth = _gray_info.depth; | ||
1203 | 1576 | ||
1204 | /* set the bits for all 8 pixels in all bytes according to the | 1577 | /* set the bits for all 8 pixels in all bytes according to the |
1205 | * precalculated patterns on the pattern stack */ | 1578 | * precalculated patterns on the pattern stack */ |
1206 | asm volatile ( | 1579 | asm volatile |
1207 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" | 1580 | ( |
1208 | /* pop all 8 patterns */ | 1581 | "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ |
1209 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ | 1582 | /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ |
1210 | "and.l #0xFF,%[mask] \n" | 1583 | |
1211 | "beq.b .ur_sstart \n" /* short loop if nothing to keep */ | 1584 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1212 | 1585 | ||
1213 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | 1586 | "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ |
1214 | "clr.l %%d0 \n" | 1587 | "lsl.l #4, %%d0 \n" |
1215 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1588 | /* move.l %[ax], %%d5 */ /* already in d5 */ |
1216 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1589 | "eor.l %%d5, %%d0 \n" |
1217 | "lsr.l #1,%%d3 \n" | 1590 | "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ |
1218 | "addx.l %%d0,%%d0 \n" | 1591 | "eor.l %%d0, %%d5 \n" |
1219 | "lsr.l #1,%%d4 \n" | 1592 | "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ |
1220 | "addx.l %%d0,%%d0 \n" | 1593 | "lsr.l #4, %%d0 \n" |
1221 | "lsr.l #1,%%d5 \n" | 1594 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ |
1222 | "addx.l %%d0,%%d0 \n" | 1595 | "move.l %%d2, %%d0 \n" |
1223 | "lsr.l #1,%%d6 \n" | 1596 | "lsl.l #4, %%d0 \n" |
1224 | "addx.l %%d0,%%d0 \n" | 1597 | "eor.l %%d6, %%d0 \n" |
1225 | "move.l %%a0,%%d1 \n" | 1598 | "and.l #0xF0F0F0F0, %%d0 \n" |
1226 | "lsr.l #1,%%d1 \n" | 1599 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ |
1227 | "addx.l %%d0,%%d0 \n" | 1600 | "lsr.l #4, %%d0 \n" |
1228 | "move.l %%d1,%%a0 \n" | 1601 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ |
1229 | "move.l %%a1,%%d1 \n" | 1602 | "move.l %%d3, %%d0 \n" |
1230 | "lsr.l #1,%%d1 \n" | 1603 | "lsl.l #4, %%d0 \n" |
1231 | "addx.l %%d0,%%d0 \n" | 1604 | "eor.l %%d7, %%d0 \n" |
1232 | "move.l %%d1,%%a1 \n" | 1605 | "and.l #0xF0F0F0F0, %%d0 \n" |
1233 | "move.l %[ax],%%d1 \n" | 1606 | "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ |
1234 | "lsr.l #1,%%d1 \n" | 1607 | "lsr.l #4, %%d0 \n" |
1235 | "addx.l %%d0,%%d0 \n" | 1608 | "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ |
1236 | "move.l %%d1,%[ax] \n" | 1609 | "move.l %%d4, %%d0 \n" |
1237 | 1610 | "lsl.l #4, %%d0 \n" | |
1238 | "move.b (%[addr]),%%d1 \n" /* read old value */ | 1611 | "move.l %%a0, %%d5 \n" |
1239 | "and.l %[mask],%%d1 \n" /* mask out replaced bits */ | 1612 | "eor.l %%d5, %%d0 \n" |
1240 | "or.l %%d0,%%d1 \n" /* set new bits */ | 1613 | "and.l #0xF0F0F0F0, %%d0 \n" |
1241 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ | 1614 | "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ |
1242 | 1615 | /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ | |
1243 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1616 | "lsr.l #4, %%d0 \n" |
1244 | "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ | 1617 | "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ |
1245 | "bhi.b .ur_floop \n" | 1618 | |
1246 | 1619 | "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ | |
1247 | "bra.b .ur_end \n" | 1620 | "lsl.l #2, %%d0 \n" |
1248 | 1621 | /* move.l %%a0, %%d5 */ /* still in d5 */ | |
1249 | ".ur_sstart: \n" | 1622 | "eor.l %%d5, %%d0 \n" |
1250 | "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ | 1623 | "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ |
1251 | 1624 | "eor.l %%d0, %%d5 \n" | |
1252 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | 1625 | "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ |
1253 | "clr.l %%d0 \n" | 1626 | "lsr.l #2, %%d0 \n" |
1254 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1627 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ |
1255 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1628 | "move.l %[ax], %%d5 \n" |
1256 | "lsr.l #1,%%d3 \n" | 1629 | "move.l %%d5, %%d0 \n" |
1257 | "addx.l %%d0,%%d0 \n" | 1630 | "lsl.l #2, %%d0 \n" |
1258 | "lsr.l #1,%%d4 \n" | 1631 | "eor.l %%d7, %%d0 \n" |
1259 | "addx.l %%d0,%%d0 \n" | 1632 | "and.l #0xCCCCCCCC, %%d0 \n" |
1260 | "lsr.l #1,%%d5 \n" | 1633 | "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1261 | "addx.l %%d0,%%d0 \n" | 1634 | "lsr.l #2, %%d0 \n" |
1262 | "lsr.l #1,%%d6 \n" | 1635 | "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ |
1263 | "addx.l %%d0,%%d0 \n" | 1636 | /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ |
1264 | "lsr.l #1,%[mask] \n" | 1637 | "move.l %%d2, %%d0 \n" |
1265 | "addx.l %%d0,%%d0 \n" | 1638 | "lsl.l #2, %%d0 \n" |
1266 | "move.l %%a1,%%d1 \n" | 1639 | "eor.l %%d4, %%d0 \n" |
1267 | "lsr.l #1,%%d1 \n" | 1640 | "and.l #0xCCCCCCCC, %%d0 \n" |
1268 | "addx.l %%d0,%%d0 \n" | 1641 | "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ |
1269 | "move.l %%d1,%%a1 \n" | 1642 | "lsr.l #2, %%d0 \n" |
1270 | "move.l %[ax],%%d1 \n" | 1643 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ |
1271 | "lsr.l #1,%%d1 \n" | 1644 | "move.l %%d1, %%d0 \n" |
1272 | "addx.l %%d0,%%d0 \n" | 1645 | "lsl.l #2, %%d0 \n" |
1273 | "move.l %%d1,%[ax] \n" | 1646 | "eor.l %%d3, %%d0 \n" |
1274 | 1647 | "and.l #0xCCCCCCCC, %%d0 \n" | |
1275 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ | 1648 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ |
1276 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1649 | "lsr.l #2, %%d0 \n" |
1277 | "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ | 1650 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ |
1278 | "bhi.b .ur_sloop \n" | 1651 | |
1279 | 1652 | "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ | |
1280 | ".ur_end: \n" | 1653 | "lsl.l #1, %%d0 \n" |
1281 | : /* outputs */ | 1654 | "eor.l %%d2, %%d0 \n" |
1282 | [addr]"+a"(addr), | 1655 | "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ |
1283 | [mask]"+d"(mask), | 1656 | "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ |
1284 | [ax] "=&a"(trash) | 1657 | "lsr.l #1, %%d0 \n" |
1285 | : /* inputs */ | 1658 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ |
1286 | [psiz]"a"(_gray_info.plane_size), | 1659 | "move.l %%d3, %%d0 \n" |
1287 | [end] "a"(end), | 1660 | "lsl.l #1, %%d0 \n" |
1288 | [patp]"[ax]"(pat_ptr) | 1661 | "eor.l %%d4, %%d0 \n" |
1289 | : /* clobbers */ | 1662 | "and.l #0xAAAAAAAA, %%d0 \n" |
1290 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" | 1663 | "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ |
1664 | "lsr.l #1, %%d0 \n" | ||
1665 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ | ||
1666 | /* move.l %[ax], %%d5 */ /* still in d5 */ | ||
1667 | "move.l %%d5, %%d0 \n" | ||
1668 | "lsl.l #1, %%d0 \n" | ||
1669 | "eor.l %%d6, %%d0 \n" | ||
1670 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1671 | "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ | ||
1672 | "lsr.l #1, %%d0 \n" | ||
1673 | "eor.l %%d0, %%d5 \n" | ||
1674 | "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ | ||
1675 | "move.l %%d7, %%d0 \n" | ||
1676 | "lsl.l #1, %%d0 \n" | ||
1677 | "move.l %%a0, %%d5 \n" | ||
1678 | "eor.l %%d5, %%d0 \n" | ||
1679 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1680 | "eor.l %%d0, %%d5 \n" | ||
1681 | "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ | ||
1682 | "lsr.l #1, %%d0 \n" | ||
1683 | "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ | ||
1684 | |||
1685 | "tst.l %[mask] \n" | ||
1686 | "jeq .ur_sloop \n" /* short loop if nothing to keep */ | ||
1687 | |||
1688 | "move.l %[mask], %%d5 \n" /* need mask in data reg. */ | ||
1689 | "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ | ||
1690 | |||
1691 | ".ur_floop: \n" /** full loop (there are bits to keep)**/ | ||
1692 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1693 | "bhs.s .ur_f8 \n" | ||
1694 | |||
1695 | "move.l %[psiz], %%d0 \n" | ||
1696 | "move.l %[dpth], %%d1 \n" | ||
1697 | "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ | ||
1698 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1699 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1700 | "bra.s .ur_f1 \n" /* dpth == 0 should never happen */ | ||
1701 | "bra.s .ur_f2 \n" | ||
1702 | "bra.s .ur_f3 \n" | ||
1703 | "bra.s .ur_f4 \n" | ||
1704 | "bra.s .ur_f5 \n" | ||
1705 | "bra.s .ur_f6 \n" | ||
1706 | "bra.s .ur_f7 \n" | ||
1707 | |||
1708 | ".ur_f8: \n" | ||
1709 | "move.l %[psiz], %%d0 \n" | ||
1710 | "lsl.l #3, %%d0 \n" | ||
1711 | "add.l %%d0, %[addr] \n" | ||
1712 | /* Point behind the last plane for this round. Note: We're using the | ||
1713 | * registers backwards in order to reuse the streak for the last round. | ||
1714 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1715 | * the bit order would be destroyed which results in more flicker. */ | ||
1716 | "sub.l %[psiz], %[addr] \n" | ||
1717 | "move.b (%[addr]), %%d0 \n" /* load old byte */ | ||
1718 | "and.l %%d5, %%d0 \n" /* mask out replaced bits */ | ||
1719 | "move.l %[mask], %%d1 \n" | ||
1720 | "or.l %%d1, %%d0 \n" /* set new bits */ | ||
1721 | "move.b %%d0, (%[addr]) \n" /* store byte */ | ||
1722 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1723 | "move.l %%d1, %[mask] \n" | ||
1724 | ".ur_f7: \n" | ||
1725 | "sub.l %[psiz], %[addr] \n" | ||
1726 | "move.b (%[addr]), %%d0 \n" | ||
1727 | "and.l %%d5, %%d0 \n" | ||
1728 | "or.l %%d2, %%d0 \n" | ||
1729 | "move.b %%d0, (%[addr]) \n" | ||
1730 | "lsr.l #8, %%d2 \n" | ||
1731 | ".ur_f6: \n" | ||
1732 | "sub.l %[psiz], %[addr] \n" | ||
1733 | "move.b (%[addr]), %%d0 \n" | ||
1734 | "and.l %%d5, %%d0 \n" | ||
1735 | "or.l %%d3, %%d0 \n" | ||
1736 | "move.b %%d0, (%[addr]) \n" | ||
1737 | "lsr.l #8, %%d3 \n" | ||
1738 | ".ur_f5: \n" | ||
1739 | "sub.l %[psiz], %[addr] \n" | ||
1740 | "move.b (%[addr]), %%d0 \n" | ||
1741 | "and.l %%d5, %%d0 \n" | ||
1742 | "or.l %%d4, %%d0 \n" | ||
1743 | "move.b %%d0, (%[addr]) \n" | ||
1744 | "lsr.l #8, %%d4 \n" | ||
1745 | ".ur_f4: \n" | ||
1746 | "sub.l %[psiz], %[addr] \n" | ||
1747 | "move.b (%[addr]), %%d0 \n" | ||
1748 | "and.l %%d5, %%d0 \n" | ||
1749 | "move.l %[ax], %%d1 \n" | ||
1750 | "or.l %%d1, %%d0 \n" | ||
1751 | "move.b %%d0, (%[addr]) \n" | ||
1752 | "lsr.l #8, %%d1 \n" | ||
1753 | "move.l %%d1, %[ax] \n" | ||
1754 | ".ur_f3: \n" | ||
1755 | "sub.l %[psiz], %[addr] \n" | ||
1756 | "move.b (%[addr]), %%d0 \n" | ||
1757 | "and.l %%d5, %%d0 \n" | ||
1758 | "or.l %%d6, %%d0 \n" | ||
1759 | "move.b %%d0, (%[addr]) \n" | ||
1760 | "lsr.l #8, %%d6 \n" | ||
1761 | ".ur_f2: \n" | ||
1762 | "sub.l %[psiz], %[addr] \n" | ||
1763 | "move.b (%[addr]), %%d0 \n" | ||
1764 | "and.l %%d5, %%d0 \n" | ||
1765 | "or.l %%d7, %%d0 \n" | ||
1766 | "move.b %%d0, (%[addr]) \n" | ||
1767 | "lsr.l #8, %%d7 \n" | ||
1768 | ".ur_f1: \n" | ||
1769 | "sub.l %[psiz], %[addr] \n" | ||
1770 | "move.b (%[addr]), %%d0 \n" | ||
1771 | "and.l %%d5, %%d0 \n" | ||
1772 | "move.l %%a0, %%d1 \n" | ||
1773 | "or.l %%d1, %%d0 \n" | ||
1774 | "move.b %%d0, (%[addr]) \n" | ||
1775 | "lsr.l #8, %%d1 \n" | ||
1776 | "move.l %%d1, %%a0 \n" | ||
1777 | |||
1778 | "move.l %[psiz], %%d0 \n" | ||
1779 | "lsl.l #3, %%d0 \n" | ||
1780 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1781 | "subq.l #8, %[dpth] \n" | ||
1782 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1783 | "jgt .ur_floop \n" /* next round if anything left */ | ||
1784 | |||
1785 | "jra .ur_end \n" | ||
1786 | |||
1787 | ".ur_sloop: \n" /** short loop (nothing to keep) **/ | ||
1788 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1789 | "bhs.s .ur_s8 \n" | ||
1790 | |||
1791 | "move.l %[psiz], %%d0 \n" | ||
1792 | "move.l %[dpth], %%d5 \n" | ||
1793 | "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ | ||
1794 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1795 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1796 | "bra.s .ur_s1 \n" /* dpth == 0 should never happen */ | ||
1797 | "bra.s .ur_s2 \n" | ||
1798 | "bra.s .ur_s3 \n" | ||
1799 | "bra.s .ur_s4 \n" | ||
1800 | "bra.s .ur_s5 \n" | ||
1801 | "bra.s .ur_s6 \n" | ||
1802 | "bra.s .ur_s7 \n" | ||
1803 | |||
1804 | ".ur_s8: \n" | ||
1805 | "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ | ||
1806 | "lsl.l #3, %%d0 \n" /* for this round. */ | ||
1807 | "add.l %%d0, %[addr] \n" /* See above. */ | ||
1808 | |||
1809 | "sub.l %[psiz], %[addr] \n" | ||
1810 | "move.b %%d1, (%[addr]) \n" /* store byte */ | ||
1811 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1812 | ".ur_s7: \n" | ||
1813 | "sub.l %[psiz], %[addr] \n" | ||
1814 | "move.b %%d2, (%[addr]) \n" | ||
1815 | "lsr.l #8, %%d2 \n" | ||
1816 | ".ur_s6: \n" | ||
1817 | "sub.l %[psiz], %[addr] \n" | ||
1818 | "move.b %%d3, (%[addr]) \n" | ||
1819 | "lsr.l #8, %%d3 \n" | ||
1820 | ".ur_s5: \n" | ||
1821 | "sub.l %[psiz], %[addr] \n" | ||
1822 | "move.b %%d4, (%[addr]) \n" | ||
1823 | "lsr.l #8, %%d4 \n" | ||
1824 | ".ur_s4: \n" | ||
1825 | "sub.l %[psiz], %[addr] \n" | ||
1826 | "move.l %[ax], %%d5 \n" | ||
1827 | "move.b %%d5, (%[addr]) \n" | ||
1828 | "lsr.l #8, %%d5 \n" | ||
1829 | "move.l %%d5, %[ax] \n" | ||
1830 | ".ur_s3: \n" | ||
1831 | "sub.l %[psiz], %[addr] \n" | ||
1832 | "move.b %%d6, (%[addr]) \n" | ||
1833 | "lsr.l #8, %%d6 \n" | ||
1834 | ".ur_s2: \n" | ||
1835 | "sub.l %[psiz], %[addr] \n" | ||
1836 | "move.b %%d7, (%[addr]) \n" | ||
1837 | "lsr.l #8, %%d7 \n" | ||
1838 | ".ur_s1: \n" | ||
1839 | "sub.l %[psiz], %[addr] \n" | ||
1840 | "move.l %%a0, %%d5 \n" | ||
1841 | "move.b %%d5, (%[addr]) \n" | ||
1842 | "lsr.l #8, %%d5 \n" | ||
1843 | "move.l %%d5, %%a0 \n" | ||
1844 | |||
1845 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1846 | "subq.l #8, %[dpth] \n" | ||
1847 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1848 | "jgt .ur_sloop \n" /* next round if anything left */ | ||
1849 | |||
1850 | ".ur_end: \n" | ||
1851 | : /* outputs */ | ||
1852 | [addr]"+a"(addr), | ||
1853 | [dpth]"+a"(depth), | ||
1854 | [mask]"+a"(mask), | ||
1855 | [ax] "=&a"(trash) | ||
1856 | : /* inputs */ | ||
1857 | [psiz]"a"(_gray_info.plane_size), | ||
1858 | [patp]"[ax]"(pat_ptr) | ||
1859 | : /* clobbers */ | ||
1860 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" | ||
1291 | ); | 1861 | ); |
1292 | } | 1862 | } |
1293 | #else /* C version, for reference*/ | 1863 | #else /* C version, for reference*/ |
@@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd) | |||
1680 | } | 2250 | } |
1681 | 2251 | ||
1682 | #endif /* HAVE_LCD_BITMAP */ | 2252 | #endif /* HAVE_LCD_BITMAP */ |
1683 | |||
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index 9406664ea2..dcc65bdd09 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c | |||
@@ -868,24 +868,24 @@ void gray_ub_clear_display(void) | |||
868 | 868 | ||
869 | /* Write a pixel block, defined by their brightnesses in a greymap. | 869 | /* Write a pixel block, defined by their brightnesses in a greymap. |
870 | Address is the byte in the first bitplane, src is the greymap start address, | 870 | Address is the byte in the first bitplane, src is the greymap start address, |
871 | stride is the increment for the greymap to get to the next pixel, mask | 871 | mask determines which pixels of the destination block are changed. */ |
872 | determines which pixels of the destination block are changed. */ | ||
873 | static void _writearray(unsigned char *address, const unsigned char *src, | 872 | static void _writearray(unsigned char *address, const unsigned char *src, |
874 | unsigned mask) | 873 | unsigned mask) |
875 | { | 874 | { |
876 | unsigned long pat_stack[8]; | 875 | unsigned long pat_stack[8]; |
877 | unsigned long *pat_ptr = &pat_stack[8]; | 876 | unsigned long *pat_ptr = &pat_stack[8]; |
878 | unsigned char *addr, *end; | 877 | unsigned char *addr; |
879 | #ifdef CPU_ARM | 878 | #ifdef CPU_ARM |
880 | const unsigned char *_src; | 879 | const unsigned char *_src; |
881 | unsigned _mask, trash; | 880 | unsigned _mask, depth, trash; |
882 | 881 | ||
883 | _mask = mask; | 882 | _mask = mask; |
884 | _src = src; | 883 | _src = src; |
885 | 884 | ||
886 | /* precalculate the bit patterns with random shifts | 885 | /* precalculate the bit patterns with random shifts |
887 | for all 8 pixels and put them on an extra "stack" */ | 886 | for all 8 pixels and put them on an extra "stack" */ |
888 | asm volatile ( | 887 | asm volatile |
888 | ( | ||
889 | "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ | 889 | "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ |
890 | "mov r3, #8 \n" /* loop count */ | 890 | "mov r3, #8 \n" /* loop count */ |
891 | 891 | ||
@@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
932 | ); | 932 | ); |
933 | 933 | ||
934 | addr = address; | 934 | addr = address; |
935 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | ||
936 | _mask = mask; | 935 | _mask = mask; |
936 | depth = _gray_info.depth; | ||
937 | 937 | ||
938 | /* set the bits for all 8 pixels in all bytes according to the | 938 | /* set the bits for all 8 pixels in all bytes according to the |
939 | * precalculated patterns on the pattern stack */ | 939 | * precalculated patterns on the pattern stack */ |
940 | asm volatile ( | 940 | asm volatile |
941 | "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ | 941 | ( |
942 | 942 | "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ | |
943 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | 943 | |
944 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ | ||
945 | |||
946 | "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ | ||
947 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
948 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ | ||
949 | "eor r0, r1, r5, lsl #4 \n" | ||
950 | "and r0, r0, %[rx] \n" | ||
951 | "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ | ||
952 | "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ | ||
953 | "eor r0, r2, r6, lsl #4 \n" | ||
954 | "and r0, r0, %[rx] \n" | ||
955 | "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ | ||
956 | "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ | ||
957 | "eor r0, r3, r7, lsl #4 \n" | ||
958 | "and r0, r0, %[rx] \n" | ||
959 | "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | ||
960 | "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | ||
961 | "eor r0, r4, r8, lsl #4 \n" | ||
962 | "and r0, r0, %[rx] \n" | ||
963 | "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
964 | "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
965 | |||
966 | "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ | ||
967 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
968 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ | ||
969 | "eor r0, r1, r3, lsl #2 \n" | ||
970 | "and r0, r0, %[rx] \n" | ||
971 | "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
972 | "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
973 | "eor r0, r2, r4, lsl #2 \n" | ||
974 | "and r0, r0, %[rx] \n" | ||
975 | "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
976 | "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
977 | "eor r0, r5, r7, lsl #2 \n" | ||
978 | "and r0, r0, %[rx] \n" | ||
979 | "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
980 | "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
981 | "eor r0, r6, r8, lsl #2 \n" | ||
982 | "and r0, r0, %[rx] \n" | ||
983 | "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
984 | "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
985 | |||
986 | "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ | ||
987 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
988 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ | ||
989 | "eor r0, r1, r2, lsl #1 \n" | ||
990 | "and r0, r0, %[rx] \n" | ||
991 | "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
992 | "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
993 | "eor r0, r3, r4, lsl #1 \n" | ||
994 | "and r0, r0, %[rx] \n" | ||
995 | "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
996 | "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
997 | "eor r0, r5, r6, lsl #1 \n" | ||
998 | "and r0, r0, %[rx] \n" | ||
999 | "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1000 | "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1001 | "eor r0, r7, r8, lsl #1 \n" | ||
1002 | "and r0, r0, %[rx] \n" | ||
1003 | "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1004 | "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1005 | |||
1006 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
944 | "ands %[mask], %[mask], #0xff \n" | 1007 | "ands %[mask], %[mask], #0xff \n" |
945 | "beq .wa_sloop \n" /* short loop if nothing to keep */ | 1008 | "beq .wa_sloop \n" /* short loop if no bits to keep */ |
946 | 1009 | ||
947 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1010 | ".wa_floop: \n" /** full loop (bits to keep)**/ |
948 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 1011 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ |
949 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | 1012 | "bhs .wa_f8 \n" |
950 | "movs r8, r8, lsr #1 \n" | 1013 | |
951 | "adc r0, r0, r0 \n" | 1014 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ |
952 | "movs r7, r7, lsr #1 \n" | 1015 | "add %[addr], %[addr], r0 \n" /* for this round */ |
953 | "adc r0, r0, r0 \n" | 1016 | |
954 | "movs r6, r6, lsr #1 \n" | 1017 | |
955 | "adc r0, r0, r0 \n" | 1018 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ |
956 | "movs r5, r5, lsr #1 \n" | 1019 | "add pc, pc, r0 \n" |
957 | "adc r0, r0, r0 \n" | 1020 | ".wa_ftable: \n" |
958 | "movs r4, r4, lsr #1 \n" | 1021 | ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */ |
959 | "adc r0, r0, r0 \n" | 1022 | ".byte .wa_f1 - .wa_ftable - 4 \n" |
960 | "movs r3, r3, lsr #1 \n" | 1023 | ".byte .wa_f2 - .wa_ftable - 4 \n" |
961 | "adc r0, r0, r0 \n" | 1024 | ".byte .wa_f3 - .wa_ftable - 4 \n" |
962 | "movs r2, r2, lsr #1 \n" | 1025 | ".byte .wa_f4 - .wa_ftable - 4 \n" |
963 | "adc r0, r0, r0 \n" | 1026 | ".byte .wa_f5 - .wa_ftable - 4 \n" |
964 | 1027 | ".byte .wa_f6 - .wa_ftable - 4 \n" | |
965 | "ldrb r1, [%[addr]] \n" /* read old value */ | 1028 | ".byte .wa_f7 - .wa_ftable - 4 \n" |
966 | "and r1, r1, %[mask] \n" /* mask out replaced bits */ | 1029 | |
967 | "orr r1, r1, r0 \n" /* set new bits */ | 1030 | ".wa_f8: \n" |
968 | "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ | 1031 | "add %[addr], %[addr], %[psiz], lsl #3 \n" |
969 | 1032 | /* Point behind the last plane for this round. Note: We're using the | |
970 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | 1033 | * registers backwards in order to reuse the streak for the last round. |
971 | "bne .wa_floop \n" | 1034 | * Therefore we need to go thru the bitplanes backwards too, otherwise |
972 | 1035 | * the bit order would be destroyed which results in more flicker. */ | |
1036 | "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ | ||
1037 | "and r0, r0, %[mask] \n" /* mask out replaced bits */ | ||
1038 | "orr r0, r0, r8 \n" /* set new bits */ | ||
1039 | "strb r0, [%[addr]] \n" /* store byte */ | ||
1040 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
1041 | ".wa_f7: \n" | ||
1042 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1043 | "and r0, r0, %[mask] \n" | ||
1044 | "orr r0, r0, r7 \n" | ||
1045 | "strb r0, [%[addr]] \n" | ||
1046 | "mov r7, r7, lsr #8 \n" | ||
1047 | ".wa_f6: \n" | ||
1048 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1049 | "and r0, r0, %[mask] \n" | ||
1050 | "orr r0, r0, r6 \n" | ||
1051 | "strb r0, [%[addr]] \n" | ||
1052 | "mov r6, r6, lsr #8 \n" | ||
1053 | ".wa_f5: \n" | ||
1054 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1055 | "and r0, r0, %[mask] \n" | ||
1056 | "orr r0, r0, r5 \n" | ||
1057 | "strb r0, [%[addr]] \n" | ||
1058 | "mov r5, r5, lsr #8 \n" | ||
1059 | ".wa_f4: \n" | ||
1060 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1061 | "and r0, r0, %[mask] \n" | ||
1062 | "orr r0, r0, r4 \n" | ||
1063 | "strb r0, [%[addr]] \n" | ||
1064 | "mov r4, r4, lsr #8 \n" | ||
1065 | ".wa_f3: \n" | ||
1066 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1067 | "and r0, r0, %[mask] \n" | ||
1068 | "orr r0, r0, r3 \n" | ||
1069 | "strb r0, [%[addr]] \n" | ||
1070 | "mov r3, r3, lsr #8 \n" | ||
1071 | ".wa_f2: \n" | ||
1072 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1073 | "and r0, r0, %[mask] \n" | ||
1074 | "orr r0, r0, r2 \n" | ||
1075 | "strb r0, [%[addr]] \n" | ||
1076 | "mov r2, r2, lsr #8 \n" | ||
1077 | ".wa_f1: \n" | ||
1078 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1079 | "and r0, r0, %[mask] \n" | ||
1080 | "orr r0, r0, r1 \n" | ||
1081 | "strb r0, [%[addr]] \n" | ||
1082 | "mov r1, r1, lsr #8 \n" | ||
1083 | ".wa_f0: \n" | ||
1084 | |||
1085 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
1086 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
1087 | "bhi .wa_floop \n" | ||
1088 | |||
973 | "b .wa_end \n" | 1089 | "b .wa_end \n" |
974 | 1090 | ||
975 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1091 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
976 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 1092 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ |
977 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | 1093 | "bhs .wa_s8 \n" |
978 | "movs r8, r8, lsr #1 \n" | ||
979 | "adc r0, r0, r0 \n" | ||
980 | "movs r7, r7, lsr #1 \n" | ||
981 | "adc r0, r0, r0 \n" | ||
982 | "movs r6, r6, lsr #1 \n" | ||
983 | "adc r0, r0, r0 \n" | ||
984 | "movs r5, r5, lsr #1 \n" | ||
985 | "adc r0, r0, r0 \n" | ||
986 | "movs r4, r4, lsr #1 \n" | ||
987 | "adc r0, r0, r0 \n" | ||
988 | "movs r3, r3, lsr #1 \n" | ||
989 | "adc r0, r0, r0 \n" | ||
990 | "movs r2, r2, lsr #1 \n" | ||
991 | "adc r0, r0, r0 \n" | ||
992 | |||
993 | "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ | ||
994 | 1094 | ||
995 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | 1095 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ |
996 | "bne .wa_sloop \n" | 1096 | "add %[addr], %[addr], r0 \n" /* for this round */ |
997 | 1097 | ||
1098 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ | ||
1099 | "add pc, pc, r0 \n" | ||
1100 | ".wa_stable: \n" | ||
1101 | ".byte .wa_s0 - .wa_stable - 4 \n" | ||
1102 | ".byte .wa_s1 - .wa_stable - 4 \n" | ||
1103 | ".byte .wa_s2 - .wa_stable - 4 \n" | ||
1104 | ".byte .wa_s3 - .wa_stable - 4 \n" | ||
1105 | ".byte .wa_s4 - .wa_stable - 4 \n" | ||
1106 | ".byte .wa_s5 - .wa_stable - 4 \n" | ||
1107 | ".byte .wa_s6 - .wa_stable - 4 \n" | ||
1108 | ".byte .wa_s7 - .wa_stable - 4 \n" | ||
1109 | |||
1110 | ".wa_s8: \n" | ||
1111 | "add %[addr], %[addr], %[psiz], lsl #3 \n" | ||
1112 | /* Point behind the last plane for this round. See above. */ | ||
1113 | "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ | ||
1114 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
1115 | ".wa_s7: \n" | ||
1116 | "strb r7, [%[addr], -%[psiz]]! \n" | ||
1117 | "mov r7, r7, lsr #8 \n" | ||
1118 | ".wa_s6: \n" | ||
1119 | "strb r6, [%[addr], -%[psiz]]! \n" | ||
1120 | "mov r6, r6, lsr #8 \n" | ||
1121 | ".wa_s5: \n" | ||
1122 | "strb r5, [%[addr], -%[psiz]]! \n" | ||
1123 | "mov r5, r5, lsr #8 \n" | ||
1124 | ".wa_s4: \n" | ||
1125 | "strb r4, [%[addr], -%[psiz]]! \n" | ||
1126 | "mov r4, r4, lsr #8 \n" | ||
1127 | ".wa_s3: \n" | ||
1128 | "strb r3, [%[addr], -%[psiz]]! \n" | ||
1129 | "mov r3, r3, lsr #8 \n" | ||
1130 | ".wa_s2: \n" | ||
1131 | "strb r2, [%[addr], -%[psiz]]! \n" | ||
1132 | "mov r2, r2, lsr #8 \n" | ||
1133 | ".wa_s1: \n" | ||
1134 | "strb r1, [%[addr], -%[psiz]]! \n" | ||
1135 | "mov r1, r1, lsr #8 \n" | ||
1136 | ".wa_s0: \n" | ||
1137 | |||
1138 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
1139 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
1140 | "bhi .wa_sloop \n" | ||
1141 | |||
998 | ".wa_end: \n" | 1142 | ".wa_end: \n" |
999 | : /* outputs */ | 1143 | : /* outputs */ |
1000 | [addr]"+r"(addr), | 1144 | [addr]"+r"(addr), |
1001 | [mask]"+r"(_mask), | 1145 | [mask]"+r"(_mask), |
1146 | [dpth]"+r"(depth), | ||
1002 | [rx] "=&r"(trash) | 1147 | [rx] "=&r"(trash) |
1003 | : /* inputs */ | 1148 | : /* inputs */ |
1004 | [psiz]"r"(_gray_info.plane_size), | 1149 | [psiz]"r"(_gray_info.plane_size), |
1005 | [end] "r"(end), | ||
1006 | [patp]"[rx]"(pat_ptr) | 1150 | [patp]"[rx]"(pat_ptr) |
1007 | : /* clobbers */ | 1151 | : /* clobbers */ |
1008 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | 1152 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" |
1009 | ); | 1153 | ); |
1010 | #else /* C version, for reference*/ | 1154 | #else /* C version, for reference*/ |
1011 | #warning C version of _writearray() used | 1155 | #warning C version of _writearray() used |
1156 | unsigned char *end; | ||
1012 | unsigned test = 0x80; | 1157 | unsigned test = 0x80; |
1013 | int i; | 1158 | int i; |
1014 | 1159 | ||
@@ -1143,67 +1288,70 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y, | |||
1143 | stride is the increment for the greymap to get to the next pixel, mask | 1288 | stride is the increment for the greymap to get to the next pixel, mask |
1144 | determines which pixels of the destination block are changed. */ | 1289 | determines which pixels of the destination block are changed. */ |
1145 | static void _writearray(unsigned char *address, const unsigned char *src, | 1290 | static void _writearray(unsigned char *address, const unsigned char *src, |
1291 | int stride, unsigned mask) __attribute__((noinline)); | ||
1292 | static void _writearray(unsigned char *address, const unsigned char *src, | ||
1146 | int stride, unsigned mask) | 1293 | int stride, unsigned mask) |
1147 | { | 1294 | { |
1148 | unsigned long pat_stack[8]; | 1295 | unsigned long pat_stack[8]; |
1149 | unsigned long *pat_ptr = &pat_stack[8]; | 1296 | unsigned long *pat_ptr = &pat_stack[8]; |
1150 | unsigned char *addr, *end; | 1297 | unsigned char *addr; |
1151 | #if CONFIG_CPU == SH7034 | 1298 | #if CONFIG_CPU == SH7034 |
1152 | const unsigned char *_src; | 1299 | const unsigned char *_src; |
1153 | unsigned _mask, trash; | 1300 | unsigned _mask, depth, trash; |
1154 | 1301 | ||
1155 | _mask = mask; | 1302 | _mask = mask; |
1156 | _src = src; | 1303 | _src = src; |
1157 | 1304 | ||
1158 | /* precalculate the bit patterns with random shifts | 1305 | /* precalculate the bit patterns with random shifts |
1159 | for all 8 pixels and put them on an extra "stack" */ | 1306 | for all 8 pixels and put them on an extra "stack" */ |
1160 | asm volatile ( | 1307 | asm volatile |
1161 | "mov #8,r3 \n" /* loop count */ | 1308 | ( |
1162 | 1309 | "mov #8, r3 \n" /* loop count */ | |
1163 | ".wa_loop: \n" /** load pattern for pixel **/ | 1310 | |
1164 | "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ | 1311 | ".wa_loop: \n" /** load pattern for pixel **/ |
1165 | "shlr %[mask] \n" /* shift out lsb of mask */ | 1312 | "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */ |
1166 | "bf .wa_skip \n" /* skip this pixel */ | 1313 | "shlr %[mask] \n" /* shift out lsb of mask */ |
1167 | 1314 | "bf .wa_skip \n" /* skip this pixel */ | |
1168 | "mov.b @%[src],r0 \n" /* load src byte */ | 1315 | |
1169 | "extu.b r0,r0 \n" /* extend unsigned */ | 1316 | "mov.b @%[src], r0 \n" /* load src byte */ |
1170 | "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ | 1317 | "extu.b r0, r0 \n" /* extend unsigned */ |
1171 | "extu.b r0,r0 \n" /* extend unsigned */ | 1318 | "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */ |
1172 | "shll2 r0 \n" | 1319 | "extu.b r0, r0 \n" /* extend unsigned */ |
1173 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ | 1320 | "shll2 r0 \n" |
1174 | 1321 | "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ | |
1175 | "mov #75,r0 \n" | 1322 | |
1176 | "mulu r0,%[rnd] \n" /* multiply by 75 */ | 1323 | "mov #75, r0 \n" |
1177 | "sts macl,%[rnd] \n" | 1324 | "mulu r0, %[rnd] \n" /* multiply by 75 */ |
1178 | "add #74,%[rnd] \n" /* add another 74 */ | 1325 | "sts macl, %[rnd] \n" |
1326 | "add #74, %[rnd] \n" /* add another 74 */ | ||
1179 | /* Since the lower bits are not very random: */ | 1327 | /* Since the lower bits are not very random: */ |
1180 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ | 1328 | "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ |
1181 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ | 1329 | "and %[rmsk], r1 \n" /* mask out unneeded bits */ |
1182 | 1330 | ||
1183 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ | 1331 | "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ |
1184 | "bf .wa_ntrim \n" | 1332 | "bf .wa_ntrim \n" |
1185 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ | 1333 | "sub %[dpth], r1 \n" /* yes: random -= depth; */ |
1186 | ".wa_ntrim: \n" | 1334 | ".wa_ntrim: \n" |
1187 | 1335 | ||
1188 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ | 1336 | "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ |
1189 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ | 1337 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ |
1190 | "mov r1,r5 \n" | 1338 | "mov r1, r5 \n" |
1191 | 1339 | ||
1192 | "mov %[dpth],r5 \n" | 1340 | "mov %[dpth], r5 \n" |
1193 | "sub r1,r5 \n" /* r5 = depth - r1 */ | 1341 | "sub r1, r5 \n" /* r5 = depth - r1 */ |
1194 | "mov.l .lshrsi3,r1 \n" | 1342 | "mov.l .lshrsi3, r1 \n" |
1195 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ | 1343 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ |
1196 | "mov r0,r1 \n" /* store previous result in r1 */ | 1344 | "mov r0, r1 \n" /* store previous result in r1 */ |
1197 | 1345 | ||
1198 | "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ | 1346 | "or r1, r0 \n" /* rotated_pattern = r0 | r1 */ |
1199 | 1347 | ||
1200 | ".wa_skip: \n" | 1348 | ".wa_skip: \n" |
1201 | "mov.l r0,@-%[patp] \n" /* push on pattern stack */ | 1349 | "mov.l r0, @-%[patp] \n" /* push on pattern stack */ |
1202 | 1350 | ||
1203 | "add %[stri],%[src] \n" /* src += stride; */ | 1351 | "add %[stri], %[src] \n" /* src += stride; */ |
1204 | "add #-1,r3 \n" /* loop 8 times (pixel block) */ | 1352 | "add #-1, r3 \n" /* loop 8 times (pixel block) */ |
1205 | "cmp/pl r3 \n" | 1353 | "cmp/pl r3 \n" |
1206 | "bt .wa_loop \n" | 1354 | "bt .wa_loop \n" |
1207 | : /* outputs */ | 1355 | : /* outputs */ |
1208 | [src] "+r"(_src), | 1356 | [src] "+r"(_src), |
1209 | [rnd] "+r"(_gray_random_buffer), | 1357 | [rnd] "+r"(_gray_random_buffer), |
@@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1220 | ); | 1368 | ); |
1221 | 1369 | ||
1222 | addr = address; | 1370 | addr = address; |
1223 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | ||
1224 | _mask = mask; | 1371 | _mask = mask; |
1372 | depth = _gray_info.depth; | ||
1225 | 1373 | ||
1226 | /* set the bits for all 8 pixels in all bytes according to the | 1374 | /* set the bits for all 8 pixels in all bytes according to the |
1227 | * precalculated patterns on the pattern stack */ | 1375 | * precalculated patterns on the pattern stack */ |
1228 | asm volatile ( | 1376 | asm volatile |
1229 | "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ | 1377 | ( |
1230 | "mov.l @%[patp]+,r2 \n" | 1378 | "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ |
1231 | "mov.l @%[patp]+,r3 \n" | 1379 | "mov.l @%[patp]+, r7 \n" |
1232 | "mov.l @%[patp]+,r6 \n" | 1380 | "mov.l @%[patp]+, r6 \n" |
1233 | "mov.l @%[patp]+,r7 \n" | 1381 | "mov.l @%[patp]+, r5 \n" |
1234 | "mov.l @%[patp]+,r8 \n" | 1382 | "mov.l @%[patp]+, r4 \n" |
1235 | "mov.l @%[patp]+,r9 \n" | 1383 | "mov.l @%[patp]+, r3 \n" |
1236 | "mov.l @%[patp],r10 \n" | 1384 | "mov.l @%[patp]+, r2 \n" |
1237 | 1385 | "mov.l @%[patp], r1 \n" | |
1238 | "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ | 1386 | |
1239 | "extu.b %[mask],%[mask] \n" /* mask out high bits */ | 1387 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1240 | "tst %[mask],%[mask] \n" | 1388 | |
1241 | "bt .wa_sloop \n" /* short loop if nothing to keep */ | 1389 | "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */ |
1242 | 1390 | "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ | |
1243 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1391 | "shll2 r0 \n" |
1244 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1392 | "shll2 r0 \n" |
1245 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1393 | "xor r1, r0 \n" |
1246 | "shlr r2 \n" | 1394 | "and %[rx], r0 \n" |
1247 | "rotcl r0 \n" | 1395 | "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ |
1248 | "shlr r3 \n" | 1396 | "shlr2 r0 \n" |
1249 | "rotcl r0 \n" | 1397 | "shlr2 r0 \n" |
1250 | "shlr r6 \n" | 1398 | "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ |
1251 | "rotcl r0 \n" | 1399 | "mov r6, r0 \n" |
1252 | "shlr r7 \n" | 1400 | "shll2 r0 \n" |
1253 | "rotcl r0 \n" | 1401 | "shll2 r0 \n" |
1254 | "shlr r8 \n" | 1402 | "xor r2, r0 \n" |
1255 | "rotcl r0 \n" | 1403 | "and %[rx], r0 \n" |
1256 | "shlr r9 \n" | 1404 | "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ |
1257 | "rotcl r0 \n" | 1405 | "shlr2 r0 \n" |
1258 | "shlr r10 \n" | 1406 | "shlr2 r0 \n" |
1259 | "mov.b @%[addr],%[rx] \n" /* read old value */ | 1407 | "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ |
1260 | "rotcl r0 \n" | 1408 | "mov r7, r0 \n" |
1261 | "and %[mask],%[rx] \n" /* mask out replaced bits */ | 1409 | "shll2 r0 \n" |
1262 | "or %[rx],r0 \n" /* set new bits */ | 1410 | "shll2 r0 \n" |
1263 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ | 1411 | "xor r3, r0 \n" |
1264 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1412 | "and %[rx], r0 \n" |
1265 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ | 1413 | "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ |
1266 | "bt .wa_floop \n" | 1414 | "shlr2 r0 \n" |
1267 | 1415 | "shlr2 r0 \n" | |
1268 | "bra .wa_end \n" | 1416 | "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ |
1269 | "nop \n" | 1417 | "mov r8, r0 \n" |
1418 | "shll2 r0 \n" | ||
1419 | "shll2 r0 \n" | ||
1420 | "xor r4, r0 \n" | ||
1421 | "and %[rx], r0 \n" | ||
1422 | "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
1423 | "shlr2 r0 \n" | ||
1424 | "shlr2 r0 \n" | ||
1425 | "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
1426 | |||
1427 | "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */ | ||
1428 | "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ | ||
1429 | "shll2 r0 \n" | ||
1430 | "xor r1, r0 \n" | ||
1431 | "and %[rx], r0 \n" | ||
1432 | "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
1433 | "shlr2 r0 \n" | ||
1434 | "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
1435 | "mov r4, r0 \n" | ||
1436 | "shll2 r0 \n" | ||
1437 | "xor r2, r0 \n" | ||
1438 | "and %[rx], r0 \n" | ||
1439 | "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
1440 | "shlr2 r0 \n" | ||
1441 | "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
1442 | "mov r7, r0 \n" | ||
1443 | "shll2 r0 \n" | ||
1444 | "xor r5, r0 \n" | ||
1445 | "and %[rx], r0 \n" | ||
1446 | "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
1447 | "shlr2 r0 \n" | ||
1448 | "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
1449 | "mov r8, r0 \n" | ||
1450 | "shll2 r0 \n" | ||
1451 | "xor r6, r0 \n" | ||
1452 | "and %[rx], r0 \n" | ||
1453 | "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
1454 | "shlr2 r0 \n" | ||
1455 | "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
1456 | |||
1457 | "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */ | ||
1458 | "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1459 | "shll r0 \n" | ||
1460 | "xor r1, r0 \n" | ||
1461 | "and %[rx], r0 \n" | ||
1462 | "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
1463 | "shlr r0 \n" | ||
1464 | "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
1465 | "mov r4, r0 \n" | ||
1466 | "shll r0 \n" | ||
1467 | "xor r3, r0 \n" | ||
1468 | "and %[rx], r0 \n" | ||
1469 | "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
1470 | "shlr r0 \n" | ||
1471 | "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
1472 | "mov r6, r0 \n" | ||
1473 | "shll r0 \n" | ||
1474 | "xor r5, r0 \n" | ||
1475 | "and %[rx], r0 \n" | ||
1476 | "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1477 | "shlr r0 \n" | ||
1478 | "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1479 | "mov r8, r0 \n" | ||
1480 | "shll r0 \n" | ||
1481 | "xor r7, r0 \n" | ||
1482 | "and %[rx], r0 \n" | ||
1483 | "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1484 | "shlr r0 \n" | ||
1485 | "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1486 | |||
1487 | "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
1488 | "extu.b %[mask], %[mask] \n" /* mask out high bits */ | ||
1489 | "tst %[mask], %[mask] \n" | ||
1490 | "bt .wa_sloop \n" /* short loop if nothing to keep */ | ||
1491 | |||
1492 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | ||
1493 | "mov #8, r0 \n" | ||
1494 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1495 | "bt .wa_f8 \n" | ||
1496 | |||
1497 | "mulu %[psiz], %[dpth] \n" | ||
1498 | "mova .wa_ftable, r0 \n" | ||
1499 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1500 | "add %[rx], r0 \n" | ||
1501 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1502 | "jmp @r0 \n" /* jump into streak */ | ||
1503 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1504 | |||
1505 | ".align 2 \n" | ||
1506 | ".wa_ftable: \n" | ||
1507 | ".byte .wa_f0 - .wa_ftable \n" | ||
1508 | ".byte .wa_f1 - .wa_ftable \n" | ||
1509 | ".byte .wa_f2 - .wa_ftable \n" | ||
1510 | ".byte .wa_f3 - .wa_ftable \n" | ||
1511 | ".byte .wa_f4 - .wa_ftable \n" | ||
1512 | ".byte .wa_f5 - .wa_ftable \n" | ||
1513 | ".byte .wa_f6 - .wa_ftable \n" | ||
1514 | ".byte .wa_f7 - .wa_ftable \n" | ||
1515 | |||
1516 | ".wa_f8: \n" | ||
1517 | "mov %[psiz], %[rx] \n" | ||
1518 | "shll2 %[rx] \n" | ||
1519 | "add %[rx], %[rx] \n" | ||
1520 | "add %[rx], %[addr] \n" | ||
1521 | /* Point behind the last plane for this round. Note: We're using the | ||
1522 | * registers backwards in order to reuse the streak for the last round. | ||
1523 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1524 | * the bit order would be destroyed which results in more flicker. */ | ||
1525 | "sub %[psiz], %[addr] \n" | ||
1526 | "mov.b @%[addr], r0 \n" /* load old byte */ | ||
1527 | "and %[mask], r0 \n" /* mask out replaced bits */ | ||
1528 | "or r8, r0 \n" /* set new bits */ | ||
1529 | "mov.b r0, @%[addr] \n" /* store byte */ | ||
1530 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1531 | ".wa_f7: \n" | ||
1532 | "sub %[psiz], %[addr] \n" | ||
1533 | "mov.b @%[addr], r0 \n" | ||
1534 | "and %[mask], r0 \n" | ||
1535 | "or r7, r0 \n" | ||
1536 | "mov.b r0, @%[addr] \n" | ||
1537 | "shlr8 r7 \n" | ||
1538 | ".wa_f6: \n" | ||
1539 | "sub %[psiz], %[addr] \n" | ||
1540 | "mov.b @%[addr], r0 \n" | ||
1541 | "and %[mask], r0 \n" | ||
1542 | "or r6, r0 \n" | ||
1543 | "mov.b r0, @%[addr] \n" | ||
1544 | "shlr8 r6 \n" | ||
1545 | ".wa_f5: \n" | ||
1546 | "sub %[psiz], %[addr] \n" | ||
1547 | "mov.b @%[addr], r0 \n" | ||
1548 | "and %[mask], r0 \n" | ||
1549 | "or r5, r0 \n" | ||
1550 | "mov.b r0, @%[addr] \n" | ||
1551 | "shlr8 r5 \n" | ||
1552 | ".wa_f4: \n" | ||
1553 | "sub %[psiz], %[addr] \n" | ||
1554 | "mov.b @%[addr], r0 \n" | ||
1555 | "and %[mask], r0 \n" | ||
1556 | "or r4, r0 \n" | ||
1557 | "mov.b r0, @%[addr] \n" | ||
1558 | "shlr8 r4 \n" | ||
1559 | ".wa_f3: \n" | ||
1560 | "sub %[psiz], %[addr] \n" | ||
1561 | "mov.b @%[addr], r0 \n" | ||
1562 | "and %[mask], r0 \n" | ||
1563 | "or r3, r0 \n" | ||
1564 | "mov.b r0, @%[addr] \n" | ||
1565 | "shlr8 r3 \n" | ||
1566 | ".wa_f2: \n" | ||
1567 | "sub %[psiz], %[addr] \n" | ||
1568 | "mov.b @%[addr], r0 \n" | ||
1569 | "and %[mask], r0 \n" | ||
1570 | "or r2, r0 \n" | ||
1571 | "mov.b r0, @%[addr] \n" | ||
1572 | "shlr8 r2 \n" | ||
1573 | ".wa_f1: \n" | ||
1574 | "sub %[psiz], %[addr] \n" | ||
1575 | "mov.b @%[addr], r0 \n" | ||
1576 | "and %[mask], r0 \n" | ||
1577 | "or r1, r0 \n" | ||
1578 | "mov.b r0, @%[addr] \n" | ||
1579 | "shlr8 r1 \n" | ||
1580 | ".wa_f0: \n" | ||
1581 | |||
1582 | "add %[rx], %[addr] \n" /* correct address */ | ||
1583 | "add #-8, %[dpth] \n" | ||
1584 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1585 | "bt .wa_floop \n" | ||
1586 | |||
1587 | "bra .wa_end \n" | ||
1588 | "nop \n" | ||
1270 | 1589 | ||
1271 | /* References to C library routines used in the precalc block */ | 1590 | /* References to C library routines used in the precalc block */ |
1272 | ".align 2 \n" | 1591 | ".align 2 \n" |
1273 | ".ashlsi3: \n" /* C library routine: */ | 1592 | ".ashlsi3: \n" /* C library routine: */ |
1274 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ | 1593 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ |
1275 | ".lshrsi3: \n" /* C library routine: */ | 1594 | ".lshrsi3: \n" /* C library routine: */ |
1276 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ | 1595 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ |
1277 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | 1596 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ |
1278 | 1597 | ||
1279 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1598 | /* Bitmasks for the bit block rotation */ |
1280 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1599 | ".wa_mask4: \n" |
1281 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1600 | ".long 0xF0F0F0F0 \n" |
1282 | "shlr r2 \n" | 1601 | ".wa_mask2: \n" |
1283 | "rotcl r0 \n" | 1602 | ".long 0xCCCCCCCC \n" |
1284 | "shlr r3 \n" | 1603 | ".wa_mask1: \n" |
1285 | "rotcl r0 \n" | 1604 | ".long 0xAAAAAAAA \n" |
1286 | "shlr r6 \n" | 1605 | |
1287 | "rotcl r0 \n" | 1606 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
1288 | "shlr r7 \n" | 1607 | "mov #8, r0 \n" |
1289 | "rotcl r0 \n" | 1608 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ |
1290 | "shlr r8 \n" | 1609 | "bt .wa_s8 \n" |
1291 | "rotcl r0 \n" | 1610 | |
1292 | "shlr r9 \n" | 1611 | "mulu %[psiz], %[dpth] \n" |
1293 | "rotcl r0 \n" | 1612 | "mova .wa_stable, r0 \n" |
1294 | "shlr r10 \n" | 1613 | "mov.b @(r0, %[dpth]), %[rx] \n" |
1295 | "rotcl r0 \n" | 1614 | "add %[rx], r0 \n" |
1296 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ | 1615 | "sts macl, %[rx] \n" /* point behind the last plane.. */ |
1297 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1616 | "jmp @r0 \n" /* jump into streak */ |
1298 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ | 1617 | "add %[rx], %[addr] \n" /* ..for this round */ |
1299 | "bt .wa_sloop \n" | 1618 | |
1300 | 1619 | ".align 2 \n" | |
1301 | ".wa_end: \n" | 1620 | ".wa_stable: \n" |
1621 | ".byte .wa_s0 - .wa_stable \n" | ||
1622 | ".byte .wa_s1 - .wa_stable \n" | ||
1623 | ".byte .wa_s2 - .wa_stable \n" | ||
1624 | ".byte .wa_s3 - .wa_stable \n" | ||
1625 | ".byte .wa_s4 - .wa_stable \n" | ||
1626 | ".byte .wa_s5 - .wa_stable \n" | ||
1627 | ".byte .wa_s6 - .wa_stable \n" | ||
1628 | ".byte .wa_s7 - .wa_stable \n" | ||
1629 | |||
1630 | ".wa_s8: \n" | ||
1631 | "mov %[psiz], %[rx] \n" /* Point behind the last plane */ | ||
1632 | "shll2 %[rx] \n" /* for this round. */ | ||
1633 | "add %[rx], %[rx] \n" /* See above. */ | ||
1634 | "add %[rx], %[addr] \n" | ||
1635 | |||
1636 | "sub %[psiz], %[addr] \n" | ||
1637 | "mov.b r8, @%[addr] \n" /* store byte */ | ||
1638 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1639 | ".wa_s7: \n" | ||
1640 | "sub %[psiz], %[addr] \n" | ||
1641 | "mov.b r7, @%[addr] \n" | ||
1642 | "shlr8 r7 \n" | ||
1643 | ".wa_s6: \n" | ||
1644 | "sub %[psiz], %[addr] \n" | ||
1645 | "mov.b r6, @%[addr] \n" | ||
1646 | "shlr8 r6 \n" | ||
1647 | ".wa_s5: \n" | ||
1648 | "sub %[psiz], %[addr] \n" | ||
1649 | "mov.b r5, @%[addr] \n" | ||
1650 | "shlr8 r5 \n" | ||
1651 | ".wa_s4: \n" | ||
1652 | "sub %[psiz], %[addr] \n" | ||
1653 | "mov.b r4, @%[addr] \n" | ||
1654 | "shlr8 r4 \n" | ||
1655 | ".wa_s3: \n" | ||
1656 | "sub %[psiz], %[addr] \n" | ||
1657 | "mov.b r3, @%[addr] \n" | ||
1658 | "shlr8 r3 \n" | ||
1659 | ".wa_s2: \n" | ||
1660 | "sub %[psiz], %[addr] \n" | ||
1661 | "mov.b r2, @%[addr] \n" | ||
1662 | "shlr8 r2 \n" | ||
1663 | ".wa_s1: \n" | ||
1664 | "sub %[psiz], %[addr] \n" | ||
1665 | "mov.b r1, @%[addr] \n" | ||
1666 | "shlr8 r1 \n" | ||
1667 | ".wa_s0: \n" | ||
1668 | |||
1669 | "add %[rx], %[addr] \n" /* correct address */ | ||
1670 | "add #-8, %[dpth] \n" | ||
1671 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1672 | "bt .wa_sloop \n" | ||
1673 | |||
1674 | ".wa_end: \n" | ||
1302 | : /* outputs */ | 1675 | : /* outputs */ |
1303 | [addr]"+r"(addr), | 1676 | [addr]"+r"(addr), |
1304 | [mask]"+r"(_mask), | 1677 | [mask]"+r"(_mask), |
1678 | [dpth]"+r"(depth), | ||
1305 | [rx] "=&r"(trash) | 1679 | [rx] "=&r"(trash) |
1306 | : /* inputs */ | 1680 | : /* inputs */ |
1307 | [psiz]"r"(_gray_info.plane_size), | 1681 | [psiz]"r"(_gray_info.plane_size), |
1308 | [end] "r"(end), | ||
1309 | [patp]"[rx]"(pat_ptr) | 1682 | [patp]"[rx]"(pat_ptr) |
1310 | : /* clobbers */ | 1683 | : /* clobbers */ |
1311 | "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" | 1684 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" |
1312 | ); | 1685 | ); |
1313 | #elif defined(CPU_COLDFIRE) | 1686 | #elif defined(CPU_COLDFIRE) |
1314 | const unsigned char *_src; | 1687 | const unsigned char *_src; |
1315 | unsigned _mask, trash; | 1688 | unsigned _mask, depth, trash; |
1316 | 1689 | ||
1317 | _mask = mask; | 1690 | _mask = mask; |
1318 | _src = src; | 1691 | _src = src; |
1319 | 1692 | ||
1320 | /* precalculate the bit patterns with random shifts | 1693 | /* precalculate the bit patterns with random shifts |
1321 | for all 8 pixels and put them on an extra "stack" */ | 1694 | for all 8 pixels and put them on an extra "stack" */ |
1322 | asm volatile ( | 1695 | asm volatile |
1323 | "moveq.l #8,%%d3 \n" /* loop count */ | 1696 | ( |
1324 | 1697 | "moveq.l #8, %%d3 \n" /* loop count */ | |
1325 | ".wa_loop: \n" /** load pattern for pixel **/ | 1698 | |
1326 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ | 1699 | ".wa_loop: \n" /** load pattern for pixel **/ |
1327 | "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ | 1700 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ |
1328 | "bcc.b .wa_skip \n" /* skip this pixel */ | 1701 | "lsr.l #1, %[mask] \n" /* shift out lsb of mask */ |
1329 | 1702 | "bcc.b .wa_skip \n" /* skip this pixel */ | |
1330 | "clr.l %%d0 \n" | 1703 | |
1331 | "move.b (%[src]),%%d0 \n" /* load src byte */ | 1704 | "clr.l %%d0 \n" |
1332 | "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ | 1705 | "move.b (%[src]), %%d0 \n" /* load src byte */ |
1333 | "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ | 1706 | "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */ |
1334 | 1707 | "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ | |
1335 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ | 1708 | |
1336 | "add.l #74,%[rnd] \n" /* add another 74 */ | 1709 | "mulu.w #75, %[rnd] \n" /* multiply by 75 */ |
1710 | "add.l #74, %[rnd] \n" /* add another 74 */ | ||
1337 | /* Since the lower bits are not very random: */ | 1711 | /* Since the lower bits are not very random: */ |
1338 | "move.l %[rnd],%%d1 \n" | 1712 | "move.l %[rnd], %%d1 \n" |
1339 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ | 1713 | "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ |
1340 | "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ | 1714 | "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ |
1341 | 1715 | ||
1342 | "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ | 1716 | "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ |
1343 | "blo.b .wa_ntrim \n" | 1717 | "blo.b .wa_ntrim \n" |
1344 | "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ | 1718 | "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ |
1345 | ".wa_ntrim: \n" | 1719 | ".wa_ntrim: \n" |
1346 | 1720 | ||
1347 | "move.l %%d2,%%d0 \n" /** rotate pattern **/ | 1721 | "move.l %%d2, %%d0 \n" /** rotate pattern **/ |
1348 | "lsl.l %%d1,%%d0 \n" | 1722 | "lsl.l %%d1, %%d0 \n" |
1349 | "sub.l %[dpth],%%d1 \n" | 1723 | "sub.l %[dpth], %%d1 \n" |
1350 | "neg.l %%d1 \n" /* d1 = depth - d1 */ | 1724 | "neg.l %%d1 \n" /* d1 = depth - d1 */ |
1351 | "lsr.l %%d1,%%d2 \n" | 1725 | "lsr.l %%d1, %%d2 \n" |
1352 | "or.l %%d0,%%d2 \n" | 1726 | "or.l %%d0, %%d2 \n" |
1353 | 1727 | ||
1354 | ".wa_skip: \n" | 1728 | ".wa_skip: \n" |
1355 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ | 1729 | "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ |
1356 | 1730 | ||
1357 | "add.l %[stri],%[src] \n" /* src += stride; */ | 1731 | "add.l %[stri], %[src] \n" /* src += stride; */ |
1358 | "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ | 1732 | "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ |
1359 | "bne.b .wa_loop \n" | 1733 | "bne.b .wa_loop \n" |
1360 | : /* outputs */ | 1734 | : /* outputs */ |
1361 | [src] "+a"(_src), | 1735 | [src] "+a"(_src), |
1362 | [patp]"+a"(pat_ptr), | 1736 | [patp]"+a"(pat_ptr), |
@@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1373 | ); | 1747 | ); |
1374 | 1748 | ||
1375 | addr = address; | 1749 | addr = address; |
1376 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1750 | _mask = ~mask & 0xff; |
1377 | _mask = mask; | 1751 | depth = _gray_info.depth; |
1378 | 1752 | ||
1379 | /* set the bits for all 8 pixels in all bytes according to the | 1753 | /* set the bits for all 8 pixels in all bytes according to the |
1380 | * precalculated patterns on the pattern stack */ | 1754 | * precalculated patterns on the pattern stack */ |
1381 | asm volatile ( | 1755 | asm volatile |
1382 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" | 1756 | ( |
1383 | /* pop all 8 patterns */ | 1757 | "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ |
1384 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ | 1758 | /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ |
1385 | "and.l #0xFF,%[mask] \n" | 1759 | |
1386 | "beq.b .wa_sstart \n" /* short loop if nothing to keep */ | 1760 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1387 | 1761 | ||
1388 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1762 | "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ |
1389 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1763 | "lsl.l #4, %%d0 \n" |
1390 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1764 | /* move.l %[ax], %%d5 */ /* already in d5 */ |
1391 | "lsr.l #1,%%d3 \n" | 1765 | "eor.l %%d5, %%d0 \n" |
1392 | "addx.l %%d0,%%d0 \n" | 1766 | "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ |
1393 | "lsr.l #1,%%d4 \n" | 1767 | "eor.l %%d0, %%d5 \n" |
1394 | "addx.l %%d0,%%d0 \n" | 1768 | "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ |
1395 | "lsr.l #1,%%d5 \n" | 1769 | "lsr.l #4, %%d0 \n" |
1396 | "addx.l %%d0,%%d0 \n" | 1770 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ |
1397 | "lsr.l #1,%%d6 \n" | 1771 | "move.l %%d2, %%d0 \n" |
1398 | "addx.l %%d0,%%d0 \n" | 1772 | "lsl.l #4, %%d0 \n" |
1399 | "move.l %%a0,%%d1 \n" | 1773 | "eor.l %%d6, %%d0 \n" |
1400 | "lsr.l #1,%%d1 \n" | 1774 | "and.l #0xF0F0F0F0, %%d0 \n" |
1401 | "addx.l %%d0,%%d0 \n" | 1775 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ |
1402 | "move.l %%d1,%%a0 \n" | 1776 | "lsr.l #4, %%d0 \n" |
1403 | "move.l %%a1,%%d1 \n" | 1777 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ |
1404 | "lsr.l #1,%%d1 \n" | 1778 | "move.l %%d3, %%d0 \n" |
1405 | "addx.l %%d0,%%d0 \n" | 1779 | "lsl.l #4, %%d0 \n" |
1406 | "move.l %%d1,%%a1 \n" | 1780 | "eor.l %%d7, %%d0 \n" |
1407 | "move.l %[ax],%%d1 \n" | 1781 | "and.l #0xF0F0F0F0, %%d0 \n" |
1408 | "lsr.l #1,%%d1 \n" | 1782 | "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ |
1409 | "addx.l %%d0,%%d0 \n" | 1783 | "lsr.l #4, %%d0 \n" |
1410 | "move.l %%d1,%[ax] \n" | 1784 | "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ |
1411 | 1785 | "move.l %%d4, %%d0 \n" | |
1412 | "move.b (%[addr]),%%d1 \n" /* read old value */ | 1786 | "lsl.l #4, %%d0 \n" |
1413 | "and.l %[mask],%%d1 \n" /* mask out replaced bits */ | 1787 | "move.l %%a0, %%d5 \n" |
1414 | "or.l %%d0,%%d1 \n" /* set new bits */ | 1788 | "eor.l %%d5, %%d0 \n" |
1415 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ | 1789 | "and.l #0xF0F0F0F0, %%d0 \n" |
1416 | 1790 | "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ | |
1417 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1791 | /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ |
1418 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ | 1792 | "lsr.l #4, %%d0 \n" |
1419 | "bhi.b .wa_floop \n" | 1793 | "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ |
1420 | 1794 | ||
1421 | "bra.b .wa_end \n" | 1795 | "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ |
1422 | 1796 | "lsl.l #2, %%d0 \n" | |
1423 | ".wa_sstart: \n" | 1797 | /* move.l %%a0, %%d5 */ /* still in d5 */ |
1424 | "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ | 1798 | "eor.l %%d5, %%d0 \n" |
1425 | 1799 | "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ | |
1426 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1800 | "eor.l %%d0, %%d5 \n" |
1427 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1801 | "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ |
1428 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1802 | "lsr.l #2, %%d0 \n" |
1429 | "lsr.l #1,%%d3 \n" | 1803 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ |
1430 | "addx.l %%d0,%%d0 \n" | 1804 | "move.l %[ax], %%d5 \n" |
1431 | "lsr.l #1,%%d4 \n" | 1805 | "move.l %%d5, %%d0 \n" |
1432 | "addx.l %%d0,%%d0 \n" | 1806 | "lsl.l #2, %%d0 \n" |
1433 | "lsr.l #1,%%d5 \n" | 1807 | "eor.l %%d7, %%d0 \n" |
1434 | "addx.l %%d0,%%d0 \n" | 1808 | "and.l #0xCCCCCCCC, %%d0 \n" |
1435 | "lsr.l #1,%%d6 \n" | 1809 | "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1436 | "addx.l %%d0,%%d0 \n" | 1810 | "lsr.l #2, %%d0 \n" |
1437 | "lsr.l #1,%[mask] \n" | 1811 | "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ |
1438 | "addx.l %%d0,%%d0 \n" | 1812 | /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ |
1439 | "move.l %%a1,%%d1 \n" | 1813 | "move.l %%d2, %%d0 \n" |
1440 | "lsr.l #1,%%d1 \n" | 1814 | "lsl.l #2, %%d0 \n" |
1441 | "addx.l %%d0,%%d0 \n" | 1815 | "eor.l %%d4, %%d0 \n" |
1442 | "move.l %%d1,%%a1 \n" | 1816 | "and.l #0xCCCCCCCC, %%d0 \n" |
1443 | "move.l %[ax],%%d1 \n" | 1817 | "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ |
1444 | "lsr.l #1,%%d1 \n" | 1818 | "lsr.l #2, %%d0 \n" |
1445 | "addx.l %%d0,%%d0 \n" | 1819 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ |
1446 | "move.l %%d1,%[ax] \n" | 1820 | "move.l %%d1, %%d0 \n" |
1447 | 1821 | "lsl.l #2, %%d0 \n" | |
1448 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ | 1822 | "eor.l %%d3, %%d0 \n" |
1449 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1823 | "and.l #0xCCCCCCCC, %%d0 \n" |
1450 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ | 1824 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ |
1451 | "bhi.b .wa_sloop \n" | 1825 | "lsr.l #2, %%d0 \n" |
1452 | 1826 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ | |
1453 | ".wa_end: \n" | 1827 | |
1828 | "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1829 | "lsl.l #1, %%d0 \n" | ||
1830 | "eor.l %%d2, %%d0 \n" | ||
1831 | "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ | ||
1832 | "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ | ||
1833 | "lsr.l #1, %%d0 \n" | ||
1834 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ | ||
1835 | "move.l %%d3, %%d0 \n" | ||
1836 | "lsl.l #1, %%d0 \n" | ||
1837 | "eor.l %%d4, %%d0 \n" | ||
1838 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1839 | "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ | ||
1840 | "lsr.l #1, %%d0 \n" | ||
1841 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ | ||
1842 | /* move.l %[ax], %%d5 */ /* still in d5 */ | ||
1843 | "move.l %%d5, %%d0 \n" | ||
1844 | "lsl.l #1, %%d0 \n" | ||
1845 | "eor.l %%d6, %%d0 \n" | ||
1846 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1847 | "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ | ||
1848 | "lsr.l #1, %%d0 \n" | ||
1849 | "eor.l %%d0, %%d5 \n" | ||
1850 | "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ | ||
1851 | "move.l %%d7, %%d0 \n" | ||
1852 | "lsl.l #1, %%d0 \n" | ||
1853 | "move.l %%a0, %%d5 \n" | ||
1854 | "eor.l %%d5, %%d0 \n" | ||
1855 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1856 | "eor.l %%d0, %%d5 \n" | ||
1857 | "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ | ||
1858 | "lsr.l #1, %%d0 \n" | ||
1859 | "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ | ||
1860 | |||
1861 | "tst.l %[mask] \n" | ||
1862 | "jeq .wa_sloop \n" /* short loop if nothing to keep */ | ||
1863 | |||
1864 | "move.l %[mask], %%d5 \n" /* need mask in data reg. */ | ||
1865 | "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ | ||
1866 | |||
1867 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | ||
1868 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1869 | "bhs.s .wa_f8 \n" | ||
1870 | |||
1871 | "move.l %[psiz], %%d0 \n" | ||
1872 | "move.l %[dpth], %%d1 \n" | ||
1873 | "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ | ||
1874 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1875 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1876 | "bra.s .wa_f1 \n" /* dpth == 0 should never happen */ | ||
1877 | "bra.s .wa_f2 \n" | ||
1878 | "bra.s .wa_f3 \n" | ||
1879 | "bra.s .wa_f4 \n" | ||
1880 | "bra.s .wa_f5 \n" | ||
1881 | "bra.s .wa_f6 \n" | ||
1882 | "bra.s .wa_f7 \n" | ||
1883 | |||
1884 | ".wa_f8: \n" | ||
1885 | "move.l %[psiz], %%d0 \n" | ||
1886 | "lsl.l #3, %%d0 \n" | ||
1887 | "add.l %%d0, %[addr] \n" | ||
1888 | /* Point behind the last plane for this round. Note: We're using the | ||
1889 | * registers backwards in order to reuse the streak for the last round. | ||
1890 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1891 | * the bit order would be destroyed which results in more flicker. */ | ||
1892 | "sub.l %[psiz], %[addr] \n" | ||
1893 | "move.b (%[addr]), %%d0 \n" /* load old byte */ | ||
1894 | "and.l %%d5, %%d0 \n" /* mask out replaced bits */ | ||
1895 | "move.l %[mask], %%d1 \n" | ||
1896 | "or.l %%d1, %%d0 \n" /* set new bits */ | ||
1897 | "move.b %%d0, (%[addr]) \n" /* store byte */ | ||
1898 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1899 | "move.l %%d1, %[mask] \n" | ||
1900 | ".wa_f7: \n" | ||
1901 | "sub.l %[psiz], %[addr] \n" | ||
1902 | "move.b (%[addr]), %%d0 \n" | ||
1903 | "and.l %%d5, %%d0 \n" | ||
1904 | "or.l %%d2, %%d0 \n" | ||
1905 | "move.b %%d0, (%[addr]) \n" | ||
1906 | "lsr.l #8, %%d2 \n" | ||
1907 | ".wa_f6: \n" | ||
1908 | "sub.l %[psiz], %[addr] \n" | ||
1909 | "move.b (%[addr]), %%d0 \n" | ||
1910 | "and.l %%d5, %%d0 \n" | ||
1911 | "or.l %%d3, %%d0 \n" | ||
1912 | "move.b %%d0, (%[addr]) \n" | ||
1913 | "lsr.l #8, %%d3 \n" | ||
1914 | ".wa_f5: \n" | ||
1915 | "sub.l %[psiz], %[addr] \n" | ||
1916 | "move.b (%[addr]), %%d0 \n" | ||
1917 | "and.l %%d5, %%d0 \n" | ||
1918 | "or.l %%d4, %%d0 \n" | ||
1919 | "move.b %%d0, (%[addr]) \n" | ||
1920 | "lsr.l #8, %%d4 \n" | ||
1921 | ".wa_f4: \n" | ||
1922 | "sub.l %[psiz], %[addr] \n" | ||
1923 | "move.b (%[addr]), %%d0 \n" | ||
1924 | "and.l %%d5, %%d0 \n" | ||
1925 | "move.l %[ax], %%d1 \n" | ||
1926 | "or.l %%d1, %%d0 \n" | ||
1927 | "move.b %%d0, (%[addr]) \n" | ||
1928 | "lsr.l #8, %%d1 \n" | ||
1929 | "move.l %%d1, %[ax] \n" | ||
1930 | ".wa_f3: \n" | ||
1931 | "sub.l %[psiz], %[addr] \n" | ||
1932 | "move.b (%[addr]), %%d0 \n" | ||
1933 | "and.l %%d5, %%d0 \n" | ||
1934 | "or.l %%d6, %%d0 \n" | ||
1935 | "move.b %%d0, (%[addr]) \n" | ||
1936 | "lsr.l #8, %%d6 \n" | ||
1937 | ".wa_f2: \n" | ||
1938 | "sub.l %[psiz], %[addr] \n" | ||
1939 | "move.b (%[addr]), %%d0 \n" | ||
1940 | "and.l %%d5, %%d0 \n" | ||
1941 | "or.l %%d7, %%d0 \n" | ||
1942 | "move.b %%d0, (%[addr]) \n" | ||
1943 | "lsr.l #8, %%d7 \n" | ||
1944 | ".wa_f1: \n" | ||
1945 | "sub.l %[psiz], %[addr] \n" | ||
1946 | "move.b (%[addr]), %%d0 \n" | ||
1947 | "and.l %%d5, %%d0 \n" | ||
1948 | "move.l %%a0, %%d1 \n" | ||
1949 | "or.l %%d1, %%d0 \n" | ||
1950 | "move.b %%d0, (%[addr]) \n" | ||
1951 | "lsr.l #8, %%d1 \n" | ||
1952 | "move.l %%d1, %%a0 \n" | ||
1953 | |||
1954 | "move.l %[psiz], %%d0 \n" | ||
1955 | "lsl.l #3, %%d0 \n" | ||
1956 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1957 | "subq.l #8, %[dpth] \n" | ||
1958 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1959 | "jgt .wa_floop \n" /* next round if anything left */ | ||
1960 | |||
1961 | "jra .wa_end \n" | ||
1962 | |||
1963 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | ||
1964 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1965 | "bhs.s .wa_s8 \n" | ||
1966 | |||
1967 | "move.l %[psiz], %%d0 \n" | ||
1968 | "move.l %[dpth], %%d5 \n" | ||
1969 | "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ | ||
1970 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1971 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1972 | "bra.s .wa_s1 \n" /* dpth == 0 should never happen */ | ||
1973 | "bra.s .wa_s2 \n" | ||
1974 | "bra.s .wa_s3 \n" | ||
1975 | "bra.s .wa_s4 \n" | ||
1976 | "bra.s .wa_s5 \n" | ||
1977 | "bra.s .wa_s6 \n" | ||
1978 | "bra.s .wa_s7 \n" | ||
1979 | |||
1980 | ".wa_s8: \n" | ||
1981 | "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ | ||
1982 | "lsl.l #3, %%d0 \n" /* for this round. */ | ||
1983 | "add.l %%d0, %[addr] \n" /* See above. */ | ||
1984 | |||
1985 | "sub.l %[psiz], %[addr] \n" | ||
1986 | "move.b %%d1, (%[addr]) \n" /* store byte */ | ||
1987 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1988 | ".wa_s7: \n" | ||
1989 | "sub.l %[psiz], %[addr] \n" | ||
1990 | "move.b %%d2, (%[addr]) \n" | ||
1991 | "lsr.l #8, %%d2 \n" | ||
1992 | ".wa_s6: \n" | ||
1993 | "sub.l %[psiz], %[addr] \n" | ||
1994 | "move.b %%d3, (%[addr]) \n" | ||
1995 | "lsr.l #8, %%d3 \n" | ||
1996 | ".wa_s5: \n" | ||
1997 | "sub.l %[psiz], %[addr] \n" | ||
1998 | "move.b %%d4, (%[addr]) \n" | ||
1999 | "lsr.l #8, %%d4 \n" | ||
2000 | ".wa_s4: \n" | ||
2001 | "sub.l %[psiz], %[addr] \n" | ||
2002 | "move.l %[ax], %%d5 \n" | ||
2003 | "move.b %%d5, (%[addr]) \n" | ||
2004 | "lsr.l #8, %%d5 \n" | ||
2005 | "move.l %%d5, %[ax] \n" | ||
2006 | ".wa_s3: \n" | ||
2007 | "sub.l %[psiz], %[addr] \n" | ||
2008 | "move.b %%d6, (%[addr]) \n" | ||
2009 | "lsr.l #8, %%d6 \n" | ||
2010 | ".wa_s2: \n" | ||
2011 | "sub.l %[psiz], %[addr] \n" | ||
2012 | "move.b %%d7, (%[addr]) \n" | ||
2013 | "lsr.l #8, %%d7 \n" | ||
2014 | ".wa_s1: \n" | ||
2015 | "sub.l %[psiz], %[addr] \n" | ||
2016 | "move.l %%a0, %%d5 \n" | ||
2017 | "move.b %%d5, (%[addr]) \n" | ||
2018 | "lsr.l #8, %%d5 \n" | ||
2019 | "move.l %%d5, %%a0 \n" | ||
2020 | |||
2021 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
2022 | "subq.l #8, %[dpth] \n" | ||
2023 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
2024 | "jgt .wa_sloop \n" /* next round if anything left */ | ||
2025 | |||
2026 | ".wa_end: \n" | ||
1454 | : /* outputs */ | 2027 | : /* outputs */ |
1455 | [addr]"+a"(addr), | 2028 | [addr]"+a"(addr), |
1456 | [mask]"+d"(_mask), | 2029 | [dpth]"+a"(depth), |
2030 | [mask]"+a"(_mask), | ||
1457 | [ax] "=&a"(trash) | 2031 | [ax] "=&a"(trash) |
1458 | : /* inputs */ | 2032 | : /* inputs */ |
1459 | [psiz]"a"(_gray_info.plane_size), | 2033 | [psiz]"a"(_gray_info.plane_size), |
1460 | [end] "a"(end), | ||
1461 | [patp]"[ax]"(pat_ptr) | 2034 | [patp]"[ax]"(pat_ptr) |
1462 | : /* clobbers */ | 2035 | : /* clobbers */ |
1463 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" | 2036 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" |
1464 | ); | 2037 | ); |
1465 | #else /* C version, for reference*/ | 2038 | #else /* C version, for reference*/ |
1466 | #warning C version of _writearray() used | 2039 | #warning C version of _writearray() used |
2040 | unsigned char *end; | ||
1467 | unsigned test = 1; | 2041 | unsigned test = 1; |
1468 | int i; | 2042 | int i; |
1469 | 2043 | ||