summaryrefslogtreecommitdiff
path: root/apps/plugins/lib
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
committerJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
commit71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch)
treeb9a97081ec04d4d311a7b45747393e68837912a2 /apps/plugins/lib
parentbcd94a9b01d19d87a437cd8158a758f206b30825 (diff)
downloadrockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz
rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib')
-rw-r--r--apps/plugins/lib/gray_core.c1417
-rw-r--r--apps/plugins/lib/gray_draw.c1156
2 files changed, 1858 insertions, 715 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c
index e65a7f259e..809e88dba1 100644
--- a/apps/plugins/lib/gray_core.c
+++ b/apps/plugins/lib/gray_core.c
@@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height)
649 bbuf = _gray_info.back_buffer + srcofs_row; 649 bbuf = _gray_info.back_buffer + srcofs_row;
650 650
651#ifdef CPU_ARM 651#ifdef CPU_ARM
652 asm volatile ( 652 asm volatile
653 (
653 "ldr r0, [%[cbuf]] \n" 654 "ldr r0, [%[cbuf]] \n"
654 "ldr r1, [%[bbuf]] \n" 655 "ldr r1, [%[bbuf]] \n"
655 "eor r1, r0, r1 \n" 656 "eor r1, r0, r1 \n"
@@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height)
668 669
669 if (change != 0) 670 if (change != 0)
670 { 671 {
671 unsigned char *addr, *end; 672 unsigned char *addr;
672 unsigned mask, trash; 673 unsigned mask, depth, trash;
673 674
674 pat_ptr = &pat_stack[8]; 675 pat_ptr = &pat_stack[8];
675 676
676 /* precalculate the bit patterns with random shifts 677 /* precalculate the bit patterns with random shifts
677 * for all 8 pixels and put them on an extra "stack" */ 678 * for all 8 pixels and put them on an extra "stack" */
678 asm volatile ( 679 asm volatile
679 "mov r3, #8 \n" /* loop count */ 680 (
680 "mov %[mask], #0 \n" 681 "mov r3, #8 \n" /* loop count */
681 682 "mov %[mask], #0 \n"
682 ".ur_pre_loop: \n" 683
683 "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ 684 ".ur_pre_loop: \n"
684 "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ 685 "mov %[mask], %[mask], lsl #1 \n" /* shift mask */
685 "ldrb r1, [%[bbuf]] \n" /* read back buffer */ 686 "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
686 "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ 687 "ldrb r1, [%[bbuf]] \n" /* read back buffer */
687 "mov r2, #0 \n" /* preset for skipped pixel */ 688 "strb r0, [%[bbuf]], #1 \n" /* update back buffer */
688 "cmp r0, r1 \n" /* no change? */ 689 "mov r2, #0 \n" /* preset for skipped pixel */
689 "beq .ur_skip \n" /* -> skip */ 690 "cmp r0, r1 \n" /* no change? */
690 691 "beq .ur_skip \n" /* -> skip */
691 "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ 692
692 693 "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
693 "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ 694
694 "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" 695 "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */
695 "add %[rnd], %[rnd], #74 \n" /* add another 74 */ 696 "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n"
696 /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ 697 "add %[rnd], %[rnd], #74 \n" /* add another 74 */
697 "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ 698 /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
698 699 "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
699 "cmp r1, %[dpth] \n" /* random >= depth ? */ 700
700 "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ 701 "cmp r1, %[dpth] \n" /* random >= depth ? */
701 702 "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
702 "mov r0, r2, lsl r1 \n" /** rotate pattern **/ 703
703 "sub r1, %[dpth], r1 \n" 704 "mov r0, r2, lsl r1 \n" /** rotate pattern **/
704 "orr r2, r0, r2, lsr r1 \n" 705 "sub r1, %[dpth], r1 \n"
705 706 "orr r2, r0, r2, lsr r1 \n"
706 "orr %[mask], %[mask], #1 \n" /* set mask bit */ 707
708 "orr %[mask], %[mask], #1 \n" /* set mask bit */
707 709
708 ".ur_skip: \n" 710 ".ur_skip: \n"
709 "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ 711 "str r2, [%[patp], #-4]! \n" /* push on pattern stack */
710 712
711 "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ 713 "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
712 "bne .ur_pre_loop \n" 714 "bne .ur_pre_loop \n"
713 : /* outputs */ 715 : /* outputs */
714 [cbuf]"+r"(cbuf), 716 [cbuf]"+r"(cbuf),
715 [bbuf]"+r"(bbuf), 717 [bbuf]"+r"(bbuf),
716 [patp]"+r"(pat_ptr), 718 [patp]"+r"(pat_ptr),
717 [rnd] "+r"(_gray_random_buffer), 719 [rnd] "+r"(_gray_random_buffer),
718 [mask]"=&r"(mask) 720 [mask]"=&r"(mask)
719 : /* inputs */ 721 : /* inputs */
720 [bpat]"r"(_gray_info.bitpattern), 722 [bpat]"r"(_gray_info.bitpattern),
721 [dpth]"r"(_gray_info.depth), 723 [dpth]"r"(_gray_info.depth),
722 [rmsk]"r"(_gray_info.randmask) 724 [rmsk]"r"(_gray_info.randmask)
723 : /* clobbers */ 725 : /* clobbers */
724 "r0", "r1", "r2", "r3" 726 "r0", "r1", "r2", "r3"
725 ); 727 );
726 728
727 addr = dst_row; 729 addr = dst_row;
728 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 730 depth = _gray_info.depth;
729 731
730 /* set the bits for all 8 pixels in all bytes according to the 732 /* set the bits for all 8 pixels in all bytes according to the
731 * precalculated patterns on the pattern stack */ 733 * precalculated patterns on the pattern stack */
732 asm volatile ( 734 asm volatile
733 "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ 735 (
736 "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
737
738 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
739
740 "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
741 "orr %[rx], %[rx], %[rx], lsl #8 \n"
742 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
743 "eor r0, r1, r5, lsl #4 \n"
744 "and r0, r0, %[rx] \n"
745 "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
746 "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
747 "eor r0, r2, r6, lsl #4 \n"
748 "and r0, r0, %[rx] \n"
749 "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
750 "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
751 "eor r0, r3, r7, lsl #4 \n"
752 "and r0, r0, %[rx] \n"
753 "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
754 "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
755 "eor r0, r4, r8, lsl #4 \n"
756 "and r0, r0, %[rx] \n"
757 "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
758 "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
759
760 "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
761 "orr %[rx], %[rx], %[rx], lsl #8 \n"
762 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
763 "eor r0, r1, r3, lsl #2 \n"
764 "and r0, r0, %[rx] \n"
765 "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
766 "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
767 "eor r0, r2, r4, lsl #2 \n"
768 "and r0, r0, %[rx] \n"
769 "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
770 "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
771 "eor r0, r5, r7, lsl #2 \n"
772 "and r0, r0, %[rx] \n"
773 "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
774 "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
775 "eor r0, r6, r8, lsl #2 \n"
776 "and r0, r0, %[rx] \n"
777 "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
778 "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
779
780 "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
781 "orr %[rx], %[rx], %[rx], lsl #8 \n"
782 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
783 "eor r0, r1, r2, lsl #1 \n"
784 "and r0, r0, %[rx] \n"
785 "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
786 "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
787 "eor r0, r3, r4, lsl #1 \n"
788 "and r0, r0, %[rx] \n"
789 "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
790 "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
791 "eor r0, r5, r6, lsl #1 \n"
792 "and r0, r0, %[rx] \n"
793 "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
794 "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
795 "eor r0, r7, r8, lsl #1 \n"
796 "and r0, r0, %[rx] \n"
797 "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
798 "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
799
800 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
801 "ands %[mask], %[mask], #0xff \n"
802 "beq .ur_sloop \n" /* short loop if no bits to keep */
803
804 ".ur_floop: \n" /** full loop (bits to keep)**/
805 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
806 "bhs .ur_f8 \n"
807
808 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
809 "add %[addr], %[addr], r0 \n" /* for this round */
810
811 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
812 "add pc, pc, r0 \n"
813 ".ur_ftable: \n"
814 ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */
815 ".byte .ur_f1 - .ur_ftable - 4 \n"
816 ".byte .ur_f2 - .ur_ftable - 4 \n"
817 ".byte .ur_f3 - .ur_ftable - 4 \n"
818 ".byte .ur_f4 - .ur_ftable - 4 \n"
819 ".byte .ur_f5 - .ur_ftable - 4 \n"
820 ".byte .ur_f6 - .ur_ftable - 4 \n"
821 ".byte .ur_f7 - .ur_ftable - 4 \n"
822
823 ".ur_f8: \n"
824 "add %[addr], %[addr], %[psiz], lsl #3 \n"
825 /* Point behind the last plane for this round. Note: We're using the
826 * registers backwards in order to reuse the streak for the last round.
827 * Therefore we need to go thru the bitplanes backwards too, otherwise
828 * the bit order would be destroyed which results in more flicker. */
829 "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
830 "and r0, r0, %[mask] \n" /* mask out replaced bits */
831 "orr r0, r0, r8 \n" /* set new bits */
832 "strb r0, [%[addr]] \n" /* store byte */
833 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
834 ".ur_f7: \n"
835 "ldrb r0, [%[addr], -%[psiz]]! \n"
836 "and r0, r0, %[mask] \n"
837 "orr r0, r0, r7 \n"
838 "strb r0, [%[addr]] \n"
839 "mov r7, r7, lsr #8 \n"
840 ".ur_f6: \n"
841 "ldrb r0, [%[addr], -%[psiz]]! \n"
842 "and r0, r0, %[mask] \n"
843 "orr r0, r0, r6 \n"
844 "strb r0, [%[addr]] \n"
845 "mov r6, r6, lsr #8 \n"
846 ".ur_f5: \n"
847 "ldrb r0, [%[addr], -%[psiz]]! \n"
848 "and r0, r0, %[mask] \n"
849 "orr r0, r0, r5 \n"
850 "strb r0, [%[addr]] \n"
851 "mov r5, r5, lsr #8 \n"
852 ".ur_f4: \n"
853 "ldrb r0, [%[addr], -%[psiz]]! \n"
854 "and r0, r0, %[mask] \n"
855 "orr r0, r0, r4 \n"
856 "strb r0, [%[addr]] \n"
857 "mov r4, r4, lsr #8 \n"
858 ".ur_f3: \n"
859 "ldrb r0, [%[addr], -%[psiz]]! \n"
860 "and r0, r0, %[mask] \n"
861 "orr r0, r0, r3 \n"
862 "strb r0, [%[addr]] \n"
863 "mov r3, r3, lsr #8 \n"
864 ".ur_f2: \n"
865 "ldrb r0, [%[addr], -%[psiz]]! \n"
866 "and r0, r0, %[mask] \n"
867 "orr r0, r0, r2 \n"
868 "strb r0, [%[addr]] \n"
869 "mov r2, r2, lsr #8 \n"
870 ".ur_f1: \n"
871 "ldrb r0, [%[addr], -%[psiz]]! \n"
872 "and r0, r0, %[mask] \n"
873 "orr r0, r0, r1 \n"
874 "strb r0, [%[addr]] \n"
875 "mov r1, r1, lsr #8 \n"
876 ".ur_f0: \n"
877
878 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
879 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
880 "bhi .ur_floop \n"
881
882 "b .ur_end \n"
883
884 ".ur_sloop: \n" /** short loop (nothing to keep) **/
885 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
886 "bhs .ur_s8 \n"
887
888 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
889 "add %[addr], %[addr], r0 \n" /* for this round */
734 890
735 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ 891 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
736 "ands %[mask], %[mask], #0xff \n" 892 "add pc, pc, r0 \n"
737 "beq .ur_sloop \n" /* short loop if nothing to keep */ 893 ".ur_stable: \n"
738 894 ".byte .ur_s0 - .ur_stable - 4 \n"
739 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 895 ".byte .ur_s1 - .ur_stable - 4 \n"
740 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 896 ".byte .ur_s2 - .ur_stable - 4 \n"
741 "adc r0, r0, r0 \n" /* put bit into LSB for byte */ 897 ".byte .ur_s3 - .ur_stable - 4 \n"
742 "movs r8, r8, lsr #1 \n" 898 ".byte .ur_s4 - .ur_stable - 4 \n"
743 "adc r0, r0, r0 \n" 899 ".byte .ur_s5 - .ur_stable - 4 \n"
744 "movs r7, r7, lsr #1 \n" 900 ".byte .ur_s6 - .ur_stable - 4 \n"
745 "adc r0, r0, r0 \n" 901 ".byte .ur_s7 - .ur_stable - 4 \n"
746 "movs r6, r6, lsr #1 \n" 902
747 "adc r0, r0, r0 \n" 903 ".ur_s8: \n"
748 "movs r5, r5, lsr #1 \n" 904 "add %[addr], %[addr], %[psiz], lsl #3 \n"
749 "adc r0, r0, r0 \n" 905 /* Point behind the last plane for this round. See above. */
750 "movs r4, r4, lsr #1 \n" 906 "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
751 "adc r0, r0, r0 \n" 907 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
752 "movs r3, r3, lsr #1 \n" 908 ".ur_s7: \n"
753 "adc r0, r0, r0 \n" 909 "strb r7, [%[addr], -%[psiz]]! \n"
754 "movs r2, r2, lsr #1 \n" 910 "mov r7, r7, lsr #8 \n"
755 "adc r0, r0, r0 \n" 911 ".ur_s6: \n"
756 912 "strb r6, [%[addr], -%[psiz]]! \n"
757 "ldrb r1, [%[addr]] \n" /* read old value */ 913 "mov r6, r6, lsr #8 \n"
758 "and r1, r1, %[mask] \n" /* mask out replaced bits */ 914 ".ur_s5: \n"
759 "orr r1, r1, r0 \n" /* set new bits */ 915 "strb r5, [%[addr], -%[psiz]]! \n"
760 "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ 916 "mov r5, r5, lsr #8 \n"
761 917 ".ur_s4: \n"
762 "cmp %[end], %[addr] \n" /* loop for all bitplanes */ 918 "strb r4, [%[addr], -%[psiz]]! \n"
763 "bne .ur_floop \n" 919 "mov r4, r4, lsr #8 \n"
764 920 ".ur_s3: \n"
765 "b .ur_end \n" 921 "strb r3, [%[addr], -%[psiz]]! \n"
766 922 "mov r3, r3, lsr #8 \n"
767 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 923 ".ur_s2: \n"
768 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 924 "strb r2, [%[addr], -%[psiz]]! \n"
769 "adc r0, r0, r0 \n" /* put bit into LSB for byte */ 925 "mov r2, r2, lsr #8 \n"
770 "movs r8, r8, lsr #1 \n" 926 ".ur_s1: \n"
771 "adc r0, r0, r0 \n" 927 "strb r1, [%[addr], -%[psiz]]! \n"
772 "movs r7, r7, lsr #1 \n" 928 "mov r1, r1, lsr #8 \n"
773 "adc r0, r0, r0 \n" 929 ".ur_s0: \n"
774 "movs r6, r6, lsr #1 \n" 930
775 "adc r0, r0, r0 \n" 931 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
776 "movs r5, r5, lsr #1 \n" 932 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
777 "adc r0, r0, r0 \n" 933 "bhi .ur_sloop \n"
778 "movs r4, r4, lsr #1 \n" 934
779 "adc r0, r0, r0 \n" 935 ".ur_end: \n"
780 "movs r3, r3, lsr #1 \n" 936 : /* outputs */
781 "adc r0, r0, r0 \n" 937 [addr]"+r"(addr),
782 "movs r2, r2, lsr #1 \n" 938 [mask]"+r"(mask),
783 "adc r0, r0, r0 \n" 939 [dpth]"+r"(depth),
784 940 [rx] "=&r"(trash)
785 "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ 941 : /* inputs */
786 942 [psiz]"r"(_gray_info.plane_size),
787 "cmp %[end], %[addr] \n" /* loop for all bitplanes */ 943 [patp]"[rx]"(pat_ptr)
788 "bne .ur_sloop \n" 944 : /* clobbers */
789 945 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
790 ".ur_end: \n" 946 );
791 : /* outputs */
792 [addr]"+r"(addr),
793 [mask]"+r"(mask),
794 [rx] "=&r"(trash)
795 : /* inputs */
796 [psiz]"r"(_gray_info.plane_size),
797 [end] "r"(end),
798 [patp]"[rx]"(pat_ptr)
799 : /* clobbers */
800 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
801 );
802 } 947 }
803#else /* C version, for reference*/ 948#else /* C version, for reference*/
804#warning C version of gray_update_rect() used 949#warning C version of gray_update_rect() used
@@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height)
873 1018
874 for (i = 7; i >= 0; i--) 1019 for (i = 7; i >= 0; i--)
875 data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); 1020 data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
876 1021
877 *addr = (*addr & mask) | data; 1022 *addr = (*addr & mask) | data;
878 addr += _gray_info.plane_size; 1023 addr += _gray_info.plane_size;
879 test <<= 1; 1024 test <<= 1;
@@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height)
935 1080
936#if CONFIG_CPU == SH7034 1081#if CONFIG_CPU == SH7034
937 asm volatile ( 1082 asm volatile (
938 "mov.l @%[cbuf],r1 \n" 1083 "mov.l @%[cbuf], r1 \n"
939 "mov.l @%[bbuf],r2 \n" 1084 "mov.l @%[bbuf], r2 \n"
940 "xor r1,r2 \n" 1085 "xor r1, r2 \n"
941 "mov.l @(4,%[cbuf]),r1 \n" 1086 "mov.l @(4,%[cbuf]), r1 \n"
942 "mov.l @(4,%[bbuf]),%[chg] \n" 1087 "mov.l @(4,%[bbuf]), %[chg]\n"
943 "xor r1,%[chg] \n" 1088 "xor r1, %[chg] \n"
944 "or r2,%[chg] \n" 1089 "or r2, %[chg] \n"
945 : /* outputs */ 1090 : /* outputs */
946 [chg] "=r"(change) 1091 [chg] "=r"(change)
947 : /* inputs */ 1092 : /* inputs */
@@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height)
953 1098
954 if (change != 0) 1099 if (change != 0)
955 { 1100 {
956 unsigned char *addr, *end; 1101 unsigned char *addr;
957 unsigned mask, trash; 1102 unsigned mask, depth, trash;
958 1103
959 pat_ptr = &pat_stack[8]; 1104 pat_ptr = &pat_stack[8];
960 1105
961 /* precalculate the bit patterns with random shifts 1106 /* precalculate the bit patterns with random shifts
962 * for all 8 pixels and put them on an extra "stack" */ 1107 * for all 8 pixels and put them on an extra "stack" */
963 asm volatile ( 1108 asm volatile
964 "mov #8,r3 \n" /* loop count */ 1109 (
965 1110 "mov #8, r3 \n" /* loop count */
966 ".ur_pre_loop: \n" 1111
967 "mov.b @%[cbuf]+,r0\n" /* read current buffer */ 1112 ".ur_pre_loop: \n"
968 "mov.b @%[bbuf],r1 \n" /* read back buffer */ 1113 "mov.b @%[cbuf]+, r0 \n" /* read current buffer */
969 "mov #0,r2 \n" /* preset for skipped pixel */ 1114 "mov.b @%[bbuf], r1 \n" /* read back buffer */
970 "mov.b r0,@%[bbuf] \n" /* update back buffer */ 1115 "mov #0, r2 \n" /* preset for skipped pixel */
971 "add #1,%[bbuf] \n" 1116 "mov.b r0, @%[bbuf] \n" /* update back buffer */
972 "cmp/eq r0,r1 \n" /* no change? */ 1117 "add #1, %[bbuf] \n"
973 "bt .ur_skip \n" /* -> skip */ 1118 "cmp/eq r0, r1 \n" /* no change? */
974 1119 "bt .ur_skip \n" /* -> skip */
975 "shll2 r0 \n" /* pixel value -> pattern offset */ 1120
976 "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ 1121 "shll2 r0 \n" /* pixel value -> pattern offset */
977 1122 "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
978 "mov #75,r0 \n" 1123
979 "mulu r0,%[rnd] \n" /* multiply by 75 */ 1124 "mov #75, r0 \n"
980 "sts macl,%[rnd] \n" 1125 "mulu r0, %[rnd] \n" /* multiply by 75 */
981 "add #74,%[rnd] \n" /* add another 74 */ 1126 "sts macl, %[rnd] \n"
982 /* Since the lower bits are not very random: */ 1127 "add #74, %[rnd] \n" /* add another 74 */
983 "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ 1128 /* Since the lower bits are not very random: */
984 "and %[rmsk],r1 \n" /* mask out unneeded bits */ 1129 "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
985 1130 "and %[rmsk], r1 \n" /* mask out unneeded bits */
986 "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ 1131
987 "bf .ur_ntrim \n" 1132 "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
988 "sub %[dpth],r1 \n" /* yes: random -= depth; */ 1133 "bf .ur_ntrim \n"
989 ".ur_ntrim: \n" 1134 "sub %[dpth], r1 \n" /* yes: random -= depth; */
1135 ".ur_ntrim: \n"
990 1136
991 "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ 1137 "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
992 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ 1138 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
993 "mov r1,r5 \n" 1139 "mov r1, r5 \n"
994 1140
995 "mov %[dpth],r5 \n" 1141 "mov %[dpth], r5 \n"
996 "sub r1,r5 \n" /* r5 = depth - r1 */ 1142 "sub r1, r5 \n" /* r5 = depth - r1 */
997 "mov.l .lshrsi3,r1 \n" 1143 "mov.l .lshrsi3, r1 \n"
998 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ 1144 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
999 "mov r0,r2 \n" /* store previous result in r2 */ 1145 "mov r0, r2 \n" /* store previous result in r2 */
1000 1146
1001 "or r0,r2 \n" /* rotated_pattern = r2 | r0 */ 1147 "or r0, r2 \n" /* rotated_pattern = r2 | r0 */
1002 "clrt \n" /* mask bit = 0 (replace) */ 1148 "clrt \n" /* mask bit = 0 (replace) */
1003 1149
1004 ".ur_skip: \n" /* T == 1 if skipped */ 1150 ".ur_skip: \n" /* T == 1 if skipped */
1005 "rotcr %[mask] \n" /* get mask bit */ 1151 "rotcr %[mask] \n" /* get mask bit */
1006 "mov.l r2,@-%[patp]\n" /* push on pattern stack */ 1152 "mov.l r2, @-%[patp] \n" /* push on pattern stack */
1007 1153
1008 "add #-1,r3 \n" /* loop 8 times (pixel block) */ 1154 "add #-1, r3 \n" /* loop 8 times (pixel block) */
1009 "cmp/pl r3 \n" 1155 "cmp/pl r3 \n"
1010 "bt .ur_pre_loop\n" 1156 "bt .ur_pre_loop \n"
1011 1157
1012 "shlr8 %[mask] \n" /* shift mask to low byte */ 1158 "shlr8 %[mask] \n" /* shift mask to low byte */
1013 "shlr16 %[mask] \n" 1159 "shlr16 %[mask] \n"
1014 : /* outputs */ 1160 : /* outputs */
1015 [cbuf]"+r"(cbuf), 1161 [cbuf]"+r"(cbuf),
1016 [bbuf]"+r"(bbuf), 1162 [bbuf]"+r"(bbuf),
1017 [rnd] "+r"(_gray_random_buffer), 1163 [rnd] "+r"(_gray_random_buffer),
1018 [patp]"+r"(pat_ptr), 1164 [patp]"+r"(pat_ptr),
1019 [mask]"=&r"(mask) 1165 [mask]"=&r"(mask)
1020 : /* inputs */ 1166 : /* inputs */
1021 [dpth]"r"(_gray_info.depth), 1167 [dpth]"r"(_gray_info.depth),
1022 [bpat]"r"(_gray_info.bitpattern), 1168 [bpat]"r"(_gray_info.bitpattern),
1023 [rmsk]"r"(_gray_info.randmask) 1169 [rmsk]"r"(_gray_info.randmask)
1024 : /* clobbers */ 1170 : /* clobbers */
1025 "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" 1171 "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr"
1026 ); 1172 );
1027 1173
1028 addr = dst_row; 1174 addr = dst_row;
1029 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1175 depth = _gray_info.depth;
1030 1176
1031 /* set the bits for all 8 pixels in all bytes according to the 1177 /* set the bits for all 8 pixels in all bytes according to the
1032 * precalculated patterns on the pattern stack */ 1178 * precalculated patterns on the pattern stack */
1033 asm volatile ( 1179 asm volatile
1034 "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ 1180 (
1035 "mov.l @%[patp]+,r2 \n" 1181 "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
1036 "mov.l @%[patp]+,r3 \n" 1182 "mov.l @%[patp]+, r7 \n"
1037 "mov.l @%[patp]+,r6 \n" 1183 "mov.l @%[patp]+, r6 \n"
1038 "mov.l @%[patp]+,r7 \n" 1184 "mov.l @%[patp]+, r5 \n"
1039 "mov.l @%[patp]+,r8 \n" 1185 "mov.l @%[patp]+, r4 \n"
1040 "mov.l @%[patp]+,r9 \n" 1186 "mov.l @%[patp]+, r3 \n"
1041 "mov.l @%[patp],r10 \n" 1187 "mov.l @%[patp]+, r2 \n"
1042 1188 "mov.l @%[patp], r1 \n"
1043 "tst %[mask],%[mask] \n" 1189
1044 "bt .ur_sloop \n" /* short loop if nothing to keep */ 1190 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1045 1191
1046 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 1192 "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */
1047 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1193 "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
1048 "rotcl r0 \n" /* rotate t bit into r0 */ 1194 "shll2 r0 \n"
1049 "shlr r2 \n" 1195 "shll2 r0 \n"
1050 "rotcl r0 \n" 1196 "xor r1, r0 \n"
1051 "shlr r3 \n" 1197 "and %[rx], r0 \n"
1052 "rotcl r0 \n" 1198 "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
1053 "shlr r6 \n" 1199 "shlr2 r0 \n"
1054 "rotcl r0 \n" 1200 "shlr2 r0 \n"
1055 "shlr r7 \n" 1201 "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
1056 "rotcl r0 \n" 1202 "mov r6, r0 \n"
1057 "shlr r8 \n" 1203 "shll2 r0 \n"
1058 "rotcl r0 \n" 1204 "shll2 r0 \n"
1059 "shlr r9 \n" 1205 "xor r2, r0 \n"
1060 "rotcl r0 \n" 1206 "and %[rx], r0 \n"
1061 "shlr r10 \n" 1207 "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
1062 "mov.b @%[addr],%[rx] \n" /* read old value */ 1208 "shlr2 r0 \n"
1063 "rotcl r0 \n" 1209 "shlr2 r0 \n"
1064 "and %[mask],%[rx] \n" /* mask out replaced bits */ 1210 "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
1065 "or %[rx],r0 \n" /* set new bits */ 1211 "mov r7, r0 \n"
1066 "mov.b r0,@%[addr] \n" /* store value to bitplane */ 1212 "shll2 r0 \n"
1067 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1213 "shll2 r0 \n"
1068 "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ 1214 "xor r3, r0 \n"
1069 "bt .ur_floop \n" 1215 "and %[rx], r0 \n"
1070 1216 "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
1071 "bra .ur_end \n" 1217 "shlr2 r0 \n"
1072 "nop \n" 1218 "shlr2 r0 \n"
1073 1219 "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
1074 /* References to C library routines used in the precalc block */ 1220 "mov r8, r0 \n"
1075 ".align 2 \n" 1221 "shll2 r0 \n"
1076 ".ashlsi3: \n" /* C library routine: */ 1222 "shll2 r0 \n"
1077 ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ 1223 "xor r4, r0 \n"
1078 ".lshrsi3: \n" /* C library routine: */ 1224 "and %[rx], r0 \n"
1079 ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ 1225 "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
1080 /* both routines preserve r4, destroy r5 and take ~16 cycles */ 1226 "shlr2 r0 \n"
1081 1227 "shlr2 r0 \n"
1082 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 1228 "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
1083 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1229
1084 "rotcl r0 \n" /* rotate t bit into r0 */ 1230 "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */
1085 "shlr r2 \n" 1231 "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
1086 "rotcl r0 \n" 1232 "shll2 r0 \n"
1087 "shlr r3 \n" 1233 "xor r1, r0 \n"
1088 "rotcl r0 \n" 1234 "and %[rx], r0 \n"
1089 "shlr r6 \n" 1235 "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
1090 "rotcl r0 \n" 1236 "shlr2 r0 \n"
1091 "shlr r7 \n" 1237 "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
1092 "rotcl r0 \n" 1238 "mov r4, r0 \n"
1093 "shlr r8 \n" 1239 "shll2 r0 \n"
1094 "rotcl r0 \n" 1240 "xor r2, r0 \n"
1095 "shlr r9 \n" 1241 "and %[rx], r0 \n"
1096 "rotcl r0 \n" 1242 "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1097 "shlr r10 \n" 1243 "shlr2 r0 \n"
1098 "rotcl r0 \n" 1244 "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
1099 "mov.b r0,@%[addr] \n" /* store byte to bitplane */ 1245 "mov r7, r0 \n"
1100 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1246 "shll2 r0 \n"
1101 "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ 1247 "xor r5, r0 \n"
1102 "bt .ur_sloop \n" 1248 "and %[rx], r0 \n"
1103 1249 "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
1104 ".ur_end: \n" 1250 "shlr2 r0 \n"
1105 : /* outputs */ 1251 "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
1106 [addr]"+r"(addr), 1252 "mov r8, r0 \n"
1107 [mask]"+r"(mask), 1253 "shll2 r0 \n"
1108 [rx] "=&r"(trash) 1254 "xor r6, r0 \n"
1109 : /* inputs */ 1255 "and %[rx], r0 \n"
1110 [psiz]"r"(_gray_info.plane_size), 1256 "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
1111 [end] "r"(end), 1257 "shlr2 r0 \n"
1112 [patp]"[rx]"(pat_ptr) 1258 "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
1113 : /* clobbers */ 1259
1114 "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" 1260 "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */
1261 "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
1262 "shll r0 \n"
1263 "xor r1, r0 \n"
1264 "and %[rx], r0 \n"
1265 "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
1266 "shlr r0 \n"
1267 "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
1268 "mov r4, r0 \n"
1269 "shll r0 \n"
1270 "xor r3, r0 \n"
1271 "and %[rx], r0 \n"
1272 "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
1273 "shlr r0 \n"
1274 "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
1275 "mov r6, r0 \n"
1276 "shll r0 \n"
1277 "xor r5, r0 \n"
1278 "and %[rx], r0 \n"
1279 "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1280 "shlr r0 \n"
1281 "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1282 "mov r8, r0 \n"
1283 "shll r0 \n"
1284 "xor r7, r0 \n"
1285 "and %[rx], r0 \n"
1286 "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1287 "shlr r0 \n"
1288 "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1289
1290 "tst %[mask], %[mask] \n"
1291 "bt .ur_sloop \n" /* short loop if nothing to keep */
1292
1293 ".ur_floop: \n" /** full loop (there are bits to keep)**/
1294 "mov #8, r0 \n"
1295 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1296 "bt .ur_f8 \n"
1297
1298 "mulu %[psiz], %[dpth] \n"
1299 "mova .ur_ftable, r0 \n"
1300 "mov.b @(r0, %[dpth]), %[rx] \n"
1301 "add %[rx], r0 \n"
1302 "sts macl, %[rx] \n" /* point behind the last plane.. */
1303 "jmp @r0 \n" /* jump into streak */
1304 "add %[rx], %[addr] \n" /* ..for this round */
1305
1306 ".align 2 \n"
1307 ".ur_ftable: \n"
1308 ".byte .ur_f0 - .ur_ftable \n"
1309 ".byte .ur_f1 - .ur_ftable \n"
1310 ".byte .ur_f2 - .ur_ftable \n"
1311 ".byte .ur_f3 - .ur_ftable \n"
1312 ".byte .ur_f4 - .ur_ftable \n"
1313 ".byte .ur_f5 - .ur_ftable \n"
1314 ".byte .ur_f6 - .ur_ftable \n"
1315 ".byte .ur_f7 - .ur_ftable \n"
1316
1317 ".ur_f8: \n"
1318 "mov %[psiz], %[rx] \n"
1319 "shll2 %[rx] \n"
1320 "add %[rx], %[rx] \n"
1321 "add %[rx], %[addr] \n"
1322 /* Point behind the last plane for this round. Note: We're using the
1323 * registers backwards in order to reuse the streak for the last round.
1324 * Therefore we need to go thru the bitplanes backwards too, otherwise
1325 * the bit order would be destroyed which results in more flicker. */
1326 "sub %[psiz], %[addr] \n"
1327 "mov.b @%[addr], r0 \n" /* load old byte */
1328 "and %[mask], r0 \n" /* mask out replaced bits */
1329 "or r8, r0 \n" /* set new bits */
1330 "mov.b r0, @%[addr] \n" /* store byte */
1331 "shlr8 r8 \n" /* shift out used-up byte */
1332 ".ur_f7: \n"
1333 "sub %[psiz], %[addr] \n"
1334 "mov.b @%[addr], r0 \n"
1335 "and %[mask], r0 \n"
1336 "or r7, r0 \n"
1337 "mov.b r0, @%[addr] \n"
1338 "shlr8 r7 \n"
1339 ".ur_f6: \n"
1340 "sub %[psiz], %[addr] \n"
1341 "mov.b @%[addr], r0 \n"
1342 "and %[mask], r0 \n"
1343 "or r6, r0 \n"
1344 "mov.b r0, @%[addr] \n"
1345 "shlr8 r6 \n"
1346 ".ur_f5: \n"
1347 "sub %[psiz], %[addr] \n"
1348 "mov.b @%[addr], r0 \n"
1349 "and %[mask], r0 \n"
1350 "or r5, r0 \n"
1351 "mov.b r0, @%[addr] \n"
1352 "shlr8 r5 \n"
1353 ".ur_f4: \n"
1354 "sub %[psiz], %[addr] \n"
1355 "mov.b @%[addr], r0 \n"
1356 "and %[mask], r0 \n"
1357 "or r4, r0 \n"
1358 "mov.b r0, @%[addr] \n"
1359 "shlr8 r4 \n"
1360 ".ur_f3: \n"
1361 "sub %[psiz], %[addr] \n"
1362 "mov.b @%[addr], r0 \n"
1363 "and %[mask], r0 \n"
1364 "or r3, r0 \n"
1365 "mov.b r0, @%[addr] \n"
1366 "shlr8 r3 \n"
1367 ".ur_f2: \n"
1368 "sub %[psiz], %[addr] \n"
1369 "mov.b @%[addr], r0 \n"
1370 "and %[mask], r0 \n"
1371 "or r2, r0 \n"
1372 "mov.b r0, @%[addr] \n"
1373 "shlr8 r2 \n"
1374 ".ur_f1: \n"
1375 "sub %[psiz], %[addr] \n"
1376 "mov.b @%[addr], r0 \n"
1377 "and %[mask], r0 \n"
1378 "or r1, r0 \n"
1379 "mov.b r0, @%[addr] \n"
1380 "shlr8 r1 \n"
1381 ".ur_f0: \n"
1382
1383 "add %[rx], %[addr] \n" /* correct address */
1384 "add #-8, %[dpth] \n"
1385 "cmp/pl %[dpth] \n" /* next round if anything left */
1386 "bt .ur_floop \n"
1387
1388 "bra .ur_end \n"
1389 "nop \n"
1390
1391 /* References to C library routines used in the precalc block */
1392 ".align 2 \n"
1393 ".ashlsi3: \n" /* C library routine: */
1394 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
1395 ".lshrsi3: \n" /* C library routine: */
1396 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
1397 /* both routines preserve r4, destroy r5 and take ~16 cycles */
1398
1399 /* Bitmasks for the bit block rotation */
1400 ".ur_mask4: \n"
1401 ".long 0xF0F0F0F0 \n"
1402 ".ur_mask2: \n"
1403 ".long 0xCCCCCCCC \n"
1404 ".ur_mask1: \n"
1405 ".long 0xAAAAAAAA \n"
1406
1407 ".ur_sloop: \n" /** short loop (nothing to keep) **/
1408 "mov #8, r0 \n"
1409 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1410 "bt .ur_s8 \n"
1411
1412 "mulu %[psiz], %[dpth] \n"
1413 "mova .ur_stable, r0 \n"
1414 "mov.b @(r0, %[dpth]), %[rx] \n"
1415 "add %[rx], r0 \n"
1416 "sts macl, %[rx] \n" /* point behind the last plane.. */
1417 "jmp @r0 \n" /* jump into streak */
1418 "add %[rx], %[addr] \n" /* ..for this round */
1419
1420 ".align 2 \n"
1421 ".ur_stable: \n"
1422 ".byte .ur_s0 - .ur_stable \n"
1423 ".byte .ur_s1 - .ur_stable \n"
1424 ".byte .ur_s2 - .ur_stable \n"
1425 ".byte .ur_s3 - .ur_stable \n"
1426 ".byte .ur_s4 - .ur_stable \n"
1427 ".byte .ur_s5 - .ur_stable \n"
1428 ".byte .ur_s6 - .ur_stable \n"
1429 ".byte .ur_s7 - .ur_stable \n"
1430
1431 ".ur_s8: \n"
1432 "mov %[psiz], %[rx] \n" /* Point behind the last plane */
1433 "shll2 %[rx] \n" /* for this round. */
1434 "add %[rx], %[rx] \n" /* See above. */
1435 "add %[rx], %[addr] \n"
1436
1437 "sub %[psiz], %[addr] \n"
1438 "mov.b r8, @%[addr] \n" /* store byte */
1439 "shlr8 r8 \n" /* shift out used-up byte */
1440 ".ur_s7: \n"
1441 "sub %[psiz], %[addr] \n"
1442 "mov.b r7, @%[addr] \n"
1443 "shlr8 r7 \n"
1444 ".ur_s6: \n"
1445 "sub %[psiz], %[addr] \n"
1446 "mov.b r6, @%[addr] \n"
1447 "shlr8 r6 \n"
1448 ".ur_s5: \n"
1449 "sub %[psiz], %[addr] \n"
1450 "mov.b r5, @%[addr] \n"
1451 "shlr8 r5 \n"
1452 ".ur_s4: \n"
1453 "sub %[psiz], %[addr] \n"
1454 "mov.b r4, @%[addr] \n"
1455 "shlr8 r4 \n"
1456 ".ur_s3: \n"
1457 "sub %[psiz], %[addr] \n"
1458 "mov.b r3, @%[addr] \n"
1459 "shlr8 r3 \n"
1460 ".ur_s2: \n"
1461 "sub %[psiz], %[addr] \n"
1462 "mov.b r2, @%[addr] \n"
1463 "shlr8 r2 \n"
1464 ".ur_s1: \n"
1465 "sub %[psiz], %[addr] \n"
1466 "mov.b r1, @%[addr] \n"
1467 "shlr8 r1 \n"
1468 ".ur_s0: \n"
1469
1470 "add %[rx], %[addr] \n" /* correct address */
1471 "add #-8, %[dpth] \n"
1472 "cmp/pl %[dpth] \n" /* next round if anything left */
1473 "bt .ur_sloop \n"
1474
1475 ".ur_end: \n"
1476 : /* outputs */
1477 [addr]"+r"(addr),
1478 [dpth]"+r"(depth),
1479 [rx] "=&r"(trash)
1480 : /* inputs */
1481 [mask]"r"(mask),
1482 [psiz]"r"(_gray_info.plane_size),
1483 [patp]"[rx]"(pat_ptr)
1484 : /* clobbers */
1485 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
1115 ); 1486 );
1116 } 1487 }
1117#elif defined(CPU_COLDFIRE) 1488#elif defined(CPU_COLDFIRE)
1118 asm volatile ( 1489 asm volatile (
1119 "move.l (%[cbuf]),%%d0 \n" 1490 "move.l (%[cbuf]), %%d0 \n"
1120 "move.l (%[bbuf]),%%d1 \n" 1491 "move.l (%[bbuf]), %%d1 \n"
1121 "eor.l %%d0,%%d1 \n" 1492 "eor.l %%d0, %%d1 \n"
1122 "move.l (4,%[cbuf]),%%d0 \n" 1493 "move.l (4,%[cbuf]), %%d0 \n"
1123 "move.l (4,%[bbuf]),%[chg] \n" 1494 "move.l (4,%[bbuf]), %[chg] \n"
1124 "eor.l %%d0,%[chg] \n" 1495 "eor.l %%d0, %[chg] \n"
1125 "or.l %%d1,%[chg] \n" 1496 "or.l %%d1, %[chg] \n"
1126 : /* outputs */ 1497 : /* outputs */
1127 [chg] "=&d"(change) 1498 [chg] "=&d"(change)
1128 : /* inputs */ 1499 : /* inputs */
@@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height)
1134 1505
1135 if (change != 0) 1506 if (change != 0)
1136 { 1507 {
1137 unsigned char *addr, *end; 1508 unsigned char *addr;
1138 unsigned mask, trash; 1509 unsigned mask, depth, trash;
1139 1510
1140 pat_ptr = &pat_stack[8]; 1511 pat_ptr = &pat_stack[8];
1141 1512
1142 /* precalculate the bit patterns with random shifts 1513 /* precalculate the bit patterns with random shifts
1143 * for all 8 pixels and put them on an extra "stack" */ 1514 * for all 8 pixels and put them on an extra "stack" */
1144 asm volatile ( 1515 asm volatile
1145 "moveq.l #8,%%d3 \n" /* loop count */ 1516 (
1146 "clr.l %[mask] \n" 1517 "moveq.l #8, %%d3 \n" /* loop count */
1147 1518 "clr.l %[mask] \n"
1148 ".ur_pre_loop: \n" 1519
1149 "clr.l %%d0 \n" 1520 ".ur_pre_loop: \n"
1150 "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ 1521 "clr.l %%d0 \n"
1151 "clr.l %%d1 \n" 1522 "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */
1152 "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ 1523 "clr.l %%d1 \n"
1153 "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ 1524 "move.b (%[bbuf]), %%d1 \n" /* read back buffer */
1154 "clr.l %%d2 \n" /* preset for skipped pixel */ 1525 "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */
1155 "cmp.l %%d0,%%d1 \n" /* no change? */ 1526 "clr.l %%d2 \n" /* preset for skipped pixel */
1156 "beq.b .ur_skip \n" /* -> skip */ 1527 "cmp.l %%d0, %%d1 \n" /* no change? */
1157 1528 "beq.b .ur_skip \n" /* -> skip */
1158 "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ 1529
1159 1530 "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
1160 "mulu.w #75,%[rnd] \n" /* multiply by 75 */ 1531
1161 "add.l #74,%[rnd] \n" /* add another 74 */ 1532 "mulu.w #75, %[rnd] \n" /* multiply by 75 */
1162 /* Since the lower bits are not very random: */ 1533 "add.l #74, %[rnd] \n" /* add another 74 */
1163 "move.l %[rnd],%%d1 \n" 1534 /* Since the lower bits are not very random: */
1164 "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ 1535 "move.l %[rnd], %%d1 \n"
1165 "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ 1536 "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
1166 1537 "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
1167 "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ 1538
1168 "blo.b .ur_ntrim \n" 1539 "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
1169 "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ 1540 "blo.b .ur_ntrim \n"
1170 ".ur_ntrim: \n" 1541 "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
1171 1542 ".ur_ntrim: \n"
1172 "move.l %%d2,%%d0 \n" /** rotate pattern **/ 1543
1173 "lsl.l %%d1,%%d0 \n" 1544 "move.l %%d2, %%d0 \n" /** rotate pattern **/
1174 "sub.l %[dpth],%%d1 \n" 1545 "lsl.l %%d1, %%d0 \n"
1175 "neg.l %%d1 \n" /* d1 = depth - d1 */ 1546 "sub.l %[dpth], %%d1 \n"
1176 "lsr.l %%d1,%%d2 \n" 1547 "neg.l %%d1 \n" /* d1 = depth - d1 */
1177 "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ 1548 "lsr.l %%d1, %%d2 \n"
1178 1549 "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */
1179 "or.l #0x0100,%[mask] \n" /* set mask bit */ 1550
1180 1551 "or.l #0x0100, %[mask] \n" /* set mask bit */
1181 ".ur_skip: \n" 1552
1182 "lsr.l #1,%[mask] \n" /* shift mask */ 1553 ".ur_skip: \n"
1183 "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ 1554 "lsr.l #1, %[mask] \n" /* shift mask */
1184 1555 "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
1185 "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ 1556
1186 "bne.b .ur_pre_loop \n" 1557 "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
1187 : /* outputs */ 1558 "bne.b .ur_pre_loop \n"
1188 [cbuf]"+a"(cbuf), 1559 : /* outputs */
1189 [bbuf]"+a"(bbuf), 1560 [cbuf]"+a"(cbuf),
1190 [patp]"+a"(pat_ptr), 1561 [bbuf]"+a"(bbuf),
1191 [rnd] "+d"(_gray_random_buffer), 1562 [patp]"+a"(pat_ptr),
1192 [mask]"=&d"(mask) 1563 [rnd] "+d"(_gray_random_buffer),
1193 : /* inputs */ 1564 [mask]"=&d"(mask)
1194 [bpat]"a"(_gray_info.bitpattern), 1565 : /* inputs */
1195 [dpth]"d"(_gray_info.depth), 1566 [bpat]"a"(_gray_info.bitpattern),
1196 [rmsk]"d"(_gray_info.randmask) 1567 [dpth]"d"(_gray_info.depth),
1197 : /* clobbers */ 1568 [rmsk]"d"(_gray_info.randmask)
1198 "d0", "d1", "d2", "d3" 1569 : /* clobbers */
1570 "d0", "d1", "d2", "d3"
1199 ); 1571 );
1200 1572
1201 addr = dst_row; 1573 addr = dst_row;
1202 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1574 mask = ~mask & 0xff;
1575 depth = _gray_info.depth;
1203 1576
1204 /* set the bits for all 8 pixels in all bytes according to the 1577 /* set the bits for all 8 pixels in all bytes according to the
1205 * precalculated patterns on the pattern stack */ 1578 * precalculated patterns on the pattern stack */
1206 asm volatile ( 1579 asm volatile
1207 "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" 1580 (
1208 /* pop all 8 patterns */ 1581 "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
1209 "not.l %[mask] \n" /* "set" mask -> "keep" mask */ 1582 /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
1210 "and.l #0xFF,%[mask] \n" 1583
1211 "beq.b .ur_sstart \n" /* short loop if nothing to keep */ 1584 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1212 1585
1213 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 1586 "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
1214 "clr.l %%d0 \n" 1587 "lsl.l #4, %%d0 \n"
1215 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1588 /* move.l %[ax], %%d5 */ /* already in d5 */
1216 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1589 "eor.l %%d5, %%d0 \n"
1217 "lsr.l #1,%%d3 \n" 1590 "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
1218 "addx.l %%d0,%%d0 \n" 1591 "eor.l %%d0, %%d5 \n"
1219 "lsr.l #1,%%d4 \n" 1592 "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
1220 "addx.l %%d0,%%d0 \n" 1593 "lsr.l #4, %%d0 \n"
1221 "lsr.l #1,%%d5 \n" 1594 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
1222 "addx.l %%d0,%%d0 \n" 1595 "move.l %%d2, %%d0 \n"
1223 "lsr.l #1,%%d6 \n" 1596 "lsl.l #4, %%d0 \n"
1224 "addx.l %%d0,%%d0 \n" 1597 "eor.l %%d6, %%d0 \n"
1225 "move.l %%a0,%%d1 \n" 1598 "and.l #0xF0F0F0F0, %%d0 \n"
1226 "lsr.l #1,%%d1 \n" 1599 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
1227 "addx.l %%d0,%%d0 \n" 1600 "lsr.l #4, %%d0 \n"
1228 "move.l %%d1,%%a0 \n" 1601 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
1229 "move.l %%a1,%%d1 \n" 1602 "move.l %%d3, %%d0 \n"
1230 "lsr.l #1,%%d1 \n" 1603 "lsl.l #4, %%d0 \n"
1231 "addx.l %%d0,%%d0 \n" 1604 "eor.l %%d7, %%d0 \n"
1232 "move.l %%d1,%%a1 \n" 1605 "and.l #0xF0F0F0F0, %%d0 \n"
1233 "move.l %[ax],%%d1 \n" 1606 "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
1234 "lsr.l #1,%%d1 \n" 1607 "lsr.l #4, %%d0 \n"
1235 "addx.l %%d0,%%d0 \n" 1608 "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
1236 "move.l %%d1,%[ax] \n" 1609 "move.l %%d4, %%d0 \n"
1237 1610 "lsl.l #4, %%d0 \n"
1238 "move.b (%[addr]),%%d1 \n" /* read old value */ 1611 "move.l %%a0, %%d5 \n"
1239 "and.l %[mask],%%d1 \n" /* mask out replaced bits */ 1612 "eor.l %%d5, %%d0 \n"
1240 "or.l %%d0,%%d1 \n" /* set new bits */ 1613 "and.l #0xF0F0F0F0, %%d0 \n"
1241 "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ 1614 "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
1242 1615 /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
1243 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1616 "lsr.l #4, %%d0 \n"
1244 "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ 1617 "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
1245 "bhi.b .ur_floop \n" 1618
1246 1619 "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
1247 "bra.b .ur_end \n" 1620 "lsl.l #2, %%d0 \n"
1248 1621 /* move.l %%a0, %%d5 */ /* still in d5 */
1249 ".ur_sstart: \n" 1622 "eor.l %%d5, %%d0 \n"
1250 "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ 1623 "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
1251 1624 "eor.l %%d0, %%d5 \n"
1252 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 1625 "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
1253 "clr.l %%d0 \n" 1626 "lsr.l #2, %%d0 \n"
1254 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1627 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
1255 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1628 "move.l %[ax], %%d5 \n"
1256 "lsr.l #1,%%d3 \n" 1629 "move.l %%d5, %%d0 \n"
1257 "addx.l %%d0,%%d0 \n" 1630 "lsl.l #2, %%d0 \n"
1258 "lsr.l #1,%%d4 \n" 1631 "eor.l %%d7, %%d0 \n"
1259 "addx.l %%d0,%%d0 \n" 1632 "and.l #0xCCCCCCCC, %%d0 \n"
1260 "lsr.l #1,%%d5 \n" 1633 "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1261 "addx.l %%d0,%%d0 \n" 1634 "lsr.l #2, %%d0 \n"
1262 "lsr.l #1,%%d6 \n" 1635 "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
1263 "addx.l %%d0,%%d0 \n" 1636 /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
1264 "lsr.l #1,%[mask] \n" 1637 "move.l %%d2, %%d0 \n"
1265 "addx.l %%d0,%%d0 \n" 1638 "lsl.l #2, %%d0 \n"
1266 "move.l %%a1,%%d1 \n" 1639 "eor.l %%d4, %%d0 \n"
1267 "lsr.l #1,%%d1 \n" 1640 "and.l #0xCCCCCCCC, %%d0 \n"
1268 "addx.l %%d0,%%d0 \n" 1641 "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
1269 "move.l %%d1,%%a1 \n" 1642 "lsr.l #2, %%d0 \n"
1270 "move.l %[ax],%%d1 \n" 1643 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
1271 "lsr.l #1,%%d1 \n" 1644 "move.l %%d1, %%d0 \n"
1272 "addx.l %%d0,%%d0 \n" 1645 "lsl.l #2, %%d0 \n"
1273 "move.l %%d1,%[ax] \n" 1646 "eor.l %%d3, %%d0 \n"
1274 1647 "and.l #0xCCCCCCCC, %%d0 \n"
1275 "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ 1648 "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
1276 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1649 "lsr.l #2, %%d0 \n"
1277 "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ 1650 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
1278 "bhi.b .ur_sloop \n" 1651
1279 1652 "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
1280 ".ur_end: \n" 1653 "lsl.l #1, %%d0 \n"
1281 : /* outputs */ 1654 "eor.l %%d2, %%d0 \n"
1282 [addr]"+a"(addr), 1655 "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
1283 [mask]"+d"(mask), 1656 "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
1284 [ax] "=&a"(trash) 1657 "lsr.l #1, %%d0 \n"
1285 : /* inputs */ 1658 "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
1286 [psiz]"a"(_gray_info.plane_size), 1659 "move.l %%d3, %%d0 \n"
1287 [end] "a"(end), 1660 "lsl.l #1, %%d0 \n"
1288 [patp]"[ax]"(pat_ptr) 1661 "eor.l %%d4, %%d0 \n"
1289 : /* clobbers */ 1662 "and.l #0xAAAAAAAA, %%d0 \n"
1290 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" 1663 "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
1664 "lsr.l #1, %%d0 \n"
1665 "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
1666 /* move.l %[ax], %%d5 */ /* still in d5 */
1667 "move.l %%d5, %%d0 \n"
1668 "lsl.l #1, %%d0 \n"
1669 "eor.l %%d6, %%d0 \n"
1670 "and.l #0xAAAAAAAA, %%d0 \n"
1671 "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
1672 "lsr.l #1, %%d0 \n"
1673 "eor.l %%d0, %%d5 \n"
1674 "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
1675 "move.l %%d7, %%d0 \n"
1676 "lsl.l #1, %%d0 \n"
1677 "move.l %%a0, %%d5 \n"
1678 "eor.l %%d5, %%d0 \n"
1679 "and.l #0xAAAAAAAA, %%d0 \n"
1680 "eor.l %%d0, %%d5 \n"
1681 "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
1682 "lsr.l #1, %%d0 \n"
1683 "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
1684
1685 "tst.l %[mask] \n"
1686 "jeq .ur_sloop \n" /* short loop if nothing to keep */
1687
1688 "move.l %[mask], %%d5 \n" /* need mask in data reg. */
1689 "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
1690
1691 ".ur_floop: \n" /** full loop (there are bits to keep)**/
1692 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1693 "bhs.s .ur_f8 \n"
1694
1695 "move.l %[psiz], %%d0 \n"
1696 "move.l %[dpth], %%d1 \n"
1697 "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
1698 "add.l %%d0, %[addr] \n" /* for this round */
1699 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1700 "bra.s .ur_f1 \n" /* dpth == 0 should never happen */
1701 "bra.s .ur_f2 \n"
1702 "bra.s .ur_f3 \n"
1703 "bra.s .ur_f4 \n"
1704 "bra.s .ur_f5 \n"
1705 "bra.s .ur_f6 \n"
1706 "bra.s .ur_f7 \n"
1707
1708 ".ur_f8: \n"
1709 "move.l %[psiz], %%d0 \n"
1710 "lsl.l #3, %%d0 \n"
1711 "add.l %%d0, %[addr] \n"
1712 /* Point behind the last plane for this round. Note: We're using the
1713 * registers backwards in order to reuse the streak for the last round.
1714 * Therefore we need to go thru the bitplanes backwards too, otherwise
1715 * the bit order would be destroyed which results in more flicker. */
1716 "sub.l %[psiz], %[addr] \n"
1717 "move.b (%[addr]), %%d0 \n" /* load old byte */
1718 "and.l %%d5, %%d0 \n" /* mask out replaced bits */
1719 "move.l %[mask], %%d1 \n"
1720 "or.l %%d1, %%d0 \n" /* set new bits */
1721 "move.b %%d0, (%[addr]) \n" /* store byte */
1722 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1723 "move.l %%d1, %[mask] \n"
1724 ".ur_f7: \n"
1725 "sub.l %[psiz], %[addr] \n"
1726 "move.b (%[addr]), %%d0 \n"
1727 "and.l %%d5, %%d0 \n"
1728 "or.l %%d2, %%d0 \n"
1729 "move.b %%d0, (%[addr]) \n"
1730 "lsr.l #8, %%d2 \n"
1731 ".ur_f6: \n"
1732 "sub.l %[psiz], %[addr] \n"
1733 "move.b (%[addr]), %%d0 \n"
1734 "and.l %%d5, %%d0 \n"
1735 "or.l %%d3, %%d0 \n"
1736 "move.b %%d0, (%[addr]) \n"
1737 "lsr.l #8, %%d3 \n"
1738 ".ur_f5: \n"
1739 "sub.l %[psiz], %[addr] \n"
1740 "move.b (%[addr]), %%d0 \n"
1741 "and.l %%d5, %%d0 \n"
1742 "or.l %%d4, %%d0 \n"
1743 "move.b %%d0, (%[addr]) \n"
1744 "lsr.l #8, %%d4 \n"
1745 ".ur_f4: \n"
1746 "sub.l %[psiz], %[addr] \n"
1747 "move.b (%[addr]), %%d0 \n"
1748 "and.l %%d5, %%d0 \n"
1749 "move.l %[ax], %%d1 \n"
1750 "or.l %%d1, %%d0 \n"
1751 "move.b %%d0, (%[addr]) \n"
1752 "lsr.l #8, %%d1 \n"
1753 "move.l %%d1, %[ax] \n"
1754 ".ur_f3: \n"
1755 "sub.l %[psiz], %[addr] \n"
1756 "move.b (%[addr]), %%d0 \n"
1757 "and.l %%d5, %%d0 \n"
1758 "or.l %%d6, %%d0 \n"
1759 "move.b %%d0, (%[addr]) \n"
1760 "lsr.l #8, %%d6 \n"
1761 ".ur_f2: \n"
1762 "sub.l %[psiz], %[addr] \n"
1763 "move.b (%[addr]), %%d0 \n"
1764 "and.l %%d5, %%d0 \n"
1765 "or.l %%d7, %%d0 \n"
1766 "move.b %%d0, (%[addr]) \n"
1767 "lsr.l #8, %%d7 \n"
1768 ".ur_f1: \n"
1769 "sub.l %[psiz], %[addr] \n"
1770 "move.b (%[addr]), %%d0 \n"
1771 "and.l %%d5, %%d0 \n"
1772 "move.l %%a0, %%d1 \n"
1773 "or.l %%d1, %%d0 \n"
1774 "move.b %%d0, (%[addr]) \n"
1775 "lsr.l #8, %%d1 \n"
1776 "move.l %%d1, %%a0 \n"
1777
1778 "move.l %[psiz], %%d0 \n"
1779 "lsl.l #3, %%d0 \n"
1780 "add.l %%d0, %[addr] \n" /* correct address */
1781 "subq.l #8, %[dpth] \n"
1782 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1783 "jgt .ur_floop \n" /* next round if anything left */
1784
1785 "jra .ur_end \n"
1786
1787 ".ur_sloop: \n" /** short loop (nothing to keep) **/
1788 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1789 "bhs.s .ur_s8 \n"
1790
1791 "move.l %[psiz], %%d0 \n"
1792 "move.l %[dpth], %%d5 \n"
1793 "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
1794 "add.l %%d0, %[addr] \n" /* for this round */
1795 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1796 "bra.s .ur_s1 \n" /* dpth == 0 should never happen */
1797 "bra.s .ur_s2 \n"
1798 "bra.s .ur_s3 \n"
1799 "bra.s .ur_s4 \n"
1800 "bra.s .ur_s5 \n"
1801 "bra.s .ur_s6 \n"
1802 "bra.s .ur_s7 \n"
1803
1804 ".ur_s8: \n"
1805 "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
1806 "lsl.l #3, %%d0 \n" /* for this round. */
1807 "add.l %%d0, %[addr] \n" /* See above. */
1808
1809 "sub.l %[psiz], %[addr] \n"
1810 "move.b %%d1, (%[addr]) \n" /* store byte */
1811 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1812 ".ur_s7: \n"
1813 "sub.l %[psiz], %[addr] \n"
1814 "move.b %%d2, (%[addr]) \n"
1815 "lsr.l #8, %%d2 \n"
1816 ".ur_s6: \n"
1817 "sub.l %[psiz], %[addr] \n"
1818 "move.b %%d3, (%[addr]) \n"
1819 "lsr.l #8, %%d3 \n"
1820 ".ur_s5: \n"
1821 "sub.l %[psiz], %[addr] \n"
1822 "move.b %%d4, (%[addr]) \n"
1823 "lsr.l #8, %%d4 \n"
1824 ".ur_s4: \n"
1825 "sub.l %[psiz], %[addr] \n"
1826 "move.l %[ax], %%d5 \n"
1827 "move.b %%d5, (%[addr]) \n"
1828 "lsr.l #8, %%d5 \n"
1829 "move.l %%d5, %[ax] \n"
1830 ".ur_s3: \n"
1831 "sub.l %[psiz], %[addr] \n"
1832 "move.b %%d6, (%[addr]) \n"
1833 "lsr.l #8, %%d6 \n"
1834 ".ur_s2: \n"
1835 "sub.l %[psiz], %[addr] \n"
1836 "move.b %%d7, (%[addr]) \n"
1837 "lsr.l #8, %%d7 \n"
1838 ".ur_s1: \n"
1839 "sub.l %[psiz], %[addr] \n"
1840 "move.l %%a0, %%d5 \n"
1841 "move.b %%d5, (%[addr]) \n"
1842 "lsr.l #8, %%d5 \n"
1843 "move.l %%d5, %%a0 \n"
1844
1845 "add.l %%d0, %[addr] \n" /* correct address */
1846 "subq.l #8, %[dpth] \n"
1847 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1848 "jgt .ur_sloop \n" /* next round if anything left */
1849
1850 ".ur_end: \n"
1851 : /* outputs */
1852 [addr]"+a"(addr),
1853 [dpth]"+a"(depth),
1854 [mask]"+a"(mask),
1855 [ax] "=&a"(trash)
1856 : /* inputs */
1857 [psiz]"a"(_gray_info.plane_size),
1858 [patp]"[ax]"(pat_ptr)
1859 : /* clobbers */
1860 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
1291 ); 1861 );
1292 } 1862 }
1293#else /* C version, for reference*/ 1863#else /* C version, for reference*/
@@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd)
1680} 2250}
1681 2251
1682#endif /* HAVE_LCD_BITMAP */ 2252#endif /* HAVE_LCD_BITMAP */
1683
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c
index 9406664ea2..dcc65bdd09 100644
--- a/apps/plugins/lib/gray_draw.c
+++ b/apps/plugins/lib/gray_draw.c
@@ -868,24 +868,24 @@ void gray_ub_clear_display(void)
868 868
869/* Write a pixel block, defined by their brightnesses in a greymap. 869/* Write a pixel block, defined by their brightnesses in a greymap.
870 Address is the byte in the first bitplane, src is the greymap start address, 870 Address is the byte in the first bitplane, src is the greymap start address,
871 stride is the increment for the greymap to get to the next pixel, mask 871 mask determines which pixels of the destination block are changed. */
872 determines which pixels of the destination block are changed. */
873static void _writearray(unsigned char *address, const unsigned char *src, 872static void _writearray(unsigned char *address, const unsigned char *src,
874 unsigned mask) 873 unsigned mask)
875{ 874{
876 unsigned long pat_stack[8]; 875 unsigned long pat_stack[8];
877 unsigned long *pat_ptr = &pat_stack[8]; 876 unsigned long *pat_ptr = &pat_stack[8];
878 unsigned char *addr, *end; 877 unsigned char *addr;
879#ifdef CPU_ARM 878#ifdef CPU_ARM
880 const unsigned char *_src; 879 const unsigned char *_src;
881 unsigned _mask, trash; 880 unsigned _mask, depth, trash;
882 881
883 _mask = mask; 882 _mask = mask;
884 _src = src; 883 _src = src;
885 884
886 /* precalculate the bit patterns with random shifts 885 /* precalculate the bit patterns with random shifts
887 for all 8 pixels and put them on an extra "stack" */ 886 for all 8 pixels and put them on an extra "stack" */
888 asm volatile ( 887 asm volatile
888 (
889 "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ 889 "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */
890 "mov r3, #8 \n" /* loop count */ 890 "mov r3, #8 \n" /* loop count */
891 891
@@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src,
932 ); 932 );
933 933
934 addr = address; 934 addr = address;
935 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
936 _mask = mask; 935 _mask = mask;
936 depth = _gray_info.depth;
937 937
938 /* set the bits for all 8 pixels in all bytes according to the 938 /* set the bits for all 8 pixels in all bytes according to the
939 * precalculated patterns on the pattern stack */ 939 * precalculated patterns on the pattern stack */
940 asm volatile ( 940 asm volatile
941 "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ 941 (
942 942 "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
943 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ 943
944 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
945
946 "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
947 "orr %[rx], %[rx], %[rx], lsl #8 \n"
948 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
949 "eor r0, r1, r5, lsl #4 \n"
950 "and r0, r0, %[rx] \n"
951 "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
952 "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
953 "eor r0, r2, r6, lsl #4 \n"
954 "and r0, r0, %[rx] \n"
955 "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
956 "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
957 "eor r0, r3, r7, lsl #4 \n"
958 "and r0, r0, %[rx] \n"
959 "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
960 "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
961 "eor r0, r4, r8, lsl #4 \n"
962 "and r0, r0, %[rx] \n"
963 "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
964 "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
965
966 "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
967 "orr %[rx], %[rx], %[rx], lsl #8 \n"
968 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
969 "eor r0, r1, r3, lsl #2 \n"
970 "and r0, r0, %[rx] \n"
971 "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
972 "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
973 "eor r0, r2, r4, lsl #2 \n"
974 "and r0, r0, %[rx] \n"
975 "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
976 "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
977 "eor r0, r5, r7, lsl #2 \n"
978 "and r0, r0, %[rx] \n"
979 "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
980 "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
981 "eor r0, r6, r8, lsl #2 \n"
982 "and r0, r0, %[rx] \n"
983 "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
984 "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
985
986 "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
987 "orr %[rx], %[rx], %[rx], lsl #8 \n"
988 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
989 "eor r0, r1, r2, lsl #1 \n"
990 "and r0, r0, %[rx] \n"
991 "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
992 "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
993 "eor r0, r3, r4, lsl #1 \n"
994 "and r0, r0, %[rx] \n"
995 "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
996 "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
997 "eor r0, r5, r6, lsl #1 \n"
998 "and r0, r0, %[rx] \n"
999 "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1000 "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1001 "eor r0, r7, r8, lsl #1 \n"
1002 "and r0, r0, %[rx] \n"
1003 "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1004 "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1005
1006 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
944 "ands %[mask], %[mask], #0xff \n" 1007 "ands %[mask], %[mask], #0xff \n"
945 "beq .wa_sloop \n" /* short loop if nothing to keep */ 1008 "beq .wa_sloop \n" /* short loop if no bits to keep */
946 1009
947 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1010 ".wa_floop: \n" /** full loop (bits to keep)**/
948 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 1011 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
949 "adc r0, r0, r0 \n" /* put bit into LSB of byte */ 1012 "bhs .wa_f8 \n"
950 "movs r8, r8, lsr #1 \n" 1013
951 "adc r0, r0, r0 \n" 1014 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
952 "movs r7, r7, lsr #1 \n" 1015 "add %[addr], %[addr], r0 \n" /* for this round */
953 "adc r0, r0, r0 \n" 1016
954 "movs r6, r6, lsr #1 \n" 1017
955 "adc r0, r0, r0 \n" 1018 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
956 "movs r5, r5, lsr #1 \n" 1019 "add pc, pc, r0 \n"
957 "adc r0, r0, r0 \n" 1020 ".wa_ftable: \n"
958 "movs r4, r4, lsr #1 \n" 1021 ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */
959 "adc r0, r0, r0 \n" 1022 ".byte .wa_f1 - .wa_ftable - 4 \n"
960 "movs r3, r3, lsr #1 \n" 1023 ".byte .wa_f2 - .wa_ftable - 4 \n"
961 "adc r0, r0, r0 \n" 1024 ".byte .wa_f3 - .wa_ftable - 4 \n"
962 "movs r2, r2, lsr #1 \n" 1025 ".byte .wa_f4 - .wa_ftable - 4 \n"
963 "adc r0, r0, r0 \n" 1026 ".byte .wa_f5 - .wa_ftable - 4 \n"
964 1027 ".byte .wa_f6 - .wa_ftable - 4 \n"
965 "ldrb r1, [%[addr]] \n" /* read old value */ 1028 ".byte .wa_f7 - .wa_ftable - 4 \n"
966 "and r1, r1, %[mask] \n" /* mask out replaced bits */ 1029
967 "orr r1, r1, r0 \n" /* set new bits */ 1030 ".wa_f8: \n"
968 "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ 1031 "add %[addr], %[addr], %[psiz], lsl #3 \n"
969 1032 /* Point behind the last plane for this round. Note: We're using the
970 "cmp %[end], %[addr] \n" /* loop through all bitplanes */ 1033 * registers backwards in order to reuse the streak for the last round.
971 "bne .wa_floop \n" 1034 * Therefore we need to go thru the bitplanes backwards too, otherwise
972 1035 * the bit order would be destroyed which results in more flicker. */
1036 "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
1037 "and r0, r0, %[mask] \n" /* mask out replaced bits */
1038 "orr r0, r0, r8 \n" /* set new bits */
1039 "strb r0, [%[addr]] \n" /* store byte */
1040 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
1041 ".wa_f7: \n"
1042 "ldrb r0, [%[addr], -%[psiz]]! \n"
1043 "and r0, r0, %[mask] \n"
1044 "orr r0, r0, r7 \n"
1045 "strb r0, [%[addr]] \n"
1046 "mov r7, r7, lsr #8 \n"
1047 ".wa_f6: \n"
1048 "ldrb r0, [%[addr], -%[psiz]]! \n"
1049 "and r0, r0, %[mask] \n"
1050 "orr r0, r0, r6 \n"
1051 "strb r0, [%[addr]] \n"
1052 "mov r6, r6, lsr #8 \n"
1053 ".wa_f5: \n"
1054 "ldrb r0, [%[addr], -%[psiz]]! \n"
1055 "and r0, r0, %[mask] \n"
1056 "orr r0, r0, r5 \n"
1057 "strb r0, [%[addr]] \n"
1058 "mov r5, r5, lsr #8 \n"
1059 ".wa_f4: \n"
1060 "ldrb r0, [%[addr], -%[psiz]]! \n"
1061 "and r0, r0, %[mask] \n"
1062 "orr r0, r0, r4 \n"
1063 "strb r0, [%[addr]] \n"
1064 "mov r4, r4, lsr #8 \n"
1065 ".wa_f3: \n"
1066 "ldrb r0, [%[addr], -%[psiz]]! \n"
1067 "and r0, r0, %[mask] \n"
1068 "orr r0, r0, r3 \n"
1069 "strb r0, [%[addr]] \n"
1070 "mov r3, r3, lsr #8 \n"
1071 ".wa_f2: \n"
1072 "ldrb r0, [%[addr], -%[psiz]]! \n"
1073 "and r0, r0, %[mask] \n"
1074 "orr r0, r0, r2 \n"
1075 "strb r0, [%[addr]] \n"
1076 "mov r2, r2, lsr #8 \n"
1077 ".wa_f1: \n"
1078 "ldrb r0, [%[addr], -%[psiz]]! \n"
1079 "and r0, r0, %[mask] \n"
1080 "orr r0, r0, r1 \n"
1081 "strb r0, [%[addr]] \n"
1082 "mov r1, r1, lsr #8 \n"
1083 ".wa_f0: \n"
1084
1085 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
1086 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
1087 "bhi .wa_floop \n"
1088
973 "b .wa_end \n" 1089 "b .wa_end \n"
974 1090
975 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1091 ".wa_sloop: \n" /** short loop (nothing to keep) **/
976 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 1092 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
977 "adc r0, r0, r0 \n" /* put bit into LSB of byte */ 1093 "bhs .wa_s8 \n"
978 "movs r8, r8, lsr #1 \n"
979 "adc r0, r0, r0 \n"
980 "movs r7, r7, lsr #1 \n"
981 "adc r0, r0, r0 \n"
982 "movs r6, r6, lsr #1 \n"
983 "adc r0, r0, r0 \n"
984 "movs r5, r5, lsr #1 \n"
985 "adc r0, r0, r0 \n"
986 "movs r4, r4, lsr #1 \n"
987 "adc r0, r0, r0 \n"
988 "movs r3, r3, lsr #1 \n"
989 "adc r0, r0, r0 \n"
990 "movs r2, r2, lsr #1 \n"
991 "adc r0, r0, r0 \n"
992
993 "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
994 1094
995 "cmp %[end], %[addr] \n" /* loop through all bitplanes */ 1095 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
996 "bne .wa_sloop \n" 1096 "add %[addr], %[addr], r0 \n" /* for this round */
997 1097
1098 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
1099 "add pc, pc, r0 \n"
1100 ".wa_stable: \n"
1101 ".byte .wa_s0 - .wa_stable - 4 \n"
1102 ".byte .wa_s1 - .wa_stable - 4 \n"
1103 ".byte .wa_s2 - .wa_stable - 4 \n"
1104 ".byte .wa_s3 - .wa_stable - 4 \n"
1105 ".byte .wa_s4 - .wa_stable - 4 \n"
1106 ".byte .wa_s5 - .wa_stable - 4 \n"
1107 ".byte .wa_s6 - .wa_stable - 4 \n"
1108 ".byte .wa_s7 - .wa_stable - 4 \n"
1109
1110 ".wa_s8: \n"
1111 "add %[addr], %[addr], %[psiz], lsl #3 \n"
1112 /* Point behind the last plane for this round. See above. */
1113 "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
1114 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
1115 ".wa_s7: \n"
1116 "strb r7, [%[addr], -%[psiz]]! \n"
1117 "mov r7, r7, lsr #8 \n"
1118 ".wa_s6: \n"
1119 "strb r6, [%[addr], -%[psiz]]! \n"
1120 "mov r6, r6, lsr #8 \n"
1121 ".wa_s5: \n"
1122 "strb r5, [%[addr], -%[psiz]]! \n"
1123 "mov r5, r5, lsr #8 \n"
1124 ".wa_s4: \n"
1125 "strb r4, [%[addr], -%[psiz]]! \n"
1126 "mov r4, r4, lsr #8 \n"
1127 ".wa_s3: \n"
1128 "strb r3, [%[addr], -%[psiz]]! \n"
1129 "mov r3, r3, lsr #8 \n"
1130 ".wa_s2: \n"
1131 "strb r2, [%[addr], -%[psiz]]! \n"
1132 "mov r2, r2, lsr #8 \n"
1133 ".wa_s1: \n"
1134 "strb r1, [%[addr], -%[psiz]]! \n"
1135 "mov r1, r1, lsr #8 \n"
1136 ".wa_s0: \n"
1137
1138 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
1139 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
1140 "bhi .wa_sloop \n"
1141
998 ".wa_end: \n" 1142 ".wa_end: \n"
999 : /* outputs */ 1143 : /* outputs */
1000 [addr]"+r"(addr), 1144 [addr]"+r"(addr),
1001 [mask]"+r"(_mask), 1145 [mask]"+r"(_mask),
1146 [dpth]"+r"(depth),
1002 [rx] "=&r"(trash) 1147 [rx] "=&r"(trash)
1003 : /* inputs */ 1148 : /* inputs */
1004 [psiz]"r"(_gray_info.plane_size), 1149 [psiz]"r"(_gray_info.plane_size),
1005 [end] "r"(end),
1006 [patp]"[rx]"(pat_ptr) 1150 [patp]"[rx]"(pat_ptr)
1007 : /* clobbers */ 1151 : /* clobbers */
1008 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" 1152 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
1009 ); 1153 );
1010#else /* C version, for reference*/ 1154#else /* C version, for reference*/
1011#warning C version of _writearray() used 1155#warning C version of _writearray() used
1156 unsigned char *end;
1012 unsigned test = 0x80; 1157 unsigned test = 0x80;
1013 int i; 1158 int i;
1014 1159
@@ -1143,67 +1288,70 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y,
1143 stride is the increment for the greymap to get to the next pixel, mask 1288 stride is the increment for the greymap to get to the next pixel, mask
1144 determines which pixels of the destination block are changed. */ 1289 determines which pixels of the destination block are changed. */
1145static void _writearray(unsigned char *address, const unsigned char *src, 1290static void _writearray(unsigned char *address, const unsigned char *src,
1291 int stride, unsigned mask) __attribute__((noinline));
1292static void _writearray(unsigned char *address, const unsigned char *src,
1146 int stride, unsigned mask) 1293 int stride, unsigned mask)
1147{ 1294{
1148 unsigned long pat_stack[8]; 1295 unsigned long pat_stack[8];
1149 unsigned long *pat_ptr = &pat_stack[8]; 1296 unsigned long *pat_ptr = &pat_stack[8];
1150 unsigned char *addr, *end; 1297 unsigned char *addr;
1151#if CONFIG_CPU == SH7034 1298#if CONFIG_CPU == SH7034
1152 const unsigned char *_src; 1299 const unsigned char *_src;
1153 unsigned _mask, trash; 1300 unsigned _mask, depth, trash;
1154 1301
1155 _mask = mask; 1302 _mask = mask;
1156 _src = src; 1303 _src = src;
1157 1304
1158 /* precalculate the bit patterns with random shifts 1305 /* precalculate the bit patterns with random shifts
1159 for all 8 pixels and put them on an extra "stack" */ 1306 for all 8 pixels and put them on an extra "stack" */
1160 asm volatile ( 1307 asm volatile
1161 "mov #8,r3 \n" /* loop count */ 1308 (
1162 1309 "mov #8, r3 \n" /* loop count */
1163 ".wa_loop: \n" /** load pattern for pixel **/ 1310
1164 "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ 1311 ".wa_loop: \n" /** load pattern for pixel **/
1165 "shlr %[mask] \n" /* shift out lsb of mask */ 1312 "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */
1166 "bf .wa_skip \n" /* skip this pixel */ 1313 "shlr %[mask] \n" /* shift out lsb of mask */
1167 1314 "bf .wa_skip \n" /* skip this pixel */
1168 "mov.b @%[src],r0 \n" /* load src byte */ 1315
1169 "extu.b r0,r0 \n" /* extend unsigned */ 1316 "mov.b @%[src], r0 \n" /* load src byte */
1170 "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ 1317 "extu.b r0, r0 \n" /* extend unsigned */
1171 "extu.b r0,r0 \n" /* extend unsigned */ 1318 "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */
1172 "shll2 r0 \n" 1319 "extu.b r0, r0 \n" /* extend unsigned */
1173 "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ 1320 "shll2 r0 \n"
1174 1321 "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
1175 "mov #75,r0 \n" 1322
1176 "mulu r0,%[rnd] \n" /* multiply by 75 */ 1323 "mov #75, r0 \n"
1177 "sts macl,%[rnd] \n" 1324 "mulu r0, %[rnd] \n" /* multiply by 75 */
1178 "add #74,%[rnd] \n" /* add another 74 */ 1325 "sts macl, %[rnd] \n"
1326 "add #74, %[rnd] \n" /* add another 74 */
1179 /* Since the lower bits are not very random: */ 1327 /* Since the lower bits are not very random: */
1180 "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ 1328 "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
1181 "and %[rmsk],r1 \n" /* mask out unneeded bits */ 1329 "and %[rmsk], r1 \n" /* mask out unneeded bits */
1182 1330
1183 "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ 1331 "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
1184 "bf .wa_ntrim \n" 1332 "bf .wa_ntrim \n"
1185 "sub %[dpth],r1 \n" /* yes: random -= depth; */ 1333 "sub %[dpth], r1 \n" /* yes: random -= depth; */
1186 ".wa_ntrim: \n" 1334 ".wa_ntrim: \n"
1187 1335
1188 "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ 1336 "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
1189 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ 1337 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
1190 "mov r1,r5 \n" 1338 "mov r1, r5 \n"
1191 1339
1192 "mov %[dpth],r5 \n" 1340 "mov %[dpth], r5 \n"
1193 "sub r1,r5 \n" /* r5 = depth - r1 */ 1341 "sub r1, r5 \n" /* r5 = depth - r1 */
1194 "mov.l .lshrsi3,r1 \n" 1342 "mov.l .lshrsi3, r1 \n"
1195 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ 1343 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
1196 "mov r0,r1 \n" /* store previous result in r1 */ 1344 "mov r0, r1 \n" /* store previous result in r1 */
1197 1345
1198 "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ 1346 "or r1, r0 \n" /* rotated_pattern = r0 | r1 */
1199 1347
1200 ".wa_skip: \n" 1348 ".wa_skip: \n"
1201 "mov.l r0,@-%[patp] \n" /* push on pattern stack */ 1349 "mov.l r0, @-%[patp] \n" /* push on pattern stack */
1202 1350
1203 "add %[stri],%[src] \n" /* src += stride; */ 1351 "add %[stri], %[src] \n" /* src += stride; */
1204 "add #-1,r3 \n" /* loop 8 times (pixel block) */ 1352 "add #-1, r3 \n" /* loop 8 times (pixel block) */
1205 "cmp/pl r3 \n" 1353 "cmp/pl r3 \n"
1206 "bt .wa_loop \n" 1354 "bt .wa_loop \n"
1207 : /* outputs */ 1355 : /* outputs */
1208 [src] "+r"(_src), 1356 [src] "+r"(_src),
1209 [rnd] "+r"(_gray_random_buffer), 1357 [rnd] "+r"(_gray_random_buffer),
@@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src,
1220 ); 1368 );
1221 1369
1222 addr = address; 1370 addr = address;
1223 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
1224 _mask = mask; 1371 _mask = mask;
1372 depth = _gray_info.depth;
1225 1373
1226 /* set the bits for all 8 pixels in all bytes according to the 1374 /* set the bits for all 8 pixels in all bytes according to the
1227 * precalculated patterns on the pattern stack */ 1375 * precalculated patterns on the pattern stack */
1228 asm volatile ( 1376 asm volatile
1229 "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ 1377 (
1230 "mov.l @%[patp]+,r2 \n" 1378 "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
1231 "mov.l @%[patp]+,r3 \n" 1379 "mov.l @%[patp]+, r7 \n"
1232 "mov.l @%[patp]+,r6 \n" 1380 "mov.l @%[patp]+, r6 \n"
1233 "mov.l @%[patp]+,r7 \n" 1381 "mov.l @%[patp]+, r5 \n"
1234 "mov.l @%[patp]+,r8 \n" 1382 "mov.l @%[patp]+, r4 \n"
1235 "mov.l @%[patp]+,r9 \n" 1383 "mov.l @%[patp]+, r3 \n"
1236 "mov.l @%[patp],r10 \n" 1384 "mov.l @%[patp]+, r2 \n"
1237 1385 "mov.l @%[patp], r1 \n"
1238 "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ 1386
1239 "extu.b %[mask],%[mask] \n" /* mask out high bits */ 1387 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1240 "tst %[mask],%[mask] \n" 1388
1241 "bt .wa_sloop \n" /* short loop if nothing to keep */ 1389 "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */
1242 1390 "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
1243 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1391 "shll2 r0 \n"
1244 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1392 "shll2 r0 \n"
1245 "rotcl r0 \n" /* rotate t bit into r0 */ 1393 "xor r1, r0 \n"
1246 "shlr r2 \n" 1394 "and %[rx], r0 \n"
1247 "rotcl r0 \n" 1395 "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
1248 "shlr r3 \n" 1396 "shlr2 r0 \n"
1249 "rotcl r0 \n" 1397 "shlr2 r0 \n"
1250 "shlr r6 \n" 1398 "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
1251 "rotcl r0 \n" 1399 "mov r6, r0 \n"
1252 "shlr r7 \n" 1400 "shll2 r0 \n"
1253 "rotcl r0 \n" 1401 "shll2 r0 \n"
1254 "shlr r8 \n" 1402 "xor r2, r0 \n"
1255 "rotcl r0 \n" 1403 "and %[rx], r0 \n"
1256 "shlr r9 \n" 1404 "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
1257 "rotcl r0 \n" 1405 "shlr2 r0 \n"
1258 "shlr r10 \n" 1406 "shlr2 r0 \n"
1259 "mov.b @%[addr],%[rx] \n" /* read old value */ 1407 "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
1260 "rotcl r0 \n" 1408 "mov r7, r0 \n"
1261 "and %[mask],%[rx] \n" /* mask out replaced bits */ 1409 "shll2 r0 \n"
1262 "or %[rx],r0 \n" /* set new bits */ 1410 "shll2 r0 \n"
1263 "mov.b r0,@%[addr] \n" /* store value to bitplane */ 1411 "xor r3, r0 \n"
1264 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1412 "and %[rx], r0 \n"
1265 "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ 1413 "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
1266 "bt .wa_floop \n" 1414 "shlr2 r0 \n"
1267 1415 "shlr2 r0 \n"
1268 "bra .wa_end \n" 1416 "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
1269 "nop \n" 1417 "mov r8, r0 \n"
1418 "shll2 r0 \n"
1419 "shll2 r0 \n"
1420 "xor r4, r0 \n"
1421 "and %[rx], r0 \n"
1422 "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
1423 "shlr2 r0 \n"
1424 "shlr2 r0 \n"
1425 "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
1426
1427 "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */
1428 "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
1429 "shll2 r0 \n"
1430 "xor r1, r0 \n"
1431 "and %[rx], r0 \n"
1432 "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
1433 "shlr2 r0 \n"
1434 "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
1435 "mov r4, r0 \n"
1436 "shll2 r0 \n"
1437 "xor r2, r0 \n"
1438 "and %[rx], r0 \n"
1439 "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1440 "shlr2 r0 \n"
1441 "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
1442 "mov r7, r0 \n"
1443 "shll2 r0 \n"
1444 "xor r5, r0 \n"
1445 "and %[rx], r0 \n"
1446 "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
1447 "shlr2 r0 \n"
1448 "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
1449 "mov r8, r0 \n"
1450 "shll2 r0 \n"
1451 "xor r6, r0 \n"
1452 "and %[rx], r0 \n"
1453 "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
1454 "shlr2 r0 \n"
1455 "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
1456
1457 "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */
1458 "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
1459 "shll r0 \n"
1460 "xor r1, r0 \n"
1461 "and %[rx], r0 \n"
1462 "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
1463 "shlr r0 \n"
1464 "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
1465 "mov r4, r0 \n"
1466 "shll r0 \n"
1467 "xor r3, r0 \n"
1468 "and %[rx], r0 \n"
1469 "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
1470 "shlr r0 \n"
1471 "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
1472 "mov r6, r0 \n"
1473 "shll r0 \n"
1474 "xor r5, r0 \n"
1475 "and %[rx], r0 \n"
1476 "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1477 "shlr r0 \n"
1478 "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1479 "mov r8, r0 \n"
1480 "shll r0 \n"
1481 "xor r7, r0 \n"
1482 "and %[rx], r0 \n"
1483 "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1484 "shlr r0 \n"
1485 "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1486
1487 "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
1488 "extu.b %[mask], %[mask] \n" /* mask out high bits */
1489 "tst %[mask], %[mask] \n"
1490 "bt .wa_sloop \n" /* short loop if nothing to keep */
1491
1492 ".wa_floop: \n" /** full loop (there are bits to keep)**/
1493 "mov #8, r0 \n"
1494 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1495 "bt .wa_f8 \n"
1496
1497 "mulu %[psiz], %[dpth] \n"
1498 "mova .wa_ftable, r0 \n"
1499 "mov.b @(r0, %[dpth]), %[rx] \n"
1500 "add %[rx], r0 \n"
1501 "sts macl, %[rx] \n" /* point behind the last plane.. */
1502 "jmp @r0 \n" /* jump into streak */
1503 "add %[rx], %[addr] \n" /* ..for this round */
1504
1505 ".align 2 \n"
1506 ".wa_ftable: \n"
1507 ".byte .wa_f0 - .wa_ftable \n"
1508 ".byte .wa_f1 - .wa_ftable \n"
1509 ".byte .wa_f2 - .wa_ftable \n"
1510 ".byte .wa_f3 - .wa_ftable \n"
1511 ".byte .wa_f4 - .wa_ftable \n"
1512 ".byte .wa_f5 - .wa_ftable \n"
1513 ".byte .wa_f6 - .wa_ftable \n"
1514 ".byte .wa_f7 - .wa_ftable \n"
1515
1516 ".wa_f8: \n"
1517 "mov %[psiz], %[rx] \n"
1518 "shll2 %[rx] \n"
1519 "add %[rx], %[rx] \n"
1520 "add %[rx], %[addr] \n"
1521 /* Point behind the last plane for this round. Note: We're using the
1522 * registers backwards in order to reuse the streak for the last round.
1523 * Therefore we need to go thru the bitplanes backwards too, otherwise
1524 * the bit order would be destroyed which results in more flicker. */
1525 "sub %[psiz], %[addr] \n"
1526 "mov.b @%[addr], r0 \n" /* load old byte */
1527 "and %[mask], r0 \n" /* mask out replaced bits */
1528 "or r8, r0 \n" /* set new bits */
1529 "mov.b r0, @%[addr] \n" /* store byte */
1530 "shlr8 r8 \n" /* shift out used-up byte */
1531 ".wa_f7: \n"
1532 "sub %[psiz], %[addr] \n"
1533 "mov.b @%[addr], r0 \n"
1534 "and %[mask], r0 \n"
1535 "or r7, r0 \n"
1536 "mov.b r0, @%[addr] \n"
1537 "shlr8 r7 \n"
1538 ".wa_f6: \n"
1539 "sub %[psiz], %[addr] \n"
1540 "mov.b @%[addr], r0 \n"
1541 "and %[mask], r0 \n"
1542 "or r6, r0 \n"
1543 "mov.b r0, @%[addr] \n"
1544 "shlr8 r6 \n"
1545 ".wa_f5: \n"
1546 "sub %[psiz], %[addr] \n"
1547 "mov.b @%[addr], r0 \n"
1548 "and %[mask], r0 \n"
1549 "or r5, r0 \n"
1550 "mov.b r0, @%[addr] \n"
1551 "shlr8 r5 \n"
1552 ".wa_f4: \n"
1553 "sub %[psiz], %[addr] \n"
1554 "mov.b @%[addr], r0 \n"
1555 "and %[mask], r0 \n"
1556 "or r4, r0 \n"
1557 "mov.b r0, @%[addr] \n"
1558 "shlr8 r4 \n"
1559 ".wa_f3: \n"
1560 "sub %[psiz], %[addr] \n"
1561 "mov.b @%[addr], r0 \n"
1562 "and %[mask], r0 \n"
1563 "or r3, r0 \n"
1564 "mov.b r0, @%[addr] \n"
1565 "shlr8 r3 \n"
1566 ".wa_f2: \n"
1567 "sub %[psiz], %[addr] \n"
1568 "mov.b @%[addr], r0 \n"
1569 "and %[mask], r0 \n"
1570 "or r2, r0 \n"
1571 "mov.b r0, @%[addr] \n"
1572 "shlr8 r2 \n"
1573 ".wa_f1: \n"
1574 "sub %[psiz], %[addr] \n"
1575 "mov.b @%[addr], r0 \n"
1576 "and %[mask], r0 \n"
1577 "or r1, r0 \n"
1578 "mov.b r0, @%[addr] \n"
1579 "shlr8 r1 \n"
1580 ".wa_f0: \n"
1581
1582 "add %[rx], %[addr] \n" /* correct address */
1583 "add #-8, %[dpth] \n"
1584 "cmp/pl %[dpth] \n" /* next round if anything left */
1585 "bt .wa_floop \n"
1586
1587 "bra .wa_end \n"
1588 "nop \n"
1270 1589
1271 /* References to C library routines used in the precalc block */ 1590 /* References to C library routines used in the precalc block */
1272 ".align 2 \n" 1591 ".align 2 \n"
1273 ".ashlsi3: \n" /* C library routine: */ 1592 ".ashlsi3: \n" /* C library routine: */
1274 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ 1593 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
1275 ".lshrsi3: \n" /* C library routine: */ 1594 ".lshrsi3: \n" /* C library routine: */
1276 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ 1595 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
1277 /* both routines preserve r4, destroy r5 and take ~16 cycles */ 1596 /* both routines preserve r4, destroy r5 and take ~16 cycles */
1278 1597
1279 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1598 /* Bitmasks for the bit block rotation */
1280 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1599 ".wa_mask4: \n"
1281 "rotcl r0 \n" /* rotate t bit into r0 */ 1600 ".long 0xF0F0F0F0 \n"
1282 "shlr r2 \n" 1601 ".wa_mask2: \n"
1283 "rotcl r0 \n" 1602 ".long 0xCCCCCCCC \n"
1284 "shlr r3 \n" 1603 ".wa_mask1: \n"
1285 "rotcl r0 \n" 1604 ".long 0xAAAAAAAA \n"
1286 "shlr r6 \n" 1605
1287 "rotcl r0 \n" 1606 ".wa_sloop: \n" /** short loop (nothing to keep) **/
1288 "shlr r7 \n" 1607 "mov #8, r0 \n"
1289 "rotcl r0 \n" 1608 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1290 "shlr r8 \n" 1609 "bt .wa_s8 \n"
1291 "rotcl r0 \n" 1610
1292 "shlr r9 \n" 1611 "mulu %[psiz], %[dpth] \n"
1293 "rotcl r0 \n" 1612 "mova .wa_stable, r0 \n"
1294 "shlr r10 \n" 1613 "mov.b @(r0, %[dpth]), %[rx] \n"
1295 "rotcl r0 \n" 1614 "add %[rx], r0 \n"
1296 "mov.b r0,@%[addr] \n" /* store byte to bitplane */ 1615 "sts macl, %[rx] \n" /* point behind the last plane.. */
1297 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1616 "jmp @r0 \n" /* jump into streak */
1298 "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ 1617 "add %[rx], %[addr] \n" /* ..for this round */
1299 "bt .wa_sloop \n" 1618
1300 1619 ".align 2 \n"
1301 ".wa_end: \n" 1620 ".wa_stable: \n"
1621 ".byte .wa_s0 - .wa_stable \n"
1622 ".byte .wa_s1 - .wa_stable \n"
1623 ".byte .wa_s2 - .wa_stable \n"
1624 ".byte .wa_s3 - .wa_stable \n"
1625 ".byte .wa_s4 - .wa_stable \n"
1626 ".byte .wa_s5 - .wa_stable \n"
1627 ".byte .wa_s6 - .wa_stable \n"
1628 ".byte .wa_s7 - .wa_stable \n"
1629
1630 ".wa_s8: \n"
1631 "mov %[psiz], %[rx] \n" /* Point behind the last plane */
1632 "shll2 %[rx] \n" /* for this round. */
1633 "add %[rx], %[rx] \n" /* See above. */
1634 "add %[rx], %[addr] \n"
1635
1636 "sub %[psiz], %[addr] \n"
1637 "mov.b r8, @%[addr] \n" /* store byte */
1638 "shlr8 r8 \n" /* shift out used-up byte */
1639 ".wa_s7: \n"
1640 "sub %[psiz], %[addr] \n"
1641 "mov.b r7, @%[addr] \n"
1642 "shlr8 r7 \n"
1643 ".wa_s6: \n"
1644 "sub %[psiz], %[addr] \n"
1645 "mov.b r6, @%[addr] \n"
1646 "shlr8 r6 \n"
1647 ".wa_s5: \n"
1648 "sub %[psiz], %[addr] \n"
1649 "mov.b r5, @%[addr] \n"
1650 "shlr8 r5 \n"
1651 ".wa_s4: \n"
1652 "sub %[psiz], %[addr] \n"
1653 "mov.b r4, @%[addr] \n"
1654 "shlr8 r4 \n"
1655 ".wa_s3: \n"
1656 "sub %[psiz], %[addr] \n"
1657 "mov.b r3, @%[addr] \n"
1658 "shlr8 r3 \n"
1659 ".wa_s2: \n"
1660 "sub %[psiz], %[addr] \n"
1661 "mov.b r2, @%[addr] \n"
1662 "shlr8 r2 \n"
1663 ".wa_s1: \n"
1664 "sub %[psiz], %[addr] \n"
1665 "mov.b r1, @%[addr] \n"
1666 "shlr8 r1 \n"
1667 ".wa_s0: \n"
1668
1669 "add %[rx], %[addr] \n" /* correct address */
1670 "add #-8, %[dpth] \n"
1671 "cmp/pl %[dpth] \n" /* next round if anything left */
1672 "bt .wa_sloop \n"
1673
1674 ".wa_end: \n"
1302 : /* outputs */ 1675 : /* outputs */
1303 [addr]"+r"(addr), 1676 [addr]"+r"(addr),
1304 [mask]"+r"(_mask), 1677 [mask]"+r"(_mask),
1678 [dpth]"+r"(depth),
1305 [rx] "=&r"(trash) 1679 [rx] "=&r"(trash)
1306 : /* inputs */ 1680 : /* inputs */
1307 [psiz]"r"(_gray_info.plane_size), 1681 [psiz]"r"(_gray_info.plane_size),
1308 [end] "r"(end),
1309 [patp]"[rx]"(pat_ptr) 1682 [patp]"[rx]"(pat_ptr)
1310 : /* clobbers */ 1683 : /* clobbers */
1311 "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" 1684 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
1312 ); 1685 );
1313#elif defined(CPU_COLDFIRE) 1686#elif defined(CPU_COLDFIRE)
1314 const unsigned char *_src; 1687 const unsigned char *_src;
1315 unsigned _mask, trash; 1688 unsigned _mask, depth, trash;
1316 1689
1317 _mask = mask; 1690 _mask = mask;
1318 _src = src; 1691 _src = src;
1319 1692
1320 /* precalculate the bit patterns with random shifts 1693 /* precalculate the bit patterns with random shifts
1321 for all 8 pixels and put them on an extra "stack" */ 1694 for all 8 pixels and put them on an extra "stack" */
1322 asm volatile ( 1695 asm volatile
1323 "moveq.l #8,%%d3 \n" /* loop count */ 1696 (
1324 1697 "moveq.l #8, %%d3 \n" /* loop count */
1325 ".wa_loop: \n" /** load pattern for pixel **/ 1698
1326 "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ 1699 ".wa_loop: \n" /** load pattern for pixel **/
1327 "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ 1700 "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
1328 "bcc.b .wa_skip \n" /* skip this pixel */ 1701 "lsr.l #1, %[mask] \n" /* shift out lsb of mask */
1329 1702 "bcc.b .wa_skip \n" /* skip this pixel */
1330 "clr.l %%d0 \n" 1703
1331 "move.b (%[src]),%%d0 \n" /* load src byte */ 1704 "clr.l %%d0 \n"
1332 "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ 1705 "move.b (%[src]), %%d0 \n" /* load src byte */
1333 "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ 1706 "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */
1334 1707 "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
1335 "mulu.w #75,%[rnd] \n" /* multiply by 75 */ 1708
1336 "add.l #74,%[rnd] \n" /* add another 74 */ 1709 "mulu.w #75, %[rnd] \n" /* multiply by 75 */
1710 "add.l #74, %[rnd] \n" /* add another 74 */
1337 /* Since the lower bits are not very random: */ 1711 /* Since the lower bits are not very random: */
1338 "move.l %[rnd],%%d1 \n" 1712 "move.l %[rnd], %%d1 \n"
1339 "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ 1713 "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
1340 "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ 1714 "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
1341 1715
1342 "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ 1716 "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
1343 "blo.b .wa_ntrim \n" 1717 "blo.b .wa_ntrim \n"
1344 "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ 1718 "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
1345 ".wa_ntrim: \n" 1719 ".wa_ntrim: \n"
1346 1720
1347 "move.l %%d2,%%d0 \n" /** rotate pattern **/ 1721 "move.l %%d2, %%d0 \n" /** rotate pattern **/
1348 "lsl.l %%d1,%%d0 \n" 1722 "lsl.l %%d1, %%d0 \n"
1349 "sub.l %[dpth],%%d1 \n" 1723 "sub.l %[dpth], %%d1 \n"
1350 "neg.l %%d1 \n" /* d1 = depth - d1 */ 1724 "neg.l %%d1 \n" /* d1 = depth - d1 */
1351 "lsr.l %%d1,%%d2 \n" 1725 "lsr.l %%d1, %%d2 \n"
1352 "or.l %%d0,%%d2 \n" 1726 "or.l %%d0, %%d2 \n"
1353 1727
1354 ".wa_skip: \n" 1728 ".wa_skip: \n"
1355 "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ 1729 "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
1356 1730
1357 "add.l %[stri],%[src] \n" /* src += stride; */ 1731 "add.l %[stri], %[src] \n" /* src += stride; */
1358 "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ 1732 "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
1359 "bne.b .wa_loop \n" 1733 "bne.b .wa_loop \n"
1360 : /* outputs */ 1734 : /* outputs */
1361 [src] "+a"(_src), 1735 [src] "+a"(_src),
1362 [patp]"+a"(pat_ptr), 1736 [patp]"+a"(pat_ptr),
@@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src,
1373 ); 1747 );
1374 1748
1375 addr = address; 1749 addr = address;
1376 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1750 _mask = ~mask & 0xff;
1377 _mask = mask; 1751 depth = _gray_info.depth;
1378 1752
1379 /* set the bits for all 8 pixels in all bytes according to the 1753 /* set the bits for all 8 pixels in all bytes according to the
1380 * precalculated patterns on the pattern stack */ 1754 * precalculated patterns on the pattern stack */
1381 asm volatile ( 1755 asm volatile
1382 "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" 1756 (
1383 /* pop all 8 patterns */ 1757 "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
1384 "not.l %[mask] \n" /* "set" mask -> "keep" mask */ 1758 /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
1385 "and.l #0xFF,%[mask] \n" 1759
1386 "beq.b .wa_sstart \n" /* short loop if nothing to keep */ 1760 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1387 1761
1388 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1762 "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
1389 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1763 "lsl.l #4, %%d0 \n"
1390 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1764 /* move.l %[ax], %%d5 */ /* already in d5 */
1391 "lsr.l #1,%%d3 \n" 1765 "eor.l %%d5, %%d0 \n"
1392 "addx.l %%d0,%%d0 \n" 1766 "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
1393 "lsr.l #1,%%d4 \n" 1767 "eor.l %%d0, %%d5 \n"
1394 "addx.l %%d0,%%d0 \n" 1768 "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
1395 "lsr.l #1,%%d5 \n" 1769 "lsr.l #4, %%d0 \n"
1396 "addx.l %%d0,%%d0 \n" 1770 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
1397 "lsr.l #1,%%d6 \n" 1771 "move.l %%d2, %%d0 \n"
1398 "addx.l %%d0,%%d0 \n" 1772 "lsl.l #4, %%d0 \n"
1399 "move.l %%a0,%%d1 \n" 1773 "eor.l %%d6, %%d0 \n"
1400 "lsr.l #1,%%d1 \n" 1774 "and.l #0xF0F0F0F0, %%d0 \n"
1401 "addx.l %%d0,%%d0 \n" 1775 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
1402 "move.l %%d1,%%a0 \n" 1776 "lsr.l #4, %%d0 \n"
1403 "move.l %%a1,%%d1 \n" 1777 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
1404 "lsr.l #1,%%d1 \n" 1778 "move.l %%d3, %%d0 \n"
1405 "addx.l %%d0,%%d0 \n" 1779 "lsl.l #4, %%d0 \n"
1406 "move.l %%d1,%%a1 \n" 1780 "eor.l %%d7, %%d0 \n"
1407 "move.l %[ax],%%d1 \n" 1781 "and.l #0xF0F0F0F0, %%d0 \n"
1408 "lsr.l #1,%%d1 \n" 1782 "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
1409 "addx.l %%d0,%%d0 \n" 1783 "lsr.l #4, %%d0 \n"
1410 "move.l %%d1,%[ax] \n" 1784 "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
1411 1785 "move.l %%d4, %%d0 \n"
1412 "move.b (%[addr]),%%d1 \n" /* read old value */ 1786 "lsl.l #4, %%d0 \n"
1413 "and.l %[mask],%%d1 \n" /* mask out replaced bits */ 1787 "move.l %%a0, %%d5 \n"
1414 "or.l %%d0,%%d1 \n" /* set new bits */ 1788 "eor.l %%d5, %%d0 \n"
1415 "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ 1789 "and.l #0xF0F0F0F0, %%d0 \n"
1416 1790 "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
1417 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1791 /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
1418 "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ 1792 "lsr.l #4, %%d0 \n"
1419 "bhi.b .wa_floop \n" 1793 "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
1420 1794
1421 "bra.b .wa_end \n" 1795 "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
1422 1796 "lsl.l #2, %%d0 \n"
1423 ".wa_sstart: \n" 1797 /* move.l %%a0, %%d5 */ /* still in d5 */
1424 "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ 1798 "eor.l %%d5, %%d0 \n"
1425 1799 "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
1426 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1800 "eor.l %%d0, %%d5 \n"
1427 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1801 "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
1428 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1802 "lsr.l #2, %%d0 \n"
1429 "lsr.l #1,%%d3 \n" 1803 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
1430 "addx.l %%d0,%%d0 \n" 1804 "move.l %[ax], %%d5 \n"
1431 "lsr.l #1,%%d4 \n" 1805 "move.l %%d5, %%d0 \n"
1432 "addx.l %%d0,%%d0 \n" 1806 "lsl.l #2, %%d0 \n"
1433 "lsr.l #1,%%d5 \n" 1807 "eor.l %%d7, %%d0 \n"
1434 "addx.l %%d0,%%d0 \n" 1808 "and.l #0xCCCCCCCC, %%d0 \n"
1435 "lsr.l #1,%%d6 \n" 1809 "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1436 "addx.l %%d0,%%d0 \n" 1810 "lsr.l #2, %%d0 \n"
1437 "lsr.l #1,%[mask] \n" 1811 "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
1438 "addx.l %%d0,%%d0 \n" 1812 /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
1439 "move.l %%a1,%%d1 \n" 1813 "move.l %%d2, %%d0 \n"
1440 "lsr.l #1,%%d1 \n" 1814 "lsl.l #2, %%d0 \n"
1441 "addx.l %%d0,%%d0 \n" 1815 "eor.l %%d4, %%d0 \n"
1442 "move.l %%d1,%%a1 \n" 1816 "and.l #0xCCCCCCCC, %%d0 \n"
1443 "move.l %[ax],%%d1 \n" 1817 "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
1444 "lsr.l #1,%%d1 \n" 1818 "lsr.l #2, %%d0 \n"
1445 "addx.l %%d0,%%d0 \n" 1819 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
1446 "move.l %%d1,%[ax] \n" 1820 "move.l %%d1, %%d0 \n"
1447 1821 "lsl.l #2, %%d0 \n"
1448 "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ 1822 "eor.l %%d3, %%d0 \n"
1449 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1823 "and.l #0xCCCCCCCC, %%d0 \n"
1450 "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ 1824 "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
1451 "bhi.b .wa_sloop \n" 1825 "lsr.l #2, %%d0 \n"
1452 1826 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
1453 ".wa_end: \n" 1827
1828 "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
1829 "lsl.l #1, %%d0 \n"
1830 "eor.l %%d2, %%d0 \n"
1831 "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
1832 "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
1833 "lsr.l #1, %%d0 \n"
1834 "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
1835 "move.l %%d3, %%d0 \n"
1836 "lsl.l #1, %%d0 \n"
1837 "eor.l %%d4, %%d0 \n"
1838 "and.l #0xAAAAAAAA, %%d0 \n"
1839 "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
1840 "lsr.l #1, %%d0 \n"
1841 "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
1842 /* move.l %[ax], %%d5 */ /* still in d5 */
1843 "move.l %%d5, %%d0 \n"
1844 "lsl.l #1, %%d0 \n"
1845 "eor.l %%d6, %%d0 \n"
1846 "and.l #0xAAAAAAAA, %%d0 \n"
1847 "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
1848 "lsr.l #1, %%d0 \n"
1849 "eor.l %%d0, %%d5 \n"
1850 "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
1851 "move.l %%d7, %%d0 \n"
1852 "lsl.l #1, %%d0 \n"
1853 "move.l %%a0, %%d5 \n"
1854 "eor.l %%d5, %%d0 \n"
1855 "and.l #0xAAAAAAAA, %%d0 \n"
1856 "eor.l %%d0, %%d5 \n"
1857 "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
1858 "lsr.l #1, %%d0 \n"
1859 "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
1860
1861 "tst.l %[mask] \n"
1862 "jeq .wa_sloop \n" /* short loop if nothing to keep */
1863
1864 "move.l %[mask], %%d5 \n" /* need mask in data reg. */
1865 "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
1866
1867 ".wa_floop: \n" /** full loop (there are bits to keep)**/
1868 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1869 "bhs.s .wa_f8 \n"
1870
1871 "move.l %[psiz], %%d0 \n"
1872 "move.l %[dpth], %%d1 \n"
1873 "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
1874 "add.l %%d0, %[addr] \n" /* for this round */
1875 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1876 "bra.s .wa_f1 \n" /* dpth == 0 should never happen */
1877 "bra.s .wa_f2 \n"
1878 "bra.s .wa_f3 \n"
1879 "bra.s .wa_f4 \n"
1880 "bra.s .wa_f5 \n"
1881 "bra.s .wa_f6 \n"
1882 "bra.s .wa_f7 \n"
1883
1884 ".wa_f8: \n"
1885 "move.l %[psiz], %%d0 \n"
1886 "lsl.l #3, %%d0 \n"
1887 "add.l %%d0, %[addr] \n"
1888 /* Point behind the last plane for this round. Note: We're using the
1889 * registers backwards in order to reuse the streak for the last round.
1890 * Therefore we need to go thru the bitplanes backwards too, otherwise
1891 * the bit order would be destroyed which results in more flicker. */
1892 "sub.l %[psiz], %[addr] \n"
1893 "move.b (%[addr]), %%d0 \n" /* load old byte */
1894 "and.l %%d5, %%d0 \n" /* mask out replaced bits */
1895 "move.l %[mask], %%d1 \n"
1896 "or.l %%d1, %%d0 \n" /* set new bits */
1897 "move.b %%d0, (%[addr]) \n" /* store byte */
1898 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1899 "move.l %%d1, %[mask] \n"
1900 ".wa_f7: \n"
1901 "sub.l %[psiz], %[addr] \n"
1902 "move.b (%[addr]), %%d0 \n"
1903 "and.l %%d5, %%d0 \n"
1904 "or.l %%d2, %%d0 \n"
1905 "move.b %%d0, (%[addr]) \n"
1906 "lsr.l #8, %%d2 \n"
1907 ".wa_f6: \n"
1908 "sub.l %[psiz], %[addr] \n"
1909 "move.b (%[addr]), %%d0 \n"
1910 "and.l %%d5, %%d0 \n"
1911 "or.l %%d3, %%d0 \n"
1912 "move.b %%d0, (%[addr]) \n"
1913 "lsr.l #8, %%d3 \n"
1914 ".wa_f5: \n"
1915 "sub.l %[psiz], %[addr] \n"
1916 "move.b (%[addr]), %%d0 \n"
1917 "and.l %%d5, %%d0 \n"
1918 "or.l %%d4, %%d0 \n"
1919 "move.b %%d0, (%[addr]) \n"
1920 "lsr.l #8, %%d4 \n"
1921 ".wa_f4: \n"
1922 "sub.l %[psiz], %[addr] \n"
1923 "move.b (%[addr]), %%d0 \n"
1924 "and.l %%d5, %%d0 \n"
1925 "move.l %[ax], %%d1 \n"
1926 "or.l %%d1, %%d0 \n"
1927 "move.b %%d0, (%[addr]) \n"
1928 "lsr.l #8, %%d1 \n"
1929 "move.l %%d1, %[ax] \n"
1930 ".wa_f3: \n"
1931 "sub.l %[psiz], %[addr] \n"
1932 "move.b (%[addr]), %%d0 \n"
1933 "and.l %%d5, %%d0 \n"
1934 "or.l %%d6, %%d0 \n"
1935 "move.b %%d0, (%[addr]) \n"
1936 "lsr.l #8, %%d6 \n"
1937 ".wa_f2: \n"
1938 "sub.l %[psiz], %[addr] \n"
1939 "move.b (%[addr]), %%d0 \n"
1940 "and.l %%d5, %%d0 \n"
1941 "or.l %%d7, %%d0 \n"
1942 "move.b %%d0, (%[addr]) \n"
1943 "lsr.l #8, %%d7 \n"
1944 ".wa_f1: \n"
1945 "sub.l %[psiz], %[addr] \n"
1946 "move.b (%[addr]), %%d0 \n"
1947 "and.l %%d5, %%d0 \n"
1948 "move.l %%a0, %%d1 \n"
1949 "or.l %%d1, %%d0 \n"
1950 "move.b %%d0, (%[addr]) \n"
1951 "lsr.l #8, %%d1 \n"
1952 "move.l %%d1, %%a0 \n"
1953
1954 "move.l %[psiz], %%d0 \n"
1955 "lsl.l #3, %%d0 \n"
1956 "add.l %%d0, %[addr] \n" /* correct address */
1957 "subq.l #8, %[dpth] \n"
1958 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1959 "jgt .wa_floop \n" /* next round if anything left */
1960
1961 "jra .wa_end \n"
1962
1963 ".wa_sloop: \n" /** short loop (nothing to keep) **/
1964 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1965 "bhs.s .wa_s8 \n"
1966
1967 "move.l %[psiz], %%d0 \n"
1968 "move.l %[dpth], %%d5 \n"
1969 "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
1970 "add.l %%d0, %[addr] \n" /* for this round */
1971 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1972 "bra.s .wa_s1 \n" /* dpth == 0 should never happen */
1973 "bra.s .wa_s2 \n"
1974 "bra.s .wa_s3 \n"
1975 "bra.s .wa_s4 \n"
1976 "bra.s .wa_s5 \n"
1977 "bra.s .wa_s6 \n"
1978 "bra.s .wa_s7 \n"
1979
1980 ".wa_s8: \n"
1981 "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
1982 "lsl.l #3, %%d0 \n" /* for this round. */
1983 "add.l %%d0, %[addr] \n" /* See above. */
1984
1985 "sub.l %[psiz], %[addr] \n"
1986 "move.b %%d1, (%[addr]) \n" /* store byte */
1987 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1988 ".wa_s7: \n"
1989 "sub.l %[psiz], %[addr] \n"
1990 "move.b %%d2, (%[addr]) \n"
1991 "lsr.l #8, %%d2 \n"
1992 ".wa_s6: \n"
1993 "sub.l %[psiz], %[addr] \n"
1994 "move.b %%d3, (%[addr]) \n"
1995 "lsr.l #8, %%d3 \n"
1996 ".wa_s5: \n"
1997 "sub.l %[psiz], %[addr] \n"
1998 "move.b %%d4, (%[addr]) \n"
1999 "lsr.l #8, %%d4 \n"
2000 ".wa_s4: \n"
2001 "sub.l %[psiz], %[addr] \n"
2002 "move.l %[ax], %%d5 \n"
2003 "move.b %%d5, (%[addr]) \n"
2004 "lsr.l #8, %%d5 \n"
2005 "move.l %%d5, %[ax] \n"
2006 ".wa_s3: \n"
2007 "sub.l %[psiz], %[addr] \n"
2008 "move.b %%d6, (%[addr]) \n"
2009 "lsr.l #8, %%d6 \n"
2010 ".wa_s2: \n"
2011 "sub.l %[psiz], %[addr] \n"
2012 "move.b %%d7, (%[addr]) \n"
2013 "lsr.l #8, %%d7 \n"
2014 ".wa_s1: \n"
2015 "sub.l %[psiz], %[addr] \n"
2016 "move.l %%a0, %%d5 \n"
2017 "move.b %%d5, (%[addr]) \n"
2018 "lsr.l #8, %%d5 \n"
2019 "move.l %%d5, %%a0 \n"
2020
2021 "add.l %%d0, %[addr] \n" /* correct address */
2022 "subq.l #8, %[dpth] \n"
2023 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
2024 "jgt .wa_sloop \n" /* next round if anything left */
2025
2026 ".wa_end: \n"
1454 : /* outputs */ 2027 : /* outputs */
1455 [addr]"+a"(addr), 2028 [addr]"+a"(addr),
1456 [mask]"+d"(_mask), 2029 [dpth]"+a"(depth),
2030 [mask]"+a"(_mask),
1457 [ax] "=&a"(trash) 2031 [ax] "=&a"(trash)
1458 : /* inputs */ 2032 : /* inputs */
1459 [psiz]"a"(_gray_info.plane_size), 2033 [psiz]"a"(_gray_info.plane_size),
1460 [end] "a"(end),
1461 [patp]"[ax]"(pat_ptr) 2034 [patp]"[ax]"(pat_ptr)
1462 : /* clobbers */ 2035 : /* clobbers */
1463 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" 2036 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
1464 ); 2037 );
1465#else /* C version, for reference*/ 2038#else /* C version, for reference*/
1466#warning C version of _writearray() used 2039#warning C version of _writearray() used
2040 unsigned char *end;
1467 unsigned test = 1; 2041 unsigned test = 1;
1468 int i; 2042 int i;
1469 2043