summaryrefslogtreecommitdiff
path: root/apps/plugins/lib/gray_core.c
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
committerJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
commit71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch)
treeb9a97081ec04d4d311a7b45747393e68837912a2 /apps/plugins/lib/gray_core.c
parentbcd94a9b01d19d87a437cd8158a758f206b30825 (diff)
downloadrockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz
rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib/gray_core.c')
-rw-r--r--apps/plugins/lib/gray_core.c1417
1 files changed, 993 insertions, 424 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c
index e65a7f259e..809e88dba1 100644
--- a/apps/plugins/lib/gray_core.c
+++ b/apps/plugins/lib/gray_core.c
@@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height)
649 bbuf = _gray_info.back_buffer + srcofs_row; 649 bbuf = _gray_info.back_buffer + srcofs_row;
650 650
651#ifdef CPU_ARM 651#ifdef CPU_ARM
652 asm volatile ( 652 asm volatile
653 (
653 "ldr r0, [%[cbuf]] \n" 654 "ldr r0, [%[cbuf]] \n"
654 "ldr r1, [%[bbuf]] \n" 655 "ldr r1, [%[bbuf]] \n"
655 "eor r1, r0, r1 \n" 656 "eor r1, r0, r1 \n"
@@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height)
668 669
669 if (change != 0) 670 if (change != 0)
670 { 671 {
671 unsigned char *addr, *end; 672 unsigned char *addr;
672 unsigned mask, trash; 673 unsigned mask, depth, trash;
673 674
674 pat_ptr = &pat_stack[8]; 675 pat_ptr = &pat_stack[8];
675 676
676 /* precalculate the bit patterns with random shifts 677 /* precalculate the bit patterns with random shifts
677 * for all 8 pixels and put them on an extra "stack" */ 678 * for all 8 pixels and put them on an extra "stack" */
678 asm volatile ( 679 asm volatile
679 "mov r3, #8 \n" /* loop count */ 680 (
680 "mov %[mask], #0 \n" 681 "mov r3, #8 \n" /* loop count */
681 682 "mov %[mask], #0 \n"
682 ".ur_pre_loop: \n" 683
683 "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ 684 ".ur_pre_loop: \n"
684 "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ 685 "mov %[mask], %[mask], lsl #1 \n" /* shift mask */
685 "ldrb r1, [%[bbuf]] \n" /* read back buffer */ 686 "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
686 "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ 687 "ldrb r1, [%[bbuf]] \n" /* read back buffer */
687 "mov r2, #0 \n" /* preset for skipped pixel */ 688 "strb r0, [%[bbuf]], #1 \n" /* update back buffer */
688 "cmp r0, r1 \n" /* no change? */ 689 "mov r2, #0 \n" /* preset for skipped pixel */
689 "beq .ur_skip \n" /* -> skip */ 690 "cmp r0, r1 \n" /* no change? */
690 691 "beq .ur_skip \n" /* -> skip */
691 "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ 692
692 693 "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
693 "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ 694
694 "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" 695 "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */
695 "add %[rnd], %[rnd], #74 \n" /* add another 74 */ 696 "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n"
696 /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ 697 "add %[rnd], %[rnd], #74 \n" /* add another 74 */
697 "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ 698 /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
698 699 "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
699 "cmp r1, %[dpth] \n" /* random >= depth ? */ 700
700 "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ 701 "cmp r1, %[dpth] \n" /* random >= depth ? */
701 702 "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
702 "mov r0, r2, lsl r1 \n" /** rotate pattern **/ 703
703 "sub r1, %[dpth], r1 \n" 704 "mov r0, r2, lsl r1 \n" /** rotate pattern **/
704 "orr r2, r0, r2, lsr r1 \n" 705 "sub r1, %[dpth], r1 \n"
705 706 "orr r2, r0, r2, lsr r1 \n"
706 "orr %[mask], %[mask], #1 \n" /* set mask bit */ 707
708 "orr %[mask], %[mask], #1 \n" /* set mask bit */
707 709
708 ".ur_skip: \n" 710 ".ur_skip: \n"
709 "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ 711 "str r2, [%[patp], #-4]! \n" /* push on pattern stack */
710 712
711 "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ 713 "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
712 "bne .ur_pre_loop \n" 714 "bne .ur_pre_loop \n"
713 : /* outputs */ 715 : /* outputs */
714 [cbuf]"+r"(cbuf), 716 [cbuf]"+r"(cbuf),
715 [bbuf]"+r"(bbuf), 717 [bbuf]"+r"(bbuf),
716 [patp]"+r"(pat_ptr), 718 [patp]"+r"(pat_ptr),
717 [rnd] "+r"(_gray_random_buffer), 719 [rnd] "+r"(_gray_random_buffer),
718 [mask]"=&r"(mask) 720 [mask]"=&r"(mask)
719 : /* inputs */ 721 : /* inputs */
720 [bpat]"r"(_gray_info.bitpattern), 722 [bpat]"r"(_gray_info.bitpattern),
721 [dpth]"r"(_gray_info.depth), 723 [dpth]"r"(_gray_info.depth),
722 [rmsk]"r"(_gray_info.randmask) 724 [rmsk]"r"(_gray_info.randmask)
723 : /* clobbers */ 725 : /* clobbers */
724 "r0", "r1", "r2", "r3" 726 "r0", "r1", "r2", "r3"
725 ); 727 );
726 728
727 addr = dst_row; 729 addr = dst_row;
728 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 730 depth = _gray_info.depth;
729 731
730 /* set the bits for all 8 pixels in all bytes according to the 732 /* set the bits for all 8 pixels in all bytes according to the
731 * precalculated patterns on the pattern stack */ 733 * precalculated patterns on the pattern stack */
732 asm volatile ( 734 asm volatile
733 "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ 735 (
736 "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
737
738 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
739
740 "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
741 "orr %[rx], %[rx], %[rx], lsl #8 \n"
742 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
743 "eor r0, r1, r5, lsl #4 \n"
744 "and r0, r0, %[rx] \n"
745 "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
746 "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
747 "eor r0, r2, r6, lsl #4 \n"
748 "and r0, r0, %[rx] \n"
749 "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
750 "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
751 "eor r0, r3, r7, lsl #4 \n"
752 "and r0, r0, %[rx] \n"
753 "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
754 "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
755 "eor r0, r4, r8, lsl #4 \n"
756 "and r0, r0, %[rx] \n"
757 "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
758 "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
759
760 "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
761 "orr %[rx], %[rx], %[rx], lsl #8 \n"
762 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
763 "eor r0, r1, r3, lsl #2 \n"
764 "and r0, r0, %[rx] \n"
765 "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
766 "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
767 "eor r0, r2, r4, lsl #2 \n"
768 "and r0, r0, %[rx] \n"
769 "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
770 "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
771 "eor r0, r5, r7, lsl #2 \n"
772 "and r0, r0, %[rx] \n"
773 "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
774 "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
775 "eor r0, r6, r8, lsl #2 \n"
776 "and r0, r0, %[rx] \n"
777 "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
778 "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
779
780 "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
781 "orr %[rx], %[rx], %[rx], lsl #8 \n"
782 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
783 "eor r0, r1, r2, lsl #1 \n"
784 "and r0, r0, %[rx] \n"
785 "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
786 "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
787 "eor r0, r3, r4, lsl #1 \n"
788 "and r0, r0, %[rx] \n"
789 "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
790 "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
791 "eor r0, r5, r6, lsl #1 \n"
792 "and r0, r0, %[rx] \n"
793 "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
794 "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
795 "eor r0, r7, r8, lsl #1 \n"
796 "and r0, r0, %[rx] \n"
797 "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
798 "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
799
800 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
801 "ands %[mask], %[mask], #0xff \n"
802 "beq .ur_sloop \n" /* short loop if no bits to keep */
803
804 ".ur_floop: \n" /** full loop (bits to keep)**/
805 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
806 "bhs .ur_f8 \n"
807
808 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
809 "add %[addr], %[addr], r0 \n" /* for this round */
810
811 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
812 "add pc, pc, r0 \n"
813 ".ur_ftable: \n"
814 ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */
815 ".byte .ur_f1 - .ur_ftable - 4 \n"
816 ".byte .ur_f2 - .ur_ftable - 4 \n"
817 ".byte .ur_f3 - .ur_ftable - 4 \n"
818 ".byte .ur_f4 - .ur_ftable - 4 \n"
819 ".byte .ur_f5 - .ur_ftable - 4 \n"
820 ".byte .ur_f6 - .ur_ftable - 4 \n"
821 ".byte .ur_f7 - .ur_ftable - 4 \n"
822
823 ".ur_f8: \n"
824 "add %[addr], %[addr], %[psiz], lsl #3 \n"
825 /* Point behind the last plane for this round. Note: We're using the
826 * registers backwards in order to reuse the streak for the last round.
827 * Therefore we need to go thru the bitplanes backwards too, otherwise
828 * the bit order would be destroyed which results in more flicker. */
829 "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
830 "and r0, r0, %[mask] \n" /* mask out replaced bits */
831 "orr r0, r0, r8 \n" /* set new bits */
832 "strb r0, [%[addr]] \n" /* store byte */
833 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
834 ".ur_f7: \n"
835 "ldrb r0, [%[addr], -%[psiz]]! \n"
836 "and r0, r0, %[mask] \n"
837 "orr r0, r0, r7 \n"
838 "strb r0, [%[addr]] \n"
839 "mov r7, r7, lsr #8 \n"
840 ".ur_f6: \n"
841 "ldrb r0, [%[addr], -%[psiz]]! \n"
842 "and r0, r0, %[mask] \n"
843 "orr r0, r0, r6 \n"
844 "strb r0, [%[addr]] \n"
845 "mov r6, r6, lsr #8 \n"
846 ".ur_f5: \n"
847 "ldrb r0, [%[addr], -%[psiz]]! \n"
848 "and r0, r0, %[mask] \n"
849 "orr r0, r0, r5 \n"
850 "strb r0, [%[addr]] \n"
851 "mov r5, r5, lsr #8 \n"
852 ".ur_f4: \n"
853 "ldrb r0, [%[addr], -%[psiz]]! \n"
854 "and r0, r0, %[mask] \n"
855 "orr r0, r0, r4 \n"
856 "strb r0, [%[addr]] \n"
857 "mov r4, r4, lsr #8 \n"
858 ".ur_f3: \n"
859 "ldrb r0, [%[addr], -%[psiz]]! \n"
860 "and r0, r0, %[mask] \n"
861 "orr r0, r0, r3 \n"
862 "strb r0, [%[addr]] \n"
863 "mov r3, r3, lsr #8 \n"
864 ".ur_f2: \n"
865 "ldrb r0, [%[addr], -%[psiz]]! \n"
866 "and r0, r0, %[mask] \n"
867 "orr r0, r0, r2 \n"
868 "strb r0, [%[addr]] \n"
869 "mov r2, r2, lsr #8 \n"
870 ".ur_f1: \n"
871 "ldrb r0, [%[addr], -%[psiz]]! \n"
872 "and r0, r0, %[mask] \n"
873 "orr r0, r0, r1 \n"
874 "strb r0, [%[addr]] \n"
875 "mov r1, r1, lsr #8 \n"
876 ".ur_f0: \n"
877
878 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
879 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
880 "bhi .ur_floop \n"
881
882 "b .ur_end \n"
883
884 ".ur_sloop: \n" /** short loop (nothing to keep) **/
885 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
886 "bhs .ur_s8 \n"
887
888 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
889 "add %[addr], %[addr], r0 \n" /* for this round */
734 890
735 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ 891 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
736 "ands %[mask], %[mask], #0xff \n" 892 "add pc, pc, r0 \n"
737 "beq .ur_sloop \n" /* short loop if nothing to keep */ 893 ".ur_stable: \n"
738 894 ".byte .ur_s0 - .ur_stable - 4 \n"
739 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 895 ".byte .ur_s1 - .ur_stable - 4 \n"
740 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 896 ".byte .ur_s2 - .ur_stable - 4 \n"
741 "adc r0, r0, r0 \n" /* put bit into LSB for byte */ 897 ".byte .ur_s3 - .ur_stable - 4 \n"
742 "movs r8, r8, lsr #1 \n" 898 ".byte .ur_s4 - .ur_stable - 4 \n"
743 "adc r0, r0, r0 \n" 899 ".byte .ur_s5 - .ur_stable - 4 \n"
744 "movs r7, r7, lsr #1 \n" 900 ".byte .ur_s6 - .ur_stable - 4 \n"
745 "adc r0, r0, r0 \n" 901 ".byte .ur_s7 - .ur_stable - 4 \n"
746 "movs r6, r6, lsr #1 \n" 902
747 "adc r0, r0, r0 \n" 903 ".ur_s8: \n"
748 "movs r5, r5, lsr #1 \n" 904 "add %[addr], %[addr], %[psiz], lsl #3 \n"
749 "adc r0, r0, r0 \n" 905 /* Point behind the last plane for this round. See above. */
750 "movs r4, r4, lsr #1 \n" 906 "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
751 "adc r0, r0, r0 \n" 907 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
752 "movs r3, r3, lsr #1 \n" 908 ".ur_s7: \n"
753 "adc r0, r0, r0 \n" 909 "strb r7, [%[addr], -%[psiz]]! \n"
754 "movs r2, r2, lsr #1 \n" 910 "mov r7, r7, lsr #8 \n"
755 "adc r0, r0, r0 \n" 911 ".ur_s6: \n"
756 912 "strb r6, [%[addr], -%[psiz]]! \n"
757 "ldrb r1, [%[addr]] \n" /* read old value */ 913 "mov r6, r6, lsr #8 \n"
758 "and r1, r1, %[mask] \n" /* mask out replaced bits */ 914 ".ur_s5: \n"
759 "orr r1, r1, r0 \n" /* set new bits */ 915 "strb r5, [%[addr], -%[psiz]]! \n"
760 "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ 916 "mov r5, r5, lsr #8 \n"
761 917 ".ur_s4: \n"
762 "cmp %[end], %[addr] \n" /* loop for all bitplanes */ 918 "strb r4, [%[addr], -%[psiz]]! \n"
763 "bne .ur_floop \n" 919 "mov r4, r4, lsr #8 \n"
764 920 ".ur_s3: \n"
765 "b .ur_end \n" 921 "strb r3, [%[addr], -%[psiz]]! \n"
766 922 "mov r3, r3, lsr #8 \n"
767 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 923 ".ur_s2: \n"
768 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 924 "strb r2, [%[addr], -%[psiz]]! \n"
769 "adc r0, r0, r0 \n" /* put bit into LSB for byte */ 925 "mov r2, r2, lsr #8 \n"
770 "movs r8, r8, lsr #1 \n" 926 ".ur_s1: \n"
771 "adc r0, r0, r0 \n" 927 "strb r1, [%[addr], -%[psiz]]! \n"
772 "movs r7, r7, lsr #1 \n" 928 "mov r1, r1, lsr #8 \n"
773 "adc r0, r0, r0 \n" 929 ".ur_s0: \n"
774 "movs r6, r6, lsr #1 \n" 930
775 "adc r0, r0, r0 \n" 931 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
776 "movs r5, r5, lsr #1 \n" 932 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
777 "adc r0, r0, r0 \n" 933 "bhi .ur_sloop \n"
778 "movs r4, r4, lsr #1 \n" 934
779 "adc r0, r0, r0 \n" 935 ".ur_end: \n"
780 "movs r3, r3, lsr #1 \n" 936 : /* outputs */
781 "adc r0, r0, r0 \n" 937 [addr]"+r"(addr),
782 "movs r2, r2, lsr #1 \n" 938 [mask]"+r"(mask),
783 "adc r0, r0, r0 \n" 939 [dpth]"+r"(depth),
784 940 [rx] "=&r"(trash)
785 "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ 941 : /* inputs */
786 942 [psiz]"r"(_gray_info.plane_size),
787 "cmp %[end], %[addr] \n" /* loop for all bitplanes */ 943 [patp]"[rx]"(pat_ptr)
788 "bne .ur_sloop \n" 944 : /* clobbers */
789 945 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
790 ".ur_end: \n" 946 );
791 : /* outputs */
792 [addr]"+r"(addr),
793 [mask]"+r"(mask),
794 [rx] "=&r"(trash)
795 : /* inputs */
796 [psiz]"r"(_gray_info.plane_size),
797 [end] "r"(end),
798 [patp]"[rx]"(pat_ptr)
799 : /* clobbers */
800 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
801 );
802 } 947 }
803#else /* C version, for reference*/ 948#else /* C version, for reference*/
804#warning C version of gray_update_rect() used 949#warning C version of gray_update_rect() used
@@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height)
873 1018
874 for (i = 7; i >= 0; i--) 1019 for (i = 7; i >= 0; i--)
875 data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); 1020 data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
876 1021
877 *addr = (*addr & mask) | data; 1022 *addr = (*addr & mask) | data;
878 addr += _gray_info.plane_size; 1023 addr += _gray_info.plane_size;
879 test <<= 1; 1024 test <<= 1;
@@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height)
935 1080
936#if CONFIG_CPU == SH7034 1081#if CONFIG_CPU == SH7034
937 asm volatile ( 1082 asm volatile (
938 "mov.l @%[cbuf],r1 \n" 1083 "mov.l @%[cbuf], r1 \n"
939 "mov.l @%[bbuf],r2 \n" 1084 "mov.l @%[bbuf], r2 \n"
940 "xor r1,r2 \n" 1085 "xor r1, r2 \n"
941 "mov.l @(4,%[cbuf]),r1 \n" 1086 "mov.l @(4,%[cbuf]), r1 \n"
942 "mov.l @(4,%[bbuf]),%[chg] \n" 1087 "mov.l @(4,%[bbuf]), %[chg]\n"
943 "xor r1,%[chg] \n" 1088 "xor r1, %[chg] \n"
944 "or r2,%[chg] \n" 1089 "or r2, %[chg] \n"
945 : /* outputs */ 1090 : /* outputs */
946 [chg] "=r"(change) 1091 [chg] "=r"(change)
947 : /* inputs */ 1092 : /* inputs */
@@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height)
953 1098
954 if (change != 0) 1099 if (change != 0)
955 { 1100 {
956 unsigned char *addr, *end; 1101 unsigned char *addr;
957 unsigned mask, trash; 1102 unsigned mask, depth, trash;
958 1103
959 pat_ptr = &pat_stack[8]; 1104 pat_ptr = &pat_stack[8];
960 1105
961 /* precalculate the bit patterns with random shifts 1106 /* precalculate the bit patterns with random shifts
962 * for all 8 pixels and put them on an extra "stack" */ 1107 * for all 8 pixels and put them on an extra "stack" */
963 asm volatile ( 1108 asm volatile
964 "mov #8,r3 \n" /* loop count */ 1109 (
965 1110 "mov #8, r3 \n" /* loop count */
966 ".ur_pre_loop: \n" 1111
967 "mov.b @%[cbuf]+,r0\n" /* read current buffer */ 1112 ".ur_pre_loop: \n"
968 "mov.b @%[bbuf],r1 \n" /* read back buffer */ 1113 "mov.b @%[cbuf]+, r0 \n" /* read current buffer */
969 "mov #0,r2 \n" /* preset for skipped pixel */ 1114 "mov.b @%[bbuf], r1 \n" /* read back buffer */
970 "mov.b r0,@%[bbuf] \n" /* update back buffer */ 1115 "mov #0, r2 \n" /* preset for skipped pixel */
971 "add #1,%[bbuf] \n" 1116 "mov.b r0, @%[bbuf] \n" /* update back buffer */
972 "cmp/eq r0,r1 \n" /* no change? */ 1117 "add #1, %[bbuf] \n"
973 "bt .ur_skip \n" /* -> skip */ 1118 "cmp/eq r0, r1 \n" /* no change? */
974 1119 "bt .ur_skip \n" /* -> skip */
975 "shll2 r0 \n" /* pixel value -> pattern offset */ 1120
976 "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ 1121 "shll2 r0 \n" /* pixel value -> pattern offset */
977 1122 "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
978 "mov #75,r0 \n" 1123
979 "mulu r0,%[rnd] \n" /* multiply by 75 */ 1124 "mov #75, r0 \n"
980 "sts macl,%[rnd] \n" 1125 "mulu r0, %[rnd] \n" /* multiply by 75 */
981 "add #74,%[rnd] \n" /* add another 74 */ 1126 "sts macl, %[rnd] \n"
982 /* Since the lower bits are not very random: */ 1127 "add #74, %[rnd] \n" /* add another 74 */
983 "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ 1128 /* Since the lower bits are not very random: */
984 "and %[rmsk],r1 \n" /* mask out unneeded bits */ 1129 "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
985 1130 "and %[rmsk], r1 \n" /* mask out unneeded bits */
986 "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ 1131
987 "bf .ur_ntrim \n" 1132 "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
988 "sub %[dpth],r1 \n" /* yes: random -= depth; */ 1133 "bf .ur_ntrim \n"
989 ".ur_ntrim: \n" 1134 "sub %[dpth], r1 \n" /* yes: random -= depth; */
1135 ".ur_ntrim: \n"
990 1136
991 "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ 1137 "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
992 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ 1138 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
993 "mov r1,r5 \n" 1139 "mov r1, r5 \n"
994 1140
995 "mov %[dpth],r5 \n" 1141 "mov %[dpth], r5 \n"
996 "sub r1,r5 \n" /* r5 = depth - r1 */ 1142 "sub r1, r5 \n" /* r5 = depth - r1 */
997 "mov.l .lshrsi3,r1 \n" 1143 "mov.l .lshrsi3, r1 \n"
998 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ 1144 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
999 "mov r0,r2 \n" /* store previous result in r2 */ 1145 "mov r0, r2 \n" /* store previous result in r2 */
1000 1146
1001 "or r0,r2 \n" /* rotated_pattern = r2 | r0 */ 1147 "or r0, r2 \n" /* rotated_pattern = r2 | r0 */
1002 "clrt \n" /* mask bit = 0 (replace) */ 1148 "clrt \n" /* mask bit = 0 (replace) */
1003 1149
1004 ".ur_skip: \n" /* T == 1 if skipped */ 1150 ".ur_skip: \n" /* T == 1 if skipped */
1005 "rotcr %[mask] \n" /* get mask bit */ 1151 "rotcr %[mask] \n" /* get mask bit */
1006 "mov.l r2,@-%[patp]\n" /* push on pattern stack */ 1152 "mov.l r2, @-%[patp] \n" /* push on pattern stack */
1007 1153
1008 "add #-1,r3 \n" /* loop 8 times (pixel block) */ 1154 "add #-1, r3 \n" /* loop 8 times (pixel block) */
1009 "cmp/pl r3 \n" 1155 "cmp/pl r3 \n"
1010 "bt .ur_pre_loop\n" 1156 "bt .ur_pre_loop \n"
1011 1157
1012 "shlr8 %[mask] \n" /* shift mask to low byte */ 1158 "shlr8 %[mask] \n" /* shift mask to low byte */
1013 "shlr16 %[mask] \n" 1159 "shlr16 %[mask] \n"
1014 : /* outputs */ 1160 : /* outputs */
1015 [cbuf]"+r"(cbuf), 1161 [cbuf]"+r"(cbuf),
1016 [bbuf]"+r"(bbuf), 1162 [bbuf]"+r"(bbuf),
1017 [rnd] "+r"(_gray_random_buffer), 1163 [rnd] "+r"(_gray_random_buffer),
1018 [patp]"+r"(pat_ptr), 1164 [patp]"+r"(pat_ptr),
1019 [mask]"=&r"(mask) 1165 [mask]"=&r"(mask)
1020 : /* inputs */ 1166 : /* inputs */
1021 [dpth]"r"(_gray_info.depth), 1167 [dpth]"r"(_gray_info.depth),
1022 [bpat]"r"(_gray_info.bitpattern), 1168 [bpat]"r"(_gray_info.bitpattern),
1023 [rmsk]"r"(_gray_info.randmask) 1169 [rmsk]"r"(_gray_info.randmask)
1024 : /* clobbers */ 1170 : /* clobbers */
1025 "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" 1171 "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr"
1026 ); 1172 );
1027 1173
1028 addr = dst_row; 1174 addr = dst_row;
1029 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1175 depth = _gray_info.depth;
1030 1176
1031 /* set the bits for all 8 pixels in all bytes according to the 1177 /* set the bits for all 8 pixels in all bytes according to the
1032 * precalculated patterns on the pattern stack */ 1178 * precalculated patterns on the pattern stack */
1033 asm volatile ( 1179 asm volatile
1034 "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ 1180 (
1035 "mov.l @%[patp]+,r2 \n" 1181 "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
1036 "mov.l @%[patp]+,r3 \n" 1182 "mov.l @%[patp]+, r7 \n"
1037 "mov.l @%[patp]+,r6 \n" 1183 "mov.l @%[patp]+, r6 \n"
1038 "mov.l @%[patp]+,r7 \n" 1184 "mov.l @%[patp]+, r5 \n"
1039 "mov.l @%[patp]+,r8 \n" 1185 "mov.l @%[patp]+, r4 \n"
1040 "mov.l @%[patp]+,r9 \n" 1186 "mov.l @%[patp]+, r3 \n"
1041 "mov.l @%[patp],r10 \n" 1187 "mov.l @%[patp]+, r2 \n"
1042 1188 "mov.l @%[patp], r1 \n"
1043 "tst %[mask],%[mask] \n" 1189
1044 "bt .ur_sloop \n" /* short loop if nothing to keep */ 1190 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1045 1191
1046 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 1192 "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */
1047 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1193 "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
1048 "rotcl r0 \n" /* rotate t bit into r0 */ 1194 "shll2 r0 \n"
1049 "shlr r2 \n" 1195 "shll2 r0 \n"
1050 "rotcl r0 \n" 1196 "xor r1, r0 \n"
1051 "shlr r3 \n" 1197 "and %[rx], r0 \n"
1052 "rotcl r0 \n" 1198 "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
1053 "shlr r6 \n" 1199 "shlr2 r0 \n"
1054 "rotcl r0 \n" 1200 "shlr2 r0 \n"
1055 "shlr r7 \n" 1201 "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
1056 "rotcl r0 \n" 1202 "mov r6, r0 \n"
1057 "shlr r8 \n" 1203 "shll2 r0 \n"
1058 "rotcl r0 \n" 1204 "shll2 r0 \n"
1059 "shlr r9 \n" 1205 "xor r2, r0 \n"
1060 "rotcl r0 \n" 1206 "and %[rx], r0 \n"
1061 "shlr r10 \n" 1207 "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
1062 "mov.b @%[addr],%[rx] \n" /* read old value */ 1208 "shlr2 r0 \n"
1063 "rotcl r0 \n" 1209 "shlr2 r0 \n"
1064 "and %[mask],%[rx] \n" /* mask out replaced bits */ 1210 "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
1065 "or %[rx],r0 \n" /* set new bits */ 1211 "mov r7, r0 \n"
1066 "mov.b r0,@%[addr] \n" /* store value to bitplane */ 1212 "shll2 r0 \n"
1067 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1213 "shll2 r0 \n"
1068 "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ 1214 "xor r3, r0 \n"
1069 "bt .ur_floop \n" 1215 "and %[rx], r0 \n"
1070 1216 "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
1071 "bra .ur_end \n" 1217 "shlr2 r0 \n"
1072 "nop \n" 1218 "shlr2 r0 \n"
1073 1219 "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
1074 /* References to C library routines used in the precalc block */ 1220 "mov r8, r0 \n"
1075 ".align 2 \n" 1221 "shll2 r0 \n"
1076 ".ashlsi3: \n" /* C library routine: */ 1222 "shll2 r0 \n"
1077 ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ 1223 "xor r4, r0 \n"
1078 ".lshrsi3: \n" /* C library routine: */ 1224 "and %[rx], r0 \n"
1079 ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ 1225 "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
1080 /* both routines preserve r4, destroy r5 and take ~16 cycles */ 1226 "shlr2 r0 \n"
1081 1227 "shlr2 r0 \n"
1082 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 1228 "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
1083 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1229
1084 "rotcl r0 \n" /* rotate t bit into r0 */ 1230 "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */
1085 "shlr r2 \n" 1231 "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
1086 "rotcl r0 \n" 1232 "shll2 r0 \n"
1087 "shlr r3 \n" 1233 "xor r1, r0 \n"
1088 "rotcl r0 \n" 1234 "and %[rx], r0 \n"
1089 "shlr r6 \n" 1235 "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
1090 "rotcl r0 \n" 1236 "shlr2 r0 \n"
1091 "shlr r7 \n" 1237 "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
1092 "rotcl r0 \n" 1238 "mov r4, r0 \n"
1093 "shlr r8 \n" 1239 "shll2 r0 \n"
1094 "rotcl r0 \n" 1240 "xor r2, r0 \n"
1095 "shlr r9 \n" 1241 "and %[rx], r0 \n"
1096 "rotcl r0 \n" 1242 "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1097 "shlr r10 \n" 1243 "shlr2 r0 \n"
1098 "rotcl r0 \n" 1244 "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
1099 "mov.b r0,@%[addr] \n" /* store byte to bitplane */ 1245 "mov r7, r0 \n"
1100 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1246 "shll2 r0 \n"
1101 "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ 1247 "xor r5, r0 \n"
1102 "bt .ur_sloop \n" 1248 "and %[rx], r0 \n"
1103 1249 "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
1104 ".ur_end: \n" 1250 "shlr2 r0 \n"
1105 : /* outputs */ 1251 "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
1106 [addr]"+r"(addr), 1252 "mov r8, r0 \n"
1107 [mask]"+r"(mask), 1253 "shll2 r0 \n"
1108 [rx] "=&r"(trash) 1254 "xor r6, r0 \n"
1109 : /* inputs */ 1255 "and %[rx], r0 \n"
1110 [psiz]"r"(_gray_info.plane_size), 1256 "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
1111 [end] "r"(end), 1257 "shlr2 r0 \n"
1112 [patp]"[rx]"(pat_ptr) 1258 "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
1113 : /* clobbers */ 1259
1114 "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" 1260 "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */
1261 "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
1262 "shll r0 \n"
1263 "xor r1, r0 \n"
1264 "and %[rx], r0 \n"
1265 "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
1266 "shlr r0 \n"
1267 "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
1268 "mov r4, r0 \n"
1269 "shll r0 \n"
1270 "xor r3, r0 \n"
1271 "and %[rx], r0 \n"
1272 "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
1273 "shlr r0 \n"
1274 "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
1275 "mov r6, r0 \n"
1276 "shll r0 \n"
1277 "xor r5, r0 \n"
1278 "and %[rx], r0 \n"
1279 "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1280 "shlr r0 \n"
1281 "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1282 "mov r8, r0 \n"
1283 "shll r0 \n"
1284 "xor r7, r0 \n"
1285 "and %[rx], r0 \n"
1286 "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1287 "shlr r0 \n"
1288 "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1289
1290 "tst %[mask], %[mask] \n"
1291 "bt .ur_sloop \n" /* short loop if nothing to keep */
1292
1293 ".ur_floop: \n" /** full loop (there are bits to keep)**/
1294 "mov #8, r0 \n"
1295 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1296 "bt .ur_f8 \n"
1297
1298 "mulu %[psiz], %[dpth] \n"
1299 "mova .ur_ftable, r0 \n"
1300 "mov.b @(r0, %[dpth]), %[rx] \n"
1301 "add %[rx], r0 \n"
1302 "sts macl, %[rx] \n" /* point behind the last plane.. */
1303 "jmp @r0 \n" /* jump into streak */
1304 "add %[rx], %[addr] \n" /* ..for this round */
1305
1306 ".align 2 \n"
1307 ".ur_ftable: \n"
1308 ".byte .ur_f0 - .ur_ftable \n"
1309 ".byte .ur_f1 - .ur_ftable \n"
1310 ".byte .ur_f2 - .ur_ftable \n"
1311 ".byte .ur_f3 - .ur_ftable \n"
1312 ".byte .ur_f4 - .ur_ftable \n"
1313 ".byte .ur_f5 - .ur_ftable \n"
1314 ".byte .ur_f6 - .ur_ftable \n"
1315 ".byte .ur_f7 - .ur_ftable \n"
1316
1317 ".ur_f8: \n"
1318 "mov %[psiz], %[rx] \n"
1319 "shll2 %[rx] \n"
1320 "add %[rx], %[rx] \n"
1321 "add %[rx], %[addr] \n"
1322 /* Point behind the last plane for this round. Note: We're using the
1323 * registers backwards in order to reuse the streak for the last round.
1324 * Therefore we need to go thru the bitplanes backwards too, otherwise
1325 * the bit order would be destroyed which results in more flicker. */
1326 "sub %[psiz], %[addr] \n"
1327 "mov.b @%[addr], r0 \n" /* load old byte */
1328 "and %[mask], r0 \n" /* mask out replaced bits */
1329 "or r8, r0 \n" /* set new bits */
1330 "mov.b r0, @%[addr] \n" /* store byte */
1331 "shlr8 r8 \n" /* shift out used-up byte */
1332 ".ur_f7: \n"
1333 "sub %[psiz], %[addr] \n"
1334 "mov.b @%[addr], r0 \n"
1335 "and %[mask], r0 \n"
1336 "or r7, r0 \n"
1337 "mov.b r0, @%[addr] \n"
1338 "shlr8 r7 \n"
1339 ".ur_f6: \n"
1340 "sub %[psiz], %[addr] \n"
1341 "mov.b @%[addr], r0 \n"
1342 "and %[mask], r0 \n"
1343 "or r6, r0 \n"
1344 "mov.b r0, @%[addr] \n"
1345 "shlr8 r6 \n"
1346 ".ur_f5: \n"
1347 "sub %[psiz], %[addr] \n"
1348 "mov.b @%[addr], r0 \n"
1349 "and %[mask], r0 \n"
1350 "or r5, r0 \n"
1351 "mov.b r0, @%[addr] \n"
1352 "shlr8 r5 \n"
1353 ".ur_f4: \n"
1354 "sub %[psiz], %[addr] \n"
1355 "mov.b @%[addr], r0 \n"
1356 "and %[mask], r0 \n"
1357 "or r4, r0 \n"
1358 "mov.b r0, @%[addr] \n"
1359 "shlr8 r4 \n"
1360 ".ur_f3: \n"
1361 "sub %[psiz], %[addr] \n"
1362 "mov.b @%[addr], r0 \n"
1363 "and %[mask], r0 \n"
1364 "or r3, r0 \n"
1365 "mov.b r0, @%[addr] \n"
1366 "shlr8 r3 \n"
1367 ".ur_f2: \n"
1368 "sub %[psiz], %[addr] \n"
1369 "mov.b @%[addr], r0 \n"
1370 "and %[mask], r0 \n"
1371 "or r2, r0 \n"
1372 "mov.b r0, @%[addr] \n"
1373 "shlr8 r2 \n"
1374 ".ur_f1: \n"
1375 "sub %[psiz], %[addr] \n"
1376 "mov.b @%[addr], r0 \n"
1377 "and %[mask], r0 \n"
1378 "or r1, r0 \n"
1379 "mov.b r0, @%[addr] \n"
1380 "shlr8 r1 \n"
1381 ".ur_f0: \n"
1382
1383 "add %[rx], %[addr] \n" /* correct address */
1384 "add #-8, %[dpth] \n"
1385 "cmp/pl %[dpth] \n" /* next round if anything left */
1386 "bt .ur_floop \n"
1387
1388 "bra .ur_end \n"
1389 "nop \n"
1390
1391 /* References to C library routines used in the precalc block */
1392 ".align 2 \n"
1393 ".ashlsi3: \n" /* C library routine: */
1394 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
1395 ".lshrsi3: \n" /* C library routine: */
1396 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
1397 /* both routines preserve r4, destroy r5 and take ~16 cycles */
1398
1399 /* Bitmasks for the bit block rotation */
1400 ".ur_mask4: \n"
1401 ".long 0xF0F0F0F0 \n"
1402 ".ur_mask2: \n"
1403 ".long 0xCCCCCCCC \n"
1404 ".ur_mask1: \n"
1405 ".long 0xAAAAAAAA \n"
1406
1407 ".ur_sloop: \n" /** short loop (nothing to keep) **/
1408 "mov #8, r0 \n"
1409 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1410 "bt .ur_s8 \n"
1411
1412 "mulu %[psiz], %[dpth] \n"
1413 "mova .ur_stable, r0 \n"
1414 "mov.b @(r0, %[dpth]), %[rx] \n"
1415 "add %[rx], r0 \n"
1416 "sts macl, %[rx] \n" /* point behind the last plane.. */
1417 "jmp @r0 \n" /* jump into streak */
1418 "add %[rx], %[addr] \n" /* ..for this round */
1419
1420 ".align 2 \n"
1421 ".ur_stable: \n"
1422 ".byte .ur_s0 - .ur_stable \n"
1423 ".byte .ur_s1 - .ur_stable \n"
1424 ".byte .ur_s2 - .ur_stable \n"
1425 ".byte .ur_s3 - .ur_stable \n"
1426 ".byte .ur_s4 - .ur_stable \n"
1427 ".byte .ur_s5 - .ur_stable \n"
1428 ".byte .ur_s6 - .ur_stable \n"
1429 ".byte .ur_s7 - .ur_stable \n"
1430
1431 ".ur_s8: \n"
1432 "mov %[psiz], %[rx] \n" /* Point behind the last plane */
1433 "shll2 %[rx] \n" /* for this round. */
1434 "add %[rx], %[rx] \n" /* See above. */
1435 "add %[rx], %[addr] \n"
1436
1437 "sub %[psiz], %[addr] \n"
1438 "mov.b r8, @%[addr] \n" /* store byte */
1439 "shlr8 r8 \n" /* shift out used-up byte */
1440 ".ur_s7: \n"
1441 "sub %[psiz], %[addr] \n"
1442 "mov.b r7, @%[addr] \n"
1443 "shlr8 r7 \n"
1444 ".ur_s6: \n"
1445 "sub %[psiz], %[addr] \n"
1446 "mov.b r6, @%[addr] \n"
1447 "shlr8 r6 \n"
1448 ".ur_s5: \n"
1449 "sub %[psiz], %[addr] \n"
1450 "mov.b r5, @%[addr] \n"
1451 "shlr8 r5 \n"
1452 ".ur_s4: \n"
1453 "sub %[psiz], %[addr] \n"
1454 "mov.b r4, @%[addr] \n"
1455 "shlr8 r4 \n"
1456 ".ur_s3: \n"
1457 "sub %[psiz], %[addr] \n"
1458 "mov.b r3, @%[addr] \n"
1459 "shlr8 r3 \n"
1460 ".ur_s2: \n"
1461 "sub %[psiz], %[addr] \n"
1462 "mov.b r2, @%[addr] \n"
1463 "shlr8 r2 \n"
1464 ".ur_s1: \n"
1465 "sub %[psiz], %[addr] \n"
1466 "mov.b r1, @%[addr] \n"
1467 "shlr8 r1 \n"
1468 ".ur_s0: \n"
1469
1470 "add %[rx], %[addr] \n" /* correct address */
1471 "add #-8, %[dpth] \n"
1472 "cmp/pl %[dpth] \n" /* next round if anything left */
1473 "bt .ur_sloop \n"
1474
1475 ".ur_end: \n"
1476 : /* outputs */
1477 [addr]"+r"(addr),
1478 [dpth]"+r"(depth),
1479 [rx] "=&r"(trash)
1480 : /* inputs */
1481 [mask]"r"(mask),
1482 [psiz]"r"(_gray_info.plane_size),
1483 [patp]"[rx]"(pat_ptr)
1484 : /* clobbers */
1485 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
1115 ); 1486 );
1116 } 1487 }
1117#elif defined(CPU_COLDFIRE) 1488#elif defined(CPU_COLDFIRE)
1118 asm volatile ( 1489 asm volatile (
1119 "move.l (%[cbuf]),%%d0 \n" 1490 "move.l (%[cbuf]), %%d0 \n"
1120 "move.l (%[bbuf]),%%d1 \n" 1491 "move.l (%[bbuf]), %%d1 \n"
1121 "eor.l %%d0,%%d1 \n" 1492 "eor.l %%d0, %%d1 \n"
1122 "move.l (4,%[cbuf]),%%d0 \n" 1493 "move.l (4,%[cbuf]), %%d0 \n"
1123 "move.l (4,%[bbuf]),%[chg] \n" 1494 "move.l (4,%[bbuf]), %[chg] \n"
1124 "eor.l %%d0,%[chg] \n" 1495 "eor.l %%d0, %[chg] \n"
1125 "or.l %%d1,%[chg] \n" 1496 "or.l %%d1, %[chg] \n"
1126 : /* outputs */ 1497 : /* outputs */
1127 [chg] "=&d"(change) 1498 [chg] "=&d"(change)
1128 : /* inputs */ 1499 : /* inputs */
@@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height)
1134 1505
1135 if (change != 0) 1506 if (change != 0)
1136 { 1507 {
1137 unsigned char *addr, *end; 1508 unsigned char *addr;
1138 unsigned mask, trash; 1509 unsigned mask, depth, trash;
1139 1510
1140 pat_ptr = &pat_stack[8]; 1511 pat_ptr = &pat_stack[8];
1141 1512
1142 /* precalculate the bit patterns with random shifts 1513 /* precalculate the bit patterns with random shifts
1143 * for all 8 pixels and put them on an extra "stack" */ 1514 * for all 8 pixels and put them on an extra "stack" */
1144 asm volatile ( 1515 asm volatile
1145 "moveq.l #8,%%d3 \n" /* loop count */ 1516 (
1146 "clr.l %[mask] \n" 1517 "moveq.l #8, %%d3 \n" /* loop count */
1147 1518 "clr.l %[mask] \n"
1148 ".ur_pre_loop: \n" 1519
1149 "clr.l %%d0 \n" 1520 ".ur_pre_loop: \n"
1150 "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ 1521 "clr.l %%d0 \n"
1151 "clr.l %%d1 \n" 1522 "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */
1152 "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ 1523 "clr.l %%d1 \n"
1153 "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ 1524 "move.b (%[bbuf]), %%d1 \n" /* read back buffer */
1154 "clr.l %%d2 \n" /* preset for skipped pixel */ 1525 "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */
1155 "cmp.l %%d0,%%d1 \n" /* no change? */ 1526 "clr.l %%d2 \n" /* preset for skipped pixel */
1156 "beq.b .ur_skip \n" /* -> skip */ 1527 "cmp.l %%d0, %%d1 \n" /* no change? */
1157 1528 "beq.b .ur_skip \n" /* -> skip */
1158 "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ 1529
1159 1530 "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
1160 "mulu.w #75,%[rnd] \n" /* multiply by 75 */ 1531
1161 "add.l #74,%[rnd] \n" /* add another 74 */ 1532 "mulu.w #75, %[rnd] \n" /* multiply by 75 */
1162 /* Since the lower bits are not very random: */ 1533 "add.l #74, %[rnd] \n" /* add another 74 */
1163 "move.l %[rnd],%%d1 \n" 1534 /* Since the lower bits are not very random: */
1164 "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ 1535 "move.l %[rnd], %%d1 \n"
1165 "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ 1536 "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
1166 1537 "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
1167 "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ 1538
1168 "blo.b .ur_ntrim \n" 1539 "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
1169 "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ 1540 "blo.b .ur_ntrim \n"
1170 ".ur_ntrim: \n" 1541 "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
1171 1542 ".ur_ntrim: \n"
1172 "move.l %%d2,%%d0 \n" /** rotate pattern **/ 1543
1173 "lsl.l %%d1,%%d0 \n" 1544 "move.l %%d2, %%d0 \n" /** rotate pattern **/
1174 "sub.l %[dpth],%%d1 \n" 1545 "lsl.l %%d1, %%d0 \n"
1175 "neg.l %%d1 \n" /* d1 = depth - d1 */ 1546 "sub.l %[dpth], %%d1 \n"
1176 "lsr.l %%d1,%%d2 \n" 1547 "neg.l %%d1 \n" /* d1 = depth - d1 */
1177 "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ 1548 "lsr.l %%d1, %%d2 \n"
1178 1549 "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */
1179 "or.l #0x0100,%[mask] \n" /* set mask bit */ 1550
1180 1551 "or.l #0x0100, %[mask] \n" /* set mask bit */
1181 ".ur_skip: \n" 1552
1182 "lsr.l #1,%[mask] \n" /* shift mask */ 1553 ".ur_skip: \n"
1183 "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ 1554 "lsr.l #1, %[mask] \n" /* shift mask */
1184 1555 "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
1185 "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ 1556
1186 "bne.b .ur_pre_loop \n" 1557 "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
1187 : /* outputs */ 1558 "bne.b .ur_pre_loop \n"
1188 [cbuf]"+a"(cbuf), 1559 : /* outputs */
1189 [bbuf]"+a"(bbuf), 1560 [cbuf]"+a"(cbuf),
1190 [patp]"+a"(pat_ptr), 1561 [bbuf]"+a"(bbuf),
1191 [rnd] "+d"(_gray_random_buffer), 1562 [patp]"+a"(pat_ptr),
1192 [mask]"=&d"(mask) 1563 [rnd] "+d"(_gray_random_buffer),
1193 : /* inputs */ 1564 [mask]"=&d"(mask)
1194 [bpat]"a"(_gray_info.bitpattern), 1565 : /* inputs */
1195 [dpth]"d"(_gray_info.depth), 1566 [bpat]"a"(_gray_info.bitpattern),
1196 [rmsk]"d"(_gray_info.randmask) 1567 [dpth]"d"(_gray_info.depth),
1197 : /* clobbers */ 1568 [rmsk]"d"(_gray_info.randmask)
1198 "d0", "d1", "d2", "d3" 1569 : /* clobbers */
1570 "d0", "d1", "d2", "d3"
1199 ); 1571 );
1200 1572
1201 addr = dst_row; 1573 addr = dst_row;
1202 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1574 mask = ~mask & 0xff;
1575 depth = _gray_info.depth;
1203 1576
1204 /* set the bits for all 8 pixels in all bytes according to the 1577 /* set the bits for all 8 pixels in all bytes according to the
1205 * precalculated patterns on the pattern stack */ 1578 * precalculated patterns on the pattern stack */
1206 asm volatile ( 1579 asm volatile
1207 "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" 1580 (
1208 /* pop all 8 patterns */ 1581 "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
1209 "not.l %[mask] \n" /* "set" mask -> "keep" mask */ 1582 /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
1210 "and.l #0xFF,%[mask] \n" 1583
1211 "beq.b .ur_sstart \n" /* short loop if nothing to keep */ 1584 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1212 1585
1213 ".ur_floop: \n" /** full loop (there are bits to keep)**/ 1586 "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
1214 "clr.l %%d0 \n" 1587 "lsl.l #4, %%d0 \n"
1215 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1588 /* move.l %[ax], %%d5 */ /* already in d5 */
1216 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1589 "eor.l %%d5, %%d0 \n"
1217 "lsr.l #1,%%d3 \n" 1590 "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
1218 "addx.l %%d0,%%d0 \n" 1591 "eor.l %%d0, %%d5 \n"
1219 "lsr.l #1,%%d4 \n" 1592 "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
1220 "addx.l %%d0,%%d0 \n" 1593 "lsr.l #4, %%d0 \n"
1221 "lsr.l #1,%%d5 \n" 1594 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
1222 "addx.l %%d0,%%d0 \n" 1595 "move.l %%d2, %%d0 \n"
1223 "lsr.l #1,%%d6 \n" 1596 "lsl.l #4, %%d0 \n"
1224 "addx.l %%d0,%%d0 \n" 1597 "eor.l %%d6, %%d0 \n"
1225 "move.l %%a0,%%d1 \n" 1598 "and.l #0xF0F0F0F0, %%d0 \n"
1226 "lsr.l #1,%%d1 \n" 1599 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
1227 "addx.l %%d0,%%d0 \n" 1600 "lsr.l #4, %%d0 \n"
1228 "move.l %%d1,%%a0 \n" 1601 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
1229 "move.l %%a1,%%d1 \n" 1602 "move.l %%d3, %%d0 \n"
1230 "lsr.l #1,%%d1 \n" 1603 "lsl.l #4, %%d0 \n"
1231 "addx.l %%d0,%%d0 \n" 1604 "eor.l %%d7, %%d0 \n"
1232 "move.l %%d1,%%a1 \n" 1605 "and.l #0xF0F0F0F0, %%d0 \n"
1233 "move.l %[ax],%%d1 \n" 1606 "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
1234 "lsr.l #1,%%d1 \n" 1607 "lsr.l #4, %%d0 \n"
1235 "addx.l %%d0,%%d0 \n" 1608 "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
1236 "move.l %%d1,%[ax] \n" 1609 "move.l %%d4, %%d0 \n"
1237 1610 "lsl.l #4, %%d0 \n"
1238 "move.b (%[addr]),%%d1 \n" /* read old value */ 1611 "move.l %%a0, %%d5 \n"
1239 "and.l %[mask],%%d1 \n" /* mask out replaced bits */ 1612 "eor.l %%d5, %%d0 \n"
1240 "or.l %%d0,%%d1 \n" /* set new bits */ 1613 "and.l #0xF0F0F0F0, %%d0 \n"
1241 "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ 1614 "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
1242 1615 /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
1243 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1616 "lsr.l #4, %%d0 \n"
1244 "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ 1617 "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
1245 "bhi.b .ur_floop \n" 1618
1246 1619 "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
1247 "bra.b .ur_end \n" 1620 "lsl.l #2, %%d0 \n"
1248 1621 /* move.l %%a0, %%d5 */ /* still in d5 */
1249 ".ur_sstart: \n" 1622 "eor.l %%d5, %%d0 \n"
1250 "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ 1623 "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
1251 1624 "eor.l %%d0, %%d5 \n"
1252 ".ur_sloop: \n" /** short loop (nothing to keep) **/ 1625 "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
1253 "clr.l %%d0 \n" 1626 "lsr.l #2, %%d0 \n"
1254 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1627 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
1255 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1628 "move.l %[ax], %%d5 \n"
1256 "lsr.l #1,%%d3 \n" 1629 "move.l %%d5, %%d0 \n"
1257 "addx.l %%d0,%%d0 \n" 1630 "lsl.l #2, %%d0 \n"
1258 "lsr.l #1,%%d4 \n" 1631 "eor.l %%d7, %%d0 \n"
1259 "addx.l %%d0,%%d0 \n" 1632 "and.l #0xCCCCCCCC, %%d0 \n"
1260 "lsr.l #1,%%d5 \n" 1633 "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1261 "addx.l %%d0,%%d0 \n" 1634 "lsr.l #2, %%d0 \n"
1262 "lsr.l #1,%%d6 \n" 1635 "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
1263 "addx.l %%d0,%%d0 \n" 1636 /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
1264 "lsr.l #1,%[mask] \n" 1637 "move.l %%d2, %%d0 \n"
1265 "addx.l %%d0,%%d0 \n" 1638 "lsl.l #2, %%d0 \n"
1266 "move.l %%a1,%%d1 \n" 1639 "eor.l %%d4, %%d0 \n"
1267 "lsr.l #1,%%d1 \n" 1640 "and.l #0xCCCCCCCC, %%d0 \n"
1268 "addx.l %%d0,%%d0 \n" 1641 "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
1269 "move.l %%d1,%%a1 \n" 1642 "lsr.l #2, %%d0 \n"
1270 "move.l %[ax],%%d1 \n" 1643 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
1271 "lsr.l #1,%%d1 \n" 1644 "move.l %%d1, %%d0 \n"
1272 "addx.l %%d0,%%d0 \n" 1645 "lsl.l #2, %%d0 \n"
1273 "move.l %%d1,%[ax] \n" 1646 "eor.l %%d3, %%d0 \n"
1274 1647 "and.l #0xCCCCCCCC, %%d0 \n"
1275 "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ 1648 "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
1276 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1649 "lsr.l #2, %%d0 \n"
1277 "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ 1650 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
1278 "bhi.b .ur_sloop \n" 1651
1279 1652 "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
1280 ".ur_end: \n" 1653 "lsl.l #1, %%d0 \n"
1281 : /* outputs */ 1654 "eor.l %%d2, %%d0 \n"
1282 [addr]"+a"(addr), 1655 "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
1283 [mask]"+d"(mask), 1656 "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
1284 [ax] "=&a"(trash) 1657 "lsr.l #1, %%d0 \n"
1285 : /* inputs */ 1658 "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
1286 [psiz]"a"(_gray_info.plane_size), 1659 "move.l %%d3, %%d0 \n"
1287 [end] "a"(end), 1660 "lsl.l #1, %%d0 \n"
1288 [patp]"[ax]"(pat_ptr) 1661 "eor.l %%d4, %%d0 \n"
1289 : /* clobbers */ 1662 "and.l #0xAAAAAAAA, %%d0 \n"
1290 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" 1663 "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
1664 "lsr.l #1, %%d0 \n"
1665 "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
1666 /* move.l %[ax], %%d5 */ /* still in d5 */
1667 "move.l %%d5, %%d0 \n"
1668 "lsl.l #1, %%d0 \n"
1669 "eor.l %%d6, %%d0 \n"
1670 "and.l #0xAAAAAAAA, %%d0 \n"
1671 "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
1672 "lsr.l #1, %%d0 \n"
1673 "eor.l %%d0, %%d5 \n"
1674 "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
1675 "move.l %%d7, %%d0 \n"
1676 "lsl.l #1, %%d0 \n"
1677 "move.l %%a0, %%d5 \n"
1678 "eor.l %%d5, %%d0 \n"
1679 "and.l #0xAAAAAAAA, %%d0 \n"
1680 "eor.l %%d0, %%d5 \n"
1681 "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
1682 "lsr.l #1, %%d0 \n"
1683 "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
1684
1685 "tst.l %[mask] \n"
1686 "jeq .ur_sloop \n" /* short loop if nothing to keep */
1687
1688 "move.l %[mask], %%d5 \n" /* need mask in data reg. */
1689 "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
1690
1691 ".ur_floop: \n" /** full loop (there are bits to keep)**/
1692 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1693 "bhs.s .ur_f8 \n"
1694
1695 "move.l %[psiz], %%d0 \n"
1696 "move.l %[dpth], %%d1 \n"
1697 "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
1698 "add.l %%d0, %[addr] \n" /* for this round */
1699 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1700 "bra.s .ur_f1 \n" /* dpth == 0 should never happen */
1701 "bra.s .ur_f2 \n"
1702 "bra.s .ur_f3 \n"
1703 "bra.s .ur_f4 \n"
1704 "bra.s .ur_f5 \n"
1705 "bra.s .ur_f6 \n"
1706 "bra.s .ur_f7 \n"
1707
1708 ".ur_f8: \n"
1709 "move.l %[psiz], %%d0 \n"
1710 "lsl.l #3, %%d0 \n"
1711 "add.l %%d0, %[addr] \n"
1712 /* Point behind the last plane for this round. Note: We're using the
1713 * registers backwards in order to reuse the streak for the last round.
1714 * Therefore we need to go thru the bitplanes backwards too, otherwise
1715 * the bit order would be destroyed which results in more flicker. */
1716 "sub.l %[psiz], %[addr] \n"
1717 "move.b (%[addr]), %%d0 \n" /* load old byte */
1718 "and.l %%d5, %%d0 \n" /* mask out replaced bits */
1719 "move.l %[mask], %%d1 \n"
1720 "or.l %%d1, %%d0 \n" /* set new bits */
1721 "move.b %%d0, (%[addr]) \n" /* store byte */
1722 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1723 "move.l %%d1, %[mask] \n"
1724 ".ur_f7: \n"
1725 "sub.l %[psiz], %[addr] \n"
1726 "move.b (%[addr]), %%d0 \n"
1727 "and.l %%d5, %%d0 \n"
1728 "or.l %%d2, %%d0 \n"
1729 "move.b %%d0, (%[addr]) \n"
1730 "lsr.l #8, %%d2 \n"
1731 ".ur_f6: \n"
1732 "sub.l %[psiz], %[addr] \n"
1733 "move.b (%[addr]), %%d0 \n"
1734 "and.l %%d5, %%d0 \n"
1735 "or.l %%d3, %%d0 \n"
1736 "move.b %%d0, (%[addr]) \n"
1737 "lsr.l #8, %%d3 \n"
1738 ".ur_f5: \n"
1739 "sub.l %[psiz], %[addr] \n"
1740 "move.b (%[addr]), %%d0 \n"
1741 "and.l %%d5, %%d0 \n"
1742 "or.l %%d4, %%d0 \n"
1743 "move.b %%d0, (%[addr]) \n"
1744 "lsr.l #8, %%d4 \n"
1745 ".ur_f4: \n"
1746 "sub.l %[psiz], %[addr] \n"
1747 "move.b (%[addr]), %%d0 \n"
1748 "and.l %%d5, %%d0 \n"
1749 "move.l %[ax], %%d1 \n"
1750 "or.l %%d1, %%d0 \n"
1751 "move.b %%d0, (%[addr]) \n"
1752 "lsr.l #8, %%d1 \n"
1753 "move.l %%d1, %[ax] \n"
1754 ".ur_f3: \n"
1755 "sub.l %[psiz], %[addr] \n"
1756 "move.b (%[addr]), %%d0 \n"
1757 "and.l %%d5, %%d0 \n"
1758 "or.l %%d6, %%d0 \n"
1759 "move.b %%d0, (%[addr]) \n"
1760 "lsr.l #8, %%d6 \n"
1761 ".ur_f2: \n"
1762 "sub.l %[psiz], %[addr] \n"
1763 "move.b (%[addr]), %%d0 \n"
1764 "and.l %%d5, %%d0 \n"
1765 "or.l %%d7, %%d0 \n"
1766 "move.b %%d0, (%[addr]) \n"
1767 "lsr.l #8, %%d7 \n"
1768 ".ur_f1: \n"
1769 "sub.l %[psiz], %[addr] \n"
1770 "move.b (%[addr]), %%d0 \n"
1771 "and.l %%d5, %%d0 \n"
1772 "move.l %%a0, %%d1 \n"
1773 "or.l %%d1, %%d0 \n"
1774 "move.b %%d0, (%[addr]) \n"
1775 "lsr.l #8, %%d1 \n"
1776 "move.l %%d1, %%a0 \n"
1777
1778 "move.l %[psiz], %%d0 \n"
1779 "lsl.l #3, %%d0 \n"
1780 "add.l %%d0, %[addr] \n" /* correct address */
1781 "subq.l #8, %[dpth] \n"
1782 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1783 "jgt .ur_floop \n" /* next round if anything left */
1784
1785 "jra .ur_end \n"
1786
1787 ".ur_sloop: \n" /** short loop (nothing to keep) **/
1788 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1789 "bhs.s .ur_s8 \n"
1790
1791 "move.l %[psiz], %%d0 \n"
1792 "move.l %[dpth], %%d5 \n"
1793 "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
1794 "add.l %%d0, %[addr] \n" /* for this round */
1795 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1796 "bra.s .ur_s1 \n" /* dpth == 0 should never happen */
1797 "bra.s .ur_s2 \n"
1798 "bra.s .ur_s3 \n"
1799 "bra.s .ur_s4 \n"
1800 "bra.s .ur_s5 \n"
1801 "bra.s .ur_s6 \n"
1802 "bra.s .ur_s7 \n"
1803
1804 ".ur_s8: \n"
1805 "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
1806 "lsl.l #3, %%d0 \n" /* for this round. */
1807 "add.l %%d0, %[addr] \n" /* See above. */
1808
1809 "sub.l %[psiz], %[addr] \n"
1810 "move.b %%d1, (%[addr]) \n" /* store byte */
1811 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1812 ".ur_s7: \n"
1813 "sub.l %[psiz], %[addr] \n"
1814 "move.b %%d2, (%[addr]) \n"
1815 "lsr.l #8, %%d2 \n"
1816 ".ur_s6: \n"
1817 "sub.l %[psiz], %[addr] \n"
1818 "move.b %%d3, (%[addr]) \n"
1819 "lsr.l #8, %%d3 \n"
1820 ".ur_s5: \n"
1821 "sub.l %[psiz], %[addr] \n"
1822 "move.b %%d4, (%[addr]) \n"
1823 "lsr.l #8, %%d4 \n"
1824 ".ur_s4: \n"
1825 "sub.l %[psiz], %[addr] \n"
1826 "move.l %[ax], %%d5 \n"
1827 "move.b %%d5, (%[addr]) \n"
1828 "lsr.l #8, %%d5 \n"
1829 "move.l %%d5, %[ax] \n"
1830 ".ur_s3: \n"
1831 "sub.l %[psiz], %[addr] \n"
1832 "move.b %%d6, (%[addr]) \n"
1833 "lsr.l #8, %%d6 \n"
1834 ".ur_s2: \n"
1835 "sub.l %[psiz], %[addr] \n"
1836 "move.b %%d7, (%[addr]) \n"
1837 "lsr.l #8, %%d7 \n"
1838 ".ur_s1: \n"
1839 "sub.l %[psiz], %[addr] \n"
1840 "move.l %%a0, %%d5 \n"
1841 "move.b %%d5, (%[addr]) \n"
1842 "lsr.l #8, %%d5 \n"
1843 "move.l %%d5, %%a0 \n"
1844
1845 "add.l %%d0, %[addr] \n" /* correct address */
1846 "subq.l #8, %[dpth] \n"
1847 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1848 "jgt .ur_sloop \n" /* next round if anything left */
1849
1850 ".ur_end: \n"
1851 : /* outputs */
1852 [addr]"+a"(addr),
1853 [dpth]"+a"(depth),
1854 [mask]"+a"(mask),
1855 [ax] "=&a"(trash)
1856 : /* inputs */
1857 [psiz]"a"(_gray_info.plane_size),
1858 [patp]"[ax]"(pat_ptr)
1859 : /* clobbers */
1860 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
1291 ); 1861 );
1292 } 1862 }
1293#else /* C version, for reference*/ 1863#else /* C version, for reference*/
@@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd)
1680} 2250}
1681 2251
1682#endif /* HAVE_LCD_BITMAP */ 2252#endif /* HAVE_LCD_BITMAP */
1683