diff options
author | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 14:13:01 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 14:13:01 +0000 |
commit | 71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch) | |
tree | b9a97081ec04d4d311a7b45747393e68837912a2 /apps/plugins/lib/gray_draw.c | |
parent | bcd94a9b01d19d87a437cd8158a758f206b30825 (diff) | |
download | rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip |
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib/gray_draw.c')
-rw-r--r-- | apps/plugins/lib/gray_draw.c | 1156 |
1 files changed, 865 insertions, 291 deletions
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index 9406664ea2..dcc65bdd09 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c | |||
@@ -868,24 +868,24 @@ void gray_ub_clear_display(void) | |||
868 | 868 | ||
869 | /* Write a pixel block, defined by their brightnesses in a greymap. | 869 | /* Write a pixel block, defined by their brightnesses in a greymap. |
870 | Address is the byte in the first bitplane, src is the greymap start address, | 870 | Address is the byte in the first bitplane, src is the greymap start address, |
871 | stride is the increment for the greymap to get to the next pixel, mask | 871 | mask determines which pixels of the destination block are changed. */ |
872 | determines which pixels of the destination block are changed. */ | ||
873 | static void _writearray(unsigned char *address, const unsigned char *src, | 872 | static void _writearray(unsigned char *address, const unsigned char *src, |
874 | unsigned mask) | 873 | unsigned mask) |
875 | { | 874 | { |
876 | unsigned long pat_stack[8]; | 875 | unsigned long pat_stack[8]; |
877 | unsigned long *pat_ptr = &pat_stack[8]; | 876 | unsigned long *pat_ptr = &pat_stack[8]; |
878 | unsigned char *addr, *end; | 877 | unsigned char *addr; |
879 | #ifdef CPU_ARM | 878 | #ifdef CPU_ARM |
880 | const unsigned char *_src; | 879 | const unsigned char *_src; |
881 | unsigned _mask, trash; | 880 | unsigned _mask, depth, trash; |
882 | 881 | ||
883 | _mask = mask; | 882 | _mask = mask; |
884 | _src = src; | 883 | _src = src; |
885 | 884 | ||
886 | /* precalculate the bit patterns with random shifts | 885 | /* precalculate the bit patterns with random shifts |
887 | for all 8 pixels and put them on an extra "stack" */ | 886 | for all 8 pixels and put them on an extra "stack" */ |
888 | asm volatile ( | 887 | asm volatile |
888 | ( | ||
889 | "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ | 889 | "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ |
890 | "mov r3, #8 \n" /* loop count */ | 890 | "mov r3, #8 \n" /* loop count */ |
891 | 891 | ||
@@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
932 | ); | 932 | ); |
933 | 933 | ||
934 | addr = address; | 934 | addr = address; |
935 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | ||
936 | _mask = mask; | 935 | _mask = mask; |
936 | depth = _gray_info.depth; | ||
937 | 937 | ||
938 | /* set the bits for all 8 pixels in all bytes according to the | 938 | /* set the bits for all 8 pixels in all bytes according to the |
939 | * precalculated patterns on the pattern stack */ | 939 | * precalculated patterns on the pattern stack */ |
940 | asm volatile ( | 940 | asm volatile |
941 | "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ | 941 | ( |
942 | 942 | "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ | |
943 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | 943 | |
944 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ | ||
945 | |||
946 | "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ | ||
947 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
948 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ | ||
949 | "eor r0, r1, r5, lsl #4 \n" | ||
950 | "and r0, r0, %[rx] \n" | ||
951 | "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ | ||
952 | "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ | ||
953 | "eor r0, r2, r6, lsl #4 \n" | ||
954 | "and r0, r0, %[rx] \n" | ||
955 | "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ | ||
956 | "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ | ||
957 | "eor r0, r3, r7, lsl #4 \n" | ||
958 | "and r0, r0, %[rx] \n" | ||
959 | "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ | ||
960 | "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ | ||
961 | "eor r0, r4, r8, lsl #4 \n" | ||
962 | "and r0, r0, %[rx] \n" | ||
963 | "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
964 | "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
965 | |||
966 | "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ | ||
967 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
968 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ | ||
969 | "eor r0, r1, r3, lsl #2 \n" | ||
970 | "and r0, r0, %[rx] \n" | ||
971 | "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
972 | "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
973 | "eor r0, r2, r4, lsl #2 \n" | ||
974 | "and r0, r0, %[rx] \n" | ||
975 | "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
976 | "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
977 | "eor r0, r5, r7, lsl #2 \n" | ||
978 | "and r0, r0, %[rx] \n" | ||
979 | "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
980 | "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
981 | "eor r0, r6, r8, lsl #2 \n" | ||
982 | "and r0, r0, %[rx] \n" | ||
983 | "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
984 | "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
985 | |||
986 | "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ | ||
987 | "orr %[rx], %[rx], %[rx], lsl #8 \n" | ||
988 | "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ | ||
989 | "eor r0, r1, r2, lsl #1 \n" | ||
990 | "and r0, r0, %[rx] \n" | ||
991 | "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
992 | "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
993 | "eor r0, r3, r4, lsl #1 \n" | ||
994 | "and r0, r0, %[rx] \n" | ||
995 | "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
996 | "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
997 | "eor r0, r5, r6, lsl #1 \n" | ||
998 | "and r0, r0, %[rx] \n" | ||
999 | "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1000 | "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1001 | "eor r0, r7, r8, lsl #1 \n" | ||
1002 | "and r0, r0, %[rx] \n" | ||
1003 | "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1004 | "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1005 | |||
1006 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
944 | "ands %[mask], %[mask], #0xff \n" | 1007 | "ands %[mask], %[mask], #0xff \n" |
945 | "beq .wa_sloop \n" /* short loop if nothing to keep */ | 1008 | "beq .wa_sloop \n" /* short loop if no bits to keep */ |
946 | 1009 | ||
947 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1010 | ".wa_floop: \n" /** full loop (bits to keep)**/ |
948 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 1011 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ |
949 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | 1012 | "bhs .wa_f8 \n" |
950 | "movs r8, r8, lsr #1 \n" | 1013 | |
951 | "adc r0, r0, r0 \n" | 1014 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ |
952 | "movs r7, r7, lsr #1 \n" | 1015 | "add %[addr], %[addr], r0 \n" /* for this round */ |
953 | "adc r0, r0, r0 \n" | 1016 | |
954 | "movs r6, r6, lsr #1 \n" | 1017 | |
955 | "adc r0, r0, r0 \n" | 1018 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ |
956 | "movs r5, r5, lsr #1 \n" | 1019 | "add pc, pc, r0 \n" |
957 | "adc r0, r0, r0 \n" | 1020 | ".wa_ftable: \n" |
958 | "movs r4, r4, lsr #1 \n" | 1021 | ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */ |
959 | "adc r0, r0, r0 \n" | 1022 | ".byte .wa_f1 - .wa_ftable - 4 \n" |
960 | "movs r3, r3, lsr #1 \n" | 1023 | ".byte .wa_f2 - .wa_ftable - 4 \n" |
961 | "adc r0, r0, r0 \n" | 1024 | ".byte .wa_f3 - .wa_ftable - 4 \n" |
962 | "movs r2, r2, lsr #1 \n" | 1025 | ".byte .wa_f4 - .wa_ftable - 4 \n" |
963 | "adc r0, r0, r0 \n" | 1026 | ".byte .wa_f5 - .wa_ftable - 4 \n" |
964 | 1027 | ".byte .wa_f6 - .wa_ftable - 4 \n" | |
965 | "ldrb r1, [%[addr]] \n" /* read old value */ | 1028 | ".byte .wa_f7 - .wa_ftable - 4 \n" |
966 | "and r1, r1, %[mask] \n" /* mask out replaced bits */ | 1029 | |
967 | "orr r1, r1, r0 \n" /* set new bits */ | 1030 | ".wa_f8: \n" |
968 | "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ | 1031 | "add %[addr], %[addr], %[psiz], lsl #3 \n" |
969 | 1032 | /* Point behind the last plane for this round. Note: We're using the | |
970 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | 1033 | * registers backwards in order to reuse the streak for the last round. |
971 | "bne .wa_floop \n" | 1034 | * Therefore we need to go thru the bitplanes backwards too, otherwise |
972 | 1035 | * the bit order would be destroyed which results in more flicker. */ | |
1036 | "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ | ||
1037 | "and r0, r0, %[mask] \n" /* mask out replaced bits */ | ||
1038 | "orr r0, r0, r8 \n" /* set new bits */ | ||
1039 | "strb r0, [%[addr]] \n" /* store byte */ | ||
1040 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
1041 | ".wa_f7: \n" | ||
1042 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1043 | "and r0, r0, %[mask] \n" | ||
1044 | "orr r0, r0, r7 \n" | ||
1045 | "strb r0, [%[addr]] \n" | ||
1046 | "mov r7, r7, lsr #8 \n" | ||
1047 | ".wa_f6: \n" | ||
1048 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1049 | "and r0, r0, %[mask] \n" | ||
1050 | "orr r0, r0, r6 \n" | ||
1051 | "strb r0, [%[addr]] \n" | ||
1052 | "mov r6, r6, lsr #8 \n" | ||
1053 | ".wa_f5: \n" | ||
1054 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1055 | "and r0, r0, %[mask] \n" | ||
1056 | "orr r0, r0, r5 \n" | ||
1057 | "strb r0, [%[addr]] \n" | ||
1058 | "mov r5, r5, lsr #8 \n" | ||
1059 | ".wa_f4: \n" | ||
1060 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1061 | "and r0, r0, %[mask] \n" | ||
1062 | "orr r0, r0, r4 \n" | ||
1063 | "strb r0, [%[addr]] \n" | ||
1064 | "mov r4, r4, lsr #8 \n" | ||
1065 | ".wa_f3: \n" | ||
1066 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1067 | "and r0, r0, %[mask] \n" | ||
1068 | "orr r0, r0, r3 \n" | ||
1069 | "strb r0, [%[addr]] \n" | ||
1070 | "mov r3, r3, lsr #8 \n" | ||
1071 | ".wa_f2: \n" | ||
1072 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1073 | "and r0, r0, %[mask] \n" | ||
1074 | "orr r0, r0, r2 \n" | ||
1075 | "strb r0, [%[addr]] \n" | ||
1076 | "mov r2, r2, lsr #8 \n" | ||
1077 | ".wa_f1: \n" | ||
1078 | "ldrb r0, [%[addr], -%[psiz]]! \n" | ||
1079 | "and r0, r0, %[mask] \n" | ||
1080 | "orr r0, r0, r1 \n" | ||
1081 | "strb r0, [%[addr]] \n" | ||
1082 | "mov r1, r1, lsr #8 \n" | ||
1083 | ".wa_f0: \n" | ||
1084 | |||
1085 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
1086 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
1087 | "bhi .wa_floop \n" | ||
1088 | |||
973 | "b .wa_end \n" | 1089 | "b .wa_end \n" |
974 | 1090 | ||
975 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1091 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
976 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | 1092 | "cmp %[dpth], #8 \n" /* 8 planes or more left? */ |
977 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | 1093 | "bhs .wa_s8 \n" |
978 | "movs r8, r8, lsr #1 \n" | ||
979 | "adc r0, r0, r0 \n" | ||
980 | "movs r7, r7, lsr #1 \n" | ||
981 | "adc r0, r0, r0 \n" | ||
982 | "movs r6, r6, lsr #1 \n" | ||
983 | "adc r0, r0, r0 \n" | ||
984 | "movs r5, r5, lsr #1 \n" | ||
985 | "adc r0, r0, r0 \n" | ||
986 | "movs r4, r4, lsr #1 \n" | ||
987 | "adc r0, r0, r0 \n" | ||
988 | "movs r3, r3, lsr #1 \n" | ||
989 | "adc r0, r0, r0 \n" | ||
990 | "movs r2, r2, lsr #1 \n" | ||
991 | "adc r0, r0, r0 \n" | ||
992 | |||
993 | "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ | ||
994 | 1094 | ||
995 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | 1095 | "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ |
996 | "bne .wa_sloop \n" | 1096 | "add %[addr], %[addr], r0 \n" /* for this round */ |
997 | 1097 | ||
1098 | "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ | ||
1099 | "add pc, pc, r0 \n" | ||
1100 | ".wa_stable: \n" | ||
1101 | ".byte .wa_s0 - .wa_stable - 4 \n" | ||
1102 | ".byte .wa_s1 - .wa_stable - 4 \n" | ||
1103 | ".byte .wa_s2 - .wa_stable - 4 \n" | ||
1104 | ".byte .wa_s3 - .wa_stable - 4 \n" | ||
1105 | ".byte .wa_s4 - .wa_stable - 4 \n" | ||
1106 | ".byte .wa_s5 - .wa_stable - 4 \n" | ||
1107 | ".byte .wa_s6 - .wa_stable - 4 \n" | ||
1108 | ".byte .wa_s7 - .wa_stable - 4 \n" | ||
1109 | |||
1110 | ".wa_s8: \n" | ||
1111 | "add %[addr], %[addr], %[psiz], lsl #3 \n" | ||
1112 | /* Point behind the last plane for this round. See above. */ | ||
1113 | "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ | ||
1114 | "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ | ||
1115 | ".wa_s7: \n" | ||
1116 | "strb r7, [%[addr], -%[psiz]]! \n" | ||
1117 | "mov r7, r7, lsr #8 \n" | ||
1118 | ".wa_s6: \n" | ||
1119 | "strb r6, [%[addr], -%[psiz]]! \n" | ||
1120 | "mov r6, r6, lsr #8 \n" | ||
1121 | ".wa_s5: \n" | ||
1122 | "strb r5, [%[addr], -%[psiz]]! \n" | ||
1123 | "mov r5, r5, lsr #8 \n" | ||
1124 | ".wa_s4: \n" | ||
1125 | "strb r4, [%[addr], -%[psiz]]! \n" | ||
1126 | "mov r4, r4, lsr #8 \n" | ||
1127 | ".wa_s3: \n" | ||
1128 | "strb r3, [%[addr], -%[psiz]]! \n" | ||
1129 | "mov r3, r3, lsr #8 \n" | ||
1130 | ".wa_s2: \n" | ||
1131 | "strb r2, [%[addr], -%[psiz]]! \n" | ||
1132 | "mov r2, r2, lsr #8 \n" | ||
1133 | ".wa_s1: \n" | ||
1134 | "strb r1, [%[addr], -%[psiz]]! \n" | ||
1135 | "mov r1, r1, lsr #8 \n" | ||
1136 | ".wa_s0: \n" | ||
1137 | |||
1138 | "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ | ||
1139 | "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ | ||
1140 | "bhi .wa_sloop \n" | ||
1141 | |||
998 | ".wa_end: \n" | 1142 | ".wa_end: \n" |
999 | : /* outputs */ | 1143 | : /* outputs */ |
1000 | [addr]"+r"(addr), | 1144 | [addr]"+r"(addr), |
1001 | [mask]"+r"(_mask), | 1145 | [mask]"+r"(_mask), |
1146 | [dpth]"+r"(depth), | ||
1002 | [rx] "=&r"(trash) | 1147 | [rx] "=&r"(trash) |
1003 | : /* inputs */ | 1148 | : /* inputs */ |
1004 | [psiz]"r"(_gray_info.plane_size), | 1149 | [psiz]"r"(_gray_info.plane_size), |
1005 | [end] "r"(end), | ||
1006 | [patp]"[rx]"(pat_ptr) | 1150 | [patp]"[rx]"(pat_ptr) |
1007 | : /* clobbers */ | 1151 | : /* clobbers */ |
1008 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | 1152 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" |
1009 | ); | 1153 | ); |
1010 | #else /* C version, for reference*/ | 1154 | #else /* C version, for reference*/ |
1011 | #warning C version of _writearray() used | 1155 | #warning C version of _writearray() used |
1156 | unsigned char *end; | ||
1012 | unsigned test = 0x80; | 1157 | unsigned test = 0x80; |
1013 | int i; | 1158 | int i; |
1014 | 1159 | ||
@@ -1143,67 +1288,70 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y, | |||
1143 | stride is the increment for the greymap to get to the next pixel, mask | 1288 | stride is the increment for the greymap to get to the next pixel, mask |
1144 | determines which pixels of the destination block are changed. */ | 1289 | determines which pixels of the destination block are changed. */ |
1145 | static void _writearray(unsigned char *address, const unsigned char *src, | 1290 | static void _writearray(unsigned char *address, const unsigned char *src, |
1291 | int stride, unsigned mask) __attribute__((noinline)); | ||
1292 | static void _writearray(unsigned char *address, const unsigned char *src, | ||
1146 | int stride, unsigned mask) | 1293 | int stride, unsigned mask) |
1147 | { | 1294 | { |
1148 | unsigned long pat_stack[8]; | 1295 | unsigned long pat_stack[8]; |
1149 | unsigned long *pat_ptr = &pat_stack[8]; | 1296 | unsigned long *pat_ptr = &pat_stack[8]; |
1150 | unsigned char *addr, *end; | 1297 | unsigned char *addr; |
1151 | #if CONFIG_CPU == SH7034 | 1298 | #if CONFIG_CPU == SH7034 |
1152 | const unsigned char *_src; | 1299 | const unsigned char *_src; |
1153 | unsigned _mask, trash; | 1300 | unsigned _mask, depth, trash; |
1154 | 1301 | ||
1155 | _mask = mask; | 1302 | _mask = mask; |
1156 | _src = src; | 1303 | _src = src; |
1157 | 1304 | ||
1158 | /* precalculate the bit patterns with random shifts | 1305 | /* precalculate the bit patterns with random shifts |
1159 | for all 8 pixels and put them on an extra "stack" */ | 1306 | for all 8 pixels and put them on an extra "stack" */ |
1160 | asm volatile ( | 1307 | asm volatile |
1161 | "mov #8,r3 \n" /* loop count */ | 1308 | ( |
1162 | 1309 | "mov #8, r3 \n" /* loop count */ | |
1163 | ".wa_loop: \n" /** load pattern for pixel **/ | 1310 | |
1164 | "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ | 1311 | ".wa_loop: \n" /** load pattern for pixel **/ |
1165 | "shlr %[mask] \n" /* shift out lsb of mask */ | 1312 | "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */ |
1166 | "bf .wa_skip \n" /* skip this pixel */ | 1313 | "shlr %[mask] \n" /* shift out lsb of mask */ |
1167 | 1314 | "bf .wa_skip \n" /* skip this pixel */ | |
1168 | "mov.b @%[src],r0 \n" /* load src byte */ | 1315 | |
1169 | "extu.b r0,r0 \n" /* extend unsigned */ | 1316 | "mov.b @%[src], r0 \n" /* load src byte */ |
1170 | "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ | 1317 | "extu.b r0, r0 \n" /* extend unsigned */ |
1171 | "extu.b r0,r0 \n" /* extend unsigned */ | 1318 | "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */ |
1172 | "shll2 r0 \n" | 1319 | "extu.b r0, r0 \n" /* extend unsigned */ |
1173 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ | 1320 | "shll2 r0 \n" |
1174 | 1321 | "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ | |
1175 | "mov #75,r0 \n" | 1322 | |
1176 | "mulu r0,%[rnd] \n" /* multiply by 75 */ | 1323 | "mov #75, r0 \n" |
1177 | "sts macl,%[rnd] \n" | 1324 | "mulu r0, %[rnd] \n" /* multiply by 75 */ |
1178 | "add #74,%[rnd] \n" /* add another 74 */ | 1325 | "sts macl, %[rnd] \n" |
1326 | "add #74, %[rnd] \n" /* add another 74 */ | ||
1179 | /* Since the lower bits are not very random: */ | 1327 | /* Since the lower bits are not very random: */ |
1180 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ | 1328 | "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ |
1181 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ | 1329 | "and %[rmsk], r1 \n" /* mask out unneeded bits */ |
1182 | 1330 | ||
1183 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ | 1331 | "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ |
1184 | "bf .wa_ntrim \n" | 1332 | "bf .wa_ntrim \n" |
1185 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ | 1333 | "sub %[dpth], r1 \n" /* yes: random -= depth; */ |
1186 | ".wa_ntrim: \n" | 1334 | ".wa_ntrim: \n" |
1187 | 1335 | ||
1188 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ | 1336 | "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ |
1189 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ | 1337 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ |
1190 | "mov r1,r5 \n" | 1338 | "mov r1, r5 \n" |
1191 | 1339 | ||
1192 | "mov %[dpth],r5 \n" | 1340 | "mov %[dpth], r5 \n" |
1193 | "sub r1,r5 \n" /* r5 = depth - r1 */ | 1341 | "sub r1, r5 \n" /* r5 = depth - r1 */ |
1194 | "mov.l .lshrsi3,r1 \n" | 1342 | "mov.l .lshrsi3, r1 \n" |
1195 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ | 1343 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ |
1196 | "mov r0,r1 \n" /* store previous result in r1 */ | 1344 | "mov r0, r1 \n" /* store previous result in r1 */ |
1197 | 1345 | ||
1198 | "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ | 1346 | "or r1, r0 \n" /* rotated_pattern = r0 | r1 */ |
1199 | 1347 | ||
1200 | ".wa_skip: \n" | 1348 | ".wa_skip: \n" |
1201 | "mov.l r0,@-%[patp] \n" /* push on pattern stack */ | 1349 | "mov.l r0, @-%[patp] \n" /* push on pattern stack */ |
1202 | 1350 | ||
1203 | "add %[stri],%[src] \n" /* src += stride; */ | 1351 | "add %[stri], %[src] \n" /* src += stride; */ |
1204 | "add #-1,r3 \n" /* loop 8 times (pixel block) */ | 1352 | "add #-1, r3 \n" /* loop 8 times (pixel block) */ |
1205 | "cmp/pl r3 \n" | 1353 | "cmp/pl r3 \n" |
1206 | "bt .wa_loop \n" | 1354 | "bt .wa_loop \n" |
1207 | : /* outputs */ | 1355 | : /* outputs */ |
1208 | [src] "+r"(_src), | 1356 | [src] "+r"(_src), |
1209 | [rnd] "+r"(_gray_random_buffer), | 1357 | [rnd] "+r"(_gray_random_buffer), |
@@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1220 | ); | 1368 | ); |
1221 | 1369 | ||
1222 | addr = address; | 1370 | addr = address; |
1223 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | ||
1224 | _mask = mask; | 1371 | _mask = mask; |
1372 | depth = _gray_info.depth; | ||
1225 | 1373 | ||
1226 | /* set the bits for all 8 pixels in all bytes according to the | 1374 | /* set the bits for all 8 pixels in all bytes according to the |
1227 | * precalculated patterns on the pattern stack */ | 1375 | * precalculated patterns on the pattern stack */ |
1228 | asm volatile ( | 1376 | asm volatile |
1229 | "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ | 1377 | ( |
1230 | "mov.l @%[patp]+,r2 \n" | 1378 | "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ |
1231 | "mov.l @%[patp]+,r3 \n" | 1379 | "mov.l @%[patp]+, r7 \n" |
1232 | "mov.l @%[patp]+,r6 \n" | 1380 | "mov.l @%[patp]+, r6 \n" |
1233 | "mov.l @%[patp]+,r7 \n" | 1381 | "mov.l @%[patp]+, r5 \n" |
1234 | "mov.l @%[patp]+,r8 \n" | 1382 | "mov.l @%[patp]+, r4 \n" |
1235 | "mov.l @%[patp]+,r9 \n" | 1383 | "mov.l @%[patp]+, r3 \n" |
1236 | "mov.l @%[patp],r10 \n" | 1384 | "mov.l @%[patp]+, r2 \n" |
1237 | 1385 | "mov.l @%[patp], r1 \n" | |
1238 | "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ | 1386 | |
1239 | "extu.b %[mask],%[mask] \n" /* mask out high bits */ | 1387 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1240 | "tst %[mask],%[mask] \n" | 1388 | |
1241 | "bt .wa_sloop \n" /* short loop if nothing to keep */ | 1389 | "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */ |
1242 | 1390 | "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ | |
1243 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1391 | "shll2 r0 \n" |
1244 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1392 | "shll2 r0 \n" |
1245 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1393 | "xor r1, r0 \n" |
1246 | "shlr r2 \n" | 1394 | "and %[rx], r0 \n" |
1247 | "rotcl r0 \n" | 1395 | "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ |
1248 | "shlr r3 \n" | 1396 | "shlr2 r0 \n" |
1249 | "rotcl r0 \n" | 1397 | "shlr2 r0 \n" |
1250 | "shlr r6 \n" | 1398 | "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ |
1251 | "rotcl r0 \n" | 1399 | "mov r6, r0 \n" |
1252 | "shlr r7 \n" | 1400 | "shll2 r0 \n" |
1253 | "rotcl r0 \n" | 1401 | "shll2 r0 \n" |
1254 | "shlr r8 \n" | 1402 | "xor r2, r0 \n" |
1255 | "rotcl r0 \n" | 1403 | "and %[rx], r0 \n" |
1256 | "shlr r9 \n" | 1404 | "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ |
1257 | "rotcl r0 \n" | 1405 | "shlr2 r0 \n" |
1258 | "shlr r10 \n" | 1406 | "shlr2 r0 \n" |
1259 | "mov.b @%[addr],%[rx] \n" /* read old value */ | 1407 | "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ |
1260 | "rotcl r0 \n" | 1408 | "mov r7, r0 \n" |
1261 | "and %[mask],%[rx] \n" /* mask out replaced bits */ | 1409 | "shll2 r0 \n" |
1262 | "or %[rx],r0 \n" /* set new bits */ | 1410 | "shll2 r0 \n" |
1263 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ | 1411 | "xor r3, r0 \n" |
1264 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1412 | "and %[rx], r0 \n" |
1265 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ | 1413 | "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ |
1266 | "bt .wa_floop \n" | 1414 | "shlr2 r0 \n" |
1267 | 1415 | "shlr2 r0 \n" | |
1268 | "bra .wa_end \n" | 1416 | "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ |
1269 | "nop \n" | 1417 | "mov r8, r0 \n" |
1418 | "shll2 r0 \n" | ||
1419 | "shll2 r0 \n" | ||
1420 | "xor r4, r0 \n" | ||
1421 | "and %[rx], r0 \n" | ||
1422 | "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ | ||
1423 | "shlr2 r0 \n" | ||
1424 | "shlr2 r0 \n" | ||
1425 | "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ | ||
1426 | |||
1427 | "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */ | ||
1428 | "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ | ||
1429 | "shll2 r0 \n" | ||
1430 | "xor r1, r0 \n" | ||
1431 | "and %[rx], r0 \n" | ||
1432 | "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ | ||
1433 | "shlr2 r0 \n" | ||
1434 | "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ | ||
1435 | "mov r4, r0 \n" | ||
1436 | "shll2 r0 \n" | ||
1437 | "xor r2, r0 \n" | ||
1438 | "and %[rx], r0 \n" | ||
1439 | "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ | ||
1440 | "shlr2 r0 \n" | ||
1441 | "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ | ||
1442 | "mov r7, r0 \n" | ||
1443 | "shll2 r0 \n" | ||
1444 | "xor r5, r0 \n" | ||
1445 | "and %[rx], r0 \n" | ||
1446 | "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ | ||
1447 | "shlr2 r0 \n" | ||
1448 | "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ | ||
1449 | "mov r8, r0 \n" | ||
1450 | "shll2 r0 \n" | ||
1451 | "xor r6, r0 \n" | ||
1452 | "and %[rx], r0 \n" | ||
1453 | "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ | ||
1454 | "shlr2 r0 \n" | ||
1455 | "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ | ||
1456 | |||
1457 | "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */ | ||
1458 | "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1459 | "shll r0 \n" | ||
1460 | "xor r1, r0 \n" | ||
1461 | "and %[rx], r0 \n" | ||
1462 | "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ | ||
1463 | "shlr r0 \n" | ||
1464 | "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ | ||
1465 | "mov r4, r0 \n" | ||
1466 | "shll r0 \n" | ||
1467 | "xor r3, r0 \n" | ||
1468 | "and %[rx], r0 \n" | ||
1469 | "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ | ||
1470 | "shlr r0 \n" | ||
1471 | "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ | ||
1472 | "mov r6, r0 \n" | ||
1473 | "shll r0 \n" | ||
1474 | "xor r5, r0 \n" | ||
1475 | "and %[rx], r0 \n" | ||
1476 | "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ | ||
1477 | "shlr r0 \n" | ||
1478 | "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ | ||
1479 | "mov r8, r0 \n" | ||
1480 | "shll r0 \n" | ||
1481 | "xor r7, r0 \n" | ||
1482 | "and %[rx], r0 \n" | ||
1483 | "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ | ||
1484 | "shlr r0 \n" | ||
1485 | "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ | ||
1486 | |||
1487 | "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
1488 | "extu.b %[mask], %[mask] \n" /* mask out high bits */ | ||
1489 | "tst %[mask], %[mask] \n" | ||
1490 | "bt .wa_sloop \n" /* short loop if nothing to keep */ | ||
1491 | |||
1492 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | ||
1493 | "mov #8, r0 \n" | ||
1494 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ | ||
1495 | "bt .wa_f8 \n" | ||
1496 | |||
1497 | "mulu %[psiz], %[dpth] \n" | ||
1498 | "mova .wa_ftable, r0 \n" | ||
1499 | "mov.b @(r0, %[dpth]), %[rx] \n" | ||
1500 | "add %[rx], r0 \n" | ||
1501 | "sts macl, %[rx] \n" /* point behind the last plane.. */ | ||
1502 | "jmp @r0 \n" /* jump into streak */ | ||
1503 | "add %[rx], %[addr] \n" /* ..for this round */ | ||
1504 | |||
1505 | ".align 2 \n" | ||
1506 | ".wa_ftable: \n" | ||
1507 | ".byte .wa_f0 - .wa_ftable \n" | ||
1508 | ".byte .wa_f1 - .wa_ftable \n" | ||
1509 | ".byte .wa_f2 - .wa_ftable \n" | ||
1510 | ".byte .wa_f3 - .wa_ftable \n" | ||
1511 | ".byte .wa_f4 - .wa_ftable \n" | ||
1512 | ".byte .wa_f5 - .wa_ftable \n" | ||
1513 | ".byte .wa_f6 - .wa_ftable \n" | ||
1514 | ".byte .wa_f7 - .wa_ftable \n" | ||
1515 | |||
1516 | ".wa_f8: \n" | ||
1517 | "mov %[psiz], %[rx] \n" | ||
1518 | "shll2 %[rx] \n" | ||
1519 | "add %[rx], %[rx] \n" | ||
1520 | "add %[rx], %[addr] \n" | ||
1521 | /* Point behind the last plane for this round. Note: We're using the | ||
1522 | * registers backwards in order to reuse the streak for the last round. | ||
1523 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1524 | * the bit order would be destroyed which results in more flicker. */ | ||
1525 | "sub %[psiz], %[addr] \n" | ||
1526 | "mov.b @%[addr], r0 \n" /* load old byte */ | ||
1527 | "and %[mask], r0 \n" /* mask out replaced bits */ | ||
1528 | "or r8, r0 \n" /* set new bits */ | ||
1529 | "mov.b r0, @%[addr] \n" /* store byte */ | ||
1530 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1531 | ".wa_f7: \n" | ||
1532 | "sub %[psiz], %[addr] \n" | ||
1533 | "mov.b @%[addr], r0 \n" | ||
1534 | "and %[mask], r0 \n" | ||
1535 | "or r7, r0 \n" | ||
1536 | "mov.b r0, @%[addr] \n" | ||
1537 | "shlr8 r7 \n" | ||
1538 | ".wa_f6: \n" | ||
1539 | "sub %[psiz], %[addr] \n" | ||
1540 | "mov.b @%[addr], r0 \n" | ||
1541 | "and %[mask], r0 \n" | ||
1542 | "or r6, r0 \n" | ||
1543 | "mov.b r0, @%[addr] \n" | ||
1544 | "shlr8 r6 \n" | ||
1545 | ".wa_f5: \n" | ||
1546 | "sub %[psiz], %[addr] \n" | ||
1547 | "mov.b @%[addr], r0 \n" | ||
1548 | "and %[mask], r0 \n" | ||
1549 | "or r5, r0 \n" | ||
1550 | "mov.b r0, @%[addr] \n" | ||
1551 | "shlr8 r5 \n" | ||
1552 | ".wa_f4: \n" | ||
1553 | "sub %[psiz], %[addr] \n" | ||
1554 | "mov.b @%[addr], r0 \n" | ||
1555 | "and %[mask], r0 \n" | ||
1556 | "or r4, r0 \n" | ||
1557 | "mov.b r0, @%[addr] \n" | ||
1558 | "shlr8 r4 \n" | ||
1559 | ".wa_f3: \n" | ||
1560 | "sub %[psiz], %[addr] \n" | ||
1561 | "mov.b @%[addr], r0 \n" | ||
1562 | "and %[mask], r0 \n" | ||
1563 | "or r3, r0 \n" | ||
1564 | "mov.b r0, @%[addr] \n" | ||
1565 | "shlr8 r3 \n" | ||
1566 | ".wa_f2: \n" | ||
1567 | "sub %[psiz], %[addr] \n" | ||
1568 | "mov.b @%[addr], r0 \n" | ||
1569 | "and %[mask], r0 \n" | ||
1570 | "or r2, r0 \n" | ||
1571 | "mov.b r0, @%[addr] \n" | ||
1572 | "shlr8 r2 \n" | ||
1573 | ".wa_f1: \n" | ||
1574 | "sub %[psiz], %[addr] \n" | ||
1575 | "mov.b @%[addr], r0 \n" | ||
1576 | "and %[mask], r0 \n" | ||
1577 | "or r1, r0 \n" | ||
1578 | "mov.b r0, @%[addr] \n" | ||
1579 | "shlr8 r1 \n" | ||
1580 | ".wa_f0: \n" | ||
1581 | |||
1582 | "add %[rx], %[addr] \n" /* correct address */ | ||
1583 | "add #-8, %[dpth] \n" | ||
1584 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1585 | "bt .wa_floop \n" | ||
1586 | |||
1587 | "bra .wa_end \n" | ||
1588 | "nop \n" | ||
1270 | 1589 | ||
1271 | /* References to C library routines used in the precalc block */ | 1590 | /* References to C library routines used in the precalc block */ |
1272 | ".align 2 \n" | 1591 | ".align 2 \n" |
1273 | ".ashlsi3: \n" /* C library routine: */ | 1592 | ".ashlsi3: \n" /* C library routine: */ |
1274 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ | 1593 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ |
1275 | ".lshrsi3: \n" /* C library routine: */ | 1594 | ".lshrsi3: \n" /* C library routine: */ |
1276 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ | 1595 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ |
1277 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | 1596 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ |
1278 | 1597 | ||
1279 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1598 | /* Bitmasks for the bit block rotation */ |
1280 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1599 | ".wa_mask4: \n" |
1281 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1600 | ".long 0xF0F0F0F0 \n" |
1282 | "shlr r2 \n" | 1601 | ".wa_mask2: \n" |
1283 | "rotcl r0 \n" | 1602 | ".long 0xCCCCCCCC \n" |
1284 | "shlr r3 \n" | 1603 | ".wa_mask1: \n" |
1285 | "rotcl r0 \n" | 1604 | ".long 0xAAAAAAAA \n" |
1286 | "shlr r6 \n" | 1605 | |
1287 | "rotcl r0 \n" | 1606 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
1288 | "shlr r7 \n" | 1607 | "mov #8, r0 \n" |
1289 | "rotcl r0 \n" | 1608 | "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ |
1290 | "shlr r8 \n" | 1609 | "bt .wa_s8 \n" |
1291 | "rotcl r0 \n" | 1610 | |
1292 | "shlr r9 \n" | 1611 | "mulu %[psiz], %[dpth] \n" |
1293 | "rotcl r0 \n" | 1612 | "mova .wa_stable, r0 \n" |
1294 | "shlr r10 \n" | 1613 | "mov.b @(r0, %[dpth]), %[rx] \n" |
1295 | "rotcl r0 \n" | 1614 | "add %[rx], r0 \n" |
1296 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ | 1615 | "sts macl, %[rx] \n" /* point behind the last plane.. */ |
1297 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1616 | "jmp @r0 \n" /* jump into streak */ |
1298 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ | 1617 | "add %[rx], %[addr] \n" /* ..for this round */ |
1299 | "bt .wa_sloop \n" | 1618 | |
1300 | 1619 | ".align 2 \n" | |
1301 | ".wa_end: \n" | 1620 | ".wa_stable: \n" |
1621 | ".byte .wa_s0 - .wa_stable \n" | ||
1622 | ".byte .wa_s1 - .wa_stable \n" | ||
1623 | ".byte .wa_s2 - .wa_stable \n" | ||
1624 | ".byte .wa_s3 - .wa_stable \n" | ||
1625 | ".byte .wa_s4 - .wa_stable \n" | ||
1626 | ".byte .wa_s5 - .wa_stable \n" | ||
1627 | ".byte .wa_s6 - .wa_stable \n" | ||
1628 | ".byte .wa_s7 - .wa_stable \n" | ||
1629 | |||
1630 | ".wa_s8: \n" | ||
1631 | "mov %[psiz], %[rx] \n" /* Point behind the last plane */ | ||
1632 | "shll2 %[rx] \n" /* for this round. */ | ||
1633 | "add %[rx], %[rx] \n" /* See above. */ | ||
1634 | "add %[rx], %[addr] \n" | ||
1635 | |||
1636 | "sub %[psiz], %[addr] \n" | ||
1637 | "mov.b r8, @%[addr] \n" /* store byte */ | ||
1638 | "shlr8 r8 \n" /* shift out used-up byte */ | ||
1639 | ".wa_s7: \n" | ||
1640 | "sub %[psiz], %[addr] \n" | ||
1641 | "mov.b r7, @%[addr] \n" | ||
1642 | "shlr8 r7 \n" | ||
1643 | ".wa_s6: \n" | ||
1644 | "sub %[psiz], %[addr] \n" | ||
1645 | "mov.b r6, @%[addr] \n" | ||
1646 | "shlr8 r6 \n" | ||
1647 | ".wa_s5: \n" | ||
1648 | "sub %[psiz], %[addr] \n" | ||
1649 | "mov.b r5, @%[addr] \n" | ||
1650 | "shlr8 r5 \n" | ||
1651 | ".wa_s4: \n" | ||
1652 | "sub %[psiz], %[addr] \n" | ||
1653 | "mov.b r4, @%[addr] \n" | ||
1654 | "shlr8 r4 \n" | ||
1655 | ".wa_s3: \n" | ||
1656 | "sub %[psiz], %[addr] \n" | ||
1657 | "mov.b r3, @%[addr] \n" | ||
1658 | "shlr8 r3 \n" | ||
1659 | ".wa_s2: \n" | ||
1660 | "sub %[psiz], %[addr] \n" | ||
1661 | "mov.b r2, @%[addr] \n" | ||
1662 | "shlr8 r2 \n" | ||
1663 | ".wa_s1: \n" | ||
1664 | "sub %[psiz], %[addr] \n" | ||
1665 | "mov.b r1, @%[addr] \n" | ||
1666 | "shlr8 r1 \n" | ||
1667 | ".wa_s0: \n" | ||
1668 | |||
1669 | "add %[rx], %[addr] \n" /* correct address */ | ||
1670 | "add #-8, %[dpth] \n" | ||
1671 | "cmp/pl %[dpth] \n" /* next round if anything left */ | ||
1672 | "bt .wa_sloop \n" | ||
1673 | |||
1674 | ".wa_end: \n" | ||
1302 | : /* outputs */ | 1675 | : /* outputs */ |
1303 | [addr]"+r"(addr), | 1676 | [addr]"+r"(addr), |
1304 | [mask]"+r"(_mask), | 1677 | [mask]"+r"(_mask), |
1678 | [dpth]"+r"(depth), | ||
1305 | [rx] "=&r"(trash) | 1679 | [rx] "=&r"(trash) |
1306 | : /* inputs */ | 1680 | : /* inputs */ |
1307 | [psiz]"r"(_gray_info.plane_size), | 1681 | [psiz]"r"(_gray_info.plane_size), |
1308 | [end] "r"(end), | ||
1309 | [patp]"[rx]"(pat_ptr) | 1682 | [patp]"[rx]"(pat_ptr) |
1310 | : /* clobbers */ | 1683 | : /* clobbers */ |
1311 | "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" | 1684 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" |
1312 | ); | 1685 | ); |
1313 | #elif defined(CPU_COLDFIRE) | 1686 | #elif defined(CPU_COLDFIRE) |
1314 | const unsigned char *_src; | 1687 | const unsigned char *_src; |
1315 | unsigned _mask, trash; | 1688 | unsigned _mask, depth, trash; |
1316 | 1689 | ||
1317 | _mask = mask; | 1690 | _mask = mask; |
1318 | _src = src; | 1691 | _src = src; |
1319 | 1692 | ||
1320 | /* precalculate the bit patterns with random shifts | 1693 | /* precalculate the bit patterns with random shifts |
1321 | for all 8 pixels and put them on an extra "stack" */ | 1694 | for all 8 pixels and put them on an extra "stack" */ |
1322 | asm volatile ( | 1695 | asm volatile |
1323 | "moveq.l #8,%%d3 \n" /* loop count */ | 1696 | ( |
1324 | 1697 | "moveq.l #8, %%d3 \n" /* loop count */ | |
1325 | ".wa_loop: \n" /** load pattern for pixel **/ | 1698 | |
1326 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ | 1699 | ".wa_loop: \n" /** load pattern for pixel **/ |
1327 | "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ | 1700 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ |
1328 | "bcc.b .wa_skip \n" /* skip this pixel */ | 1701 | "lsr.l #1, %[mask] \n" /* shift out lsb of mask */ |
1329 | 1702 | "bcc.b .wa_skip \n" /* skip this pixel */ | |
1330 | "clr.l %%d0 \n" | 1703 | |
1331 | "move.b (%[src]),%%d0 \n" /* load src byte */ | 1704 | "clr.l %%d0 \n" |
1332 | "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ | 1705 | "move.b (%[src]), %%d0 \n" /* load src byte */ |
1333 | "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ | 1706 | "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */ |
1334 | 1707 | "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ | |
1335 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ | 1708 | |
1336 | "add.l #74,%[rnd] \n" /* add another 74 */ | 1709 | "mulu.w #75, %[rnd] \n" /* multiply by 75 */ |
1710 | "add.l #74, %[rnd] \n" /* add another 74 */ | ||
1337 | /* Since the lower bits are not very random: */ | 1711 | /* Since the lower bits are not very random: */ |
1338 | "move.l %[rnd],%%d1 \n" | 1712 | "move.l %[rnd], %%d1 \n" |
1339 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ | 1713 | "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ |
1340 | "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ | 1714 | "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ |
1341 | 1715 | ||
1342 | "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ | 1716 | "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ |
1343 | "blo.b .wa_ntrim \n" | 1717 | "blo.b .wa_ntrim \n" |
1344 | "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ | 1718 | "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ |
1345 | ".wa_ntrim: \n" | 1719 | ".wa_ntrim: \n" |
1346 | 1720 | ||
1347 | "move.l %%d2,%%d0 \n" /** rotate pattern **/ | 1721 | "move.l %%d2, %%d0 \n" /** rotate pattern **/ |
1348 | "lsl.l %%d1,%%d0 \n" | 1722 | "lsl.l %%d1, %%d0 \n" |
1349 | "sub.l %[dpth],%%d1 \n" | 1723 | "sub.l %[dpth], %%d1 \n" |
1350 | "neg.l %%d1 \n" /* d1 = depth - d1 */ | 1724 | "neg.l %%d1 \n" /* d1 = depth - d1 */ |
1351 | "lsr.l %%d1,%%d2 \n" | 1725 | "lsr.l %%d1, %%d2 \n" |
1352 | "or.l %%d0,%%d2 \n" | 1726 | "or.l %%d0, %%d2 \n" |
1353 | 1727 | ||
1354 | ".wa_skip: \n" | 1728 | ".wa_skip: \n" |
1355 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ | 1729 | "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ |
1356 | 1730 | ||
1357 | "add.l %[stri],%[src] \n" /* src += stride; */ | 1731 | "add.l %[stri], %[src] \n" /* src += stride; */ |
1358 | "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ | 1732 | "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ |
1359 | "bne.b .wa_loop \n" | 1733 | "bne.b .wa_loop \n" |
1360 | : /* outputs */ | 1734 | : /* outputs */ |
1361 | [src] "+a"(_src), | 1735 | [src] "+a"(_src), |
1362 | [patp]"+a"(pat_ptr), | 1736 | [patp]"+a"(pat_ptr), |
@@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1373 | ); | 1747 | ); |
1374 | 1748 | ||
1375 | addr = address; | 1749 | addr = address; |
1376 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | 1750 | _mask = ~mask & 0xff; |
1377 | _mask = mask; | 1751 | depth = _gray_info.depth; |
1378 | 1752 | ||
1379 | /* set the bits for all 8 pixels in all bytes according to the | 1753 | /* set the bits for all 8 pixels in all bytes according to the |
1380 | * precalculated patterns on the pattern stack */ | 1754 | * precalculated patterns on the pattern stack */ |
1381 | asm volatile ( | 1755 | asm volatile |
1382 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" | 1756 | ( |
1383 | /* pop all 8 patterns */ | 1757 | "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ |
1384 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ | 1758 | /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ |
1385 | "and.l #0xFF,%[mask] \n" | 1759 | |
1386 | "beq.b .wa_sstart \n" /* short loop if nothing to keep */ | 1760 | /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ |
1387 | 1761 | ||
1388 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1762 | "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ |
1389 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1763 | "lsl.l #4, %%d0 \n" |
1390 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1764 | /* move.l %[ax], %%d5 */ /* already in d5 */ |
1391 | "lsr.l #1,%%d3 \n" | 1765 | "eor.l %%d5, %%d0 \n" |
1392 | "addx.l %%d0,%%d0 \n" | 1766 | "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ |
1393 | "lsr.l #1,%%d4 \n" | 1767 | "eor.l %%d0, %%d5 \n" |
1394 | "addx.l %%d0,%%d0 \n" | 1768 | "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ |
1395 | "lsr.l #1,%%d5 \n" | 1769 | "lsr.l #4, %%d0 \n" |
1396 | "addx.l %%d0,%%d0 \n" | 1770 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ |
1397 | "lsr.l #1,%%d6 \n" | 1771 | "move.l %%d2, %%d0 \n" |
1398 | "addx.l %%d0,%%d0 \n" | 1772 | "lsl.l #4, %%d0 \n" |
1399 | "move.l %%a0,%%d1 \n" | 1773 | "eor.l %%d6, %%d0 \n" |
1400 | "lsr.l #1,%%d1 \n" | 1774 | "and.l #0xF0F0F0F0, %%d0 \n" |
1401 | "addx.l %%d0,%%d0 \n" | 1775 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ |
1402 | "move.l %%d1,%%a0 \n" | 1776 | "lsr.l #4, %%d0 \n" |
1403 | "move.l %%a1,%%d1 \n" | 1777 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ |
1404 | "lsr.l #1,%%d1 \n" | 1778 | "move.l %%d3, %%d0 \n" |
1405 | "addx.l %%d0,%%d0 \n" | 1779 | "lsl.l #4, %%d0 \n" |
1406 | "move.l %%d1,%%a1 \n" | 1780 | "eor.l %%d7, %%d0 \n" |
1407 | "move.l %[ax],%%d1 \n" | 1781 | "and.l #0xF0F0F0F0, %%d0 \n" |
1408 | "lsr.l #1,%%d1 \n" | 1782 | "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ |
1409 | "addx.l %%d0,%%d0 \n" | 1783 | "lsr.l #4, %%d0 \n" |
1410 | "move.l %%d1,%[ax] \n" | 1784 | "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ |
1411 | 1785 | "move.l %%d4, %%d0 \n" | |
1412 | "move.b (%[addr]),%%d1 \n" /* read old value */ | 1786 | "lsl.l #4, %%d0 \n" |
1413 | "and.l %[mask],%%d1 \n" /* mask out replaced bits */ | 1787 | "move.l %%a0, %%d5 \n" |
1414 | "or.l %%d0,%%d1 \n" /* set new bits */ | 1788 | "eor.l %%d5, %%d0 \n" |
1415 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ | 1789 | "and.l #0xF0F0F0F0, %%d0 \n" |
1416 | 1790 | "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ | |
1417 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1791 | /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ |
1418 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ | 1792 | "lsr.l #4, %%d0 \n" |
1419 | "bhi.b .wa_floop \n" | 1793 | "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ |
1420 | 1794 | ||
1421 | "bra.b .wa_end \n" | 1795 | "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ |
1422 | 1796 | "lsl.l #2, %%d0 \n" | |
1423 | ".wa_sstart: \n" | 1797 | /* move.l %%a0, %%d5 */ /* still in d5 */ |
1424 | "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ | 1798 | "eor.l %%d5, %%d0 \n" |
1425 | 1799 | "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ | |
1426 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1800 | "eor.l %%d0, %%d5 \n" |
1427 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ | 1801 | "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ |
1428 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ | 1802 | "lsr.l #2, %%d0 \n" |
1429 | "lsr.l #1,%%d3 \n" | 1803 | "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ |
1430 | "addx.l %%d0,%%d0 \n" | 1804 | "move.l %[ax], %%d5 \n" |
1431 | "lsr.l #1,%%d4 \n" | 1805 | "move.l %%d5, %%d0 \n" |
1432 | "addx.l %%d0,%%d0 \n" | 1806 | "lsl.l #2, %%d0 \n" |
1433 | "lsr.l #1,%%d5 \n" | 1807 | "eor.l %%d7, %%d0 \n" |
1434 | "addx.l %%d0,%%d0 \n" | 1808 | "and.l #0xCCCCCCCC, %%d0 \n" |
1435 | "lsr.l #1,%%d6 \n" | 1809 | "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ |
1436 | "addx.l %%d0,%%d0 \n" | 1810 | "lsr.l #2, %%d0 \n" |
1437 | "lsr.l #1,%[mask] \n" | 1811 | "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ |
1438 | "addx.l %%d0,%%d0 \n" | 1812 | /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ |
1439 | "move.l %%a1,%%d1 \n" | 1813 | "move.l %%d2, %%d0 \n" |
1440 | "lsr.l #1,%%d1 \n" | 1814 | "lsl.l #2, %%d0 \n" |
1441 | "addx.l %%d0,%%d0 \n" | 1815 | "eor.l %%d4, %%d0 \n" |
1442 | "move.l %%d1,%%a1 \n" | 1816 | "and.l #0xCCCCCCCC, %%d0 \n" |
1443 | "move.l %[ax],%%d1 \n" | 1817 | "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ |
1444 | "lsr.l #1,%%d1 \n" | 1818 | "lsr.l #2, %%d0 \n" |
1445 | "addx.l %%d0,%%d0 \n" | 1819 | "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ |
1446 | "move.l %%d1,%[ax] \n" | 1820 | "move.l %%d1, %%d0 \n" |
1447 | 1821 | "lsl.l #2, %%d0 \n" | |
1448 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ | 1822 | "eor.l %%d3, %%d0 \n" |
1449 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1823 | "and.l #0xCCCCCCCC, %%d0 \n" |
1450 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ | 1824 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ |
1451 | "bhi.b .wa_sloop \n" | 1825 | "lsr.l #2, %%d0 \n" |
1452 | 1826 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ | |
1453 | ".wa_end: \n" | 1827 | |
1828 | "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ | ||
1829 | "lsl.l #1, %%d0 \n" | ||
1830 | "eor.l %%d2, %%d0 \n" | ||
1831 | "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ | ||
1832 | "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ | ||
1833 | "lsr.l #1, %%d0 \n" | ||
1834 | "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ | ||
1835 | "move.l %%d3, %%d0 \n" | ||
1836 | "lsl.l #1, %%d0 \n" | ||
1837 | "eor.l %%d4, %%d0 \n" | ||
1838 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1839 | "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ | ||
1840 | "lsr.l #1, %%d0 \n" | ||
1841 | "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ | ||
1842 | /* move.l %[ax], %%d5 */ /* still in d5 */ | ||
1843 | "move.l %%d5, %%d0 \n" | ||
1844 | "lsl.l #1, %%d0 \n" | ||
1845 | "eor.l %%d6, %%d0 \n" | ||
1846 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1847 | "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ | ||
1848 | "lsr.l #1, %%d0 \n" | ||
1849 | "eor.l %%d0, %%d5 \n" | ||
1850 | "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ | ||
1851 | "move.l %%d7, %%d0 \n" | ||
1852 | "lsl.l #1, %%d0 \n" | ||
1853 | "move.l %%a0, %%d5 \n" | ||
1854 | "eor.l %%d5, %%d0 \n" | ||
1855 | "and.l #0xAAAAAAAA, %%d0 \n" | ||
1856 | "eor.l %%d0, %%d5 \n" | ||
1857 | "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ | ||
1858 | "lsr.l #1, %%d0 \n" | ||
1859 | "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ | ||
1860 | |||
1861 | "tst.l %[mask] \n" | ||
1862 | "jeq .wa_sloop \n" /* short loop if nothing to keep */ | ||
1863 | |||
1864 | "move.l %[mask], %%d5 \n" /* need mask in data reg. */ | ||
1865 | "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ | ||
1866 | |||
1867 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | ||
1868 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1869 | "bhs.s .wa_f8 \n" | ||
1870 | |||
1871 | "move.l %[psiz], %%d0 \n" | ||
1872 | "move.l %[dpth], %%d1 \n" | ||
1873 | "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ | ||
1874 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1875 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1876 | "bra.s .wa_f1 \n" /* dpth == 0 should never happen */ | ||
1877 | "bra.s .wa_f2 \n" | ||
1878 | "bra.s .wa_f3 \n" | ||
1879 | "bra.s .wa_f4 \n" | ||
1880 | "bra.s .wa_f5 \n" | ||
1881 | "bra.s .wa_f6 \n" | ||
1882 | "bra.s .wa_f7 \n" | ||
1883 | |||
1884 | ".wa_f8: \n" | ||
1885 | "move.l %[psiz], %%d0 \n" | ||
1886 | "lsl.l #3, %%d0 \n" | ||
1887 | "add.l %%d0, %[addr] \n" | ||
1888 | /* Point behind the last plane for this round. Note: We're using the | ||
1889 | * registers backwards in order to reuse the streak for the last round. | ||
1890 | * Therefore we need to go thru the bitplanes backwards too, otherwise | ||
1891 | * the bit order would be destroyed which results in more flicker. */ | ||
1892 | "sub.l %[psiz], %[addr] \n" | ||
1893 | "move.b (%[addr]), %%d0 \n" /* load old byte */ | ||
1894 | "and.l %%d5, %%d0 \n" /* mask out replaced bits */ | ||
1895 | "move.l %[mask], %%d1 \n" | ||
1896 | "or.l %%d1, %%d0 \n" /* set new bits */ | ||
1897 | "move.b %%d0, (%[addr]) \n" /* store byte */ | ||
1898 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1899 | "move.l %%d1, %[mask] \n" | ||
1900 | ".wa_f7: \n" | ||
1901 | "sub.l %[psiz], %[addr] \n" | ||
1902 | "move.b (%[addr]), %%d0 \n" | ||
1903 | "and.l %%d5, %%d0 \n" | ||
1904 | "or.l %%d2, %%d0 \n" | ||
1905 | "move.b %%d0, (%[addr]) \n" | ||
1906 | "lsr.l #8, %%d2 \n" | ||
1907 | ".wa_f6: \n" | ||
1908 | "sub.l %[psiz], %[addr] \n" | ||
1909 | "move.b (%[addr]), %%d0 \n" | ||
1910 | "and.l %%d5, %%d0 \n" | ||
1911 | "or.l %%d3, %%d0 \n" | ||
1912 | "move.b %%d0, (%[addr]) \n" | ||
1913 | "lsr.l #8, %%d3 \n" | ||
1914 | ".wa_f5: \n" | ||
1915 | "sub.l %[psiz], %[addr] \n" | ||
1916 | "move.b (%[addr]), %%d0 \n" | ||
1917 | "and.l %%d5, %%d0 \n" | ||
1918 | "or.l %%d4, %%d0 \n" | ||
1919 | "move.b %%d0, (%[addr]) \n" | ||
1920 | "lsr.l #8, %%d4 \n" | ||
1921 | ".wa_f4: \n" | ||
1922 | "sub.l %[psiz], %[addr] \n" | ||
1923 | "move.b (%[addr]), %%d0 \n" | ||
1924 | "and.l %%d5, %%d0 \n" | ||
1925 | "move.l %[ax], %%d1 \n" | ||
1926 | "or.l %%d1, %%d0 \n" | ||
1927 | "move.b %%d0, (%[addr]) \n" | ||
1928 | "lsr.l #8, %%d1 \n" | ||
1929 | "move.l %%d1, %[ax] \n" | ||
1930 | ".wa_f3: \n" | ||
1931 | "sub.l %[psiz], %[addr] \n" | ||
1932 | "move.b (%[addr]), %%d0 \n" | ||
1933 | "and.l %%d5, %%d0 \n" | ||
1934 | "or.l %%d6, %%d0 \n" | ||
1935 | "move.b %%d0, (%[addr]) \n" | ||
1936 | "lsr.l #8, %%d6 \n" | ||
1937 | ".wa_f2: \n" | ||
1938 | "sub.l %[psiz], %[addr] \n" | ||
1939 | "move.b (%[addr]), %%d0 \n" | ||
1940 | "and.l %%d5, %%d0 \n" | ||
1941 | "or.l %%d7, %%d0 \n" | ||
1942 | "move.b %%d0, (%[addr]) \n" | ||
1943 | "lsr.l #8, %%d7 \n" | ||
1944 | ".wa_f1: \n" | ||
1945 | "sub.l %[psiz], %[addr] \n" | ||
1946 | "move.b (%[addr]), %%d0 \n" | ||
1947 | "and.l %%d5, %%d0 \n" | ||
1948 | "move.l %%a0, %%d1 \n" | ||
1949 | "or.l %%d1, %%d0 \n" | ||
1950 | "move.b %%d0, (%[addr]) \n" | ||
1951 | "lsr.l #8, %%d1 \n" | ||
1952 | "move.l %%d1, %%a0 \n" | ||
1953 | |||
1954 | "move.l %[psiz], %%d0 \n" | ||
1955 | "lsl.l #3, %%d0 \n" | ||
1956 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
1957 | "subq.l #8, %[dpth] \n" | ||
1958 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
1959 | "jgt .wa_floop \n" /* next round if anything left */ | ||
1960 | |||
1961 | "jra .wa_end \n" | ||
1962 | |||
1963 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | ||
1964 | "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ | ||
1965 | "bhs.s .wa_s8 \n" | ||
1966 | |||
1967 | "move.l %[psiz], %%d0 \n" | ||
1968 | "move.l %[dpth], %%d5 \n" | ||
1969 | "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ | ||
1970 | "add.l %%d0, %[addr] \n" /* for this round */ | ||
1971 | "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ | ||
1972 | "bra.s .wa_s1 \n" /* dpth == 0 should never happen */ | ||
1973 | "bra.s .wa_s2 \n" | ||
1974 | "bra.s .wa_s3 \n" | ||
1975 | "bra.s .wa_s4 \n" | ||
1976 | "bra.s .wa_s5 \n" | ||
1977 | "bra.s .wa_s6 \n" | ||
1978 | "bra.s .wa_s7 \n" | ||
1979 | |||
1980 | ".wa_s8: \n" | ||
1981 | "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ | ||
1982 | "lsl.l #3, %%d0 \n" /* for this round. */ | ||
1983 | "add.l %%d0, %[addr] \n" /* See above. */ | ||
1984 | |||
1985 | "sub.l %[psiz], %[addr] \n" | ||
1986 | "move.b %%d1, (%[addr]) \n" /* store byte */ | ||
1987 | "lsr.l #8, %%d1 \n" /* shift out used-up byte */ | ||
1988 | ".wa_s7: \n" | ||
1989 | "sub.l %[psiz], %[addr] \n" | ||
1990 | "move.b %%d2, (%[addr]) \n" | ||
1991 | "lsr.l #8, %%d2 \n" | ||
1992 | ".wa_s6: \n" | ||
1993 | "sub.l %[psiz], %[addr] \n" | ||
1994 | "move.b %%d3, (%[addr]) \n" | ||
1995 | "lsr.l #8, %%d3 \n" | ||
1996 | ".wa_s5: \n" | ||
1997 | "sub.l %[psiz], %[addr] \n" | ||
1998 | "move.b %%d4, (%[addr]) \n" | ||
1999 | "lsr.l #8, %%d4 \n" | ||
2000 | ".wa_s4: \n" | ||
2001 | "sub.l %[psiz], %[addr] \n" | ||
2002 | "move.l %[ax], %%d5 \n" | ||
2003 | "move.b %%d5, (%[addr]) \n" | ||
2004 | "lsr.l #8, %%d5 \n" | ||
2005 | "move.l %%d5, %[ax] \n" | ||
2006 | ".wa_s3: \n" | ||
2007 | "sub.l %[psiz], %[addr] \n" | ||
2008 | "move.b %%d6, (%[addr]) \n" | ||
2009 | "lsr.l #8, %%d6 \n" | ||
2010 | ".wa_s2: \n" | ||
2011 | "sub.l %[psiz], %[addr] \n" | ||
2012 | "move.b %%d7, (%[addr]) \n" | ||
2013 | "lsr.l #8, %%d7 \n" | ||
2014 | ".wa_s1: \n" | ||
2015 | "sub.l %[psiz], %[addr] \n" | ||
2016 | "move.l %%a0, %%d5 \n" | ||
2017 | "move.b %%d5, (%[addr]) \n" | ||
2018 | "lsr.l #8, %%d5 \n" | ||
2019 | "move.l %%d5, %%a0 \n" | ||
2020 | |||
2021 | "add.l %%d0, %[addr] \n" /* correct address */ | ||
2022 | "subq.l #8, %[dpth] \n" | ||
2023 | "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ | ||
2024 | "jgt .wa_sloop \n" /* next round if anything left */ | ||
2025 | |||
2026 | ".wa_end: \n" | ||
1454 | : /* outputs */ | 2027 | : /* outputs */ |
1455 | [addr]"+a"(addr), | 2028 | [addr]"+a"(addr), |
1456 | [mask]"+d"(_mask), | 2029 | [dpth]"+a"(depth), |
2030 | [mask]"+a"(_mask), | ||
1457 | [ax] "=&a"(trash) | 2031 | [ax] "=&a"(trash) |
1458 | : /* inputs */ | 2032 | : /* inputs */ |
1459 | [psiz]"a"(_gray_info.plane_size), | 2033 | [psiz]"a"(_gray_info.plane_size), |
1460 | [end] "a"(end), | ||
1461 | [patp]"[ax]"(pat_ptr) | 2034 | [patp]"[ax]"(pat_ptr) |
1462 | : /* clobbers */ | 2035 | : /* clobbers */ |
1463 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" | 2036 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" |
1464 | ); | 2037 | ); |
1465 | #else /* C version, for reference*/ | 2038 | #else /* C version, for reference*/ |
1466 | #warning C version of _writearray() used | 2039 | #warning C version of _writearray() used |
2040 | unsigned char *end; | ||
1467 | unsigned test = 1; | 2041 | unsigned test = 1; |
1468 | int i; | 2042 | int i; |
1469 | 2043 | ||