diff options
author | Jens Arnold <amiconn@rockbox.org> | 2006-08-07 17:21:38 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2006-08-07 17:21:38 +0000 |
commit | c00d799fa3a568ecb8649b5ce6d40366707b9551 (patch) | |
tree | f3112971b136ec365a3ef24929bf41ab355d4026 /apps/plugins/lib/gray_draw.c | |
parent | 8921b34e4b81f427d19b5c9f263eb893040c2d43 (diff) | |
download | rockbox-c00d799fa3a568ecb8649b5ce6d40366707b9551.tar.gz rockbox-c00d799fa3a568ecb8649b5ce6d40366707b9551.zip |
* Assembler optimised gray_update_rect() and writearray() for arm (greyscale iPods). * Some slight optimisations for coldfire (H1x0) and SH1 (archos). * Comment and formatting cleanup.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10473 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib/gray_draw.c')
-rw-r--r-- | apps/plugins/lib/gray_draw.c | 496 |
1 files changed, 313 insertions, 183 deletions
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index 396046d1e6..7df3e13c56 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c | |||
@@ -876,8 +876,140 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
876 | unsigned long pat_stack[8]; | 876 | unsigned long pat_stack[8]; |
877 | unsigned long *pat_ptr = &pat_stack[8]; | 877 | unsigned long *pat_ptr = &pat_stack[8]; |
878 | unsigned char *addr, *end; | 878 | unsigned char *addr, *end; |
879 | #if 0 /* CPU specific asm versions will go here */ | 879 | #ifdef CPU_ARM |
880 | const unsigned char *_src; | ||
881 | unsigned _mask, trash; | ||
882 | |||
883 | _mask = mask; | ||
884 | _src = src; | ||
885 | |||
886 | /* precalculate the bit patterns with random shifts | ||
887 | for all 8 pixels and put them on an extra "stack" */ | ||
888 | asm volatile ( | ||
889 | "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ | ||
890 | "mov r3, #8 \n" /* loop count */ | ||
891 | |||
892 | ".wa_loop: \n" /** load pattern for pixel **/ | ||
893 | "mov r2, #0 \n" /* pattern for skipped pixel must be 0 */ | ||
894 | "movs %[mask], %[mask], lsl #1 \n" /* shift out msb of mask */ | ||
895 | "bcc .wa_skip \n" /* skip this pixel */ | ||
896 | |||
897 | "ldrb r0, [%[src]] \n" /* load src byte */ | ||
898 | "ldrb r0, [%[trns], r0] \n" /* idxtable into pattern index */ | ||
899 | "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ | ||
900 | |||
901 | "add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */ | ||
902 | "add %[rnd], %[rnd], %[rnd], lsl #1 \n" | ||
903 | "add %[rnd], %[rnd], r0, lsl #3 \n" | ||
904 | "add %[rnd], %[rnd], #74 \n" /* add another 74 */ | ||
905 | /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ | ||
906 | "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ | ||
907 | |||
908 | "cmp r1, %[dpth] \n" /* random >= depth ? */ | ||
909 | "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ | ||
910 | |||
911 | "mov r0, r2, lsl r1 \n" /** rotate pattern **/ | ||
912 | "sub r1, %[dpth], r1 \n" | ||
913 | "orr r2, r0, r2, lsr r1 \n" | ||
914 | |||
915 | ".wa_skip: \n" | ||
916 | "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ | ||
917 | |||
918 | "add %[src], %[src], #1 \n" /* src++; */ | ||
919 | "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ | ||
920 | "bne .wa_loop \n" | ||
921 | : /* outputs */ | ||
922 | [src] "+r"(_src), | ||
923 | [patp]"+r"(pat_ptr), | ||
924 | [rnd] "+r"(_gray_random_buffer), | ||
925 | [mask]"+r"(_mask) | ||
926 | : /* inputs */ | ||
927 | [bpat]"r"(_gray_info.bitpattern), | ||
928 | [trns]"r"(_gray_info.idxtable), | ||
929 | [dpth]"r"(_gray_info.depth), | ||
930 | [rmsk]"r"(_gray_info.randmask) | ||
931 | : /* clobbers */ | ||
932 | "r0", "r1", "r2", "r3" | ||
933 | ); | ||
934 | |||
935 | addr = address; | ||
936 | end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); | ||
937 | _mask = mask; | ||
938 | |||
939 | /* set the bits for all 8 pixels in all bytes according to the | ||
940 | * precalculated patterns on the pattern stack */ | ||
941 | asm volatile ( | ||
942 | "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ | ||
943 | |||
944 | "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ | ||
945 | "ands %[mask], %[mask], #0xff \n" | ||
946 | "beq .wa_sloop \n" /* short loop if nothing to keep */ | ||
947 | |||
948 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | ||
949 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | ||
950 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | ||
951 | "movs r8, r8, lsr #1 \n" | ||
952 | "adc r0, r0, r0 \n" | ||
953 | "movs r7, r7, lsr #1 \n" | ||
954 | "adc r0, r0, r0 \n" | ||
955 | "movs r6, r6, lsr #1 \n" | ||
956 | "adc r0, r0, r0 \n" | ||
957 | "movs r5, r5, lsr #1 \n" | ||
958 | "adc r0, r0, r0 \n" | ||
959 | "movs r4, r4, lsr #1 \n" | ||
960 | "adc r0, r0, r0 \n" | ||
961 | "movs r3, r3, lsr #1 \n" | ||
962 | "adc r0, r0, r0 \n" | ||
963 | "movs r2, r2, lsr #1 \n" | ||
964 | "adc r0, r0, r0 \n" | ||
965 | |||
966 | "ldrb r1, [%[addr]] \n" /* read old value */ | ||
967 | "and r1, r1, %[mask] \n" /* mask out replaced bits */ | ||
968 | "orr r1, r1, r0 \n" /* set new bits */ | ||
969 | "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ | ||
970 | |||
971 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | ||
972 | "bne .wa_floop \n" | ||
973 | |||
974 | "b .wa_end \n" | ||
975 | |||
976 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | ||
977 | "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ | ||
978 | "adc r0, r0, r0 \n" /* put bit into LSB of byte */ | ||
979 | "movs r8, r8, lsr #1 \n" | ||
980 | "adc r0, r0, r0 \n" | ||
981 | "movs r7, r7, lsr #1 \n" | ||
982 | "adc r0, r0, r0 \n" | ||
983 | "movs r6, r6, lsr #1 \n" | ||
984 | "adc r0, r0, r0 \n" | ||
985 | "movs r5, r5, lsr #1 \n" | ||
986 | "adc r0, r0, r0 \n" | ||
987 | "movs r4, r4, lsr #1 \n" | ||
988 | "adc r0, r0, r0 \n" | ||
989 | "movs r3, r3, lsr #1 \n" | ||
990 | "adc r0, r0, r0 \n" | ||
991 | "movs r2, r2, lsr #1 \n" | ||
992 | "adc r0, r0, r0 \n" | ||
993 | |||
994 | "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ | ||
995 | |||
996 | "cmp %[end], %[addr] \n" /* loop through all bitplanes */ | ||
997 | "bne .wa_sloop \n" | ||
998 | |||
999 | ".wa_end: \n" | ||
1000 | : /* outputs */ | ||
1001 | [addr]"+r"(addr), | ||
1002 | [mask]"+r"(_mask), | ||
1003 | [rx] "=&r"(trash) | ||
1004 | : /* inputs */ | ||
1005 | [psiz]"r"(_gray_info.plane_size), | ||
1006 | [end] "r"(end), | ||
1007 | [patp]"[rx]"(pat_ptr) | ||
1008 | : /* clobbers */ | ||
1009 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" | ||
1010 | ); | ||
880 | #else /* C version, for reference*/ | 1011 | #else /* C version, for reference*/ |
1012 | #warning C version of _writearray() used | ||
881 | unsigned test = 0x80; | 1013 | unsigned test = 0x80; |
882 | int i; | 1014 | int i; |
883 | 1015 | ||
@@ -1027,52 +1159,52 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1027 | /* precalculate the bit patterns with random shifts | 1159 | /* precalculate the bit patterns with random shifts |
1028 | for all 8 pixels and put them on an extra "stack" */ | 1160 | for all 8 pixels and put them on an extra "stack" */ |
1029 | asm volatile ( | 1161 | asm volatile ( |
1030 | "mov #8,r3 \n" /* loop count in r3: 8 pixels */ | 1162 | "mov #8,r3 \n" /* loop count */ |
1031 | 1163 | ||
1032 | ".wa_loop: \n" /** load pattern for pixel **/ | 1164 | ".wa_loop: \n" /** load pattern for pixel **/ |
1033 | "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ | 1165 | "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ |
1034 | "shlr %[mask] \n" /* shift out lsb of mask */ | 1166 | "shlr %[mask] \n" /* shift out lsb of mask */ |
1035 | "bf .wa_skip \n" /* skip this pixel */ | 1167 | "bf .wa_skip \n" /* skip this pixel */ |
1036 | 1168 | ||
1037 | "mov.b @%[src],r0 \n" /* load src byte */ | 1169 | "mov.b @%[src],r0 \n" /* load src byte */ |
1038 | "extu.b r0,r0 \n" /* extend unsigned */ | 1170 | "extu.b r0,r0 \n" /* extend unsigned */ |
1039 | "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ | 1171 | "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ |
1040 | "extu.b r0,r0 \n" /* extend unsigned */ | 1172 | "extu.b r0,r0 \n" /* extend unsigned */ |
1041 | "shll2 r0 \n" | 1173 | "shll2 r0 \n" |
1042 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ | 1174 | "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ |
1043 | 1175 | ||
1044 | "mov #75,r0 \n" | 1176 | "mov #75,r0 \n" |
1045 | "mulu r0,%[rnd] \n" /* multiply by 75 */ | 1177 | "mulu r0,%[rnd] \n" /* multiply by 75 */ |
1046 | "sts macl,%[rnd] \n" | 1178 | "sts macl,%[rnd] \n" |
1047 | "add #74,%[rnd] \n" /* add another 74 */ | 1179 | "add #74,%[rnd] \n" /* add another 74 */ |
1048 | /* Since the lower bits are not very random: */ | 1180 | /* Since the lower bits are not very random: */ |
1049 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ | 1181 | "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ |
1050 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ | 1182 | "and %[rmsk],r1 \n" /* mask out unneeded bits */ |
1051 | 1183 | ||
1052 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ | 1184 | "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ |
1053 | "bf .wa_ntrim \n" | 1185 | "bf .wa_ntrim \n" |
1054 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ | 1186 | "sub %[dpth],r1 \n" /* yes: random -= depth; */ |
1055 | ".wa_ntrim: \n" | 1187 | ".wa_ntrim: \n" |
1056 | 1188 | ||
1057 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ | 1189 | "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ |
1058 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ | 1190 | "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ |
1059 | "mov r1,r5 \n" | 1191 | "mov r1,r5 \n" |
1060 | 1192 | ||
1061 | "mov %[dpth],r5 \n" | 1193 | "mov %[dpth],r5 \n" |
1062 | "sub r1,r5 \n" /* r5 = depth - r1 */ | 1194 | "sub r1,r5 \n" /* r5 = depth - r1 */ |
1063 | "mov.l .lshrsi3,r1 \n" | 1195 | "mov.l .lshrsi3,r1 \n" |
1064 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ | 1196 | "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ |
1065 | "mov r0,r1 \n" /* store previous result in r1 */ | 1197 | "mov r0,r1 \n" /* store previous result in r1 */ |
1066 | 1198 | ||
1067 | "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ | 1199 | "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ |
1068 | 1200 | ||
1069 | ".wa_skip: \n" | 1201 | ".wa_skip: \n" |
1070 | "mov.l r0,@-%[patp]\n" /* push on pattern stack */ | 1202 | "mov.l r0,@-%[patp] \n" /* push on pattern stack */ |
1071 | 1203 | ||
1072 | "add %[stri],%[src] \n" /* src += stride; */ | 1204 | "add %[stri],%[src] \n" /* src += stride; */ |
1073 | "add #-1,r3 \n" /* decrease loop count */ | 1205 | "add #-1,r3 \n" /* loop 8 times (pixel block) */ |
1074 | "cmp/pl r3 \n" /* loop count > 0? */ | 1206 | "cmp/pl r3 \n" |
1075 | "bt .wa_loop \n" /* yes: loop */ | 1207 | "bt .wa_loop \n" |
1076 | : /* outputs */ | 1208 | : /* outputs */ |
1077 | [src] "+r"(_src), | 1209 | [src] "+r"(_src), |
1078 | [rnd] "+r"(_gray_random_buffer), | 1210 | [rnd] "+r"(_gray_random_buffer), |
@@ -1095,79 +1227,79 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1095 | /* set the bits for all 8 pixels in all bytes according to the | 1227 | /* set the bits for all 8 pixels in all bytes according to the |
1096 | * precalculated patterns on the pattern stack */ | 1228 | * precalculated patterns on the pattern stack */ |
1097 | asm volatile ( | 1229 | asm volatile ( |
1098 | "mov.l @%[patp]+,r1\n" /* pop all 8 patterns */ | 1230 | "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ |
1099 | "mov.l @%[patp]+,r2\n" | 1231 | "mov.l @%[patp]+,r2 \n" |
1100 | "mov.l @%[patp]+,r3\n" | 1232 | "mov.l @%[patp]+,r3 \n" |
1101 | "mov.l @%[patp]+,r6\n" | 1233 | "mov.l @%[patp]+,r6 \n" |
1102 | "mov.l @%[patp]+,r7\n" | 1234 | "mov.l @%[patp]+,r7 \n" |
1103 | "mov.l @%[patp]+,r8\n" | 1235 | "mov.l @%[patp]+,r8 \n" |
1104 | "mov.l @%[patp]+,r9\n" | 1236 | "mov.l @%[patp]+,r9 \n" |
1105 | "mov.l @%[patp],r10\n" | 1237 | "mov.l @%[patp],r10 \n" |
1106 | 1238 | ||
1107 | "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ | 1239 | "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ |
1108 | "extu.b %[mask],%[mask] \n" /* mask out high bits */ | 1240 | "extu.b %[mask],%[mask] \n" /* mask out high bits */ |
1109 | "tst %[mask],%[mask] \n" /* nothing to keep? */ | 1241 | "tst %[mask],%[mask] \n" |
1110 | "bt .wa_sloop \n" /* yes: jump to short loop */ | 1242 | "bt .wa_sloop \n" /* short loop if nothing to keep */ |
1111 | 1243 | ||
1112 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1244 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ |
1113 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1245 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ |
1114 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1246 | "rotcl r0 \n" /* rotate t bit into r0 */ |
1115 | "shlr r2 \n" | 1247 | "shlr r2 \n" |
1116 | "rotcl r0 \n" | 1248 | "rotcl r0 \n" |
1117 | "shlr r3 \n" | 1249 | "shlr r3 \n" |
1118 | "rotcl r0 \n" | 1250 | "rotcl r0 \n" |
1119 | "shlr r6 \n" | 1251 | "shlr r6 \n" |
1120 | "rotcl r0 \n" | 1252 | "rotcl r0 \n" |
1121 | "shlr r7 \n" | 1253 | "shlr r7 \n" |
1122 | "rotcl r0 \n" | 1254 | "rotcl r0 \n" |
1123 | "shlr r8 \n" | 1255 | "shlr r8 \n" |
1124 | "rotcl r0 \n" | 1256 | "rotcl r0 \n" |
1125 | "shlr r9 \n" | 1257 | "shlr r9 \n" |
1126 | "rotcl r0 \n" | 1258 | "rotcl r0 \n" |
1127 | "shlr r10 \n" | 1259 | "shlr r10 \n" |
1128 | "mov.b @%[addr],%[rx] \n" /* read old value */ | 1260 | "mov.b @%[addr],%[rx] \n" /* read old value */ |
1129 | "rotcl r0 \n" | 1261 | "rotcl r0 \n" |
1130 | "and %[mask],%[rx] \n" /* mask out unneeded bits */ | 1262 | "and %[mask],%[rx] \n" /* mask out replaced bits */ |
1131 | "or %[rx],r0 \n" /* set new bits */ | 1263 | "or %[rx],r0 \n" /* set new bits */ |
1132 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ | 1264 | "mov.b r0,@%[addr] \n" /* store value to bitplane */ |
1133 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1265 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ |
1134 | "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ | 1266 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ |
1135 | "bt .wa_floop \n" /* no: loop */ | 1267 | "bt .wa_floop \n" |
1136 | 1268 | ||
1137 | "bra .wa_end \n" | 1269 | "bra .wa_end \n" |
1138 | "nop \n" | 1270 | "nop \n" |
1139 | 1271 | ||
1140 | /* References to C library routines used in the precalc block */ | 1272 | /* References to C library routines used in the precalc block */ |
1141 | ".align 2 \n" | 1273 | ".align 2 \n" |
1142 | ".ashlsi3: \n" /* C library routine: */ | 1274 | ".ashlsi3: \n" /* C library routine: */ |
1143 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ | 1275 | ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ |
1144 | ".lshrsi3: \n" /* C library routine: */ | 1276 | ".lshrsi3: \n" /* C library routine: */ |
1145 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ | 1277 | ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ |
1146 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ | 1278 | /* both routines preserve r4, destroy r5 and take ~16 cycles */ |
1147 | 1279 | ||
1148 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1280 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
1149 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ | 1281 | "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ |
1150 | "rotcl r0 \n" /* rotate t bit into r0 */ | 1282 | "rotcl r0 \n" /* rotate t bit into r0 */ |
1151 | "shlr r2 \n" | 1283 | "shlr r2 \n" |
1152 | "rotcl r0 \n" | 1284 | "rotcl r0 \n" |
1153 | "shlr r3 \n" | 1285 | "shlr r3 \n" |
1154 | "rotcl r0 \n" | 1286 | "rotcl r0 \n" |
1155 | "shlr r6 \n" | 1287 | "shlr r6 \n" |
1156 | "rotcl r0 \n" | 1288 | "rotcl r0 \n" |
1157 | "shlr r7 \n" | 1289 | "shlr r7 \n" |
1158 | "rotcl r0 \n" | 1290 | "rotcl r0 \n" |
1159 | "shlr r8 \n" | 1291 | "shlr r8 \n" |
1160 | "rotcl r0 \n" | 1292 | "rotcl r0 \n" |
1161 | "shlr r9 \n" | 1293 | "shlr r9 \n" |
1162 | "rotcl r0 \n" | 1294 | "rotcl r0 \n" |
1163 | "shlr r10 \n" | 1295 | "shlr r10 \n" |
1164 | "rotcl r0 \n" | 1296 | "rotcl r0 \n" |
1165 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ | 1297 | "mov.b r0,@%[addr] \n" /* store byte to bitplane */ |
1166 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ | 1298 | "add %[psiz],%[addr] \n" /* advance to next bitplane */ |
1167 | "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ | 1299 | "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ |
1168 | "bt .wa_sloop \n" /* no: loop */ | 1300 | "bt .wa_sloop \n" |
1169 | 1301 | ||
1170 | ".wa_end: \n" | 1302 | ".wa_end: \n" |
1171 | : /* outputs */ | 1303 | : /* outputs */ |
1172 | [addr]"+r"(addr), | 1304 | [addr]"+r"(addr), |
1173 | [mask]"+r"(_mask), | 1305 | [mask]"+r"(_mask), |
@@ -1189,43 +1321,43 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1189 | /* precalculate the bit patterns with random shifts | 1321 | /* precalculate the bit patterns with random shifts |
1190 | for all 8 pixels and put them on an extra "stack" */ | 1322 | for all 8 pixels and put them on an extra "stack" */ |
1191 | asm volatile ( | 1323 | asm volatile ( |
1192 | "moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */ | 1324 | "moveq.l #8,%%d3 \n" /* loop count */ |
1193 | 1325 | ||
1194 | ".wa_loop: \n" /** load pattern for pixel **/ | 1326 | ".wa_loop: \n" /** load pattern for pixel **/ |
1195 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ | 1327 | "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ |
1196 | "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ | 1328 | "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ |
1197 | "bcc.b .wa_skip \n" /* skip this pixel */ | 1329 | "bcc.b .wa_skip \n" /* skip this pixel */ |
1198 | 1330 | ||
1199 | "clr.l %%d0 \n" | 1331 | "clr.l %%d0 \n" |
1200 | "move.b (%[src]),%%d0 \n" /* load src byte */ | 1332 | "move.b (%[src]),%%d0 \n" /* load src byte */ |
1201 | "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ | 1333 | "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ |
1202 | "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ | 1334 | "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ |
1203 | 1335 | ||
1204 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ | 1336 | "mulu.w #75,%[rnd] \n" /* multiply by 75 */ |
1205 | "add.l #74,%[rnd] \n" /* add another 74 */ | 1337 | "add.l #74,%[rnd] \n" /* add another 74 */ |
1206 | /* Since the lower bits are not very random: */ | 1338 | /* Since the lower bits are not very random: */ |
1207 | "move.l %[rnd],%%d1 \n" | 1339 | "move.l %[rnd],%%d1 \n" |
1208 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ | 1340 | "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ |
1209 | "and.l %[rmsk],%%d1\n" /* mask out unneeded bits */ | 1341 | "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ |
1210 | 1342 | ||
1211 | "cmp.l %[dpth],%%d1\n" /* random >= depth ? */ | 1343 | "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ |
1212 | "blo.b .wa_ntrim \n" | 1344 | "blo.b .wa_ntrim \n" |
1213 | "sub.l %[dpth],%%d1\n" /* yes: random -= depth; */ | 1345 | "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ |
1214 | ".wa_ntrim: \n" | 1346 | ".wa_ntrim: \n" |
1215 | 1347 | ||
1216 | "move.l %%d2,%%d0 \n" | 1348 | "move.l %%d2,%%d0 \n" /** rotate pattern **/ |
1217 | "lsl.l %%d1,%%d0 \n" | 1349 | "lsl.l %%d1,%%d0 \n" |
1218 | "sub.l %[dpth],%%d1\n" | 1350 | "sub.l %[dpth],%%d1 \n" |
1219 | "neg.l %%d1 \n" /* d1 = depth - d1 */ | 1351 | "neg.l %%d1 \n" /* d1 = depth - d1 */ |
1220 | "lsr.l %%d1,%%d2 \n" | 1352 | "lsr.l %%d1,%%d2 \n" |
1221 | "or.l %%d0,%%d2 \n" | 1353 | "or.l %%d0,%%d2 \n" |
1222 | 1354 | ||
1223 | ".wa_skip: \n" | 1355 | ".wa_skip: \n" |
1224 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ | 1356 | "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ |
1225 | 1357 | ||
1226 | "add.l %[stri],%[src] \n" /* src += stride; */ | 1358 | "add.l %[stri],%[src] \n" /* src += stride; */ |
1227 | "subq.l #1,%%d3 \n" /* decrease loop count */ | 1359 | "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ |
1228 | "bne.b .wa_loop \n" /* yes: loop */ | 1360 | "bne.b .wa_loop \n" |
1229 | : /* outputs */ | 1361 | : /* outputs */ |
1230 | [src] "+a"(_src), | 1362 | [src] "+a"(_src), |
1231 | [patp]"+a"(pat_ptr), | 1363 | [patp]"+a"(pat_ptr), |
@@ -1250,78 +1382,76 @@ static void _writearray(unsigned char *address, const unsigned char *src, | |||
1250 | asm volatile ( | 1382 | asm volatile ( |
1251 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" | 1383 | "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" |
1252 | /* pop all 8 patterns */ | 1384 | /* pop all 8 patterns */ |
1253 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ | 1385 | "not.l %[mask] \n" /* "set" mask -> "keep" mask */ |
1254 | "and.l #0xFF,%[mask] \n" | 1386 | "and.l #0xFF,%[mask] \n" |
1255 | "beq.b .wa_sstart \n" /* yes: jump to short loop */ | 1387 | "beq.b .wa_sstart \n" /* short loop if nothing to keep */ |
1256 | 1388 | ||
1257 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ | 1389 | ".wa_floop: \n" /** full loop (there are bits to keep)**/ |
1258 | "clr.l %%d0 \n" | 1390 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ |
1259 | "lsr.l #1,%%d2 \n" /* shift out mask bit */ | 1391 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ |
1260 | "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ | 1392 | "lsr.l #1,%%d3 \n" |
1261 | "lsr.l #1,%%d3 \n" | 1393 | "addx.l %%d0,%%d0 \n" |
1262 | "addx.l %%d0,%%d0 \n" | 1394 | "lsr.l #1,%%d4 \n" |
1263 | "lsr.l #1,%%d4 \n" | 1395 | "addx.l %%d0,%%d0 \n" |
1264 | "addx.l %%d0,%%d0 \n" | 1396 | "lsr.l #1,%%d5 \n" |
1265 | "lsr.l #1,%%d5 \n" | 1397 | "addx.l %%d0,%%d0 \n" |
1266 | "addx.l %%d0,%%d0 \n" | 1398 | "lsr.l #1,%%d6 \n" |
1267 | "lsr.l #1,%%d6 \n" | 1399 | "addx.l %%d0,%%d0 \n" |
1268 | "addx.l %%d0,%%d0 \n" | 1400 | "move.l %%a0,%%d1 \n" |
1269 | "move.l %%a0,%%d1 \n" | 1401 | "lsr.l #1,%%d1 \n" |
1270 | "lsr.l #1,%%d1 \n" | 1402 | "addx.l %%d0,%%d0 \n" |
1271 | "addx.l %%d0,%%d0 \n" | 1403 | "move.l %%d1,%%a0 \n" |
1272 | "move.l %%d1,%%a0 \n" | 1404 | "move.l %%a1,%%d1 \n" |
1273 | "move.l %%a1,%%d1 \n" | 1405 | "lsr.l #1,%%d1 \n" |
1274 | "lsr.l #1,%%d1 \n" | 1406 | "addx.l %%d0,%%d0 \n" |
1275 | "addx.l %%d0,%%d0 \n" | 1407 | "move.l %%d1,%%a1 \n" |
1276 | "move.l %%d1,%%a1 \n" | 1408 | "move.l %[ax],%%d1 \n" |
1277 | "move.l %[ax],%%d1 \n" | 1409 | "lsr.l #1,%%d1 \n" |
1278 | "lsr.l #1,%%d1 \n" | 1410 | "addx.l %%d0,%%d0 \n" |
1279 | "addx.l %%d0,%%d0 \n" | 1411 | "move.l %%d1,%[ax] \n" |
1280 | "move.l %%d1,%[ax] \n" | ||
1281 | 1412 | ||
1282 | "move.b (%[addr]),%%d1 \n" /* read old value */ | 1413 | "move.b (%[addr]),%%d1 \n" /* read old value */ |
1283 | "and.l %[mask],%%d1 \n" /* mask out unneeded bits */ | 1414 | "and.l %[mask],%%d1 \n" /* mask out replaced bits */ |
1284 | "or.l %%d0,%%d1 \n" /* set new bits */ | 1415 | "or.l %%d0,%%d1 \n" /* set new bits */ |
1285 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ | 1416 | "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ |
1286 | 1417 | ||
1287 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1418 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ |
1288 | "cmp.l %[addr],%[end] \n" /* last bitplane done? */ | 1419 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ |
1289 | "bhi.b .wa_floop \n" /* no: loop */ | 1420 | "bhi.b .wa_floop \n" |
1290 | 1421 | ||
1291 | "bra.b .wa_end \n" | 1422 | "bra.b .wa_end \n" |
1292 | 1423 | ||
1293 | ".wa_sstart: \n" | 1424 | ".wa_sstart: \n" |
1294 | "move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */ | 1425 | "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ |
1295 | 1426 | ||
1296 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ | 1427 | ".wa_sloop: \n" /** short loop (nothing to keep) **/ |
1297 | "clr.l %%d0 \n" | 1428 | "lsr.l #1,%%d2 \n" /* shift out pattern bit */ |
1298 | "lsr.l #1,%%d2 \n" /* shift out mask bit */ | 1429 | "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ |
1299 | "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ | 1430 | "lsr.l #1,%%d3 \n" |
1300 | "lsr.l #1,%%d3 \n" | 1431 | "addx.l %%d0,%%d0 \n" |
1301 | "addx.l %%d0,%%d0 \n" | 1432 | "lsr.l #1,%%d4 \n" |
1302 | "lsr.l #1,%%d4 \n" | 1433 | "addx.l %%d0,%%d0 \n" |
1303 | "addx.l %%d0,%%d0 \n" | 1434 | "lsr.l #1,%%d5 \n" |
1304 | "lsr.l #1,%%d5 \n" | 1435 | "addx.l %%d0,%%d0 \n" |
1305 | "addx.l %%d0,%%d0 \n" | 1436 | "lsr.l #1,%%d6 \n" |
1306 | "lsr.l #1,%%d6 \n" | 1437 | "addx.l %%d0,%%d0 \n" |
1307 | "addx.l %%d0,%%d0 \n" | 1438 | "lsr.l #1,%[mask] \n" |
1308 | "lsr.l #1,%[mask] \n" | 1439 | "addx.l %%d0,%%d0 \n" |
1309 | "addx.l %%d0,%%d0 \n" | 1440 | "move.l %%a1,%%d1 \n" |
1310 | "move.l %%a1,%%d1 \n" | 1441 | "lsr.l #1,%%d1 \n" |
1311 | "lsr.l #1,%%d1 \n" | 1442 | "addx.l %%d0,%%d0 \n" |
1312 | "addx.l %%d0,%%d0 \n" | 1443 | "move.l %%d1,%%a1 \n" |
1313 | "move.l %%d1,%%a1 \n" | 1444 | "move.l %[ax],%%d1 \n" |
1314 | "move.l %[ax],%%d1 \n" | 1445 | "lsr.l #1,%%d1 \n" |
1315 | "lsr.l #1,%%d1 \n" | 1446 | "addx.l %%d0,%%d0 \n" |
1316 | "addx.l %%d0,%%d0 \n" | 1447 | "move.l %%d1,%[ax] \n" |
1317 | "move.l %%d1,%[ax] \n" | ||
1318 | 1448 | ||
1319 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ | 1449 | "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ |
1320 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ | 1450 | "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ |
1321 | "cmp.l %[addr],%[end] \n" /* last bitplane done? */ | 1451 | "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ |
1322 | "bhi.b .wa_sloop \n" /* no: loop */ | 1452 | "bhi.b .wa_sloop \n" |
1323 | 1453 | ||
1324 | ".wa_end: \n" | 1454 | ".wa_end: \n" |
1325 | : /* outputs */ | 1455 | : /* outputs */ |
1326 | [addr]"+a"(addr), | 1456 | [addr]"+a"(addr), |
1327 | [mask]"+d"(_mask), | 1457 | [mask]"+d"(_mask), |