summaryrefslogtreecommitdiff
path: root/apps/plugins/lib/gray_draw.c
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
committerJens Arnold <amiconn@rockbox.org>2006-08-11 14:13:01 +0000
commit71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 (patch)
treeb9a97081ec04d4d311a7b45747393e68837912a2 /apps/plugins/lib/gray_draw.c
parentbcd94a9b01d19d87a437cd8158a758f206b30825 (diff)
downloadrockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.tar.gz
rockbox-71dc284b5d4f7bfd27fb50fd91184d2d5f70db21.zip
New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins/lib/gray_draw.c')
-rw-r--r--apps/plugins/lib/gray_draw.c1156
1 files changed, 865 insertions, 291 deletions
diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c
index 9406664ea2..dcc65bdd09 100644
--- a/apps/plugins/lib/gray_draw.c
+++ b/apps/plugins/lib/gray_draw.c
@@ -868,24 +868,24 @@ void gray_ub_clear_display(void)
868 868
869/* Write a pixel block, defined by their brightnesses in a greymap. 869/* Write a pixel block, defined by their brightnesses in a greymap.
870 Address is the byte in the first bitplane, src is the greymap start address, 870 Address is the byte in the first bitplane, src is the greymap start address,
871 stride is the increment for the greymap to get to the next pixel, mask 871 mask determines which pixels of the destination block are changed. */
872 determines which pixels of the destination block are changed. */
873static void _writearray(unsigned char *address, const unsigned char *src, 872static void _writearray(unsigned char *address, const unsigned char *src,
874 unsigned mask) 873 unsigned mask)
875{ 874{
876 unsigned long pat_stack[8]; 875 unsigned long pat_stack[8];
877 unsigned long *pat_ptr = &pat_stack[8]; 876 unsigned long *pat_ptr = &pat_stack[8];
878 unsigned char *addr, *end; 877 unsigned char *addr;
879#ifdef CPU_ARM 878#ifdef CPU_ARM
880 const unsigned char *_src; 879 const unsigned char *_src;
881 unsigned _mask, trash; 880 unsigned _mask, depth, trash;
882 881
883 _mask = mask; 882 _mask = mask;
884 _src = src; 883 _src = src;
885 884
886 /* precalculate the bit patterns with random shifts 885 /* precalculate the bit patterns with random shifts
887 for all 8 pixels and put them on an extra "stack" */ 886 for all 8 pixels and put them on an extra "stack" */
888 asm volatile ( 887 asm volatile
888 (
889 "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ 889 "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */
890 "mov r3, #8 \n" /* loop count */ 890 "mov r3, #8 \n" /* loop count */
891 891
@@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src,
932 ); 932 );
933 933
934 addr = address; 934 addr = address;
935 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
936 _mask = mask; 935 _mask = mask;
936 depth = _gray_info.depth;
937 937
938 /* set the bits for all 8 pixels in all bytes according to the 938 /* set the bits for all 8 pixels in all bytes according to the
939 * precalculated patterns on the pattern stack */ 939 * precalculated patterns on the pattern stack */
940 asm volatile ( 940 asm volatile
941 "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ 941 (
942 942 "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */
943 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ 943
944 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
945
946 "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/
947 "orr %[rx], %[rx], %[rx], lsl #8 \n"
948 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */
949 "eor r0, r1, r5, lsl #4 \n"
950 "and r0, r0, %[rx] \n"
951 "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
952 "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
953 "eor r0, r2, r6, lsl #4 \n"
954 "and r0, r0, %[rx] \n"
955 "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
956 "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
957 "eor r0, r3, r7, lsl #4 \n"
958 "and r0, r0, %[rx] \n"
959 "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
960 "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
961 "eor r0, r4, r8, lsl #4 \n"
962 "and r0, r0, %[rx] \n"
963 "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
964 "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
965
966 "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/
967 "orr %[rx], %[rx], %[rx], lsl #8 \n"
968 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */
969 "eor r0, r1, r3, lsl #2 \n"
970 "and r0, r0, %[rx] \n"
971 "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
972 "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
973 "eor r0, r2, r4, lsl #2 \n"
974 "and r0, r0, %[rx] \n"
975 "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
976 "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
977 "eor r0, r5, r7, lsl #2 \n"
978 "and r0, r0, %[rx] \n"
979 "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
980 "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
981 "eor r0, r6, r8, lsl #2 \n"
982 "and r0, r0, %[rx] \n"
983 "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
984 "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
985
986 "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/
987 "orr %[rx], %[rx], %[rx], lsl #8 \n"
988 "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */
989 "eor r0, r1, r2, lsl #1 \n"
990 "and r0, r0, %[rx] \n"
991 "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
992 "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
993 "eor r0, r3, r4, lsl #1 \n"
994 "and r0, r0, %[rx] \n"
995 "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
996 "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
997 "eor r0, r5, r6, lsl #1 \n"
998 "and r0, r0, %[rx] \n"
999 "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1000 "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1001 "eor r0, r7, r8, lsl #1 \n"
1002 "and r0, r0, %[rx] \n"
1003 "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1004 "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1005
1006 "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
944 "ands %[mask], %[mask], #0xff \n" 1007 "ands %[mask], %[mask], #0xff \n"
945 "beq .wa_sloop \n" /* short loop if nothing to keep */ 1008 "beq .wa_sloop \n" /* short loop if no bits to keep */
946 1009
947 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1010 ".wa_floop: \n" /** full loop (bits to keep)**/
948 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 1011 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
949 "adc r0, r0, r0 \n" /* put bit into LSB of byte */ 1012 "bhs .wa_f8 \n"
950 "movs r8, r8, lsr #1 \n" 1013
951 "adc r0, r0, r0 \n" 1014 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
952 "movs r7, r7, lsr #1 \n" 1015 "add %[addr], %[addr], r0 \n" /* for this round */
953 "adc r0, r0, r0 \n" 1016
954 "movs r6, r6, lsr #1 \n" 1017
955 "adc r0, r0, r0 \n" 1018 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
956 "movs r5, r5, lsr #1 \n" 1019 "add pc, pc, r0 \n"
957 "adc r0, r0, r0 \n" 1020 ".wa_ftable: \n"
958 "movs r4, r4, lsr #1 \n" 1021 ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */
959 "adc r0, r0, r0 \n" 1022 ".byte .wa_f1 - .wa_ftable - 4 \n"
960 "movs r3, r3, lsr #1 \n" 1023 ".byte .wa_f2 - .wa_ftable - 4 \n"
961 "adc r0, r0, r0 \n" 1024 ".byte .wa_f3 - .wa_ftable - 4 \n"
962 "movs r2, r2, lsr #1 \n" 1025 ".byte .wa_f4 - .wa_ftable - 4 \n"
963 "adc r0, r0, r0 \n" 1026 ".byte .wa_f5 - .wa_ftable - 4 \n"
964 1027 ".byte .wa_f6 - .wa_ftable - 4 \n"
965 "ldrb r1, [%[addr]] \n" /* read old value */ 1028 ".byte .wa_f7 - .wa_ftable - 4 \n"
966 "and r1, r1, %[mask] \n" /* mask out replaced bits */ 1029
967 "orr r1, r1, r0 \n" /* set new bits */ 1030 ".wa_f8: \n"
968 "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ 1031 "add %[addr], %[addr], %[psiz], lsl #3 \n"
969 1032 /* Point behind the last plane for this round. Note: We're using the
970 "cmp %[end], %[addr] \n" /* loop through all bitplanes */ 1033 * registers backwards in order to reuse the streak for the last round.
971 "bne .wa_floop \n" 1034 * Therefore we need to go thru the bitplanes backwards too, otherwise
972 1035 * the bit order would be destroyed which results in more flicker. */
1036 "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */
1037 "and r0, r0, %[mask] \n" /* mask out replaced bits */
1038 "orr r0, r0, r8 \n" /* set new bits */
1039 "strb r0, [%[addr]] \n" /* store byte */
1040 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
1041 ".wa_f7: \n"
1042 "ldrb r0, [%[addr], -%[psiz]]! \n"
1043 "and r0, r0, %[mask] \n"
1044 "orr r0, r0, r7 \n"
1045 "strb r0, [%[addr]] \n"
1046 "mov r7, r7, lsr #8 \n"
1047 ".wa_f6: \n"
1048 "ldrb r0, [%[addr], -%[psiz]]! \n"
1049 "and r0, r0, %[mask] \n"
1050 "orr r0, r0, r6 \n"
1051 "strb r0, [%[addr]] \n"
1052 "mov r6, r6, lsr #8 \n"
1053 ".wa_f5: \n"
1054 "ldrb r0, [%[addr], -%[psiz]]! \n"
1055 "and r0, r0, %[mask] \n"
1056 "orr r0, r0, r5 \n"
1057 "strb r0, [%[addr]] \n"
1058 "mov r5, r5, lsr #8 \n"
1059 ".wa_f4: \n"
1060 "ldrb r0, [%[addr], -%[psiz]]! \n"
1061 "and r0, r0, %[mask] \n"
1062 "orr r0, r0, r4 \n"
1063 "strb r0, [%[addr]] \n"
1064 "mov r4, r4, lsr #8 \n"
1065 ".wa_f3: \n"
1066 "ldrb r0, [%[addr], -%[psiz]]! \n"
1067 "and r0, r0, %[mask] \n"
1068 "orr r0, r0, r3 \n"
1069 "strb r0, [%[addr]] \n"
1070 "mov r3, r3, lsr #8 \n"
1071 ".wa_f2: \n"
1072 "ldrb r0, [%[addr], -%[psiz]]! \n"
1073 "and r0, r0, %[mask] \n"
1074 "orr r0, r0, r2 \n"
1075 "strb r0, [%[addr]] \n"
1076 "mov r2, r2, lsr #8 \n"
1077 ".wa_f1: \n"
1078 "ldrb r0, [%[addr], -%[psiz]]! \n"
1079 "and r0, r0, %[mask] \n"
1080 "orr r0, r0, r1 \n"
1081 "strb r0, [%[addr]] \n"
1082 "mov r1, r1, lsr #8 \n"
1083 ".wa_f0: \n"
1084
1085 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
1086 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
1087 "bhi .wa_floop \n"
1088
973 "b .wa_end \n" 1089 "b .wa_end \n"
974 1090
975 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1091 ".wa_sloop: \n" /** short loop (nothing to keep) **/
976 "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ 1092 "cmp %[dpth], #8 \n" /* 8 planes or more left? */
977 "adc r0, r0, r0 \n" /* put bit into LSB of byte */ 1093 "bhs .wa_s8 \n"
978 "movs r8, r8, lsr #1 \n"
979 "adc r0, r0, r0 \n"
980 "movs r7, r7, lsr #1 \n"
981 "adc r0, r0, r0 \n"
982 "movs r6, r6, lsr #1 \n"
983 "adc r0, r0, r0 \n"
984 "movs r5, r5, lsr #1 \n"
985 "adc r0, r0, r0 \n"
986 "movs r4, r4, lsr #1 \n"
987 "adc r0, r0, r0 \n"
988 "movs r3, r3, lsr #1 \n"
989 "adc r0, r0, r0 \n"
990 "movs r2, r2, lsr #1 \n"
991 "adc r0, r0, r0 \n"
992
993 "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
994 1094
995 "cmp %[end], %[addr] \n" /* loop through all bitplanes */ 1095 "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */
996 "bne .wa_sloop \n" 1096 "add %[addr], %[addr], r0 \n" /* for this round */
997 1097
1098 "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */
1099 "add pc, pc, r0 \n"
1100 ".wa_stable: \n"
1101 ".byte .wa_s0 - .wa_stable - 4 \n"
1102 ".byte .wa_s1 - .wa_stable - 4 \n"
1103 ".byte .wa_s2 - .wa_stable - 4 \n"
1104 ".byte .wa_s3 - .wa_stable - 4 \n"
1105 ".byte .wa_s4 - .wa_stable - 4 \n"
1106 ".byte .wa_s5 - .wa_stable - 4 \n"
1107 ".byte .wa_s6 - .wa_stable - 4 \n"
1108 ".byte .wa_s7 - .wa_stable - 4 \n"
1109
1110 ".wa_s8: \n"
1111 "add %[addr], %[addr], %[psiz], lsl #3 \n"
1112 /* Point behind the last plane for this round. See above. */
1113 "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */
1114 "mov r8, r8, lsr #8 \n" /* shift out used-up byte */
1115 ".wa_s7: \n"
1116 "strb r7, [%[addr], -%[psiz]]! \n"
1117 "mov r7, r7, lsr #8 \n"
1118 ".wa_s6: \n"
1119 "strb r6, [%[addr], -%[psiz]]! \n"
1120 "mov r6, r6, lsr #8 \n"
1121 ".wa_s5: \n"
1122 "strb r5, [%[addr], -%[psiz]]! \n"
1123 "mov r5, r5, lsr #8 \n"
1124 ".wa_s4: \n"
1125 "strb r4, [%[addr], -%[psiz]]! \n"
1126 "mov r4, r4, lsr #8 \n"
1127 ".wa_s3: \n"
1128 "strb r3, [%[addr], -%[psiz]]! \n"
1129 "mov r3, r3, lsr #8 \n"
1130 ".wa_s2: \n"
1131 "strb r2, [%[addr], -%[psiz]]! \n"
1132 "mov r2, r2, lsr #8 \n"
1133 ".wa_s1: \n"
1134 "strb r1, [%[addr], -%[psiz]]! \n"
1135 "mov r1, r1, lsr #8 \n"
1136 ".wa_s0: \n"
1137
1138 "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */
1139 "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */
1140 "bhi .wa_sloop \n"
1141
998 ".wa_end: \n" 1142 ".wa_end: \n"
999 : /* outputs */ 1143 : /* outputs */
1000 [addr]"+r"(addr), 1144 [addr]"+r"(addr),
1001 [mask]"+r"(_mask), 1145 [mask]"+r"(_mask),
1146 [dpth]"+r"(depth),
1002 [rx] "=&r"(trash) 1147 [rx] "=&r"(trash)
1003 : /* inputs */ 1148 : /* inputs */
1004 [psiz]"r"(_gray_info.plane_size), 1149 [psiz]"r"(_gray_info.plane_size),
1005 [end] "r"(end),
1006 [patp]"[rx]"(pat_ptr) 1150 [patp]"[rx]"(pat_ptr)
1007 : /* clobbers */ 1151 : /* clobbers */
1008 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" 1152 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
1009 ); 1153 );
1010#else /* C version, for reference*/ 1154#else /* C version, for reference*/
1011#warning C version of _writearray() used 1155#warning C version of _writearray() used
1156 unsigned char *end;
1012 unsigned test = 0x80; 1157 unsigned test = 0x80;
1013 int i; 1158 int i;
1014 1159
@@ -1143,67 +1288,70 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y,
1143 stride is the increment for the greymap to get to the next pixel, mask 1288 stride is the increment for the greymap to get to the next pixel, mask
1144 determines which pixels of the destination block are changed. */ 1289 determines which pixels of the destination block are changed. */
1145static void _writearray(unsigned char *address, const unsigned char *src, 1290static void _writearray(unsigned char *address, const unsigned char *src,
1291 int stride, unsigned mask) __attribute__((noinline));
1292static void _writearray(unsigned char *address, const unsigned char *src,
1146 int stride, unsigned mask) 1293 int stride, unsigned mask)
1147{ 1294{
1148 unsigned long pat_stack[8]; 1295 unsigned long pat_stack[8];
1149 unsigned long *pat_ptr = &pat_stack[8]; 1296 unsigned long *pat_ptr = &pat_stack[8];
1150 unsigned char *addr, *end; 1297 unsigned char *addr;
1151#if CONFIG_CPU == SH7034 1298#if CONFIG_CPU == SH7034
1152 const unsigned char *_src; 1299 const unsigned char *_src;
1153 unsigned _mask, trash; 1300 unsigned _mask, depth, trash;
1154 1301
1155 _mask = mask; 1302 _mask = mask;
1156 _src = src; 1303 _src = src;
1157 1304
1158 /* precalculate the bit patterns with random shifts 1305 /* precalculate the bit patterns with random shifts
1159 for all 8 pixels and put them on an extra "stack" */ 1306 for all 8 pixels and put them on an extra "stack" */
1160 asm volatile ( 1307 asm volatile
1161 "mov #8,r3 \n" /* loop count */ 1308 (
1162 1309 "mov #8, r3 \n" /* loop count */
1163 ".wa_loop: \n" /** load pattern for pixel **/ 1310
1164 "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ 1311 ".wa_loop: \n" /** load pattern for pixel **/
1165 "shlr %[mask] \n" /* shift out lsb of mask */ 1312 "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */
1166 "bf .wa_skip \n" /* skip this pixel */ 1313 "shlr %[mask] \n" /* shift out lsb of mask */
1167 1314 "bf .wa_skip \n" /* skip this pixel */
1168 "mov.b @%[src],r0 \n" /* load src byte */ 1315
1169 "extu.b r0,r0 \n" /* extend unsigned */ 1316 "mov.b @%[src], r0 \n" /* load src byte */
1170 "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ 1317 "extu.b r0, r0 \n" /* extend unsigned */
1171 "extu.b r0,r0 \n" /* extend unsigned */ 1318 "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */
1172 "shll2 r0 \n" 1319 "extu.b r0, r0 \n" /* extend unsigned */
1173 "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ 1320 "shll2 r0 \n"
1174 1321 "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */
1175 "mov #75,r0 \n" 1322
1176 "mulu r0,%[rnd] \n" /* multiply by 75 */ 1323 "mov #75, r0 \n"
1177 "sts macl,%[rnd] \n" 1324 "mulu r0, %[rnd] \n" /* multiply by 75 */
1178 "add #74,%[rnd] \n" /* add another 74 */ 1325 "sts macl, %[rnd] \n"
1326 "add #74, %[rnd] \n" /* add another 74 */
1179 /* Since the lower bits are not very random: */ 1327 /* Since the lower bits are not very random: */
1180 "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ 1328 "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */
1181 "and %[rmsk],r1 \n" /* mask out unneeded bits */ 1329 "and %[rmsk], r1 \n" /* mask out unneeded bits */
1182 1330
1183 "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ 1331 "cmp/hs %[dpth], r1 \n" /* random >= depth ? */
1184 "bf .wa_ntrim \n" 1332 "bf .wa_ntrim \n"
1185 "sub %[dpth],r1 \n" /* yes: random -= depth; */ 1333 "sub %[dpth], r1 \n" /* yes: random -= depth; */
1186 ".wa_ntrim: \n" 1334 ".wa_ntrim: \n"
1187 1335
1188 "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ 1336 "mov.l .ashlsi3, r0 \n" /** rotate pattern **/
1189 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ 1337 "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
1190 "mov r1,r5 \n" 1338 "mov r1, r5 \n"
1191 1339
1192 "mov %[dpth],r5 \n" 1340 "mov %[dpth], r5 \n"
1193 "sub r1,r5 \n" /* r5 = depth - r1 */ 1341 "sub r1, r5 \n" /* r5 = depth - r1 */
1194 "mov.l .lshrsi3,r1 \n" 1342 "mov.l .lshrsi3, r1 \n"
1195 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ 1343 "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
1196 "mov r0,r1 \n" /* store previous result in r1 */ 1344 "mov r0, r1 \n" /* store previous result in r1 */
1197 1345
1198 "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ 1346 "or r1, r0 \n" /* rotated_pattern = r0 | r1 */
1199 1347
1200 ".wa_skip: \n" 1348 ".wa_skip: \n"
1201 "mov.l r0,@-%[patp] \n" /* push on pattern stack */ 1349 "mov.l r0, @-%[patp] \n" /* push on pattern stack */
1202 1350
1203 "add %[stri],%[src] \n" /* src += stride; */ 1351 "add %[stri], %[src] \n" /* src += stride; */
1204 "add #-1,r3 \n" /* loop 8 times (pixel block) */ 1352 "add #-1, r3 \n" /* loop 8 times (pixel block) */
1205 "cmp/pl r3 \n" 1353 "cmp/pl r3 \n"
1206 "bt .wa_loop \n" 1354 "bt .wa_loop \n"
1207 : /* outputs */ 1355 : /* outputs */
1208 [src] "+r"(_src), 1356 [src] "+r"(_src),
1209 [rnd] "+r"(_gray_random_buffer), 1357 [rnd] "+r"(_gray_random_buffer),
@@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src,
1220 ); 1368 );
1221 1369
1222 addr = address; 1370 addr = address;
1223 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
1224 _mask = mask; 1371 _mask = mask;
1372 depth = _gray_info.depth;
1225 1373
1226 /* set the bits for all 8 pixels in all bytes according to the 1374 /* set the bits for all 8 pixels in all bytes according to the
1227 * precalculated patterns on the pattern stack */ 1375 * precalculated patterns on the pattern stack */
1228 asm volatile ( 1376 asm volatile
1229 "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ 1377 (
1230 "mov.l @%[patp]+,r2 \n" 1378 "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */
1231 "mov.l @%[patp]+,r3 \n" 1379 "mov.l @%[patp]+, r7 \n"
1232 "mov.l @%[patp]+,r6 \n" 1380 "mov.l @%[patp]+, r6 \n"
1233 "mov.l @%[patp]+,r7 \n" 1381 "mov.l @%[patp]+, r5 \n"
1234 "mov.l @%[patp]+,r8 \n" 1382 "mov.l @%[patp]+, r4 \n"
1235 "mov.l @%[patp]+,r9 \n" 1383 "mov.l @%[patp]+, r3 \n"
1236 "mov.l @%[patp],r10 \n" 1384 "mov.l @%[patp]+, r2 \n"
1237 1385 "mov.l @%[patp], r1 \n"
1238 "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ 1386
1239 "extu.b %[mask],%[mask] \n" /* mask out high bits */ 1387 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1240 "tst %[mask],%[mask] \n" 1388
1241 "bt .wa_sloop \n" /* short loop if nothing to keep */ 1389 "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */
1242 1390 "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/
1243 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1391 "shll2 r0 \n"
1244 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1392 "shll2 r0 \n"
1245 "rotcl r0 \n" /* rotate t bit into r0 */ 1393 "xor r1, r0 \n"
1246 "shlr r2 \n" 1394 "and %[rx], r0 \n"
1247 "rotcl r0 \n" 1395 "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */
1248 "shlr r3 \n" 1396 "shlr2 r0 \n"
1249 "rotcl r0 \n" 1397 "shlr2 r0 \n"
1250 "shlr r6 \n" 1398 "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */
1251 "rotcl r0 \n" 1399 "mov r6, r0 \n"
1252 "shlr r7 \n" 1400 "shll2 r0 \n"
1253 "rotcl r0 \n" 1401 "shll2 r0 \n"
1254 "shlr r8 \n" 1402 "xor r2, r0 \n"
1255 "rotcl r0 \n" 1403 "and %[rx], r0 \n"
1256 "shlr r9 \n" 1404 "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */
1257 "rotcl r0 \n" 1405 "shlr2 r0 \n"
1258 "shlr r10 \n" 1406 "shlr2 r0 \n"
1259 "mov.b @%[addr],%[rx] \n" /* read old value */ 1407 "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */
1260 "rotcl r0 \n" 1408 "mov r7, r0 \n"
1261 "and %[mask],%[rx] \n" /* mask out replaced bits */ 1409 "shll2 r0 \n"
1262 "or %[rx],r0 \n" /* set new bits */ 1410 "shll2 r0 \n"
1263 "mov.b r0,@%[addr] \n" /* store value to bitplane */ 1411 "xor r3, r0 \n"
1264 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1412 "and %[rx], r0 \n"
1265 "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ 1413 "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */
1266 "bt .wa_floop \n" 1414 "shlr2 r0 \n"
1267 1415 "shlr2 r0 \n"
1268 "bra .wa_end \n" 1416 "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */
1269 "nop \n" 1417 "mov r8, r0 \n"
1418 "shll2 r0 \n"
1419 "shll2 r0 \n"
1420 "xor r4, r0 \n"
1421 "and %[rx], r0 \n"
1422 "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */
1423 "shlr2 r0 \n"
1424 "shlr2 r0 \n"
1425 "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */
1426
1427 "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */
1428 "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/
1429 "shll2 r0 \n"
1430 "xor r1, r0 \n"
1431 "and %[rx], r0 \n"
1432 "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */
1433 "shlr2 r0 \n"
1434 "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */
1435 "mov r4, r0 \n"
1436 "shll2 r0 \n"
1437 "xor r2, r0 \n"
1438 "and %[rx], r0 \n"
1439 "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1440 "shlr2 r0 \n"
1441 "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */
1442 "mov r7, r0 \n"
1443 "shll2 r0 \n"
1444 "xor r5, r0 \n"
1445 "and %[rx], r0 \n"
1446 "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */
1447 "shlr2 r0 \n"
1448 "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */
1449 "mov r8, r0 \n"
1450 "shll2 r0 \n"
1451 "xor r6, r0 \n"
1452 "and %[rx], r0 \n"
1453 "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */
1454 "shlr2 r0 \n"
1455 "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */
1456
1457 "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */
1458 "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/
1459 "shll r0 \n"
1460 "xor r1, r0 \n"
1461 "and %[rx], r0 \n"
1462 "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */
1463 "shlr r0 \n"
1464 "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */
1465 "mov r4, r0 \n"
1466 "shll r0 \n"
1467 "xor r3, r0 \n"
1468 "and %[rx], r0 \n"
1469 "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */
1470 "shlr r0 \n"
1471 "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */
1472 "mov r6, r0 \n"
1473 "shll r0 \n"
1474 "xor r5, r0 \n"
1475 "and %[rx], r0 \n"
1476 "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */
1477 "shlr r0 \n"
1478 "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */
1479 "mov r8, r0 \n"
1480 "shll r0 \n"
1481 "xor r7, r0 \n"
1482 "and %[rx], r0 \n"
1483 "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */
1484 "shlr r0 \n"
1485 "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */
1486
1487 "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
1488 "extu.b %[mask], %[mask] \n" /* mask out high bits */
1489 "tst %[mask], %[mask] \n"
1490 "bt .wa_sloop \n" /* short loop if nothing to keep */
1491
1492 ".wa_floop: \n" /** full loop (there are bits to keep)**/
1493 "mov #8, r0 \n"
1494 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1495 "bt .wa_f8 \n"
1496
1497 "mulu %[psiz], %[dpth] \n"
1498 "mova .wa_ftable, r0 \n"
1499 "mov.b @(r0, %[dpth]), %[rx] \n"
1500 "add %[rx], r0 \n"
1501 "sts macl, %[rx] \n" /* point behind the last plane.. */
1502 "jmp @r0 \n" /* jump into streak */
1503 "add %[rx], %[addr] \n" /* ..for this round */
1504
1505 ".align 2 \n"
1506 ".wa_ftable: \n"
1507 ".byte .wa_f0 - .wa_ftable \n"
1508 ".byte .wa_f1 - .wa_ftable \n"
1509 ".byte .wa_f2 - .wa_ftable \n"
1510 ".byte .wa_f3 - .wa_ftable \n"
1511 ".byte .wa_f4 - .wa_ftable \n"
1512 ".byte .wa_f5 - .wa_ftable \n"
1513 ".byte .wa_f6 - .wa_ftable \n"
1514 ".byte .wa_f7 - .wa_ftable \n"
1515
1516 ".wa_f8: \n"
1517 "mov %[psiz], %[rx] \n"
1518 "shll2 %[rx] \n"
1519 "add %[rx], %[rx] \n"
1520 "add %[rx], %[addr] \n"
1521 /* Point behind the last plane for this round. Note: We're using the
1522 * registers backwards in order to reuse the streak for the last round.
1523 * Therefore we need to go thru the bitplanes backwards too, otherwise
1524 * the bit order would be destroyed which results in more flicker. */
1525 "sub %[psiz], %[addr] \n"
1526 "mov.b @%[addr], r0 \n" /* load old byte */
1527 "and %[mask], r0 \n" /* mask out replaced bits */
1528 "or r8, r0 \n" /* set new bits */
1529 "mov.b r0, @%[addr] \n" /* store byte */
1530 "shlr8 r8 \n" /* shift out used-up byte */
1531 ".wa_f7: \n"
1532 "sub %[psiz], %[addr] \n"
1533 "mov.b @%[addr], r0 \n"
1534 "and %[mask], r0 \n"
1535 "or r7, r0 \n"
1536 "mov.b r0, @%[addr] \n"
1537 "shlr8 r7 \n"
1538 ".wa_f6: \n"
1539 "sub %[psiz], %[addr] \n"
1540 "mov.b @%[addr], r0 \n"
1541 "and %[mask], r0 \n"
1542 "or r6, r0 \n"
1543 "mov.b r0, @%[addr] \n"
1544 "shlr8 r6 \n"
1545 ".wa_f5: \n"
1546 "sub %[psiz], %[addr] \n"
1547 "mov.b @%[addr], r0 \n"
1548 "and %[mask], r0 \n"
1549 "or r5, r0 \n"
1550 "mov.b r0, @%[addr] \n"
1551 "shlr8 r5 \n"
1552 ".wa_f4: \n"
1553 "sub %[psiz], %[addr] \n"
1554 "mov.b @%[addr], r0 \n"
1555 "and %[mask], r0 \n"
1556 "or r4, r0 \n"
1557 "mov.b r0, @%[addr] \n"
1558 "shlr8 r4 \n"
1559 ".wa_f3: \n"
1560 "sub %[psiz], %[addr] \n"
1561 "mov.b @%[addr], r0 \n"
1562 "and %[mask], r0 \n"
1563 "or r3, r0 \n"
1564 "mov.b r0, @%[addr] \n"
1565 "shlr8 r3 \n"
1566 ".wa_f2: \n"
1567 "sub %[psiz], %[addr] \n"
1568 "mov.b @%[addr], r0 \n"
1569 "and %[mask], r0 \n"
1570 "or r2, r0 \n"
1571 "mov.b r0, @%[addr] \n"
1572 "shlr8 r2 \n"
1573 ".wa_f1: \n"
1574 "sub %[psiz], %[addr] \n"
1575 "mov.b @%[addr], r0 \n"
1576 "and %[mask], r0 \n"
1577 "or r1, r0 \n"
1578 "mov.b r0, @%[addr] \n"
1579 "shlr8 r1 \n"
1580 ".wa_f0: \n"
1581
1582 "add %[rx], %[addr] \n" /* correct address */
1583 "add #-8, %[dpth] \n"
1584 "cmp/pl %[dpth] \n" /* next round if anything left */
1585 "bt .wa_floop \n"
1586
1587 "bra .wa_end \n"
1588 "nop \n"
1270 1589
1271 /* References to C library routines used in the precalc block */ 1590 /* References to C library routines used in the precalc block */
1272 ".align 2 \n" 1591 ".align 2 \n"
1273 ".ashlsi3: \n" /* C library routine: */ 1592 ".ashlsi3: \n" /* C library routine: */
1274 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ 1593 ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
1275 ".lshrsi3: \n" /* C library routine: */ 1594 ".lshrsi3: \n" /* C library routine: */
1276 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ 1595 ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
1277 /* both routines preserve r4, destroy r5 and take ~16 cycles */ 1596 /* both routines preserve r4, destroy r5 and take ~16 cycles */
1278 1597
1279 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1598 /* Bitmasks for the bit block rotation */
1280 "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ 1599 ".wa_mask4: \n"
1281 "rotcl r0 \n" /* rotate t bit into r0 */ 1600 ".long 0xF0F0F0F0 \n"
1282 "shlr r2 \n" 1601 ".wa_mask2: \n"
1283 "rotcl r0 \n" 1602 ".long 0xCCCCCCCC \n"
1284 "shlr r3 \n" 1603 ".wa_mask1: \n"
1285 "rotcl r0 \n" 1604 ".long 0xAAAAAAAA \n"
1286 "shlr r6 \n" 1605
1287 "rotcl r0 \n" 1606 ".wa_sloop: \n" /** short loop (nothing to keep) **/
1288 "shlr r7 \n" 1607 "mov #8, r0 \n"
1289 "rotcl r0 \n" 1608 "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */
1290 "shlr r8 \n" 1609 "bt .wa_s8 \n"
1291 "rotcl r0 \n" 1610
1292 "shlr r9 \n" 1611 "mulu %[psiz], %[dpth] \n"
1293 "rotcl r0 \n" 1612 "mova .wa_stable, r0 \n"
1294 "shlr r10 \n" 1613 "mov.b @(r0, %[dpth]), %[rx] \n"
1295 "rotcl r0 \n" 1614 "add %[rx], r0 \n"
1296 "mov.b r0,@%[addr] \n" /* store byte to bitplane */ 1615 "sts macl, %[rx] \n" /* point behind the last plane.. */
1297 "add %[psiz],%[addr] \n" /* advance to next bitplane */ 1616 "jmp @r0 \n" /* jump into streak */
1298 "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ 1617 "add %[rx], %[addr] \n" /* ..for this round */
1299 "bt .wa_sloop \n" 1618
1300 1619 ".align 2 \n"
1301 ".wa_end: \n" 1620 ".wa_stable: \n"
1621 ".byte .wa_s0 - .wa_stable \n"
1622 ".byte .wa_s1 - .wa_stable \n"
1623 ".byte .wa_s2 - .wa_stable \n"
1624 ".byte .wa_s3 - .wa_stable \n"
1625 ".byte .wa_s4 - .wa_stable \n"
1626 ".byte .wa_s5 - .wa_stable \n"
1627 ".byte .wa_s6 - .wa_stable \n"
1628 ".byte .wa_s7 - .wa_stable \n"
1629
1630 ".wa_s8: \n"
1631 "mov %[psiz], %[rx] \n" /* Point behind the last plane */
1632 "shll2 %[rx] \n" /* for this round. */
1633 "add %[rx], %[rx] \n" /* See above. */
1634 "add %[rx], %[addr] \n"
1635
1636 "sub %[psiz], %[addr] \n"
1637 "mov.b r8, @%[addr] \n" /* store byte */
1638 "shlr8 r8 \n" /* shift out used-up byte */
1639 ".wa_s7: \n"
1640 "sub %[psiz], %[addr] \n"
1641 "mov.b r7, @%[addr] \n"
1642 "shlr8 r7 \n"
1643 ".wa_s6: \n"
1644 "sub %[psiz], %[addr] \n"
1645 "mov.b r6, @%[addr] \n"
1646 "shlr8 r6 \n"
1647 ".wa_s5: \n"
1648 "sub %[psiz], %[addr] \n"
1649 "mov.b r5, @%[addr] \n"
1650 "shlr8 r5 \n"
1651 ".wa_s4: \n"
1652 "sub %[psiz], %[addr] \n"
1653 "mov.b r4, @%[addr] \n"
1654 "shlr8 r4 \n"
1655 ".wa_s3: \n"
1656 "sub %[psiz], %[addr] \n"
1657 "mov.b r3, @%[addr] \n"
1658 "shlr8 r3 \n"
1659 ".wa_s2: \n"
1660 "sub %[psiz], %[addr] \n"
1661 "mov.b r2, @%[addr] \n"
1662 "shlr8 r2 \n"
1663 ".wa_s1: \n"
1664 "sub %[psiz], %[addr] \n"
1665 "mov.b r1, @%[addr] \n"
1666 "shlr8 r1 \n"
1667 ".wa_s0: \n"
1668
1669 "add %[rx], %[addr] \n" /* correct address */
1670 "add #-8, %[dpth] \n"
1671 "cmp/pl %[dpth] \n" /* next round if anything left */
1672 "bt .wa_sloop \n"
1673
1674 ".wa_end: \n"
1302 : /* outputs */ 1675 : /* outputs */
1303 [addr]"+r"(addr), 1676 [addr]"+r"(addr),
1304 [mask]"+r"(_mask), 1677 [mask]"+r"(_mask),
1678 [dpth]"+r"(depth),
1305 [rx] "=&r"(trash) 1679 [rx] "=&r"(trash)
1306 : /* inputs */ 1680 : /* inputs */
1307 [psiz]"r"(_gray_info.plane_size), 1681 [psiz]"r"(_gray_info.plane_size),
1308 [end] "r"(end),
1309 [patp]"[rx]"(pat_ptr) 1682 [patp]"[rx]"(pat_ptr)
1310 : /* clobbers */ 1683 : /* clobbers */
1311 "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" 1684 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
1312 ); 1685 );
1313#elif defined(CPU_COLDFIRE) 1686#elif defined(CPU_COLDFIRE)
1314 const unsigned char *_src; 1687 const unsigned char *_src;
1315 unsigned _mask, trash; 1688 unsigned _mask, depth, trash;
1316 1689
1317 _mask = mask; 1690 _mask = mask;
1318 _src = src; 1691 _src = src;
1319 1692
1320 /* precalculate the bit patterns with random shifts 1693 /* precalculate the bit patterns with random shifts
1321 for all 8 pixels and put them on an extra "stack" */ 1694 for all 8 pixels and put them on an extra "stack" */
1322 asm volatile ( 1695 asm volatile
1323 "moveq.l #8,%%d3 \n" /* loop count */ 1696 (
1324 1697 "moveq.l #8, %%d3 \n" /* loop count */
1325 ".wa_loop: \n" /** load pattern for pixel **/ 1698
1326 "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ 1699 ".wa_loop: \n" /** load pattern for pixel **/
1327 "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ 1700 "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
1328 "bcc.b .wa_skip \n" /* skip this pixel */ 1701 "lsr.l #1, %[mask] \n" /* shift out lsb of mask */
1329 1702 "bcc.b .wa_skip \n" /* skip this pixel */
1330 "clr.l %%d0 \n" 1703
1331 "move.b (%[src]),%%d0 \n" /* load src byte */ 1704 "clr.l %%d0 \n"
1332 "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ 1705 "move.b (%[src]), %%d0 \n" /* load src byte */
1333 "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ 1706 "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */
1334 1707 "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */
1335 "mulu.w #75,%[rnd] \n" /* multiply by 75 */ 1708
1336 "add.l #74,%[rnd] \n" /* add another 74 */ 1709 "mulu.w #75, %[rnd] \n" /* multiply by 75 */
1710 "add.l #74, %[rnd] \n" /* add another 74 */
1337 /* Since the lower bits are not very random: */ 1711 /* Since the lower bits are not very random: */
1338 "move.l %[rnd],%%d1 \n" 1712 "move.l %[rnd], %%d1 \n"
1339 "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ 1713 "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */
1340 "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ 1714 "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */
1341 1715
1342 "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ 1716 "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */
1343 "blo.b .wa_ntrim \n" 1717 "blo.b .wa_ntrim \n"
1344 "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ 1718 "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */
1345 ".wa_ntrim: \n" 1719 ".wa_ntrim: \n"
1346 1720
1347 "move.l %%d2,%%d0 \n" /** rotate pattern **/ 1721 "move.l %%d2, %%d0 \n" /** rotate pattern **/
1348 "lsl.l %%d1,%%d0 \n" 1722 "lsl.l %%d1, %%d0 \n"
1349 "sub.l %[dpth],%%d1 \n" 1723 "sub.l %[dpth], %%d1 \n"
1350 "neg.l %%d1 \n" /* d1 = depth - d1 */ 1724 "neg.l %%d1 \n" /* d1 = depth - d1 */
1351 "lsr.l %%d1,%%d2 \n" 1725 "lsr.l %%d1, %%d2 \n"
1352 "or.l %%d0,%%d2 \n" 1726 "or.l %%d0, %%d2 \n"
1353 1727
1354 ".wa_skip: \n" 1728 ".wa_skip: \n"
1355 "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ 1729 "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */
1356 1730
1357 "add.l %[stri],%[src] \n" /* src += stride; */ 1731 "add.l %[stri], %[src] \n" /* src += stride; */
1358 "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ 1732 "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */
1359 "bne.b .wa_loop \n" 1733 "bne.b .wa_loop \n"
1360 : /* outputs */ 1734 : /* outputs */
1361 [src] "+a"(_src), 1735 [src] "+a"(_src),
1362 [patp]"+a"(pat_ptr), 1736 [patp]"+a"(pat_ptr),
@@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src,
1373 ); 1747 );
1374 1748
1375 addr = address; 1749 addr = address;
1376 end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); 1750 _mask = ~mask & 0xff;
1377 _mask = mask; 1751 depth = _gray_info.depth;
1378 1752
1379 /* set the bits for all 8 pixels in all bytes according to the 1753 /* set the bits for all 8 pixels in all bytes according to the
1380 * precalculated patterns on the pattern stack */ 1754 * precalculated patterns on the pattern stack */
1381 asm volatile ( 1755 asm volatile
1382 "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" 1756 (
1383 /* pop all 8 patterns */ 1757 "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */
1384 "not.l %[mask] \n" /* "set" mask -> "keep" mask */ 1758 /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */
1385 "and.l #0xFF,%[mask] \n" 1759
1386 "beq.b .wa_sstart \n" /* short loop if nothing to keep */ 1760 /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
1387 1761
1388 ".wa_floop: \n" /** full loop (there are bits to keep)**/ 1762 "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/
1389 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1763 "lsl.l #4, %%d0 \n"
1390 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1764 /* move.l %[ax], %%d5 */ /* already in d5 */
1391 "lsr.l #1,%%d3 \n" 1765 "eor.l %%d5, %%d0 \n"
1392 "addx.l %%d0,%%d0 \n" 1766 "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */
1393 "lsr.l #1,%%d4 \n" 1767 "eor.l %%d0, %%d5 \n"
1394 "addx.l %%d0,%%d0 \n" 1768 "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */
1395 "lsr.l #1,%%d5 \n" 1769 "lsr.l #4, %%d0 \n"
1396 "addx.l %%d0,%%d0 \n" 1770 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */
1397 "lsr.l #1,%%d6 \n" 1771 "move.l %%d2, %%d0 \n"
1398 "addx.l %%d0,%%d0 \n" 1772 "lsl.l #4, %%d0 \n"
1399 "move.l %%a0,%%d1 \n" 1773 "eor.l %%d6, %%d0 \n"
1400 "lsr.l #1,%%d1 \n" 1774 "and.l #0xF0F0F0F0, %%d0 \n"
1401 "addx.l %%d0,%%d0 \n" 1775 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */
1402 "move.l %%d1,%%a0 \n" 1776 "lsr.l #4, %%d0 \n"
1403 "move.l %%a1,%%d1 \n" 1777 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */
1404 "lsr.l #1,%%d1 \n" 1778 "move.l %%d3, %%d0 \n"
1405 "addx.l %%d0,%%d0 \n" 1779 "lsl.l #4, %%d0 \n"
1406 "move.l %%d1,%%a1 \n" 1780 "eor.l %%d7, %%d0 \n"
1407 "move.l %[ax],%%d1 \n" 1781 "and.l #0xF0F0F0F0, %%d0 \n"
1408 "lsr.l #1,%%d1 \n" 1782 "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */
1409 "addx.l %%d0,%%d0 \n" 1783 "lsr.l #4, %%d0 \n"
1410 "move.l %%d1,%[ax] \n" 1784 "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */
1411 1785 "move.l %%d4, %%d0 \n"
1412 "move.b (%[addr]),%%d1 \n" /* read old value */ 1786 "lsl.l #4, %%d0 \n"
1413 "and.l %[mask],%%d1 \n" /* mask out replaced bits */ 1787 "move.l %%a0, %%d5 \n"
1414 "or.l %%d0,%%d1 \n" /* set new bits */ 1788 "eor.l %%d5, %%d0 \n"
1415 "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ 1789 "and.l #0xF0F0F0F0, %%d0 \n"
1416 1790 "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */
1417 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1791 /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */
1418 "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ 1792 "lsr.l #4, %%d0 \n"
1419 "bhi.b .wa_floop \n" 1793 "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */
1420 1794
1421 "bra.b .wa_end \n" 1795 "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/
1422 1796 "lsl.l #2, %%d0 \n"
1423 ".wa_sstart: \n" 1797 /* move.l %%a0, %%d5 */ /* still in d5 */
1424 "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ 1798 "eor.l %%d5, %%d0 \n"
1425 1799 "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */
1426 ".wa_sloop: \n" /** short loop (nothing to keep) **/ 1800 "eor.l %%d0, %%d5 \n"
1427 "lsr.l #1,%%d2 \n" /* shift out pattern bit */ 1801 "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */
1428 "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ 1802 "lsr.l #2, %%d0 \n"
1429 "lsr.l #1,%%d3 \n" 1803 "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */
1430 "addx.l %%d0,%%d0 \n" 1804 "move.l %[ax], %%d5 \n"
1431 "lsr.l #1,%%d4 \n" 1805 "move.l %%d5, %%d0 \n"
1432 "addx.l %%d0,%%d0 \n" 1806 "lsl.l #2, %%d0 \n"
1433 "lsr.l #1,%%d5 \n" 1807 "eor.l %%d7, %%d0 \n"
1434 "addx.l %%d0,%%d0 \n" 1808 "and.l #0xCCCCCCCC, %%d0 \n"
1435 "lsr.l #1,%%d6 \n" 1809 "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */
1436 "addx.l %%d0,%%d0 \n" 1810 "lsr.l #2, %%d0 \n"
1437 "lsr.l #1,%[mask] \n" 1811 "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */
1438 "addx.l %%d0,%%d0 \n" 1812 /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */
1439 "move.l %%a1,%%d1 \n" 1813 "move.l %%d2, %%d0 \n"
1440 "lsr.l #1,%%d1 \n" 1814 "lsl.l #2, %%d0 \n"
1441 "addx.l %%d0,%%d0 \n" 1815 "eor.l %%d4, %%d0 \n"
1442 "move.l %%d1,%%a1 \n" 1816 "and.l #0xCCCCCCCC, %%d0 \n"
1443 "move.l %[ax],%%d1 \n" 1817 "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */
1444 "lsr.l #1,%%d1 \n" 1818 "lsr.l #2, %%d0 \n"
1445 "addx.l %%d0,%%d0 \n" 1819 "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */
1446 "move.l %%d1,%[ax] \n" 1820 "move.l %%d1, %%d0 \n"
1447 1821 "lsl.l #2, %%d0 \n"
1448 "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ 1822 "eor.l %%d3, %%d0 \n"
1449 "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ 1823 "and.l #0xCCCCCCCC, %%d0 \n"
1450 "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ 1824 "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */
1451 "bhi.b .wa_sloop \n" 1825 "lsr.l #2, %%d0 \n"
1452 1826 "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */
1453 ".wa_end: \n" 1827
1828 "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/
1829 "lsl.l #1, %%d0 \n"
1830 "eor.l %%d2, %%d0 \n"
1831 "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */
1832 "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */
1833 "lsr.l #1, %%d0 \n"
1834 "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */
1835 "move.l %%d3, %%d0 \n"
1836 "lsl.l #1, %%d0 \n"
1837 "eor.l %%d4, %%d0 \n"
1838 "and.l #0xAAAAAAAA, %%d0 \n"
1839 "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */
1840 "lsr.l #1, %%d0 \n"
1841 "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */
1842 /* move.l %[ax], %%d5 */ /* still in d5 */
1843 "move.l %%d5, %%d0 \n"
1844 "lsl.l #1, %%d0 \n"
1845 "eor.l %%d6, %%d0 \n"
1846 "and.l #0xAAAAAAAA, %%d0 \n"
1847 "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */
1848 "lsr.l #1, %%d0 \n"
1849 "eor.l %%d0, %%d5 \n"
1850 "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */
1851 "move.l %%d7, %%d0 \n"
1852 "lsl.l #1, %%d0 \n"
1853 "move.l %%a0, %%d5 \n"
1854 "eor.l %%d5, %%d0 \n"
1855 "and.l #0xAAAAAAAA, %%d0 \n"
1856 "eor.l %%d0, %%d5 \n"
1857 "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */
1858 "lsr.l #1, %%d0 \n"
1859 "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */
1860
1861 "tst.l %[mask] \n"
1862 "jeq .wa_sloop \n" /* short loop if nothing to keep */
1863
1864 "move.l %[mask], %%d5 \n" /* need mask in data reg. */
1865 "move.l %%d1, %[mask] \n" /* free d1 as working reg. */
1866
1867 ".wa_floop: \n" /** full loop (there are bits to keep)**/
1868 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1869 "bhs.s .wa_f8 \n"
1870
1871 "move.l %[psiz], %%d0 \n"
1872 "move.l %[dpth], %%d1 \n"
1873 "mulu.w %%d1, %%d0 \n" /* point behind the last plane */
1874 "add.l %%d0, %[addr] \n" /* for this round */
1875 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1876 "bra.s .wa_f1 \n" /* dpth == 0 should never happen */
1877 "bra.s .wa_f2 \n"
1878 "bra.s .wa_f3 \n"
1879 "bra.s .wa_f4 \n"
1880 "bra.s .wa_f5 \n"
1881 "bra.s .wa_f6 \n"
1882 "bra.s .wa_f7 \n"
1883
1884 ".wa_f8: \n"
1885 "move.l %[psiz], %%d0 \n"
1886 "lsl.l #3, %%d0 \n"
1887 "add.l %%d0, %[addr] \n"
1888 /* Point behind the last plane for this round. Note: We're using the
1889 * registers backwards in order to reuse the streak for the last round.
1890 * Therefore we need to go thru the bitplanes backwards too, otherwise
1891 * the bit order would be destroyed which results in more flicker. */
1892 "sub.l %[psiz], %[addr] \n"
1893 "move.b (%[addr]), %%d0 \n" /* load old byte */
1894 "and.l %%d5, %%d0 \n" /* mask out replaced bits */
1895 "move.l %[mask], %%d1 \n"
1896 "or.l %%d1, %%d0 \n" /* set new bits */
1897 "move.b %%d0, (%[addr]) \n" /* store byte */
1898 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1899 "move.l %%d1, %[mask] \n"
1900 ".wa_f7: \n"
1901 "sub.l %[psiz], %[addr] \n"
1902 "move.b (%[addr]), %%d0 \n"
1903 "and.l %%d5, %%d0 \n"
1904 "or.l %%d2, %%d0 \n"
1905 "move.b %%d0, (%[addr]) \n"
1906 "lsr.l #8, %%d2 \n"
1907 ".wa_f6: \n"
1908 "sub.l %[psiz], %[addr] \n"
1909 "move.b (%[addr]), %%d0 \n"
1910 "and.l %%d5, %%d0 \n"
1911 "or.l %%d3, %%d0 \n"
1912 "move.b %%d0, (%[addr]) \n"
1913 "lsr.l #8, %%d3 \n"
1914 ".wa_f5: \n"
1915 "sub.l %[psiz], %[addr] \n"
1916 "move.b (%[addr]), %%d0 \n"
1917 "and.l %%d5, %%d0 \n"
1918 "or.l %%d4, %%d0 \n"
1919 "move.b %%d0, (%[addr]) \n"
1920 "lsr.l #8, %%d4 \n"
1921 ".wa_f4: \n"
1922 "sub.l %[psiz], %[addr] \n"
1923 "move.b (%[addr]), %%d0 \n"
1924 "and.l %%d5, %%d0 \n"
1925 "move.l %[ax], %%d1 \n"
1926 "or.l %%d1, %%d0 \n"
1927 "move.b %%d0, (%[addr]) \n"
1928 "lsr.l #8, %%d1 \n"
1929 "move.l %%d1, %[ax] \n"
1930 ".wa_f3: \n"
1931 "sub.l %[psiz], %[addr] \n"
1932 "move.b (%[addr]), %%d0 \n"
1933 "and.l %%d5, %%d0 \n"
1934 "or.l %%d6, %%d0 \n"
1935 "move.b %%d0, (%[addr]) \n"
1936 "lsr.l #8, %%d6 \n"
1937 ".wa_f2: \n"
1938 "sub.l %[psiz], %[addr] \n"
1939 "move.b (%[addr]), %%d0 \n"
1940 "and.l %%d5, %%d0 \n"
1941 "or.l %%d7, %%d0 \n"
1942 "move.b %%d0, (%[addr]) \n"
1943 "lsr.l #8, %%d7 \n"
1944 ".wa_f1: \n"
1945 "sub.l %[psiz], %[addr] \n"
1946 "move.b (%[addr]), %%d0 \n"
1947 "and.l %%d5, %%d0 \n"
1948 "move.l %%a0, %%d1 \n"
1949 "or.l %%d1, %%d0 \n"
1950 "move.b %%d0, (%[addr]) \n"
1951 "lsr.l #8, %%d1 \n"
1952 "move.l %%d1, %%a0 \n"
1953
1954 "move.l %[psiz], %%d0 \n"
1955 "lsl.l #3, %%d0 \n"
1956 "add.l %%d0, %[addr] \n" /* correct address */
1957 "subq.l #8, %[dpth] \n"
1958 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
1959 "jgt .wa_floop \n" /* next round if anything left */
1960
1961 "jra .wa_end \n"
1962
1963 ".wa_sloop: \n" /** short loop (nothing to keep) **/
1964 "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */
1965 "bhs.s .wa_s8 \n"
1966
1967 "move.l %[psiz], %%d0 \n"
1968 "move.l %[dpth], %%d5 \n"
1969 "mulu.w %%d5, %%d0 \n" /* point behind the last plane */
1970 "add.l %%d0, %[addr] \n" /* for this round */
1971 "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */
1972 "bra.s .wa_s1 \n" /* dpth == 0 should never happen */
1973 "bra.s .wa_s2 \n"
1974 "bra.s .wa_s3 \n"
1975 "bra.s .wa_s4 \n"
1976 "bra.s .wa_s5 \n"
1977 "bra.s .wa_s6 \n"
1978 "bra.s .wa_s7 \n"
1979
1980 ".wa_s8: \n"
1981 "move.l %[psiz], %%d0 \n" /* Point behind the last plane */
1982 "lsl.l #3, %%d0 \n" /* for this round. */
1983 "add.l %%d0, %[addr] \n" /* See above. */
1984
1985 "sub.l %[psiz], %[addr] \n"
1986 "move.b %%d1, (%[addr]) \n" /* store byte */
1987 "lsr.l #8, %%d1 \n" /* shift out used-up byte */
1988 ".wa_s7: \n"
1989 "sub.l %[psiz], %[addr] \n"
1990 "move.b %%d2, (%[addr]) \n"
1991 "lsr.l #8, %%d2 \n"
1992 ".wa_s6: \n"
1993 "sub.l %[psiz], %[addr] \n"
1994 "move.b %%d3, (%[addr]) \n"
1995 "lsr.l #8, %%d3 \n"
1996 ".wa_s5: \n"
1997 "sub.l %[psiz], %[addr] \n"
1998 "move.b %%d4, (%[addr]) \n"
1999 "lsr.l #8, %%d4 \n"
2000 ".wa_s4: \n"
2001 "sub.l %[psiz], %[addr] \n"
2002 "move.l %[ax], %%d5 \n"
2003 "move.b %%d5, (%[addr]) \n"
2004 "lsr.l #8, %%d5 \n"
2005 "move.l %%d5, %[ax] \n"
2006 ".wa_s3: \n"
2007 "sub.l %[psiz], %[addr] \n"
2008 "move.b %%d6, (%[addr]) \n"
2009 "lsr.l #8, %%d6 \n"
2010 ".wa_s2: \n"
2011 "sub.l %[psiz], %[addr] \n"
2012 "move.b %%d7, (%[addr]) \n"
2013 "lsr.l #8, %%d7 \n"
2014 ".wa_s1: \n"
2015 "sub.l %[psiz], %[addr] \n"
2016 "move.l %%a0, %%d5 \n"
2017 "move.b %%d5, (%[addr]) \n"
2018 "lsr.l #8, %%d5 \n"
2019 "move.l %%d5, %%a0 \n"
2020
2021 "add.l %%d0, %[addr] \n" /* correct address */
2022 "subq.l #8, %[dpth] \n"
2023 "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */
2024 "jgt .wa_sloop \n" /* next round if anything left */
2025
2026 ".wa_end: \n"
1454 : /* outputs */ 2027 : /* outputs */
1455 [addr]"+a"(addr), 2028 [addr]"+a"(addr),
1456 [mask]"+d"(_mask), 2029 [dpth]"+a"(depth),
2030 [mask]"+a"(_mask),
1457 [ax] "=&a"(trash) 2031 [ax] "=&a"(trash)
1458 : /* inputs */ 2032 : /* inputs */
1459 [psiz]"a"(_gray_info.plane_size), 2033 [psiz]"a"(_gray_info.plane_size),
1460 [end] "a"(end),
1461 [patp]"[ax]"(pat_ptr) 2034 [patp]"[ax]"(pat_ptr)
1462 : /* clobbers */ 2035 : /* clobbers */
1463 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" 2036 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
1464 ); 2037 );
1465#else /* C version, for reference*/ 2038#else /* C version, for reference*/
1466#warning C version of _writearray() used 2039#warning C version of _writearray() used
2040 unsigned char *end;
1467 unsigned test = 1; 2041 unsigned test = 1;
1468 int i; 2042 int i;
1469 2043