From 71dc284b5d4f7bfd27fb50fd91184d2d5f70db21 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Fri, 11 Aug 2006 14:13:01 +0000 Subject: New algorithm for grayscale buffer updates which is faster for large buffer depths. Speedup (unbuffered, depth==32): +8% on H1x0, +17% on Recorder (depth==24), and +83% on iPod Mini. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10529 a1c6a512-1295-4272-9138-f99709370657 --- apps/plugins/lib/gray_core.c | 1417 +++++++++++++++++++++++++++++------------- apps/plugins/lib/gray_draw.c | 1156 +++++++++++++++++++++++++--------- 2 files changed, 1858 insertions(+), 715 deletions(-) (limited to 'apps/plugins/lib') diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c index e65a7f259e..809e88dba1 100644 --- a/apps/plugins/lib/gray_core.c +++ b/apps/plugins/lib/gray_core.c @@ -649,7 +649,8 @@ void gray_update_rect(int x, int y, int width, int height) bbuf = _gray_info.back_buffer + srcofs_row; #ifdef CPU_ARM - asm volatile ( + asm volatile + ( "ldr r0, [%[cbuf]] \n" "ldr r1, [%[bbuf]] \n" "eor r1, r0, r1 \n" @@ -668,137 +669,281 @@ void gray_update_rect(int x, int y, int width, int height) if (change != 0) { - unsigned char *addr, *end; - unsigned mask, trash; + unsigned char *addr; + unsigned mask, depth, trash; pat_ptr = &pat_stack[8]; /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ - asm volatile ( - "mov r3, #8 \n" /* loop count */ - "mov %[mask], #0 \n" - - ".ur_pre_loop: \n" - "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ - "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ - "ldrb r1, [%[bbuf]] \n" /* read back buffer */ - "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ - "mov r2, #0 \n" /* preset for skipped pixel */ - "cmp r0, r1 \n" /* no change? */ - "beq .ur_skip \n" /* -> skip */ - - "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ - - "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ - "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" - "add %[rnd], %[rnd], #74 \n" /* add another 74 */ - /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ - "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ - - "cmp r1, %[dpth] \n" /* random >= depth ? */ - "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ - - "mov r0, r2, lsl r1 \n" /** rotate pattern **/ - "sub r1, %[dpth], r1 \n" - "orr r2, r0, r2, lsr r1 \n" - - "orr %[mask], %[mask], #1 \n" /* set mask bit */ + asm volatile + ( + "mov r3, #8 \n" /* loop count */ + "mov %[mask], #0 \n" + + ".ur_pre_loop: \n" + "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ + "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ + "ldrb r1, [%[bbuf]] \n" /* read back buffer */ + "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ + "mov r2, #0 \n" /* preset for skipped pixel */ + "cmp r0, r1 \n" /* no change? */ + "beq .ur_skip \n" /* -> skip */ + + "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ + + "add %[rnd], %[rnd], %[rnd], lsl #2 \n" /* multiply by 75 */ + "rsb %[rnd], %[rnd], %[rnd], lsl #4 \n" + "add %[rnd], %[rnd], #74 \n" /* add another 74 */ + /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ + "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ + + "cmp r1, %[dpth] \n" /* random >= depth ? */ + "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ + + "mov r0, r2, lsl r1 \n" /** rotate pattern **/ + "sub r1, %[dpth], r1 \n" + "orr r2, r0, r2, lsr r1 \n" + + "orr %[mask], %[mask], #1 \n" /* set mask bit */ - ".ur_skip: \n" - "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ - - "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ - "bne .ur_pre_loop \n" - : /* outputs */ - [cbuf]"+r"(cbuf), - [bbuf]"+r"(bbuf), - [patp]"+r"(pat_ptr), - [rnd] "+r"(_gray_random_buffer), - [mask]"=&r"(mask) - : /* inputs */ - [bpat]"r"(_gray_info.bitpattern), - [dpth]"r"(_gray_info.depth), - [rmsk]"r"(_gray_info.randmask) - : /* clobbers */ - "r0", "r1", "r2", "r3" + ".ur_skip: \n" + "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ + + "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ + "bne .ur_pre_loop \n" + : /* outputs */ + [cbuf]"+r"(cbuf), + [bbuf]"+r"(bbuf), + [patp]"+r"(pat_ptr), + [rnd] "+r"(_gray_random_buffer), + [mask]"=&r"(mask) + : /* inputs */ + [bpat]"r"(_gray_info.bitpattern), + [dpth]"r"(_gray_info.depth), + [rmsk]"r"(_gray_info.randmask) + : /* clobbers */ + "r0", "r1", "r2", "r3" ); addr = dst_row; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ + asm volatile + ( + "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ + "eor r0, r1, r5, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ + "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ + "eor r0, r2, r6, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ + "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ + "eor r0, r3, r7, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ + "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ + "eor r0, r4, r8, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ + "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ + + "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ + "eor r0, r1, r3, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ + "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ + "eor r0, r2, r4, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ + "eor r0, r5, r7, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ + "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ + "eor r0, r6, r8, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ + "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ + + "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ + "eor r0, r1, r2, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ + "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ + "eor r0, r3, r4, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ + "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ + "eor r0, r5, r6, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ + "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ + "eor r0, r7, r8, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ + "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ + "ands %[mask], %[mask], #0xff \n" + "beq .ur_sloop \n" /* short loop if no bits to keep */ + + ".ur_floop: \n" /** full loop (bits to keep)**/ + "cmp %[dpth], #8 \n" /* 8 planes or more left? */ + "bhs .ur_f8 \n" + + "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ + "add %[addr], %[addr], r0 \n" /* for this round */ + + "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "add pc, pc, r0 \n" + ".ur_ftable: \n" + ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */ + ".byte .ur_f1 - .ur_ftable - 4 \n" + ".byte .ur_f2 - .ur_ftable - 4 \n" + ".byte .ur_f3 - .ur_ftable - 4 \n" + ".byte .ur_f4 - .ur_ftable - 4 \n" + ".byte .ur_f5 - .ur_ftable - 4 \n" + ".byte .ur_f6 - .ur_ftable - 4 \n" + ".byte .ur_f7 - .ur_ftable - 4 \n" + + ".ur_f8: \n" + "add %[addr], %[addr], %[psiz], lsl #3 \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ + "and r0, r0, %[mask] \n" /* mask out replaced bits */ + "orr r0, r0, r8 \n" /* set new bits */ + "strb r0, [%[addr]] \n" /* store byte */ + "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + ".ur_f7: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r7 \n" + "strb r0, [%[addr]] \n" + "mov r7, r7, lsr #8 \n" + ".ur_f6: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r6 \n" + "strb r0, [%[addr]] \n" + "mov r6, r6, lsr #8 \n" + ".ur_f5: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r5 \n" + "strb r0, [%[addr]] \n" + "mov r5, r5, lsr #8 \n" + ".ur_f4: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r4 \n" + "strb r0, [%[addr]] \n" + "mov r4, r4, lsr #8 \n" + ".ur_f3: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r3 \n" + "strb r0, [%[addr]] \n" + "mov r3, r3, lsr #8 \n" + ".ur_f2: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r2 \n" + "strb r0, [%[addr]] \n" + "mov r2, r2, lsr #8 \n" + ".ur_f1: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r1 \n" + "strb r0, [%[addr]] \n" + "mov r1, r1, lsr #8 \n" + ".ur_f0: \n" + + "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ + "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ + "bhi .ur_floop \n" + + "b .ur_end \n" + + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "cmp %[dpth], #8 \n" /* 8 planes or more left? */ + "bhs .ur_s8 \n" + + "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ + "add %[addr], %[addr], r0 \n" /* for this round */ - "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ - "ands %[mask], %[mask], #0xff \n" - "beq .ur_sloop \n" /* short loop if nothing to keep */ - - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ - "adc r0, r0, r0 \n" /* put bit into LSB for byte */ - "movs r8, r8, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r7, r7, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r6, r6, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r5, r5, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r4, r4, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r3, r3, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r2, r2, lsr #1 \n" - "adc r0, r0, r0 \n" - - "ldrb r1, [%[addr]] \n" /* read old value */ - "and r1, r1, %[mask] \n" /* mask out replaced bits */ - "orr r1, r1, r0 \n" /* set new bits */ - "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ - - "cmp %[end], %[addr] \n" /* loop for all bitplanes */ - "bne .ur_floop \n" - - "b .ur_end \n" - - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ - "adc r0, r0, r0 \n" /* put bit into LSB for byte */ - "movs r8, r8, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r7, r7, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r6, r6, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r5, r5, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r4, r4, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r3, r3, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r2, r2, lsr #1 \n" - "adc r0, r0, r0 \n" - - "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ - - "cmp %[end], %[addr] \n" /* loop for all bitplanes */ - "bne .ur_sloop \n" - - ".ur_end: \n" - : /* outputs */ - [addr]"+r"(addr), - [mask]"+r"(mask), - [rx] "=&r"(trash) - : /* inputs */ - [psiz]"r"(_gray_info.plane_size), - [end] "r"(end), - [patp]"[rx]"(pat_ptr) - : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" - ); + "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "add pc, pc, r0 \n" + ".ur_stable: \n" + ".byte .ur_s0 - .ur_stable - 4 \n" + ".byte .ur_s1 - .ur_stable - 4 \n" + ".byte .ur_s2 - .ur_stable - 4 \n" + ".byte .ur_s3 - .ur_stable - 4 \n" + ".byte .ur_s4 - .ur_stable - 4 \n" + ".byte .ur_s5 - .ur_stable - 4 \n" + ".byte .ur_s6 - .ur_stable - 4 \n" + ".byte .ur_s7 - .ur_stable - 4 \n" + + ".ur_s8: \n" + "add %[addr], %[addr], %[psiz], lsl #3 \n" + /* Point behind the last plane for this round. See above. */ + "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ + "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + ".ur_s7: \n" + "strb r7, [%[addr], -%[psiz]]! \n" + "mov r7, r7, lsr #8 \n" + ".ur_s6: \n" + "strb r6, [%[addr], -%[psiz]]! \n" + "mov r6, r6, lsr #8 \n" + ".ur_s5: \n" + "strb r5, [%[addr], -%[psiz]]! \n" + "mov r5, r5, lsr #8 \n" + ".ur_s4: \n" + "strb r4, [%[addr], -%[psiz]]! \n" + "mov r4, r4, lsr #8 \n" + ".ur_s3: \n" + "strb r3, [%[addr], -%[psiz]]! \n" + "mov r3, r3, lsr #8 \n" + ".ur_s2: \n" + "strb r2, [%[addr], -%[psiz]]! \n" + "mov r2, r2, lsr #8 \n" + ".ur_s1: \n" + "strb r1, [%[addr], -%[psiz]]! \n" + "mov r1, r1, lsr #8 \n" + ".ur_s0: \n" + + "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ + "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ + "bhi .ur_sloop \n" + + ".ur_end: \n" + : /* outputs */ + [addr]"+r"(addr), + [mask]"+r"(mask), + [dpth]"+r"(depth), + [rx] "=&r"(trash) + : /* inputs */ + [psiz]"r"(_gray_info.plane_size), + [patp]"[rx]"(pat_ptr) + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" + ); } #else /* C version, for reference*/ #warning C version of gray_update_rect() used @@ -873,7 +1018,7 @@ void gray_update_rect(int x, int y, int width, int height) for (i = 7; i >= 0; i--) data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); - + *addr = (*addr & mask) | data; addr += _gray_info.plane_size; test <<= 1; @@ -935,13 +1080,13 @@ void gray_update_rect(int x, int y, int width, int height) #if CONFIG_CPU == SH7034 asm volatile ( - "mov.l @%[cbuf],r1 \n" - "mov.l @%[bbuf],r2 \n" - "xor r1,r2 \n" - "mov.l @(4,%[cbuf]),r1 \n" - "mov.l @(4,%[bbuf]),%[chg] \n" - "xor r1,%[chg] \n" - "or r2,%[chg] \n" + "mov.l @%[cbuf], r1 \n" + "mov.l @%[bbuf], r2 \n" + "xor r1, r2 \n" + "mov.l @(4,%[cbuf]), r1 \n" + "mov.l @(4,%[bbuf]), %[chg]\n" + "xor r1, %[chg] \n" + "or r2, %[chg] \n" : /* outputs */ [chg] "=r"(change) : /* inputs */ @@ -953,176 +1098,402 @@ void gray_update_rect(int x, int y, int width, int height) if (change != 0) { - unsigned char *addr, *end; - unsigned mask, trash; + unsigned char *addr; + unsigned mask, depth, trash; pat_ptr = &pat_stack[8]; /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ - asm volatile ( - "mov #8,r3 \n" /* loop count */ - - ".ur_pre_loop: \n" - "mov.b @%[cbuf]+,r0\n" /* read current buffer */ - "mov.b @%[bbuf],r1 \n" /* read back buffer */ - "mov #0,r2 \n" /* preset for skipped pixel */ - "mov.b r0,@%[bbuf] \n" /* update back buffer */ - "add #1,%[bbuf] \n" - "cmp/eq r0,r1 \n" /* no change? */ - "bt .ur_skip \n" /* -> skip */ - - "shll2 r0 \n" /* pixel value -> pattern offset */ - "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ - - "mov #75,r0 \n" - "mulu r0,%[rnd] \n" /* multiply by 75 */ - "sts macl,%[rnd] \n" - "add #74,%[rnd] \n" /* add another 74 */ - /* Since the lower bits are not very random: */ - "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ - "and %[rmsk],r1 \n" /* mask out unneeded bits */ - - "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ - "bf .ur_ntrim \n" - "sub %[dpth],r1 \n" /* yes: random -= depth; */ - ".ur_ntrim: \n" + asm volatile + ( + "mov #8, r3 \n" /* loop count */ + + ".ur_pre_loop: \n" + "mov.b @%[cbuf]+, r0 \n" /* read current buffer */ + "mov.b @%[bbuf], r1 \n" /* read back buffer */ + "mov #0, r2 \n" /* preset for skipped pixel */ + "mov.b r0, @%[bbuf] \n" /* update back buffer */ + "add #1, %[bbuf] \n" + "cmp/eq r0, r1 \n" /* no change? */ + "bt .ur_skip \n" /* -> skip */ + + "shll2 r0 \n" /* pixel value -> pattern offset */ + "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ + + "mov #75, r0 \n" + "mulu r0, %[rnd] \n" /* multiply by 75 */ + "sts macl, %[rnd] \n" + "add #74, %[rnd] \n" /* add another 74 */ + /* Since the lower bits are not very random: */ + "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ + "and %[rmsk], r1 \n" /* mask out unneeded bits */ + + "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ + "bf .ur_ntrim \n" + "sub %[dpth], r1 \n" /* yes: random -= depth; */ + ".ur_ntrim: \n" - "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ - "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ - "mov r1,r5 \n" - - "mov %[dpth],r5 \n" - "sub r1,r5 \n" /* r5 = depth - r1 */ - "mov.l .lshrsi3,r1 \n" - "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ - "mov r0,r2 \n" /* store previous result in r2 */ + "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ + "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ + "mov r1, r5 \n" + + "mov %[dpth], r5 \n" + "sub r1, r5 \n" /* r5 = depth - r1 */ + "mov.l .lshrsi3, r1 \n" + "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ + "mov r0, r2 \n" /* store previous result in r2 */ - "or r0,r2 \n" /* rotated_pattern = r2 | r0 */ - "clrt \n" /* mask bit = 0 (replace) */ - - ".ur_skip: \n" /* T == 1 if skipped */ - "rotcr %[mask] \n" /* get mask bit */ - "mov.l r2,@-%[patp]\n" /* push on pattern stack */ - - "add #-1,r3 \n" /* loop 8 times (pixel block) */ - "cmp/pl r3 \n" - "bt .ur_pre_loop\n" - - "shlr8 %[mask] \n" /* shift mask to low byte */ - "shlr16 %[mask] \n" - : /* outputs */ - [cbuf]"+r"(cbuf), - [bbuf]"+r"(bbuf), - [rnd] "+r"(_gray_random_buffer), - [patp]"+r"(pat_ptr), - [mask]"=&r"(mask) - : /* inputs */ - [dpth]"r"(_gray_info.depth), - [bpat]"r"(_gray_info.bitpattern), - [rmsk]"r"(_gray_info.randmask) - : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" + "or r0, r2 \n" /* rotated_pattern = r2 | r0 */ + "clrt \n" /* mask bit = 0 (replace) */ + + ".ur_skip: \n" /* T == 1 if skipped */ + "rotcr %[mask] \n" /* get mask bit */ + "mov.l r2, @-%[patp] \n" /* push on pattern stack */ + + "add #-1, r3 \n" /* loop 8 times (pixel block) */ + "cmp/pl r3 \n" + "bt .ur_pre_loop \n" + + "shlr8 %[mask] \n" /* shift mask to low byte */ + "shlr16 %[mask] \n" + : /* outputs */ + [cbuf]"+r"(cbuf), + [bbuf]"+r"(bbuf), + [rnd] "+r"(_gray_random_buffer), + [patp]"+r"(pat_ptr), + [mask]"=&r"(mask) + : /* inputs */ + [dpth]"r"(_gray_info.depth), + [bpat]"r"(_gray_info.bitpattern), + [rmsk]"r"(_gray_info.randmask) + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr" ); addr = dst_row; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ - "mov.l @%[patp]+,r2 \n" - "mov.l @%[patp]+,r3 \n" - "mov.l @%[patp]+,r6 \n" - "mov.l @%[patp]+,r7 \n" - "mov.l @%[patp]+,r8 \n" - "mov.l @%[patp]+,r9 \n" - "mov.l @%[patp],r10 \n" - - "tst %[mask],%[mask] \n" - "bt .ur_sloop \n" /* short loop if nothing to keep */ - - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "mov.b @%[addr],%[rx] \n" /* read old value */ - "rotcl r0 \n" - "and %[mask],%[rx] \n" /* mask out replaced bits */ - "or %[rx],r0 \n" /* set new bits */ - "mov.b r0,@%[addr] \n" /* store value to bitplane */ - "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ - "bt .ur_floop \n" - - "bra .ur_end \n" - "nop \n" - - /* References to C library routines used in the precalc block */ - ".align 2 \n" - ".ashlsi3: \n" /* C library routine: */ - ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ - ".lshrsi3: \n" /* C library routine: */ - ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ - /* both routines preserve r4, destroy r5 and take ~16 cycles */ - - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "rotcl r0 \n" - "mov.b r0,@%[addr] \n" /* store byte to bitplane */ - "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ - "bt .ur_sloop \n" - - ".ur_end: \n" - : /* outputs */ - [addr]"+r"(addr), - [mask]"+r"(mask), - [rx] "=&r"(trash) - : /* inputs */ - [psiz]"r"(_gray_info.plane_size), - [end] "r"(end), - [patp]"[rx]"(pat_ptr) - : /* clobbers */ - "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" + asm volatile + ( + "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ + "mov.l @%[patp]+, r7 \n" + "mov.l @%[patp]+, r6 \n" + "mov.l @%[patp]+, r5 \n" + "mov.l @%[patp]+, r4 \n" + "mov.l @%[patp]+, r3 \n" + "mov.l @%[patp]+, r2 \n" + "mov.l @%[patp], r1 \n" + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "mov.l .ur_mask4, %[rx] \n" /* bitmask = ...11110000 */ + "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ + "shll2 r0 \n" + "shll2 r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ + "mov r6, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r2, r0 \n" + "and %[rx], r0 \n" + "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ + "mov r7, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r3, r0 \n" + "and %[rx], r0 \n" + "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ + "mov r8, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r4, r0 \n" + "and %[rx], r0 \n" + "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ + + "mov.l .ur_mask2, %[rx] \n" /* bitmask = ...11001100 */ + "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ + "shll2 r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ + "shlr2 r0 \n" + "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ + "mov r4, r0 \n" + "shll2 r0 \n" + "xor r2, r0 \n" + "and %[rx], r0 \n" + "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "shlr2 r0 \n" + "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ + "mov r7, r0 \n" + "shll2 r0 \n" + "xor r5, r0 \n" + "and %[rx], r0 \n" + "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ + "shlr2 r0 \n" + "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ + "mov r8, r0 \n" + "shll2 r0 \n" + "xor r6, r0 \n" + "and %[rx], r0 \n" + "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ + "shlr2 r0 \n" + "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ + + "mov.l .ur_mask1, %[rx] \n" /* bitmask = ...10101010 */ + "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ + "shll r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ + "shlr r0 \n" + "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ + "mov r4, r0 \n" + "shll r0 \n" + "xor r3, r0 \n" + "and %[rx], r0 \n" + "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ + "shlr r0 \n" + "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ + "mov r6, r0 \n" + "shll r0 \n" + "xor r5, r0 \n" + "and %[rx], r0 \n" + "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ + "shlr r0 \n" + "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ + "mov r8, r0 \n" + "shll r0 \n" + "xor r7, r0 \n" + "and %[rx], r0 \n" + "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ + "shlr r0 \n" + "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "tst %[mask], %[mask] \n" + "bt .ur_sloop \n" /* short loop if nothing to keep */ + + ".ur_floop: \n" /** full loop (there are bits to keep)**/ + "mov #8, r0 \n" + "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ + "bt .ur_f8 \n" + + "mulu %[psiz], %[dpth] \n" + "mova .ur_ftable, r0 \n" + "mov.b @(r0, %[dpth]), %[rx] \n" + "add %[rx], r0 \n" + "sts macl, %[rx] \n" /* point behind the last plane.. */ + "jmp @r0 \n" /* jump into streak */ + "add %[rx], %[addr] \n" /* ..for this round */ + + ".align 2 \n" + ".ur_ftable: \n" + ".byte .ur_f0 - .ur_ftable \n" + ".byte .ur_f1 - .ur_ftable \n" + ".byte .ur_f2 - .ur_ftable \n" + ".byte .ur_f3 - .ur_ftable \n" + ".byte .ur_f4 - .ur_ftable \n" + ".byte .ur_f5 - .ur_ftable \n" + ".byte .ur_f6 - .ur_ftable \n" + ".byte .ur_f7 - .ur_ftable \n" + + ".ur_f8: \n" + "mov %[psiz], %[rx] \n" + "shll2 %[rx] \n" + "add %[rx], %[rx] \n" + "add %[rx], %[addr] \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" /* load old byte */ + "and %[mask], r0 \n" /* mask out replaced bits */ + "or r8, r0 \n" /* set new bits */ + "mov.b r0, @%[addr] \n" /* store byte */ + "shlr8 r8 \n" /* shift out used-up byte */ + ".ur_f7: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r7, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r7 \n" + ".ur_f6: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r6, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r6 \n" + ".ur_f5: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r5, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r5 \n" + ".ur_f4: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r4, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r4 \n" + ".ur_f3: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r3, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r3 \n" + ".ur_f2: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r2, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r2 \n" + ".ur_f1: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r1, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r1 \n" + ".ur_f0: \n" + + "add %[rx], %[addr] \n" /* correct address */ + "add #-8, %[dpth] \n" + "cmp/pl %[dpth] \n" /* next round if anything left */ + "bt .ur_floop \n" + + "bra .ur_end \n" + "nop \n" + + /* References to C library routines used in the precalc block */ + ".align 2 \n" + ".ashlsi3: \n" /* C library routine: */ + ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ + ".lshrsi3: \n" /* C library routine: */ + ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ + /* both routines preserve r4, destroy r5 and take ~16 cycles */ + + /* Bitmasks for the bit block rotation */ + ".ur_mask4: \n" + ".long 0xF0F0F0F0 \n" + ".ur_mask2: \n" + ".long 0xCCCCCCCC \n" + ".ur_mask1: \n" + ".long 0xAAAAAAAA \n" + + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "mov #8, r0 \n" + "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ + "bt .ur_s8 \n" + + "mulu %[psiz], %[dpth] \n" + "mova .ur_stable, r0 \n" + "mov.b @(r0, %[dpth]), %[rx] \n" + "add %[rx], r0 \n" + "sts macl, %[rx] \n" /* point behind the last plane.. */ + "jmp @r0 \n" /* jump into streak */ + "add %[rx], %[addr] \n" /* ..for this round */ + + ".align 2 \n" + ".ur_stable: \n" + ".byte .ur_s0 - .ur_stable \n" + ".byte .ur_s1 - .ur_stable \n" + ".byte .ur_s2 - .ur_stable \n" + ".byte .ur_s3 - .ur_stable \n" + ".byte .ur_s4 - .ur_stable \n" + ".byte .ur_s5 - .ur_stable \n" + ".byte .ur_s6 - .ur_stable \n" + ".byte .ur_s7 - .ur_stable \n" + + ".ur_s8: \n" + "mov %[psiz], %[rx] \n" /* Point behind the last plane */ + "shll2 %[rx] \n" /* for this round. */ + "add %[rx], %[rx] \n" /* See above. */ + "add %[rx], %[addr] \n" + + "sub %[psiz], %[addr] \n" + "mov.b r8, @%[addr] \n" /* store byte */ + "shlr8 r8 \n" /* shift out used-up byte */ + ".ur_s7: \n" + "sub %[psiz], %[addr] \n" + "mov.b r7, @%[addr] \n" + "shlr8 r7 \n" + ".ur_s6: \n" + "sub %[psiz], %[addr] \n" + "mov.b r6, @%[addr] \n" + "shlr8 r6 \n" + ".ur_s5: \n" + "sub %[psiz], %[addr] \n" + "mov.b r5, @%[addr] \n" + "shlr8 r5 \n" + ".ur_s4: \n" + "sub %[psiz], %[addr] \n" + "mov.b r4, @%[addr] \n" + "shlr8 r4 \n" + ".ur_s3: \n" + "sub %[psiz], %[addr] \n" + "mov.b r3, @%[addr] \n" + "shlr8 r3 \n" + ".ur_s2: \n" + "sub %[psiz], %[addr] \n" + "mov.b r2, @%[addr] \n" + "shlr8 r2 \n" + ".ur_s1: \n" + "sub %[psiz], %[addr] \n" + "mov.b r1, @%[addr] \n" + "shlr8 r1 \n" + ".ur_s0: \n" + + "add %[rx], %[addr] \n" /* correct address */ + "add #-8, %[dpth] \n" + "cmp/pl %[dpth] \n" /* next round if anything left */ + "bt .ur_sloop \n" + + ".ur_end: \n" + : /* outputs */ + [addr]"+r"(addr), + [dpth]"+r"(depth), + [rx] "=&r"(trash) + : /* inputs */ + [mask]"r"(mask), + [psiz]"r"(_gray_info.plane_size), + [patp]"[rx]"(pat_ptr) + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" ); } #elif defined(CPU_COLDFIRE) asm volatile ( - "move.l (%[cbuf]),%%d0 \n" - "move.l (%[bbuf]),%%d1 \n" - "eor.l %%d0,%%d1 \n" - "move.l (4,%[cbuf]),%%d0 \n" - "move.l (4,%[bbuf]),%[chg] \n" - "eor.l %%d0,%[chg] \n" - "or.l %%d1,%[chg] \n" + "move.l (%[cbuf]), %%d0 \n" + "move.l (%[bbuf]), %%d1 \n" + "eor.l %%d0, %%d1 \n" + "move.l (4,%[cbuf]), %%d0 \n" + "move.l (4,%[bbuf]), %[chg] \n" + "eor.l %%d0, %[chg] \n" + "or.l %%d1, %[chg] \n" : /* outputs */ [chg] "=&d"(change) : /* inputs */ @@ -1134,160 +1505,359 @@ void gray_update_rect(int x, int y, int width, int height) if (change != 0) { - unsigned char *addr, *end; - unsigned mask, trash; + unsigned char *addr; + unsigned mask, depth, trash; pat_ptr = &pat_stack[8]; /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ - asm volatile ( - "moveq.l #8,%%d3 \n" /* loop count */ - "clr.l %[mask] \n" - - ".ur_pre_loop: \n" - "clr.l %%d0 \n" - "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ - "clr.l %%d1 \n" - "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ - "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ - "clr.l %%d2 \n" /* preset for skipped pixel */ - "cmp.l %%d0,%%d1 \n" /* no change? */ - "beq.b .ur_skip \n" /* -> skip */ - - "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ - - "mulu.w #75,%[rnd] \n" /* multiply by 75 */ - "add.l #74,%[rnd] \n" /* add another 74 */ - /* Since the lower bits are not very random: */ - "move.l %[rnd],%%d1 \n" - "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ - "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ - - "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ - "blo.b .ur_ntrim \n" - "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ - ".ur_ntrim: \n" - - "move.l %%d2,%%d0 \n" /** rotate pattern **/ - "lsl.l %%d1,%%d0 \n" - "sub.l %[dpth],%%d1 \n" - "neg.l %%d1 \n" /* d1 = depth - d1 */ - "lsr.l %%d1,%%d2 \n" - "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ - - "or.l #0x0100,%[mask] \n" /* set mask bit */ - - ".ur_skip: \n" - "lsr.l #1,%[mask] \n" /* shift mask */ - "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ - - "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ - "bne.b .ur_pre_loop \n" - : /* outputs */ - [cbuf]"+a"(cbuf), - [bbuf]"+a"(bbuf), - [patp]"+a"(pat_ptr), - [rnd] "+d"(_gray_random_buffer), - [mask]"=&d"(mask) - : /* inputs */ - [bpat]"a"(_gray_info.bitpattern), - [dpth]"d"(_gray_info.depth), - [rmsk]"d"(_gray_info.randmask) - : /* clobbers */ - "d0", "d1", "d2", "d3" + asm volatile + ( + "moveq.l #8, %%d3 \n" /* loop count */ + "clr.l %[mask] \n" + + ".ur_pre_loop: \n" + "clr.l %%d0 \n" + "move.b (%[cbuf])+, %%d0 \n" /* read current buffer */ + "clr.l %%d1 \n" + "move.b (%[bbuf]), %%d1 \n" /* read back buffer */ + "move.b %%d0, (%[bbuf])+ \n" /* update back buffer */ + "clr.l %%d2 \n" /* preset for skipped pixel */ + "cmp.l %%d0, %%d1 \n" /* no change? */ + "beq.b .ur_skip \n" /* -> skip */ + + "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ + + "mulu.w #75, %[rnd] \n" /* multiply by 75 */ + "add.l #74, %[rnd] \n" /* add another 74 */ + /* Since the lower bits are not very random: */ + "move.l %[rnd], %%d1 \n" + "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ + "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ + + "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ + "blo.b .ur_ntrim \n" + "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ + ".ur_ntrim: \n" + + "move.l %%d2, %%d0 \n" /** rotate pattern **/ + "lsl.l %%d1, %%d0 \n" + "sub.l %[dpth], %%d1 \n" + "neg.l %%d1 \n" /* d1 = depth - d1 */ + "lsr.l %%d1, %%d2 \n" + "or.l %%d0, %%d2 \n" /* rotated_pattern = d2 | d0 */ + + "or.l #0x0100, %[mask] \n" /* set mask bit */ + + ".ur_skip: \n" + "lsr.l #1, %[mask] \n" /* shift mask */ + "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ + + "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ + "bne.b .ur_pre_loop \n" + : /* outputs */ + [cbuf]"+a"(cbuf), + [bbuf]"+a"(bbuf), + [patp]"+a"(pat_ptr), + [rnd] "+d"(_gray_random_buffer), + [mask]"=&d"(mask) + : /* inputs */ + [bpat]"a"(_gray_info.bitpattern), + [dpth]"d"(_gray_info.depth), + [rmsk]"d"(_gray_info.randmask) + : /* clobbers */ + "d0", "d1", "d2", "d3" ); addr = dst_row; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); + mask = ~mask & 0xff; + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" - /* pop all 8 patterns */ - "not.l %[mask] \n" /* "set" mask -> "keep" mask */ - "and.l #0xFF,%[mask] \n" - "beq.b .ur_sstart \n" /* short loop if nothing to keep */ - - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out pattern bit */ - "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a0,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" - - "move.b (%[addr]),%%d1 \n" /* read old value */ - "and.l %[mask],%%d1 \n" /* mask out replaced bits */ - "or.l %%d0,%%d1 \n" /* set new bits */ - "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ - - "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ - "bhi.b .ur_floop \n" - - "bra.b .ur_end \n" - - ".ur_sstart: \n" - "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ - - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out pattern bit */ - "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%[mask] \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" - - "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ - "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ - "bhi.b .ur_sloop \n" - - ".ur_end: \n" - : /* outputs */ - [addr]"+a"(addr), - [mask]"+d"(mask), - [ax] "=&a"(trash) - : /* inputs */ - [psiz]"a"(_gray_info.plane_size), - [end] "a"(end), - [patp]"[ax]"(pat_ptr) - : /* clobbers */ - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" + asm volatile + ( + "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ + /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ + "lsl.l #4, %%d0 \n" + /* move.l %[ax], %%d5 */ /* already in d5 */ + "eor.l %%d5, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ + "move.l %%d2, %%d0 \n" + "lsl.l #4, %%d0 \n" + "eor.l %%d6, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ + "move.l %%d3, %%d0 \n" + "lsl.l #4, %%d0 \n" + "eor.l %%d7, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ + "move.l %%d4, %%d0 \n" + "lsl.l #4, %%d0 \n" + "move.l %%a0, %%d5 \n" + "eor.l %%d5, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ + /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ + + "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ + "lsl.l #2, %%d0 \n" + /* move.l %%a0, %%d5 */ /* still in d5 */ + "eor.l %%d5, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ + "move.l %[ax], %%d5 \n" + "move.l %%d5, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d7, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ + /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ + "move.l %%d2, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d4, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ + "move.l %%d1, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d3, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ + + "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ + "lsl.l #1, %%d0 \n" + "eor.l %%d2, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ + "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ + "move.l %%d3, %%d0 \n" + "lsl.l #1, %%d0 \n" + "eor.l %%d4, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ + /* move.l %[ax], %%d5 */ /* still in d5 */ + "move.l %%d5, %%d0 \n" + "lsl.l #1, %%d0 \n" + "eor.l %%d6, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ + "move.l %%d7, %%d0 \n" + "lsl.l #1, %%d0 \n" + "move.l %%a0, %%d5 \n" + "eor.l %%d5, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ + + "tst.l %[mask] \n" + "jeq .ur_sloop \n" /* short loop if nothing to keep */ + + "move.l %[mask], %%d5 \n" /* need mask in data reg. */ + "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ + + ".ur_floop: \n" /** full loop (there are bits to keep)**/ + "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ + "bhs.s .ur_f8 \n" + + "move.l %[psiz], %%d0 \n" + "move.l %[dpth], %%d1 \n" + "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ + "add.l %%d0, %[addr] \n" /* for this round */ + "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ + "bra.s .ur_f1 \n" /* dpth == 0 should never happen */ + "bra.s .ur_f2 \n" + "bra.s .ur_f3 \n" + "bra.s .ur_f4 \n" + "bra.s .ur_f5 \n" + "bra.s .ur_f6 \n" + "bra.s .ur_f7 \n" + + ".ur_f8: \n" + "move.l %[psiz], %%d0 \n" + "lsl.l #3, %%d0 \n" + "add.l %%d0, %[addr] \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" /* load old byte */ + "and.l %%d5, %%d0 \n" /* mask out replaced bits */ + "move.l %[mask], %%d1 \n" + "or.l %%d1, %%d0 \n" /* set new bits */ + "move.b %%d0, (%[addr]) \n" /* store byte */ + "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + "move.l %%d1, %[mask] \n" + ".ur_f7: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d2, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d2 \n" + ".ur_f6: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d3, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d3 \n" + ".ur_f5: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d4, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d4 \n" + ".ur_f4: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "move.l %[ax], %%d1 \n" + "or.l %%d1, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d1 \n" + "move.l %%d1, %[ax] \n" + ".ur_f3: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d6, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d6 \n" + ".ur_f2: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d7, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d7 \n" + ".ur_f1: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "move.l %%a0, %%d1 \n" + "or.l %%d1, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d1 \n" + "move.l %%d1, %%a0 \n" + + "move.l %[psiz], %%d0 \n" + "lsl.l #3, %%d0 \n" + "add.l %%d0, %[addr] \n" /* correct address */ + "subq.l #8, %[dpth] \n" + "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ + "jgt .ur_floop \n" /* next round if anything left */ + + "jra .ur_end \n" + + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ + "bhs.s .ur_s8 \n" + + "move.l %[psiz], %%d0 \n" + "move.l %[dpth], %%d5 \n" + "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ + "add.l %%d0, %[addr] \n" /* for this round */ + "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ + "bra.s .ur_s1 \n" /* dpth == 0 should never happen */ + "bra.s .ur_s2 \n" + "bra.s .ur_s3 \n" + "bra.s .ur_s4 \n" + "bra.s .ur_s5 \n" + "bra.s .ur_s6 \n" + "bra.s .ur_s7 \n" + + ".ur_s8: \n" + "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ + "lsl.l #3, %%d0 \n" /* for this round. */ + "add.l %%d0, %[addr] \n" /* See above. */ + + "sub.l %[psiz], %[addr] \n" + "move.b %%d1, (%[addr]) \n" /* store byte */ + "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + ".ur_s7: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d2, (%[addr]) \n" + "lsr.l #8, %%d2 \n" + ".ur_s6: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d3, (%[addr]) \n" + "lsr.l #8, %%d3 \n" + ".ur_s5: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d4, (%[addr]) \n" + "lsr.l #8, %%d4 \n" + ".ur_s4: \n" + "sub.l %[psiz], %[addr] \n" + "move.l %[ax], %%d5 \n" + "move.b %%d5, (%[addr]) \n" + "lsr.l #8, %%d5 \n" + "move.l %%d5, %[ax] \n" + ".ur_s3: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d6, (%[addr]) \n" + "lsr.l #8, %%d6 \n" + ".ur_s2: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d7, (%[addr]) \n" + "lsr.l #8, %%d7 \n" + ".ur_s1: \n" + "sub.l %[psiz], %[addr] \n" + "move.l %%a0, %%d5 \n" + "move.b %%d5, (%[addr]) \n" + "lsr.l #8, %%d5 \n" + "move.l %%d5, %%a0 \n" + + "add.l %%d0, %[addr] \n" /* correct address */ + "subq.l #8, %[dpth] \n" + "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ + "jgt .ur_sloop \n" /* next round if anything left */ + + ".ur_end: \n" + : /* outputs */ + [addr]"+a"(addr), + [dpth]"+a"(depth), + [mask]"+a"(mask), + [ax] "=&a"(trash) + : /* inputs */ + [psiz]"a"(_gray_info.plane_size), + [patp]"[ax]"(pat_ptr) + : /* clobbers */ + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" ); } #else /* C version, for reference*/ @@ -1680,4 +2250,3 @@ static void gray_screendump_hook(int fd) } #endif /* HAVE_LCD_BITMAP */ - diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index 9406664ea2..dcc65bdd09 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c @@ -868,24 +868,24 @@ void gray_ub_clear_display(void) /* Write a pixel block, defined by their brightnesses in a greymap. Address is the byte in the first bitplane, src is the greymap start address, - stride is the increment for the greymap to get to the next pixel, mask - determines which pixels of the destination block are changed. */ + mask determines which pixels of the destination block are changed. */ static void _writearray(unsigned char *address, const unsigned char *src, unsigned mask) { unsigned long pat_stack[8]; unsigned long *pat_ptr = &pat_stack[8]; - unsigned char *addr, *end; + unsigned char *addr; #ifdef CPU_ARM const unsigned char *_src; - unsigned _mask, trash; + unsigned _mask, depth, trash; _mask = mask; _src = src; /* precalculate the bit patterns with random shifts for all 8 pixels and put them on an extra "stack" */ - asm volatile ( + asm volatile + ( "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ "mov r3, #8 \n" /* loop count */ @@ -932,83 +932,228 @@ static void _writearray(unsigned char *address, const unsigned char *src, ); addr = address; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); _mask = mask; + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ - - "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ + asm volatile + ( + "ldmia %[patp], {r1 - r8} \n" /* pop all 8 patterns */ + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "mov %[rx], #0xF0 \n" /** Stage 1: 4 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11110000 */ + "eor r0, r1, r5, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ + "eor r5, r5, r0, lsr #4 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ + "eor r0, r2, r6, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r2, r2, r0 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ + "eor r6, r6, r0, lsr #4 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ + "eor r0, r3, r7, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r3, r3, r0 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ + "eor r7, r7, r0, lsr #4 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ + "eor r0, r4, r8, lsl #4 \n" + "and r0, r0, %[rx] \n" + "eor r4, r4, r0 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ + "eor r8, r8, r0, lsr #4 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ + + "mov %[rx], #0xCC \n" /** Stage 2: 2 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...11001100 */ + "eor r0, r1, r3, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ + "eor r3, r3, r0, lsr #2 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ + "eor r0, r2, r4, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r2, r2, r0 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "eor r4, r4, r0, lsr #2 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ + "eor r0, r5, r7, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r5, r5, r0 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ + "eor r7, r7, r0, lsr #2 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ + "eor r0, r6, r8, lsl #2 \n" + "and r0, r0, %[rx] \n" + "eor r6, r6, r0 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ + "eor r8, r8, r0, lsr #2 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ + + "mov %[rx], #0xAA \n" /** Stage 3: 1 bit "comb" **/ + "orr %[rx], %[rx], %[rx], lsl #8 \n" + "orr %[rx], %[rx], %[rx], lsl #16\n" /* bitmask = ...10101010 */ + "eor r0, r1, r2, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r1, r1, r0 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ + "eor r2, r2, r0, lsr #1 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ + "eor r0, r3, r4, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r3, r3, r0 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ + "eor r4, r4, r0, lsr #1 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ + "eor r0, r5, r6, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r5, r5, r0 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ + "eor r6, r6, r0, lsr #1 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ + "eor r0, r7, r8, lsl #1 \n" + "and r0, r0, %[rx] \n" + "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ + "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ "ands %[mask], %[mask], #0xff \n" - "beq .wa_sloop \n" /* short loop if nothing to keep */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ - "adc r0, r0, r0 \n" /* put bit into LSB of byte */ - "movs r8, r8, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r7, r7, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r6, r6, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r5, r5, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r4, r4, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r3, r3, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r2, r2, lsr #1 \n" - "adc r0, r0, r0 \n" - - "ldrb r1, [%[addr]] \n" /* read old value */ - "and r1, r1, %[mask] \n" /* mask out replaced bits */ - "orr r1, r1, r0 \n" /* set new bits */ - "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ - - "cmp %[end], %[addr] \n" /* loop through all bitplanes */ - "bne .wa_floop \n" - + "beq .wa_sloop \n" /* short loop if no bits to keep */ + + ".wa_floop: \n" /** full loop (bits to keep)**/ + "cmp %[dpth], #8 \n" /* 8 planes or more left? */ + "bhs .wa_f8 \n" + + "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ + "add %[addr], %[addr], r0 \n" /* for this round */ + + + "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "add pc, pc, r0 \n" + ".wa_ftable: \n" + ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */ + ".byte .wa_f1 - .wa_ftable - 4 \n" + ".byte .wa_f2 - .wa_ftable - 4 \n" + ".byte .wa_f3 - .wa_ftable - 4 \n" + ".byte .wa_f4 - .wa_ftable - 4 \n" + ".byte .wa_f5 - .wa_ftable - 4 \n" + ".byte .wa_f6 - .wa_ftable - 4 \n" + ".byte .wa_f7 - .wa_ftable - 4 \n" + + ".wa_f8: \n" + "add %[addr], %[addr], %[psiz], lsl #3 \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ + "and r0, r0, %[mask] \n" /* mask out replaced bits */ + "orr r0, r0, r8 \n" /* set new bits */ + "strb r0, [%[addr]] \n" /* store byte */ + "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + ".wa_f7: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r7 \n" + "strb r0, [%[addr]] \n" + "mov r7, r7, lsr #8 \n" + ".wa_f6: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r6 \n" + "strb r0, [%[addr]] \n" + "mov r6, r6, lsr #8 \n" + ".wa_f5: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r5 \n" + "strb r0, [%[addr]] \n" + "mov r5, r5, lsr #8 \n" + ".wa_f4: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r4 \n" + "strb r0, [%[addr]] \n" + "mov r4, r4, lsr #8 \n" + ".wa_f3: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r3 \n" + "strb r0, [%[addr]] \n" + "mov r3, r3, lsr #8 \n" + ".wa_f2: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r2 \n" + "strb r0, [%[addr]] \n" + "mov r2, r2, lsr #8 \n" + ".wa_f1: \n" + "ldrb r0, [%[addr], -%[psiz]]! \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r1 \n" + "strb r0, [%[addr]] \n" + "mov r1, r1, lsr #8 \n" + ".wa_f0: \n" + + "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ + "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ + "bhi .wa_floop \n" + "b .wa_end \n" - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ - "adc r0, r0, r0 \n" /* put bit into LSB of byte */ - "movs r8, r8, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r7, r7, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r6, r6, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r5, r5, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r4, r4, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r3, r3, lsr #1 \n" - "adc r0, r0, r0 \n" - "movs r2, r2, lsr #1 \n" - "adc r0, r0, r0 \n" - - "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "cmp %[dpth], #8 \n" /* 8 planes or more left? */ + "bhs .wa_s8 \n" - "cmp %[end], %[addr] \n" /* loop through all bitplanes */ - "bne .wa_sloop \n" + "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ + "add %[addr], %[addr], r0 \n" /* for this round */ + "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "add pc, pc, r0 \n" + ".wa_stable: \n" + ".byte .wa_s0 - .wa_stable - 4 \n" + ".byte .wa_s1 - .wa_stable - 4 \n" + ".byte .wa_s2 - .wa_stable - 4 \n" + ".byte .wa_s3 - .wa_stable - 4 \n" + ".byte .wa_s4 - .wa_stable - 4 \n" + ".byte .wa_s5 - .wa_stable - 4 \n" + ".byte .wa_s6 - .wa_stable - 4 \n" + ".byte .wa_s7 - .wa_stable - 4 \n" + + ".wa_s8: \n" + "add %[addr], %[addr], %[psiz], lsl #3 \n" + /* Point behind the last plane for this round. See above. */ + "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ + "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + ".wa_s7: \n" + "strb r7, [%[addr], -%[psiz]]! \n" + "mov r7, r7, lsr #8 \n" + ".wa_s6: \n" + "strb r6, [%[addr], -%[psiz]]! \n" + "mov r6, r6, lsr #8 \n" + ".wa_s5: \n" + "strb r5, [%[addr], -%[psiz]]! \n" + "mov r5, r5, lsr #8 \n" + ".wa_s4: \n" + "strb r4, [%[addr], -%[psiz]]! \n" + "mov r4, r4, lsr #8 \n" + ".wa_s3: \n" + "strb r3, [%[addr], -%[psiz]]! \n" + "mov r3, r3, lsr #8 \n" + ".wa_s2: \n" + "strb r2, [%[addr], -%[psiz]]! \n" + "mov r2, r2, lsr #8 \n" + ".wa_s1: \n" + "strb r1, [%[addr], -%[psiz]]! \n" + "mov r1, r1, lsr #8 \n" + ".wa_s0: \n" + + "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ + "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ + "bhi .wa_sloop \n" + ".wa_end: \n" : /* outputs */ [addr]"+r"(addr), [mask]"+r"(_mask), + [dpth]"+r"(depth), [rx] "=&r"(trash) : /* inputs */ [psiz]"r"(_gray_info.plane_size), - [end] "r"(end), [patp]"[rx]"(pat_ptr) : /* clobbers */ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); #else /* C version, for reference*/ #warning C version of _writearray() used + unsigned char *end; unsigned test = 0x80; int i; @@ -1142,68 +1287,71 @@ void gray_ub_gray_bitmap_part(const unsigned char *src, int src_x, int src_y, Address is the byte in the first bitplane, src is the greymap start address, stride is the increment for the greymap to get to the next pixel, mask determines which pixels of the destination block are changed. */ +static void _writearray(unsigned char *address, const unsigned char *src, + int stride, unsigned mask) __attribute__((noinline)); static void _writearray(unsigned char *address, const unsigned char *src, int stride, unsigned mask) { unsigned long pat_stack[8]; unsigned long *pat_ptr = &pat_stack[8]; - unsigned char *addr, *end; + unsigned char *addr; #if CONFIG_CPU == SH7034 const unsigned char *_src; - unsigned _mask, trash; + unsigned _mask, depth, trash; _mask = mask; _src = src; /* precalculate the bit patterns with random shifts for all 8 pixels and put them on an extra "stack" */ - asm volatile ( - "mov #8,r3 \n" /* loop count */ - - ".wa_loop: \n" /** load pattern for pixel **/ - "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ - "shlr %[mask] \n" /* shift out lsb of mask */ - "bf .wa_skip \n" /* skip this pixel */ - - "mov.b @%[src],r0 \n" /* load src byte */ - "extu.b r0,r0 \n" /* extend unsigned */ - "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ - "extu.b r0,r0 \n" /* extend unsigned */ - "shll2 r0 \n" - "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ - - "mov #75,r0 \n" - "mulu r0,%[rnd] \n" /* multiply by 75 */ - "sts macl,%[rnd] \n" - "add #74,%[rnd] \n" /* add another 74 */ + asm volatile + ( + "mov #8, r3 \n" /* loop count */ + + ".wa_loop: \n" /** load pattern for pixel **/ + "mov #0, r0 \n" /* pattern for skipped pixel must be 0 */ + "shlr %[mask] \n" /* shift out lsb of mask */ + "bf .wa_skip \n" /* skip this pixel */ + + "mov.b @%[src], r0 \n" /* load src byte */ + "extu.b r0, r0 \n" /* extend unsigned */ + "mov.b @(r0,%[trns]), r0 \n" /* idxtable into pattern index */ + "extu.b r0, r0 \n" /* extend unsigned */ + "shll2 r0 \n" + "mov.l @(r0,%[bpat]), r4 \n" /* r4 = bitpattern[byte]; */ + + "mov #75, r0 \n" + "mulu r0, %[rnd] \n" /* multiply by 75 */ + "sts macl, %[rnd] \n" + "add #74, %[rnd] \n" /* add another 74 */ /* Since the lower bits are not very random: */ - "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ - "and %[rmsk],r1 \n" /* mask out unneeded bits */ + "swap.b %[rnd], r1 \n" /* get bits 8..15 (need max. 5) */ + "and %[rmsk], r1 \n" /* mask out unneeded bits */ - "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ - "bf .wa_ntrim \n" - "sub %[dpth],r1 \n" /* yes: random -= depth; */ - ".wa_ntrim: \n" + "cmp/hs %[dpth], r1 \n" /* random >= depth ? */ + "bf .wa_ntrim \n" + "sub %[dpth], r1 \n" /* yes: random -= depth; */ + ".wa_ntrim: \n" - "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ - "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ - "mov r1,r5 \n" + "mov.l .ashlsi3, r0 \n" /** rotate pattern **/ + "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ + "mov r1, r5 \n" - "mov %[dpth],r5 \n" - "sub r1,r5 \n" /* r5 = depth - r1 */ - "mov.l .lshrsi3,r1 \n" - "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ - "mov r0,r1 \n" /* store previous result in r1 */ + "mov %[dpth], r5 \n" + "sub r1, r5 \n" /* r5 = depth - r1 */ + "mov.l .lshrsi3, r1 \n" + "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ + "mov r0, r1 \n" /* store previous result in r1 */ - "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ + "or r1, r0 \n" /* rotated_pattern = r0 | r1 */ - ".wa_skip: \n" - "mov.l r0,@-%[patp] \n" /* push on pattern stack */ + ".wa_skip: \n" + "mov.l r0, @-%[patp] \n" /* push on pattern stack */ - "add %[stri],%[src] \n" /* src += stride; */ - "add #-1,r3 \n" /* loop 8 times (pixel block) */ - "cmp/pl r3 \n" - "bt .wa_loop \n" + "add %[stri], %[src] \n" /* src += stride; */ + "add #-1, r3 \n" /* loop 8 times (pixel block) */ + "cmp/pl r3 \n" + "bt .wa_loop \n" : /* outputs */ [src] "+r"(_src), [rnd] "+r"(_gray_random_buffer), @@ -1220,143 +1368,369 @@ static void _writearray(unsigned char *address, const unsigned char *src, ); addr = address; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); _mask = mask; + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ - "mov.l @%[patp]+,r2 \n" - "mov.l @%[patp]+,r3 \n" - "mov.l @%[patp]+,r6 \n" - "mov.l @%[patp]+,r7 \n" - "mov.l @%[patp]+,r8 \n" - "mov.l @%[patp]+,r9 \n" - "mov.l @%[patp],r10 \n" - - "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ - "extu.b %[mask],%[mask] \n" /* mask out high bits */ - "tst %[mask],%[mask] \n" - "bt .wa_sloop \n" /* short loop if nothing to keep */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "mov.b @%[addr],%[rx] \n" /* read old value */ - "rotcl r0 \n" - "and %[mask],%[rx] \n" /* mask out replaced bits */ - "or %[rx],r0 \n" /* set new bits */ - "mov.b r0,@%[addr] \n" /* store value to bitplane */ - "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ - "bt .wa_floop \n" - - "bra .wa_end \n" - "nop \n" + asm volatile + ( + "mov.l @%[patp]+, r8 \n" /* pop all 8 patterns */ + "mov.l @%[patp]+, r7 \n" + "mov.l @%[patp]+, r6 \n" + "mov.l @%[patp]+, r5 \n" + "mov.l @%[patp]+, r4 \n" + "mov.l @%[patp]+, r3 \n" + "mov.l @%[patp]+, r2 \n" + "mov.l @%[patp], r1 \n" + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "mov.l .wa_mask4, %[rx] \n" /* bitmask = ...11110000 */ + "mov r5, r0 \n" /** Stage 1: 4 bit "comb" **/ + "shll2 r0 \n" + "shll2 r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...e3e2e1e0a3a2a1a0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r5 \n" /* r5 = ...e7e6e5e4a7a6a5a4 */ + "mov r6, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r2, r0 \n" + "and %[rx], r0 \n" + "xor r0, r2 \n" /* r2 = ...f3f2f1f0b3b2b1b0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r6 \n" /* r6 = ...f7f6f5f4f7f6f5f4 */ + "mov r7, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r3, r0 \n" + "and %[rx], r0 \n" + "xor r0, r3 \n" /* r3 = ...g3g2g1g0c3c2c1c0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r7 \n" /* r7 = ...g7g6g5g4c7c6c5c4 */ + "mov r8, r0 \n" + "shll2 r0 \n" + "shll2 r0 \n" + "xor r4, r0 \n" + "and %[rx], r0 \n" + "xor r0, r4 \n" /* r4 = ...h3h2h1h0d3d2d1d0 */ + "shlr2 r0 \n" + "shlr2 r0 \n" + "xor r0, r8 \n" /* r8 = ...h7h6h5h4d7d6d5d4 */ + + "mov.l .wa_mask2, %[rx] \n" /* bitmask = ...11001100 */ + "mov r3, r0 \n" /** Stage 2: 2 bit "comb" **/ + "shll2 r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...g1g0e1e0c1c0a1a0 */ + "shlr2 r0 \n" + "xor r0, r3 \n" /* r3 = ...g3g2e3e2c3c2a3a2 */ + "mov r4, r0 \n" + "shll2 r0 \n" + "xor r2, r0 \n" + "and %[rx], r0 \n" + "xor r0, r2 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "shlr2 r0 \n" + "xor r0, r4 \n" /* r4 = ...h3h2f3f2d3d2b3b2 */ + "mov r7, r0 \n" + "shll2 r0 \n" + "xor r5, r0 \n" + "and %[rx], r0 \n" + "xor r0, r5 \n" /* r5 = ...g5g4e5e4c5c4a5a4 */ + "shlr2 r0 \n" + "xor r0, r7 \n" /* r7 = ...g7g6e7e6c7c6a7a6 */ + "mov r8, r0 \n" + "shll2 r0 \n" + "xor r6, r0 \n" + "and %[rx], r0 \n" + "xor r0, r6 \n" /* r6 = ...h5h4f5f4d5d4b5b4 */ + "shlr2 r0 \n" + "xor r0, r8 \n" /* r8 = ...h7h6f7f6d7d6b7b6 */ + + "mov.l .wa_mask1, %[rx] \n" /* bitmask = ...10101010 */ + "mov r2, r0 \n" /** Stage 3: 1 bit "comb" **/ + "shll r0 \n" + "xor r1, r0 \n" + "and %[rx], r0 \n" + "xor r0, r1 \n" /* r1 = ...h0g0f0e0d0c0b0a0 */ + "shlr r0 \n" + "xor r0, r2 \n" /* r2 = ...h1g1f1e1d1c1b1a1 */ + "mov r4, r0 \n" + "shll r0 \n" + "xor r3, r0 \n" + "and %[rx], r0 \n" + "xor r0, r3 \n" /* r3 = ...h2g2f2e2d2c2b2a2 */ + "shlr r0 \n" + "xor r0, r4 \n" /* r4 = ...h3g3f3e3d3c3b3a3 */ + "mov r6, r0 \n" + "shll r0 \n" + "xor r5, r0 \n" + "and %[rx], r0 \n" + "xor r0, r5 \n" /* r5 = ...h4g4f4e4d4c4b4a4 */ + "shlr r0 \n" + "xor r0, r6 \n" /* r6 = ...h5g5f5e5d5c5b5a5 */ + "mov r8, r0 \n" + "shll r0 \n" + "xor r7, r0 \n" + "and %[rx], r0 \n" + "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ + "shlr r0 \n" + "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ + "extu.b %[mask], %[mask] \n" /* mask out high bits */ + "tst %[mask], %[mask] \n" + "bt .wa_sloop \n" /* short loop if nothing to keep */ + + ".wa_floop: \n" /** full loop (there are bits to keep)**/ + "mov #8, r0 \n" + "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ + "bt .wa_f8 \n" + + "mulu %[psiz], %[dpth] \n" + "mova .wa_ftable, r0 \n" + "mov.b @(r0, %[dpth]), %[rx] \n" + "add %[rx], r0 \n" + "sts macl, %[rx] \n" /* point behind the last plane.. */ + "jmp @r0 \n" /* jump into streak */ + "add %[rx], %[addr] \n" /* ..for this round */ + + ".align 2 \n" + ".wa_ftable: \n" + ".byte .wa_f0 - .wa_ftable \n" + ".byte .wa_f1 - .wa_ftable \n" + ".byte .wa_f2 - .wa_ftable \n" + ".byte .wa_f3 - .wa_ftable \n" + ".byte .wa_f4 - .wa_ftable \n" + ".byte .wa_f5 - .wa_ftable \n" + ".byte .wa_f6 - .wa_ftable \n" + ".byte .wa_f7 - .wa_ftable \n" + + ".wa_f8: \n" + "mov %[psiz], %[rx] \n" + "shll2 %[rx] \n" + "add %[rx], %[rx] \n" + "add %[rx], %[addr] \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" /* load old byte */ + "and %[mask], r0 \n" /* mask out replaced bits */ + "or r8, r0 \n" /* set new bits */ + "mov.b r0, @%[addr] \n" /* store byte */ + "shlr8 r8 \n" /* shift out used-up byte */ + ".wa_f7: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r7, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r7 \n" + ".wa_f6: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r6, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r6 \n" + ".wa_f5: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r5, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r5 \n" + ".wa_f4: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r4, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r4 \n" + ".wa_f3: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r3, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r3 \n" + ".wa_f2: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r2, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r2 \n" + ".wa_f1: \n" + "sub %[psiz], %[addr] \n" + "mov.b @%[addr], r0 \n" + "and %[mask], r0 \n" + "or r1, r0 \n" + "mov.b r0, @%[addr] \n" + "shlr8 r1 \n" + ".wa_f0: \n" + + "add %[rx], %[addr] \n" /* correct address */ + "add #-8, %[dpth] \n" + "cmp/pl %[dpth] \n" /* next round if anything left */ + "bt .wa_floop \n" + + "bra .wa_end \n" + "nop \n" /* References to C library routines used in the precalc block */ - ".align 2 \n" - ".ashlsi3: \n" /* C library routine: */ - ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ - ".lshrsi3: \n" /* C library routine: */ - ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ + ".align 2 \n" + ".ashlsi3: \n" /* C library routine: */ + ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ + ".lshrsi3: \n" /* C library routine: */ + ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ /* both routines preserve r4, destroy r5 and take ~16 cycles */ - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "rotcl r0 \n" - "mov.b r0,@%[addr] \n" /* store byte to bitplane */ - "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ - "bt .wa_sloop \n" - - ".wa_end: \n" + /* Bitmasks for the bit block rotation */ + ".wa_mask4: \n" + ".long 0xF0F0F0F0 \n" + ".wa_mask2: \n" + ".long 0xCCCCCCCC \n" + ".wa_mask1: \n" + ".long 0xAAAAAAAA \n" + + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "mov #8, r0 \n" + "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ + "bt .wa_s8 \n" + + "mulu %[psiz], %[dpth] \n" + "mova .wa_stable, r0 \n" + "mov.b @(r0, %[dpth]), %[rx] \n" + "add %[rx], r0 \n" + "sts macl, %[rx] \n" /* point behind the last plane.. */ + "jmp @r0 \n" /* jump into streak */ + "add %[rx], %[addr] \n" /* ..for this round */ + + ".align 2 \n" + ".wa_stable: \n" + ".byte .wa_s0 - .wa_stable \n" + ".byte .wa_s1 - .wa_stable \n" + ".byte .wa_s2 - .wa_stable \n" + ".byte .wa_s3 - .wa_stable \n" + ".byte .wa_s4 - .wa_stable \n" + ".byte .wa_s5 - .wa_stable \n" + ".byte .wa_s6 - .wa_stable \n" + ".byte .wa_s7 - .wa_stable \n" + + ".wa_s8: \n" + "mov %[psiz], %[rx] \n" /* Point behind the last plane */ + "shll2 %[rx] \n" /* for this round. */ + "add %[rx], %[rx] \n" /* See above. */ + "add %[rx], %[addr] \n" + + "sub %[psiz], %[addr] \n" + "mov.b r8, @%[addr] \n" /* store byte */ + "shlr8 r8 \n" /* shift out used-up byte */ + ".wa_s7: \n" + "sub %[psiz], %[addr] \n" + "mov.b r7, @%[addr] \n" + "shlr8 r7 \n" + ".wa_s6: \n" + "sub %[psiz], %[addr] \n" + "mov.b r6, @%[addr] \n" + "shlr8 r6 \n" + ".wa_s5: \n" + "sub %[psiz], %[addr] \n" + "mov.b r5, @%[addr] \n" + "shlr8 r5 \n" + ".wa_s4: \n" + "sub %[psiz], %[addr] \n" + "mov.b r4, @%[addr] \n" + "shlr8 r4 \n" + ".wa_s3: \n" + "sub %[psiz], %[addr] \n" + "mov.b r3, @%[addr] \n" + "shlr8 r3 \n" + ".wa_s2: \n" + "sub %[psiz], %[addr] \n" + "mov.b r2, @%[addr] \n" + "shlr8 r2 \n" + ".wa_s1: \n" + "sub %[psiz], %[addr] \n" + "mov.b r1, @%[addr] \n" + "shlr8 r1 \n" + ".wa_s0: \n" + + "add %[rx], %[addr] \n" /* correct address */ + "add #-8, %[dpth] \n" + "cmp/pl %[dpth] \n" /* next round if anything left */ + "bt .wa_sloop \n" + + ".wa_end: \n" : /* outputs */ [addr]"+r"(addr), [mask]"+r"(_mask), + [dpth]"+r"(depth), [rx] "=&r"(trash) : /* inputs */ [psiz]"r"(_gray_info.plane_size), - [end] "r"(end), [patp]"[rx]"(pat_ptr) : /* clobbers */ - "r0", "r1", "r2", "r3", "r6", "r7", "r8", "r9", "r10" + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl" ); #elif defined(CPU_COLDFIRE) const unsigned char *_src; - unsigned _mask, trash; + unsigned _mask, depth, trash; _mask = mask; _src = src; /* precalculate the bit patterns with random shifts for all 8 pixels and put them on an extra "stack" */ - asm volatile ( - "moveq.l #8,%%d3 \n" /* loop count */ - - ".wa_loop: \n" /** load pattern for pixel **/ - "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ - "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ - "bcc.b .wa_skip \n" /* skip this pixel */ - - "clr.l %%d0 \n" - "move.b (%[src]),%%d0 \n" /* load src byte */ - "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ - "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ - - "mulu.w #75,%[rnd] \n" /* multiply by 75 */ - "add.l #74,%[rnd] \n" /* add another 74 */ + asm volatile + ( + "moveq.l #8, %%d3 \n" /* loop count */ + + ".wa_loop: \n" /** load pattern for pixel **/ + "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ + "lsr.l #1, %[mask] \n" /* shift out lsb of mask */ + "bcc.b .wa_skip \n" /* skip this pixel */ + + "clr.l %%d0 \n" + "move.b (%[src]), %%d0 \n" /* load src byte */ + "move.b (%%d0:l:1, %[trns]), %%d0 \n" /* idxtable into pattern index */ + "move.l (%%d0:l:4, %[bpat]), %%d2 \n" /* d2 = bitpattern[byte]; */ + + "mulu.w #75, %[rnd] \n" /* multiply by 75 */ + "add.l #74, %[rnd] \n" /* add another 74 */ /* Since the lower bits are not very random: */ - "move.l %[rnd],%%d1 \n" - "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ - "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ - - "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ - "blo.b .wa_ntrim \n" - "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ - ".wa_ntrim: \n" - - "move.l %%d2,%%d0 \n" /** rotate pattern **/ - "lsl.l %%d1,%%d0 \n" - "sub.l %[dpth],%%d1 \n" - "neg.l %%d1 \n" /* d1 = depth - d1 */ - "lsr.l %%d1,%%d2 \n" - "or.l %%d0,%%d2 \n" - - ".wa_skip: \n" - "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ - - "add.l %[stri],%[src] \n" /* src += stride; */ - "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ - "bne.b .wa_loop \n" + "move.l %[rnd], %%d1 \n" + "lsr.l #8, %%d1 \n" /* get bits 8..15 (need max. 5) */ + "and.l %[rmsk], %%d1 \n" /* mask out unneeded bits */ + + "cmp.l %[dpth], %%d1 \n" /* random >= depth ? */ + "blo.b .wa_ntrim \n" + "sub.l %[dpth], %%d1 \n" /* yes: random -= depth; */ + ".wa_ntrim: \n" + + "move.l %%d2, %%d0 \n" /** rotate pattern **/ + "lsl.l %%d1, %%d0 \n" + "sub.l %[dpth], %%d1 \n" + "neg.l %%d1 \n" /* d1 = depth - d1 */ + "lsr.l %%d1, %%d2 \n" + "or.l %%d0, %%d2 \n" + + ".wa_skip: \n" + "move.l %%d2, -(%[patp]) \n" /* push on pattern stack */ + + "add.l %[stri], %[src] \n" /* src += stride; */ + "subq.l #1, %%d3 \n" /* loop 8 times (pixel block) */ + "bne.b .wa_loop \n" : /* outputs */ [src] "+a"(_src), [patp]"+a"(pat_ptr), @@ -1373,97 +1747,297 @@ static void _writearray(unsigned char *address, const unsigned char *src, ); addr = address; - end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); - _mask = mask; + _mask = ~mask & 0xff; + depth = _gray_info.depth; /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - asm volatile ( - "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" - /* pop all 8 patterns */ - "not.l %[mask] \n" /* "set" mask -> "keep" mask */ - "and.l #0xFF,%[mask] \n" - "beq.b .wa_sstart \n" /* short loop if nothing to keep */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "lsr.l #1,%%d2 \n" /* shift out pattern bit */ - "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a0,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" - - "move.b (%[addr]),%%d1 \n" /* read old value */ - "and.l %[mask],%%d1 \n" /* mask out replaced bits */ - "or.l %%d0,%%d1 \n" /* set new bits */ - "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ - - "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ - "bhi.b .wa_floop \n" - - "bra.b .wa_end \n" - - ".wa_sstart: \n" - "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ - - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "lsr.l #1,%%d2 \n" /* shift out pattern bit */ - "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%[mask] \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" - - "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ - "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ - "bhi.b .wa_sloop \n" - - ".wa_end: \n" + asm volatile + ( + "movem.l (%[patp]), %%d1-%%d7/%%a0 \n" /* pop all 8 patterns */ + /* move.l %%d5, %[ax] */ /* need %%d5 as workspace, but not yet */ + + /** Rotate the four 8x8 bit "blocks" within r1..r8 **/ + + "move.l %%d1, %%d0 \n" /** Stage 1: 4 bit "comb" **/ + "lsl.l #4, %%d0 \n" + /* move.l %[ax], %%d5 */ /* already in d5 */ + "eor.l %%d5, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" /* bitmask = ...11110000 */ + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %[ax] \n" /* ax = ...h3h2h1h0d3d2d1d0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6h5h4d7d6d5d4 */ + "move.l %%d2, %%d0 \n" + "lsl.l #4, %%d0 \n" + "eor.l %%d6, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2g1g0c3c2c1c0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6g5g4c7c6c5c4 */ + "move.l %%d3, %%d0 \n" + "lsl.l #4, %%d0 \n" + "eor.l %%d7, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* d7 = ...f3f2f1f0b3b2b1b0 */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...f7f6f5f4f7f6f5f4 */ + "move.l %%d4, %%d0 \n" + "lsl.l #4, %%d0 \n" + "move.l %%a0, %%d5 \n" + "eor.l %%d5, %%d0 \n" + "and.l #0xF0F0F0F0, %%d0 \n" + "eor.l %%d0, %%d5 \n" /* (a0 = ...e3e2e1e0a3a2a1a0) */ + /* move.l %%d5, %%a0 */ /* but d5 is kept until next usage */ + "lsr.l #4, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...e7e6e5e4a7a6a5a4 */ + + "move.l %%d6, %%d0 \n" /** Stage 2: 2 bit "comb" **/ + "lsl.l #2, %%d0 \n" + /* move.l %%a0, %%d5 */ /* still in d5 */ + "eor.l %%d5, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" /* bitmask = ...11001100 */ + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %%a0 \n" /* a0 = ...g1g0e1e0c1c0a1a0 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...g3g2e3e2c3c2a3a2 */ + "move.l %[ax], %%d5 \n" + "move.l %%d5, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d7, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* r2 = ...h1h0f1f0d1d0b1b0 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d5 \n" /* (ax = ...h3h2f3f2d3d2b3b2) */ + /* move.l %%d5, %[ax] */ /* but d5 is kept until next usage */ + "move.l %%d2, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d4, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...g5g4e5e4c5c4a5a4 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d2 \n" /* d2 = ...g7g6e7e6c7c6a7a6 */ + "move.l %%d1, %%d0 \n" + "lsl.l #2, %%d0 \n" + "eor.l %%d3, %%d0 \n" + "and.l #0xCCCCCCCC, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...h5h4f5f4d5d4b5b4 */ + "lsr.l #2, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7h6f7f6d7d6b7b6 */ + + "move.l %%d1, %%d0 \n" /** Stage 3: 1 bit "comb" **/ + "lsl.l #1, %%d0 \n" + "eor.l %%d2, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" /* bitmask = ...10101010 */ + "eor.l %%d0, %%d2 \n" /* d2 = ...h6g6f6e6d6c6b6a6 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d1 \n" /* d1 = ...h7g7f7e7d7c7b7a7 */ + "move.l %%d3, %%d0 \n" + "lsl.l #1, %%d0 \n" + "eor.l %%d4, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d4 \n" /* d4 = ...h4g4f4e4d4c4b4a4 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d3 \n" /* d3 = ...h5g5f5e5d5c5b5a5 */ + /* move.l %[ax], %%d5 */ /* still in d5 */ + "move.l %%d5, %%d0 \n" + "lsl.l #1, %%d0 \n" + "eor.l %%d6, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d6 \n" /* d6 = ...h2g2f2e2d2c2b2a2 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %[ax] \n" /* ax = ...h3g3f3e3d3c3b3a3 */ + "move.l %%d7, %%d0 \n" + "lsl.l #1, %%d0 \n" + "move.l %%a0, %%d5 \n" + "eor.l %%d5, %%d0 \n" + "and.l #0xAAAAAAAA, %%d0 \n" + "eor.l %%d0, %%d5 \n" + "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ + "lsr.l #1, %%d0 \n" + "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ + + "tst.l %[mask] \n" + "jeq .wa_sloop \n" /* short loop if nothing to keep */ + + "move.l %[mask], %%d5 \n" /* need mask in data reg. */ + "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ + + ".wa_floop: \n" /** full loop (there are bits to keep)**/ + "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ + "bhs.s .wa_f8 \n" + + "move.l %[psiz], %%d0 \n" + "move.l %[dpth], %%d1 \n" + "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ + "add.l %%d0, %[addr] \n" /* for this round */ + "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ + "bra.s .wa_f1 \n" /* dpth == 0 should never happen */ + "bra.s .wa_f2 \n" + "bra.s .wa_f3 \n" + "bra.s .wa_f4 \n" + "bra.s .wa_f5 \n" + "bra.s .wa_f6 \n" + "bra.s .wa_f7 \n" + + ".wa_f8: \n" + "move.l %[psiz], %%d0 \n" + "lsl.l #3, %%d0 \n" + "add.l %%d0, %[addr] \n" + /* Point behind the last plane for this round. Note: We're using the + * registers backwards in order to reuse the streak for the last round. + * Therefore we need to go thru the bitplanes backwards too, otherwise + * the bit order would be destroyed which results in more flicker. */ + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" /* load old byte */ + "and.l %%d5, %%d0 \n" /* mask out replaced bits */ + "move.l %[mask], %%d1 \n" + "or.l %%d1, %%d0 \n" /* set new bits */ + "move.b %%d0, (%[addr]) \n" /* store byte */ + "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + "move.l %%d1, %[mask] \n" + ".wa_f7: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d2, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d2 \n" + ".wa_f6: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d3, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d3 \n" + ".wa_f5: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d4, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d4 \n" + ".wa_f4: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "move.l %[ax], %%d1 \n" + "or.l %%d1, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d1 \n" + "move.l %%d1, %[ax] \n" + ".wa_f3: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d6, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d6 \n" + ".wa_f2: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d7, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d7 \n" + ".wa_f1: \n" + "sub.l %[psiz], %[addr] \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "move.l %%a0, %%d1 \n" + "or.l %%d1, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "lsr.l #8, %%d1 \n" + "move.l %%d1, %%a0 \n" + + "move.l %[psiz], %%d0 \n" + "lsl.l #3, %%d0 \n" + "add.l %%d0, %[addr] \n" /* correct address */ + "subq.l #8, %[dpth] \n" + "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ + "jgt .wa_floop \n" /* next round if anything left */ + + "jra .wa_end \n" + + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ + "bhs.s .wa_s8 \n" + + "move.l %[psiz], %%d0 \n" + "move.l %[dpth], %%d5 \n" + "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ + "add.l %%d0, %[addr] \n" /* for this round */ + "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ + "bra.s .wa_s1 \n" /* dpth == 0 should never happen */ + "bra.s .wa_s2 \n" + "bra.s .wa_s3 \n" + "bra.s .wa_s4 \n" + "bra.s .wa_s5 \n" + "bra.s .wa_s6 \n" + "bra.s .wa_s7 \n" + + ".wa_s8: \n" + "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ + "lsl.l #3, %%d0 \n" /* for this round. */ + "add.l %%d0, %[addr] \n" /* See above. */ + + "sub.l %[psiz], %[addr] \n" + "move.b %%d1, (%[addr]) \n" /* store byte */ + "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + ".wa_s7: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d2, (%[addr]) \n" + "lsr.l #8, %%d2 \n" + ".wa_s6: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d3, (%[addr]) \n" + "lsr.l #8, %%d3 \n" + ".wa_s5: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d4, (%[addr]) \n" + "lsr.l #8, %%d4 \n" + ".wa_s4: \n" + "sub.l %[psiz], %[addr] \n" + "move.l %[ax], %%d5 \n" + "move.b %%d5, (%[addr]) \n" + "lsr.l #8, %%d5 \n" + "move.l %%d5, %[ax] \n" + ".wa_s3: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d6, (%[addr]) \n" + "lsr.l #8, %%d6 \n" + ".wa_s2: \n" + "sub.l %[psiz], %[addr] \n" + "move.b %%d7, (%[addr]) \n" + "lsr.l #8, %%d7 \n" + ".wa_s1: \n" + "sub.l %[psiz], %[addr] \n" + "move.l %%a0, %%d5 \n" + "move.b %%d5, (%[addr]) \n" + "lsr.l #8, %%d5 \n" + "move.l %%d5, %%a0 \n" + + "add.l %%d0, %[addr] \n" /* correct address */ + "subq.l #8, %[dpth] \n" + "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ + "jgt .wa_sloop \n" /* next round if anything left */ + + ".wa_end: \n" : /* outputs */ [addr]"+a"(addr), - [mask]"+d"(_mask), + [dpth]"+a"(depth), + [mask]"+a"(_mask), [ax] "=&a"(trash) : /* inputs */ [psiz]"a"(_gray_info.plane_size), - [end] "a"(end), [patp]"[ax]"(pat_ptr) : /* clobbers */ - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "a0", "a1" + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0" ); #else /* C version, for reference*/ #warning C version of _writearray() used + unsigned char *end; unsigned test = 1; int i; -- cgit v1.2.3