From c00d799fa3a568ecb8649b5ce6d40366707b9551 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Mon, 7 Aug 2006 17:21:38 +0000 Subject: * Assembler optimised gray_update_rect() and writearray() for arm (greyscale iPods). * Some slight optimisations for coldfire (H1x0) and SH1 (archos). * Comment and formatting cleanup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10473 a1c6a512-1295-4272-9138-f99709370657 --- apps/plugins/lib/gray_core.c | 536 ++++++++++++++++++++++++++--------------- apps/plugins/lib/gray_draw.c | 496 ++++++++++++++++++++++++-------------- apps/plugins/lib/gray_scroll.c | 83 ++++--- 3 files changed, 692 insertions(+), 423 deletions(-) (limited to 'apps') diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c index c253a7112e..c162349f76 100644 --- a/apps/plugins/lib/gray_core.c +++ b/apps/plugins/lib/gray_core.c @@ -648,14 +648,165 @@ void gray_update_rect(int x, int y, int width, int height) cbuf = _gray_info.cur_buffer + srcofs_row; bbuf = _gray_info.back_buffer + srcofs_row; -#if 0 /* CPU specific asm versions will go here */ +#ifdef CPU_ARM + asm volatile ( + "ldr r0, [%[cbuf]] \n" + "ldr r1, [%[bbuf]] \n" + "eor r1, r0, r1 \n" + "ldr r0, [%[cbuf], #4] \n" + "ldr %[chg], [%[bbuf], #4] \n" + "eor %[chg], r0, %[chg] \n" + "orr %[chg], %[chg], r1 \n" + : /* outputs */ + [chg] "=&r"(change) + : /* inputs */ + [cbuf]"r"(cbuf), + [bbuf]"r"(bbuf) + : /* clobbers */ + "r0", "r1" + ); + + if (change != 0) + { + unsigned char *addr, *end; + unsigned mask, trash; + + pat_ptr = &pat_stack[8]; + + /* precalculate the bit patterns with random shifts + * for all 8 pixels and put them on an extra "stack" */ + asm volatile ( + "mov r3, #8 \n" /* loop count */ + "mov %[mask], #0 \n" + + ".ur_pre_loop: \n" + "mov %[mask], %[mask], lsl #1 \n" /* shift mask */ + "ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */ + "ldrb r1, [%[bbuf]] \n" /* read back buffer */ + "strb r0, [%[bbuf]], #1 \n" /* update back buffer */ + "mov r2, #0 \n" /* preset for skipped pixel */ + "cmp r0, r1 \n" /* no change? */ + "beq .ur_skip \n" /* -> skip */ + + "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ + + "add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */ + "add %[rnd], %[rnd], %[rnd], lsl #1 \n" + "add %[rnd], %[rnd], r0, lsl #3 \n" + "add %[rnd], %[rnd], #74 \n" /* add another 74 */ + /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ + "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ + + "cmp r1, %[dpth] \n" /* random >= depth ? */ + "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ + + "mov r0, r2, lsl r1 \n" /** rotate pattern **/ + "sub r1, %[dpth], r1 \n" + "orr r2, r0, r2, lsr r1 \n" + + "orr %[mask], %[mask], #1 \n" /* set mask bit */ + + ".ur_skip: \n" + "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ + + "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ + "bne .ur_pre_loop \n" + : /* outputs */ + [cbuf]"+r"(cbuf), + [bbuf]"+r"(bbuf), + [patp]"+r"(pat_ptr), + [rnd] "+r"(_gray_random_buffer), + [mask]"=&r"(mask) + : /* inputs */ + [bpat]"r"(_gray_info.bitpattern), + [dpth]"r"(_gray_info.depth), + [rmsk]"r"(_gray_info.randmask) + : /* clobbers */ + "r0", "r1", "r2", "r3" + ); + + addr = dst_row; + end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); + + /* set the bits for all 8 pixels in all bytes according to the + * precalculated patterns on the pattern stack */ + asm volatile ( + "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ + + "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ + "ands %[mask], %[mask], #0xff \n" + "beq .ur_sloop \n" /* short loop if nothing to keep */ + + ".ur_floop: \n" /** full loop (there are bits to keep)**/ + "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ + "adc r0, r0, r0 \n" /* put bit into LSB for byte */ + "movs r8, r8, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r7, r7, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r6, r6, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r5, r5, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r4, r4, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r3, r3, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r2, r2, lsr #1 \n" + "adc r0, r0, r0 \n" + + "ldrb r1, [%[addr]] \n" /* read old value */ + "and r1, r1, %[mask] \n" /* mask out replaced bits */ + "orr r1, r1, r0 \n" /* set new bits */ + "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ + + "cmp %[end], %[addr] \n" /* loop for all bitplanes */ + "bne .ur_floop \n" + + "b .ur_end \n" + + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ + "adc r0, r0, r0 \n" /* put bit into LSB for byte */ + "movs r8, r8, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r7, r7, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r6, r6, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r5, r5, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r4, r4, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r3, r3, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r2, r2, lsr #1 \n" + "adc r0, r0, r0 \n" + + "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ + + "cmp %[end], %[addr] \n" /* loop for all bitplanes */ + "bne .ur_sloop \n" + + ".ur_end: \n" + : /* outputs */ + [addr]"+r"(addr), + [mask]"+r"(mask), + [rx] "=&r"(trash) + : /* inputs */ + [psiz]"r"(_gray_info.plane_size), + [end] "r"(end), + [patp]"[rx]"(pat_ptr) + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" + ); + } #else /* C version, for reference*/ +#warning C version of gray_update_rect() used (void)pat_ptr; /* check whether anything changed in the 8-pixel block */ change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; - cbuf += sizeof(uint32_t); - bbuf += sizeof(uint32_t); - change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; + change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4); if (change != 0) { @@ -664,9 +815,6 @@ void gray_update_rect(int x, int y, int width, int height) unsigned test = 1; int i; - cbuf = _gray_info.cur_buffer + srcofs_row; - bbuf = _gray_info.back_buffer + srcofs_row; - /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ for (i = 7; i >= 0; i--) @@ -711,7 +859,7 @@ void gray_update_rect(int x, int y, int width, int height) for (i = 7; i >= 0; i--) data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0); - + *addr = data; addr += _gray_info.plane_size; test <<= 1; @@ -788,18 +936,18 @@ void gray_update_rect(int x, int y, int width, int height) #if CONFIG_CPU == SH7034 asm volatile ( - "mov.l @%[cbuf]+,r1 \n" - "mov.l @%[bbuf]+,r2 \n" - "xor r1,r2 \n" - "mov.l @%[cbuf],r1 \n" - "mov.l @%[bbuf],%[chg] \n" - "xor r1,%[chg] \n" - "or r2,%[chg] \n" + "mov.l @%[cbuf],r1 \n" + "mov.l @%[bbuf],r2 \n" + "xor r1,r2 \n" + "mov.l @(4,%[cbuf]),r1 \n" + "mov.l @(4,%[bbuf]),%[chg] \n" + "xor r1,%[chg] \n" + "or r2,%[chg] \n" : /* outputs */ - [cbuf]"+r"(cbuf), - [bbuf]"+r"(bbuf), [chg] "=r"(change) : /* inputs */ + [cbuf]"r"(cbuf), + [bbuf]"r"(bbuf) : /* clobbers */ "r1", "r2" ); @@ -810,13 +958,11 @@ void gray_update_rect(int x, int y, int width, int height) unsigned mask, trash; pat_ptr = &pat_stack[8]; - cbuf = _gray_info.cur_buffer + srcofs_row; - bbuf = _gray_info.back_buffer + srcofs_row; /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ asm volatile ( - "mov #8,r3 \n" /* loop count in r3: 8 pixels */ + "mov #8,r3 \n" /* loop count */ ".ur_pre_loop: \n" "mov.b @%[cbuf]+,r0\n" /* read current buffer */ @@ -860,10 +1006,11 @@ void gray_update_rect(int x, int y, int width, int height) "rotcr %[mask] \n" /* get mask bit */ "mov.l r2,@-%[patp]\n" /* push on pattern stack */ - "add #-1,r3 \n" /* decrease loop count */ - "cmp/pl r3 \n" /* loop count > 0? */ - "bt .ur_pre_loop\n" /* yes: loop */ - "shlr8 %[mask] \n" + "add #-1,r3 \n" /* loop 8 times (pixel block) */ + "cmp/pl r3 \n" + "bt .ur_pre_loop\n" + + "shlr8 %[mask] \n" /* shift mask to low byte */ "shlr16 %[mask] \n" : /* outputs */ [cbuf]"+r"(cbuf), @@ -885,77 +1032,77 @@ void gray_update_rect(int x, int y, int width, int height) /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ asm volatile ( - "mov.l @%[patp]+,r1\n" /* pop all 8 patterns */ - "mov.l @%[patp]+,r2\n" - "mov.l @%[patp]+,r3\n" - "mov.l @%[patp]+,r6\n" - "mov.l @%[patp]+,r7\n" - "mov.l @%[patp]+,r8\n" - "mov.l @%[patp]+,r9\n" - "mov.l @%[patp],r10\n" - - "tst %[mask],%[mask] \n" /* nothing to keep? */ - "bt .ur_sloop \n" /* yes: jump to short loop */ - - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" + "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ + "mov.l @%[patp]+,r2 \n" + "mov.l @%[patp]+,r3 \n" + "mov.l @%[patp]+,r6 \n" + "mov.l @%[patp]+,r7 \n" + "mov.l @%[patp]+,r8 \n" + "mov.l @%[patp]+,r9 \n" + "mov.l @%[patp],r10 \n" + + "tst %[mask],%[mask] \n" + "bt .ur_sloop \n" /* short loop if nothing to keep */ + + ".ur_floop: \n" /** full loop (there are bits to keep)**/ + "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ + "rotcl r0 \n" /* rotate t bit into r0 */ + "shlr r2 \n" + "rotcl r0 \n" + "shlr r3 \n" + "rotcl r0 \n" + "shlr r6 \n" + "rotcl r0 \n" + "shlr r7 \n" + "rotcl r0 \n" + "shlr r8 \n" + "rotcl r0 \n" + "shlr r9 \n" + "rotcl r0 \n" + "shlr r10 \n" "mov.b @%[addr],%[rx] \n" /* read old value */ - "rotcl r0 \n" - "and %[mask],%[rx] \n" /* mask out unneeded bits */ - "or %[rx],r0 \n" /* set new bits */ - "mov.b r0,@%[addr] \n" /* store value to bitplane */ + "rotcl r0 \n" + "and %[mask],%[rx] \n" /* mask out replaced bits */ + "or %[rx],r0 \n" /* set new bits */ + "mov.b r0,@%[addr] \n" /* store value to bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ - "bt .ur_floop \n" /* no: loop */ + "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ + "bt .ur_floop \n" - "bra .ur_end \n" - "nop \n" + "bra .ur_end \n" + "nop \n" /* References to C library routines used in the precalc block */ - ".align 2 \n" - ".ashlsi3: \n" /* C library routine: */ - ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ - ".lshrsi3: \n" /* C library routine: */ - ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ + ".align 2 \n" + ".ashlsi3: \n" /* C library routine: */ + ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ + ".lshrsi3: \n" /* C library routine: */ + ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ /* both routines preserve r4, destroy r5 and take ~16 cycles */ - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "rotcl r0 \n" + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ + "rotcl r0 \n" /* rotate t bit into r0 */ + "shlr r2 \n" + "rotcl r0 \n" + "shlr r3 \n" + "rotcl r0 \n" + "shlr r6 \n" + "rotcl r0 \n" + "shlr r7 \n" + "rotcl r0 \n" + "shlr r8 \n" + "rotcl r0 \n" + "shlr r9 \n" + "rotcl r0 \n" + "shlr r10 \n" + "rotcl r0 \n" "mov.b r0,@%[addr] \n" /* store byte to bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ - "bt .ur_sloop \n" /* no: loop */ + "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */ + "bt .ur_sloop \n" - ".ur_end: \n" + ".ur_end: \n" : /* outputs */ [addr]"+r"(addr), [mask]"+r"(mask), @@ -970,18 +1117,18 @@ void gray_update_rect(int x, int y, int width, int height) } #elif defined(CPU_COLDFIRE) asm volatile ( - "move.l (%[cbuf])+,%%d0 \n" - "move.l (%[bbuf])+,%%d1 \n" - "eor.l %%d0,%%d1 \n" - "move.l (%[cbuf]),%%d0 \n" - "move.l (%[bbuf]),%[chg]\n" - "eor.l %%d0,%[chg] \n" - "or.l %%d1,%[chg] \n" + "move.l (%[cbuf]),%%d0 \n" + "move.l (%[bbuf]),%%d1 \n" + "eor.l %%d0,%%d1 \n" + "move.l (4,%[cbuf]),%%d0 \n" + "move.l (4,%[bbuf]),%[chg] \n" + "eor.l %%d0,%[chg] \n" + "or.l %%d1,%[chg] \n" : /* outputs */ - [cbuf]"+a"(cbuf), - [bbuf]"+a"(bbuf), [chg] "=&d"(change) : /* inputs */ + [cbuf]"a"(cbuf), + [bbuf]"a"(bbuf) : /* clobbers */ "d0", "d1" ); @@ -992,54 +1139,52 @@ void gray_update_rect(int x, int y, int width, int height) unsigned mask, trash; pat_ptr = &pat_stack[8]; - cbuf = _gray_info.cur_buffer + srcofs_row; - bbuf = _gray_info.back_buffer + srcofs_row; /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ asm volatile ( - "moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */ - "clr.l %[mask] \n" - - ".ur_pre_loop: \n" - "clr.l %%d0 \n" - "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ - "clr.l %%d1 \n" - "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ - "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ - "clr.l %%d2 \n" /* preset for skipped pixel */ - "cmp.l %%d0,%%d1 \n" /* no change? */ - "beq.b .ur_skip \n" /* -> skip */ - - "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ - - "mulu.w #75,%[rnd] \n" /* multiply by 75 */ - "add.l #74,%[rnd] \n" /* add another 74 */ + "moveq.l #8,%%d3 \n" /* loop count */ + "clr.l %[mask] \n" + + ".ur_pre_loop: \n" + "clr.l %%d0 \n" + "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ + "clr.l %%d1 \n" + "move.b (%[bbuf]),%%d1 \n" /* read back buffer */ + "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ + "clr.l %%d2 \n" /* preset for skipped pixel */ + "cmp.l %%d0,%%d1 \n" /* no change? */ + "beq.b .ur_skip \n" /* -> skip */ + + "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ + + "mulu.w #75,%[rnd] \n" /* multiply by 75 */ + "add.l #74,%[rnd] \n" /* add another 74 */ /* Since the lower bits are not very random: */ - "move.l %[rnd],%%d1 \n" - "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ - "and.l %[rmsk],%%d1\n" /* mask out unneeded bits */ - - "cmp.l %[dpth],%%d1\n" /* random >= depth ? */ - "blo.b .ur_ntrim \n" - "sub.l %[dpth],%%d1\n" /* yes: random -= depth; */ - ".ur_ntrim: \n" - - "move.l %%d2,%%d0 \n" - "lsl.l %%d1,%%d0 \n" - "sub.l %[dpth],%%d1\n" - "neg.l %%d1 \n" /* d1 = depth - d1 */ - "lsr.l %%d1,%%d2 \n" - "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ + "move.l %[rnd],%%d1 \n" + "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ + "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ + + "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ + "blo.b .ur_ntrim \n" + "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ + ".ur_ntrim: \n" + + "move.l %%d2,%%d0 \n" /** rotate pattern **/ + "lsl.l %%d1,%%d0 \n" + "sub.l %[dpth],%%d1 \n" + "neg.l %%d1 \n" /* d1 = depth - d1 */ + "lsr.l %%d1,%%d2 \n" + "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ "or.l #0x0100,%[mask] \n" /* set mask bit */ - ".ur_skip: \n" - "lsr.l #1,%[mask] \n" /* shift mask */ + ".ur_skip: \n" + "lsr.l #1,%[mask] \n" /* shift mask */ "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ - "subq.l #1,%%d3 \n" /* decrease loop count */ - "bne.b .ur_pre_loop\n" /* yes: loop */ + "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ + "bne.b .ur_pre_loop \n" : /* outputs */ [cbuf]"+a"(cbuf), [bbuf]"+a"(bbuf), @@ -1061,79 +1206,79 @@ void gray_update_rect(int x, int y, int width, int height) * precalculated patterns on the pattern stack */ asm volatile ( "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" - /* pop all 8 patterns */ - "not.l %[mask] \n" /* set mask -> keep mask */ + /* pop all 8 patterns */ + "not.l %[mask] \n" /* "set" mask -> "keep" mask */ "and.l #0xFF,%[mask] \n" - "beq.b .ur_sstart \n" /* yes: jump to short loop */ - - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out mask bit */ - "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a0,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" + "beq.b .ur_sstart \n" /* short loop if nothing to keep */ + + ".ur_floop: \n" /** full loop (there are bits to keep)**/ + "clr.l %%d0 \n" + "lsr.l #1,%%d2 \n" /* shift out pattern bit */ + "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ + "lsr.l #1,%%d3 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d4 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d5 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d6 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%a0,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a0 \n" + "move.l %%a1,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a1 \n" + "move.l %[ax],%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%[ax] \n" "move.b (%[addr]),%%d1 \n" /* read old value */ - "and.l %[mask],%%d1 \n" /* mask out unneeded bits */ + "and.l %[mask],%%d1 \n" /* mask out replaced bits */ "or.l %%d0,%%d1 \n" /* set new bits */ "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* last bitplane done? */ - "bhi.b .ur_floop \n" /* no: loop */ + "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ + "bhi.b .ur_floop \n" - "bra.b .ur_end \n" + "bra.b .ur_end \n" - ".ur_sstart: \n" - "move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */ - - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out mask bit */ - "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%[mask] \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" + ".ur_sstart: \n" + "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ + + ".ur_sloop: \n" /** short loop (nothing to keep) **/ + "clr.l %%d0 \n" + "lsr.l #1,%%d2 \n" /* shift out pattern bit */ + "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ + "lsr.l #1,%%d3 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d4 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d5 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d6 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%[mask] \n" + "addx.l %%d0,%%d0 \n" + "move.l %%a1,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a1 \n" + "move.l %[ax],%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%[ax] \n" "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* last bitplane done? */ - "bhi.b .ur_sloop \n" /* no: loop */ + "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */ + "bhi.b .ur_sloop \n" - ".ur_end: \n" + ".ur_end: \n" : /* outputs */ [addr]"+a"(addr), [mask]"+d"(mask), @@ -1151,9 +1296,7 @@ void gray_update_rect(int x, int y, int width, int height) (void)pat_ptr; /* check whether anything changed in the 8-pixel block */ change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; - cbuf += sizeof(uint32_t); - bbuf += sizeof(uint32_t); - change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; + change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4); if (change != 0) { @@ -1162,9 +1305,6 @@ void gray_update_rect(int x, int y, int width, int height) unsigned test = 1; int i; - cbuf = _gray_info.cur_buffer + srcofs_row; - bbuf = _gray_info.back_buffer + srcofs_row; - /* precalculate the bit patterns with random shifts * for all 8 pixels and put them on an extra "stack" */ for (i = 0; i < 8; i++) diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index 396046d1e6..7df3e13c56 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c @@ -876,8 +876,140 @@ static void _writearray(unsigned char *address, const unsigned char *src, unsigned long pat_stack[8]; unsigned long *pat_ptr = &pat_stack[8]; unsigned char *addr, *end; -#if 0 /* CPU specific asm versions will go here */ +#ifdef CPU_ARM + const unsigned char *_src; + unsigned _mask, trash; + + _mask = mask; + _src = src; + + /* precalculate the bit patterns with random shifts + for all 8 pixels and put them on an extra "stack" */ + asm volatile ( + "mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */ + "mov r3, #8 \n" /* loop count */ + + ".wa_loop: \n" /** load pattern for pixel **/ + "mov r2, #0 \n" /* pattern for skipped pixel must be 0 */ + "movs %[mask], %[mask], lsl #1 \n" /* shift out msb of mask */ + "bcc .wa_skip \n" /* skip this pixel */ + + "ldrb r0, [%[src]] \n" /* load src byte */ + "ldrb r0, [%[trns], r0] \n" /* idxtable into pattern index */ + "ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */ + + "add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */ + "add %[rnd], %[rnd], %[rnd], lsl #1 \n" + "add %[rnd], %[rnd], r0, lsl #3 \n" + "add %[rnd], %[rnd], #74 \n" /* add another 74 */ + /* Since the lower bits are not very random: get bits 8..15 (need max. 5) */ + "and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */ + + "cmp r1, %[dpth] \n" /* random >= depth ? */ + "subhs r1, r1, %[dpth] \n" /* yes: random -= depth */ + + "mov r0, r2, lsl r1 \n" /** rotate pattern **/ + "sub r1, %[dpth], r1 \n" + "orr r2, r0, r2, lsr r1 \n" + + ".wa_skip: \n" + "str r2, [%[patp], #-4]! \n" /* push on pattern stack */ + + "add %[src], %[src], #1 \n" /* src++; */ + "subs r3, r3, #1 \n" /* loop 8 times (pixel block) */ + "bne .wa_loop \n" + : /* outputs */ + [src] "+r"(_src), + [patp]"+r"(pat_ptr), + [rnd] "+r"(_gray_random_buffer), + [mask]"+r"(_mask) + : /* inputs */ + [bpat]"r"(_gray_info.bitpattern), + [trns]"r"(_gray_info.idxtable), + [dpth]"r"(_gray_info.depth), + [rmsk]"r"(_gray_info.randmask) + : /* clobbers */ + "r0", "r1", "r2", "r3" + ); + + addr = address; + end = addr + MULU16(_gray_info.depth, _gray_info.plane_size); + _mask = mask; + + /* set the bits for all 8 pixels in all bytes according to the + * precalculated patterns on the pattern stack */ + asm volatile ( + "ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */ + + "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ + "ands %[mask], %[mask], #0xff \n" + "beq .wa_sloop \n" /* short loop if nothing to keep */ + + ".wa_floop: \n" /** full loop (there are bits to keep)**/ + "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ + "adc r0, r0, r0 \n" /* put bit into LSB of byte */ + "movs r8, r8, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r7, r7, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r6, r6, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r5, r5, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r4, r4, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r3, r3, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r2, r2, lsr #1 \n" + "adc r0, r0, r0 \n" + + "ldrb r1, [%[addr]] \n" /* read old value */ + "and r1, r1, %[mask] \n" /* mask out replaced bits */ + "orr r1, r1, r0 \n" /* set new bits */ + "strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */ + + "cmp %[end], %[addr] \n" /* loop through all bitplanes */ + "bne .wa_floop \n" + + "b .wa_end \n" + + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */ + "adc r0, r0, r0 \n" /* put bit into LSB of byte */ + "movs r8, r8, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r7, r7, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r6, r6, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r5, r5, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r4, r4, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r3, r3, lsr #1 \n" + "adc r0, r0, r0 \n" + "movs r2, r2, lsr #1 \n" + "adc r0, r0, r0 \n" + + "strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */ + + "cmp %[end], %[addr] \n" /* loop through all bitplanes */ + "bne .wa_sloop \n" + + ".wa_end: \n" + : /* outputs */ + [addr]"+r"(addr), + [mask]"+r"(_mask), + [rx] "=&r"(trash) + : /* inputs */ + [psiz]"r"(_gray_info.plane_size), + [end] "r"(end), + [patp]"[rx]"(pat_ptr) + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" + ); #else /* C version, for reference*/ +#warning C version of _writearray() used unsigned test = 0x80; int i; @@ -1027,52 +1159,52 @@ static void _writearray(unsigned char *address, const unsigned char *src, /* precalculate the bit patterns with random shifts for all 8 pixels and put them on an extra "stack" */ asm volatile ( - "mov #8,r3 \n" /* loop count in r3: 8 pixels */ + "mov #8,r3 \n" /* loop count */ - ".wa_loop: \n" /** load pattern for pixel **/ - "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ - "shlr %[mask] \n" /* shift out lsb of mask */ - "bf .wa_skip \n" /* skip this pixel */ + ".wa_loop: \n" /** load pattern for pixel **/ + "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ + "shlr %[mask] \n" /* shift out lsb of mask */ + "bf .wa_skip \n" /* skip this pixel */ - "mov.b @%[src],r0 \n" /* load src byte */ - "extu.b r0,r0 \n" /* extend unsigned */ + "mov.b @%[src],r0 \n" /* load src byte */ + "extu.b r0,r0 \n" /* extend unsigned */ "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ - "extu.b r0,r0 \n" /* extend unsigned */ - "shll2 r0 \n" + "extu.b r0,r0 \n" /* extend unsigned */ + "shll2 r0 \n" "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ - "mov #75,r0 \n" - "mulu r0,%[rnd] \n" /* multiply by 75 */ - "sts macl,%[rnd] \n" - "add #74,%[rnd] \n" /* add another 74 */ + "mov #75,r0 \n" + "mulu r0,%[rnd] \n" /* multiply by 75 */ + "sts macl,%[rnd] \n" + "add #74,%[rnd] \n" /* add another 74 */ /* Since the lower bits are not very random: */ - "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ - "and %[rmsk],r1 \n" /* mask out unneeded bits */ + "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ + "and %[rmsk],r1 \n" /* mask out unneeded bits */ - "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ - "bf .wa_ntrim \n" - "sub %[dpth],r1 \n" /* yes: random -= depth; */ - ".wa_ntrim: \n" + "cmp/hs %[dpth],r1 \n" /* random >= depth ? */ + "bf .wa_ntrim \n" + "sub %[dpth],r1 \n" /* yes: random -= depth; */ + ".wa_ntrim: \n" - "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ - "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ - "mov r1,r5 \n" + "mov.l .ashlsi3,r0 \n" /** rotate pattern **/ + "jsr @r0 \n" /* r4 -> r0, shift left by r5 */ + "mov r1,r5 \n" - "mov %[dpth],r5 \n" - "sub r1,r5 \n" /* r5 = depth - r1 */ - "mov.l .lshrsi3,r1 \n" - "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ - "mov r0,r1 \n" /* store previous result in r1 */ + "mov %[dpth],r5 \n" + "sub r1,r5 \n" /* r5 = depth - r1 */ + "mov.l .lshrsi3,r1 \n" + "jsr @r1 \n" /* r4 -> r0, shift right by r5 */ + "mov r0,r1 \n" /* store previous result in r1 */ - "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ + "or r1,r0 \n" /* rotated_pattern = r0 | r1 */ - ".wa_skip: \n" - "mov.l r0,@-%[patp]\n" /* push on pattern stack */ + ".wa_skip: \n" + "mov.l r0,@-%[patp] \n" /* push on pattern stack */ "add %[stri],%[src] \n" /* src += stride; */ - "add #-1,r3 \n" /* decrease loop count */ - "cmp/pl r3 \n" /* loop count > 0? */ - "bt .wa_loop \n" /* yes: loop */ + "add #-1,r3 \n" /* loop 8 times (pixel block) */ + "cmp/pl r3 \n" + "bt .wa_loop \n" : /* outputs */ [src] "+r"(_src), [rnd] "+r"(_gray_random_buffer), @@ -1095,79 +1227,79 @@ static void _writearray(unsigned char *address, const unsigned char *src, /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ asm volatile ( - "mov.l @%[patp]+,r1\n" /* pop all 8 patterns */ - "mov.l @%[patp]+,r2\n" - "mov.l @%[patp]+,r3\n" - "mov.l @%[patp]+,r6\n" - "mov.l @%[patp]+,r7\n" - "mov.l @%[patp]+,r8\n" - "mov.l @%[patp]+,r9\n" - "mov.l @%[patp],r10\n" + "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */ + "mov.l @%[patp]+,r2 \n" + "mov.l @%[patp]+,r3 \n" + "mov.l @%[patp]+,r6 \n" + "mov.l @%[patp]+,r7 \n" + "mov.l @%[patp]+,r8 \n" + "mov.l @%[patp]+,r9 \n" + "mov.l @%[patp],r10 \n" "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ "extu.b %[mask],%[mask] \n" /* mask out high bits */ - "tst %[mask],%[mask] \n" /* nothing to keep? */ - "bt .wa_sloop \n" /* yes: jump to short loop */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" + "tst %[mask],%[mask] \n" + "bt .wa_sloop \n" /* short loop if nothing to keep */ + + ".wa_floop: \n" /** full loop (there are bits to keep)**/ + "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ + "rotcl r0 \n" /* rotate t bit into r0 */ + "shlr r2 \n" + "rotcl r0 \n" + "shlr r3 \n" + "rotcl r0 \n" + "shlr r6 \n" + "rotcl r0 \n" + "shlr r7 \n" + "rotcl r0 \n" + "shlr r8 \n" + "rotcl r0 \n" + "shlr r9 \n" + "rotcl r0 \n" + "shlr r10 \n" "mov.b @%[addr],%[rx] \n" /* read old value */ - "rotcl r0 \n" - "and %[mask],%[rx] \n" /* mask out unneeded bits */ - "or %[rx],r0 \n" /* set new bits */ - "mov.b r0,@%[addr] \n" /* store value to bitplane */ + "rotcl r0 \n" + "and %[mask],%[rx] \n" /* mask out replaced bits */ + "or %[rx],r0 \n" /* set new bits */ + "mov.b r0,@%[addr] \n" /* store value to bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ - "bt .wa_floop \n" /* no: loop */ + "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ + "bt .wa_floop \n" - "bra .wa_end \n" - "nop \n" + "bra .wa_end \n" + "nop \n" /* References to C library routines used in the precalc block */ - ".align 2 \n" - ".ashlsi3: \n" /* C library routine: */ - ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ - ".lshrsi3: \n" /* C library routine: */ - ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ + ".align 2 \n" + ".ashlsi3: \n" /* C library routine: */ + ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ + ".lshrsi3: \n" /* C library routine: */ + ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ /* both routines preserve r4, destroy r5 and take ~16 cycles */ - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ - "rotcl r0 \n" /* rotate t bit into r0 */ - "shlr r2 \n" - "rotcl r0 \n" - "shlr r3 \n" - "rotcl r0 \n" - "shlr r6 \n" - "rotcl r0 \n" - "shlr r7 \n" - "rotcl r0 \n" - "shlr r8 \n" - "rotcl r0 \n" - "shlr r9 \n" - "rotcl r0 \n" - "shlr r10 \n" - "rotcl r0 \n" - "mov.b r0,@%[addr] \n" /* store byte to bitplane */ + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ + "rotcl r0 \n" /* rotate t bit into r0 */ + "shlr r2 \n" + "rotcl r0 \n" + "shlr r3 \n" + "rotcl r0 \n" + "shlr r6 \n" + "rotcl r0 \n" + "shlr r7 \n" + "rotcl r0 \n" + "shlr r8 \n" + "rotcl r0 \n" + "shlr r9 \n" + "rotcl r0 \n" + "shlr r10 \n" + "rotcl r0 \n" + "mov.b r0,@%[addr] \n" /* store byte to bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp/hi %[addr],%[end] \n" /* last bitplane done? */ - "bt .wa_sloop \n" /* no: loop */ + "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */ + "bt .wa_sloop \n" - ".wa_end: \n" + ".wa_end: \n" : /* outputs */ [addr]"+r"(addr), [mask]"+r"(_mask), @@ -1189,43 +1321,43 @@ static void _writearray(unsigned char *address, const unsigned char *src, /* precalculate the bit patterns with random shifts for all 8 pixels and put them on an extra "stack" */ asm volatile ( - "moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */ + "moveq.l #8,%%d3 \n" /* loop count */ - ".wa_loop: \n" /** load pattern for pixel **/ - "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ - "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ - "bcc.b .wa_skip \n" /* skip this pixel */ + ".wa_loop: \n" /** load pattern for pixel **/ + "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ + "lsr.l #1,%[mask] \n" /* shift out lsb of mask */ + "bcc.b .wa_skip \n" /* skip this pixel */ - "clr.l %%d0 \n" + "clr.l %%d0 \n" "move.b (%[src]),%%d0 \n" /* load src byte */ "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ - "mulu.w #75,%[rnd] \n" /* multiply by 75 */ - "add.l #74,%[rnd] \n" /* add another 74 */ + "mulu.w #75,%[rnd] \n" /* multiply by 75 */ + "add.l #74,%[rnd] \n" /* add another 74 */ /* Since the lower bits are not very random: */ - "move.l %[rnd],%%d1 \n" - "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ - "and.l %[rmsk],%%d1\n" /* mask out unneeded bits */ - - "cmp.l %[dpth],%%d1\n" /* random >= depth ? */ - "blo.b .wa_ntrim \n" - "sub.l %[dpth],%%d1\n" /* yes: random -= depth; */ - ".wa_ntrim: \n" - - "move.l %%d2,%%d0 \n" - "lsl.l %%d1,%%d0 \n" - "sub.l %[dpth],%%d1\n" - "neg.l %%d1 \n" /* d1 = depth - d1 */ - "lsr.l %%d1,%%d2 \n" - "or.l %%d0,%%d2 \n" - - ".wa_skip: \n" + "move.l %[rnd],%%d1 \n" + "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ + "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */ + + "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */ + "blo.b .wa_ntrim \n" + "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */ + ".wa_ntrim: \n" + + "move.l %%d2,%%d0 \n" /** rotate pattern **/ + "lsl.l %%d1,%%d0 \n" + "sub.l %[dpth],%%d1 \n" + "neg.l %%d1 \n" /* d1 = depth - d1 */ + "lsr.l %%d1,%%d2 \n" + "or.l %%d0,%%d2 \n" + + ".wa_skip: \n" "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ "add.l %[stri],%[src] \n" /* src += stride; */ - "subq.l #1,%%d3 \n" /* decrease loop count */ - "bne.b .wa_loop \n" /* yes: loop */ + "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */ + "bne.b .wa_loop \n" : /* outputs */ [src] "+a"(_src), [patp]"+a"(pat_ptr), @@ -1250,78 +1382,76 @@ static void _writearray(unsigned char *address, const unsigned char *src, asm volatile ( "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" /* pop all 8 patterns */ - "not.l %[mask] \n" /* "set" mask -> "keep" mask */ + "not.l %[mask] \n" /* "set" mask -> "keep" mask */ "and.l #0xFF,%[mask] \n" - "beq.b .wa_sstart \n" /* yes: jump to short loop */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out mask bit */ - "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a0,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" + "beq.b .wa_sstart \n" /* short loop if nothing to keep */ + + ".wa_floop: \n" /** full loop (there are bits to keep)**/ + "lsr.l #1,%%d2 \n" /* shift out pattern bit */ + "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ + "lsr.l #1,%%d3 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d4 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d5 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d6 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%a0,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a0 \n" + "move.l %%a1,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a1 \n" + "move.l %[ax],%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%[ax] \n" "move.b (%[addr]),%%d1 \n" /* read old value */ - "and.l %[mask],%%d1 \n" /* mask out unneeded bits */ + "and.l %[mask],%%d1 \n" /* mask out replaced bits */ "or.l %%d0,%%d1 \n" /* set new bits */ "move.b %%d1,(%[addr]) \n" /* store value to bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* last bitplane done? */ - "bhi.b .wa_floop \n" /* no: loop */ - - "bra.b .wa_end \n" - - ".wa_sstart: \n" - "move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */ - - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "clr.l %%d0 \n" - "lsr.l #1,%%d2 \n" /* shift out mask bit */ - "addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ - "lsr.l #1,%%d3 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d4 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d5 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%%d6 \n" - "addx.l %%d0,%%d0 \n" - "lsr.l #1,%[mask] \n" - "addx.l %%d0,%%d0 \n" - "move.l %%a1,%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%%a1 \n" - "move.l %[ax],%%d1 \n" - "lsr.l #1,%%d1 \n" - "addx.l %%d0,%%d0 \n" - "move.l %%d1,%[ax] \n" + "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ + "bhi.b .wa_floop \n" + + "bra.b .wa_end \n" + + ".wa_sstart: \n" + "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */ + + ".wa_sloop: \n" /** short loop (nothing to keep) **/ + "lsr.l #1,%%d2 \n" /* shift out pattern bit */ + "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */ + "lsr.l #1,%%d3 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d4 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d5 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%%d6 \n" + "addx.l %%d0,%%d0 \n" + "lsr.l #1,%[mask] \n" + "addx.l %%d0,%%d0 \n" + "move.l %%a1,%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%%a1 \n" + "move.l %[ax],%%d1 \n" + "lsr.l #1,%%d1 \n" + "addx.l %%d0,%%d0 \n" + "move.l %%d1,%[ax] \n" "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */ - "cmp.l %[addr],%[end] \n" /* last bitplane done? */ - "bhi.b .wa_sloop \n" /* no: loop */ + "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */ + "bhi.b .wa_sloop \n" - ".wa_end: \n" + ".wa_end: \n" : /* outputs */ [addr]"+a"(addr), [mask]"+d"(_mask), diff --git a/apps/plugins/lib/gray_scroll.c b/apps/plugins/lib/gray_scroll.c index df5dc57044..8f60e7cef1 100644 --- a/apps/plugins/lib/gray_scroll.c +++ b/apps/plugins/lib/gray_scroll.c @@ -283,32 +283,32 @@ void gray_ub_scroll_left(int count) if (count) { asm ( - "mov r4, %[high] \n" + "mov r4, %[high] \n" /* rows = height */ - ".sl_rloop: \n" - "mov r5, %[addr] \n" - "mov r2, %[dpth] \n" + ".sl_rloop: \n" /* repeat for every row */ + "mov r5, %[addr] \n" /* get start address */ + "mov r2, %[dpth] \n" /* planes = depth */ - ".sl_oloop: \n" - "mov r6, r5 \n" - "mov r3, %[cols] \n" - "mov r1, #0 \n" + ".sl_oloop: \n" /* repeat for every bitplane */ + "mov r6, r5 \n" /* get start address */ + "mov r3, %[cols] \n" /* cols = col_count */ + "mov r1, #0 \n" /* fill with zero */ - ".sl_iloop: \n" - "mov r1, r1, lsr #8 \n" - "ldrb r0, [r6, #-1]! \n" - "orr r1, r1, r0, lsl %[cnt] \n" - "strb r1, [r6] \n" + ".sl_iloop: \n" /* repeat for all cols */ + "mov r1, r1, lsr #8 \n" /* shift right to get residue */ + "ldrb r0, [r6, #-1]! \n" /* decrement addr & get data byte */ + "orr r1, r1, r0, lsl %[cnt] \n" /* combine with last residue */ + "strb r1, [r6] \n" /* store data */ - "subs r3, r3, #1 \n" - "bne .sl_iloop \n" + "subs r3, r3, #1 \n" /* cols-- */ + "bne .sl_iloop \n" - "add r5, r5, %[psiz] \n" - "subs r2, r2, #1 \n" - "bne .sl_oloop \n" + "add r5, r5, %[psiz] \n" /* start_address += plane_size */ + "subs r2, r2, #1 \n" /* planes-- */ + "bne .sl_oloop \n" - "add %[addr],%[addr],%[bwid] \n" - "subs r4, r4, #1 \n" + "add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */ + "subs r4, r4, #1 \n" /* rows-- */ "bne .sl_rloop \n" : /* outputs */ : /* inputs */ @@ -364,32 +364,32 @@ void gray_ub_scroll_right(int count) if (count) { asm ( - "mov r4, %[high] \n" + "mov r4, %[high] \n" /* rows = height */ - ".sr_rloop: \n" - "mov r5, %[addr] \n" - "mov r2, %[dpth] \n" + ".sr_rloop: \n" /* repeat for every row */ + "mov r5, %[addr] \n" /* get start address */ + "mov r2, %[dpth] \n" /* planes = depth */ - ".sr_oloop: \n" - "mov r6, r5 \n" - "mov r3, %[cols] \n" - "mov r1, #0 \n" + ".sr_oloop: \n" /* repeat for every bitplane */ + "mov r6, r5 \n" /* get start address */ + "mov r3, %[cols] \n" /* cols = col_count */ + "mov r1, #0 \n" /* fill with zero */ - ".sr_iloop: \n" - "ldrb r0, [r6] \n" - "orr r1, r0, r1, lsl #8 \n" - "mov r0, r1, lsr %[cnt] \n" - "strb r0, [r6], #1 \n" + ".sr_iloop: \n" /* repeat for all cols */ + "ldrb r0, [r6] \n" /* get data byte */ + "orr r1, r0, r1, lsl #8 \n" /* combine w/ old data shifted to 2nd byte */ + "mov r0, r1, lsr %[cnt] \n" /* shift right */ + "strb r0, [r6], #1 \n" /* store data, increment addr */ - "subs r3, r3, #1 \n" - "bne .sr_iloop \n" + "subs r3, r3, #1 \n" /* cols-- */ + "bne .sr_iloop \n" - "add r5, r5, %[psiz] \n" - "subs r2, r2, #1 \n" - "bne .sr_oloop \n" + "add r5, r5, %[psiz] \n" /* start_address += plane_size */ + "subs r2, r2, #1 \n" /* planes-- */ + "bne .sr_oloop \n" - "add %[addr],%[addr],%[bwid] \n" - "subs r4, r4, #1 \n" + "add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */ + "subs r4, r4, #1 \n" /* rows-- */ "bne .sr_rloop \n" : /* outputs */ : /* inputs */ @@ -714,8 +714,7 @@ void gray_ub_scroll_up(int count) "move.b (%%a1),%%d0 \n" /* get data byte */ "lsl.l #8,%%d1 \n" /* old data to 2nd byte */ "or.l %%d1,%%d0 \n" /* combine old data */ - "clr.l %%d1 \n" - "move.b %%d0,%%d1 \n" /* keep data for next round */ + "move.l %%d0,%%d1 \n" /* keep data for next round */ "lsr.l %[cnt],%%d0 \n" /* shift right */ "move.b %%d0,(%%a1) \n" /* store data */ -- cgit v1.2.3