From f8b1da2f7bddebc9c7026bd5d106dec118ce70a9 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Sat, 4 Nov 2006 00:42:18 +0000 Subject: H300, X5: Faster lcd_yuv_blit() using EMAC. Speedup of the function itself at 124MHz: 10.5% on X5, 16.5% on H300. mpegplayer speedup 3..4% git-svn-id: svn://svn.rockbox.org/rockbox/trunk@11429 a1c6a512-1295-4272-9138-f99709370657 --- firmware/drivers/lcd-h300.c | 12 +- firmware/target/coldfire/iaudio/x5/lcd-as-x5.S | 388 ++++++++------------- firmware/target/coldfire/iaudio/x5/lcd-x5.c | 28 +- firmware/target/coldfire/iriver/h300/lcd-as-h300.S | 346 +++++++----------- 4 files changed, 284 insertions(+), 490 deletions(-) (limited to 'firmware') diff --git a/firmware/drivers/lcd-h300.c b/firmware/drivers/lcd-h300.c index b7865fa7c5..3e5642e35d 100644 --- a/firmware/drivers/lcd-h300.c +++ b/firmware/drivers/lcd-h300.c @@ -304,10 +304,11 @@ void lcd_blit(const fb_data* data, int x, int by, int width, /* Line write helper function for lcd_yuv_blit. Write two lines of yuv420. * y should have two lines of Y back to back. * bu and rv should contain the Cb and Cr data for the two lines of Y. - * Stores bu, guv and rv in repective buffers for use in second line. + * Needs EMAC set to saturated, signed integer mode. */ extern void lcd_write_yuv420_lines(const unsigned char *y, - unsigned char *bu, unsigned char *guv, unsigned char *rv, int width); + const unsigned char *bu, + const unsigned char *rv, int width); /* Performance function to blit a YUV bitmap directly to the LCD * src_x, src_y, width and height should be even @@ -317,10 +318,9 @@ void lcd_yuv_blit(unsigned char * const src[3], int src_x, int src_y, int stride, int x, int y, int width, int height) { - /* IRAM Y, Cb/bu, guv and Cb/rv buffers. */ + /* IRAM Y, Cb and Cb buffers. */ unsigned char y_ibuf[LCD_WIDTH*2]; unsigned char bu_ibuf[LCD_WIDTH/2]; - unsigned char guv_ibuf[LCD_WIDTH/2]; unsigned char rv_ibuf[LCD_WIDTH/2]; const unsigned char *ysrc, *usrc, *vsrc; const unsigned char *ysrc_max; @@ -342,13 +342,14 @@ void lcd_yuv_blit(unsigned char * const src[3], vsrc = src[2] + (src_y * stride >> 2) + (src_x >> 1); ysrc_max = ysrc + height * stride; + coldfire_set_macsr(EMAC_SATURATE); do { memcpy(y_ibuf, ysrc, width); memcpy(y_ibuf + width, ysrc + stride, width); memcpy(bu_ibuf, usrc, width >> 1); memcpy(rv_ibuf, vsrc, width >> 1); - lcd_write_yuv420_lines(y_ibuf, bu_ibuf, guv_ibuf, rv_ibuf, width); + lcd_write_yuv420_lines(y_ibuf, bu_ibuf, rv_ibuf, width); ysrc += 2 * stride; usrc += stride >> 1; vsrc += stride >> 1; @@ -381,6 +382,7 @@ void lcd_update(void) } } + /* Update a fraction of the display. */ void lcd_update_rect(int, int, int, int) ICODE_ATTR; void lcd_update_rect(int x, int y, int width, int height) diff --git a/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S b/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S index 6d5d324ebf..11150203af 100644 --- a/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S +++ b/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S @@ -40,260 +40,158 @@ * |G| = |1.000000 -0.334136 -0.714136| |Pb| * |B| |1.000000 1.772000 0.000000| |Pr| * Scaled, normalized, rounded and tweaked to yield RGB 666: - * |R| |74 0 101| |Y' - 16| / 256 - * |G| = |74 -24 -51| |Cb - 128| / 256 - * |B| |74 128 0| |Cr - 128| / 256 + * |R| |19611723 0 26881894| |Y' - 16| >> 26 + * |G| = |19611723 -6406711 -13692816| |Cb - 128| >> 26 + * |B| |19611723 33976259 0| |Cr - 128| >> 26 + * + * Needs EMAC set to saturated, signed integer mode. */ .align 2 .global lcd_write_yuv420_lines - .type lcd_write_yuv420_lines,@function + .type lcd_write_yuv420_lines, @function + lcd_write_yuv420_lines: - lea.l (-36,%sp),%sp /* free up some registers */ - movem.l %d2-%d6/%a2-%a5,(%sp) - - lea.l 0xf0008002,%a0 /* LCD data port */ - movem.l (36+4,%sp),%a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */ - lea.l (%a1,%a5),%a5 /* end address */ - -.yuv_line_loop1: - /** Write first pixel **/ - clr.l %d1 /* get bu component */ - move.b (%a2),%d1 - clr.l %d3 /* get rv component */ - move.b (%a4),%d3 - moveq.l #-128,%d0 - add.l %d0,%d1 - add.l %d0,%d3 - - move.l %d1,%d2 /* %d2 = cb component for guv */ - asr.l #1,%d1 /* %d1 = 128 * (Cb - 128) / 256 */ - move.b %d1,(%a2)+ /* save bu for next line */ - moveq.l #-24,%d0 /* multiply first term of guv */ - muls.w %d0,%d2 - moveq.l #-51,%d0 /* multiply second term of guv */ - muls.w %d3,%d0 - add.l %d0,%d2 - asr.l #8,%d2 - move.b %d2,(%a3)+ /* save guv for next line */ - moveq.l #101,%d0 - muls.w %d0,%d3 - asr.l #8,%d3 - move.b %d3,(%a4)+ /* save rv for next line */ - - clr.l %d4 /* get y component */ - move.b (%a1)+,%d4 - moveq.l #74,%d0 - muls.w %d0,%d4 - asr.l #8,%d4 - subq.l #4,%d4 - move.l %d4,%d5 - move.l %d4,%d6 - /* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ - - add.l %d3,%d4 /* get r */ - add.l %d2,%d5 /* get g */ - add.l %d1,%d6 /* get b */ - - move.l %d6,%d0 /* is clamping needed? */ - or.l %d5,%d0 - or.l %d4,%d0 - asr.l #6,%d0 - beq.b .yuv_no_clamp1 /* values in range: skip clamping */ - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_red_ok1 - spl.b %d4 - and.l %d0, %d4 -.yuv_red_ok1: - cmp.l %d0, %d5 - bls.s .yuv_green_ok1 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok1: - cmp.l %d0, %d6 - bls.s .yuv_blue_ok1 - spl.b %d6 - and.l %d0, %d6 -.yuv_blue_ok1: -.yuv_no_clamp1: - /* : %d4 = R, %d5 = G, %d6 = B */ - - move.l %d5,%d0 /* save g for lower 9 bits */ - lsl.l #3,%d4 /* R << 3 */ - lsr.l #3,%d0 /* G >> 3 */ - or.l %d4,%d0 - move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ - lsl.l #6,%d5 /* B << 6 */ - or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d6,(%a0) - - /** Write second pixel **/ - clr.l %d4 - move.b (%a1)+,%d4 /* get y component */ - moveq.l #74,%d0 - muls.w %d0,%d4 - asr.l #8,%d4 - subq.l #4,%d4 - /* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ - - /* Add Y + each chroma component (can clobber %d1-%d3 values now) */ - add.l %d4,%d3 /* get r */ - add.l %d4,%d2 /* get g */ - add.l %d4,%d1 /* get b */ - - move.l %d1,%d0 /* is clamping needed? */ - or.l %d2,%d0 - or.l %d3,%d0 - asr.l #6,%d0 - beq.b .yuv_no_clamp2 /* values in range: skip clamping */ - moveq.l #63, %d0 - cmp.l %d0, %d3 - bls.s .yuv_red_ok2 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok2: - cmp.l %d0, %d2 - bls.s .yuv_green_ok2 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok2: - cmp.l %d0, %d1 - bls.s .yuv_blue_ok2 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok2: -.yuv_no_clamp2: - /* : %d3 = R, %d2 = G, %d1 = B */ - - move.l %d2,%d0 /* save g for lower 9 bits */ - lsl.l #3,%d3 /* R << 3 */ - lsr.l #3,%d0 /* G >> 3 */ - or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */ - move.w %d0,(%a0) - lsl.l #6,%d2 /* G << 6 */ - or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d1,(%a0) - - cmp.l %a1,%a5 /* run %a1 up to end of line */ - bhi.w .yuv_line_loop1 + lea.l (-44, %sp), %sp /* free up some registers */ + movem.l %d2-%d7/%a2-%a6, (%sp) + + lea.l 0xf0008002, %a0 /* LCD data port */ + movem.l (44+4, %sp), %a1-%a4 /* Y data, Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + + move.l #19611723, %a5 /* y factor */ + move.l #33976259, %a6 /* bu factor */ + move.l #-6406711, %d5 /* gu factor */ + move.l #-13692816, %d6 /* gv factor */ + move.l #0x01040820, %d7 /* bitmask for signed->unsigned conversion + * of R, G and B within RGGB6666 at once */ + + /* chroma for (very) first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 + moveq.l #-128, %d0 + add.l %d0, %d2 + add.l %d0, %d3 + + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ + + /* luma for (very) first pixel */ + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 + + bra.b .yuv_line_entry + +.yuv_line_loop: + /* chroma for first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 + moveq.l #-128, %d0 + add.l %d0, %d2 + add.l %d0, %d3 + + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ + + /* luma for first pixel */ + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 + + move.w %d4, (%a0) + /* 2nd LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB666, pack and output */ +.yuv_line_entry: + moveq.l #26, %d0 + move.l %acc0, %d4 + move.l %acc1, %d3 + move.l %acc2, %d2 + lsr.l %d0, %d4 + lsr.l %d0, %d3 + lsr.l %d0, %d2 + + lsl.l #6, %d2 + or.l %d3, %d2 /* |00000000|00000000|0000Rrrr|rrGggggg| */ + lsl.l #7, %d2 + or.l %d2, %d3 /* |00000000|00000Rrr|rrrGgggg|g0Gggggg| */ + lsl.l #6, %d3 + or.l %d3, %d4 /* |0000000R|rrrrrGgg|ggg0Gggg|ggBbbbbb| */ + eor.l %d7, %d4 /* |0000000r|rrrrrggg|ggg0gggg|ggbbbbbb| */ + swap %d4 + move.w %d4, (%a0) + swap %d4 + + /* luma for second pixel as delta from the first */ + clr.l %d0 + move.b (%a1)+, %d0 + sub.l %d1, %d0 + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 + + move.w %d4, (%a0) + /* 2nd LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB666, pack and output */ + moveq.l #26, %d0 + movclr.l %acc0, %d4 + movclr.l %acc1, %d3 + movclr.l %acc2, %d2 + lsr.l %d0, %d4 + lsr.l %d0, %d3 + lsr.l %d0, %d2 + + lsl.l #6, %d2 + or.l %d3, %d2 /* |00000000|00000000|0000Rrrr|rrGggggg| */ + lsl.l #7, %d2 + or.l %d2, %d3 /* |00000000|00000Rrr|rrrGgggg|g0Gggggg| */ + lsl.l #6, %d3 + or.l %d3, %d4 /* |0000000R|rrrrrGgg|ggg0Gggg|ggBbbbbb| */ + eor.l %d7, %d4 /* |0000000r|rrrrrggg|ggg0gggg|ggbbbbbb| */ + swap %d4 + move.w %d4, (%a0) + swap %d4 + + cmp.l %a1, %a4 /* run %a1 up to end of line */ + bhi.w .yuv_line_loop + + tst.l (44+4, %sp) /* use original Y pointer as a flag to */ + beq.b .yuv_exit /* distinguish between first and second */ + clr.l (44+4, %sp) /* pixel line */ /* Rewind chroma pointers */ - movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */ - lea.l (%a1, %a5), %a5 /* next end address */ - -.yuv_line_loop2: - move.b (%a2)+,%d1 /* read save chromas and sign extend */ - extb.l %d1 - move.b (%a3)+,%d2 - extb.l %d2 - move.b (%a4)+,%d3 - extb.l %d3 - - clr.l %d4 - move.b (%a1)+,%d4 /* get y component */ - moveq.l #74,%d0 - muls.w %d0,%d4 - asr.l #8,%d4 - subq.l #4,%d4 - move.l %d4,%d5 - move.l %d4,%d6 - /* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ - - add.l %d3,%d4 /* get r */ - add.l %d2,%d5 /* get g */ - add.l %d1,%d6 /* get b */ - - move.l %d6,%d0 /* is clamping needed? */ - or.l %d5,%d0 - or.l %d4,%d0 - asr.l #6,%d0 - beq.b .yuv_no_clamp3 /* values in range: skip clamping */ - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_red_ok3 - spl.b %d4 - and.l %d0, %d4 -.yuv_red_ok3: - cmp.l %d0, %d5 - bls.s .yuv_green_ok3 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok3: - cmp.l %d0, %d6 - bls.s .yuv_blue_ok3 - spl.b %d6 - and.l %d0, %d6 -.yuv_blue_ok3: -.yuv_no_clamp3: - /* : %d4 = R, %d5 = G, %d6 = B */ - - move.l %d5,%d0 /* save g for lower 9 bits */ - lsl.l #3,%d4 /* R << 3 */ - lsr.l #3,%d0 /* G >> 3 */ - or.l %d4,%d0 - move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ - lsl.l #6,%d5 /* B << 6 */ - or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d6,(%a0) - - /** Write second pixel **/ - clr.l %d4 - move.b (%a1)+,%d4 /* get y component */ - moveq.l #74,%d0 - muls.w %d0,%d4 - asr.l #8,%d4 - subq.l #4,%d4 - /* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ - - /* Add Y + each chroma component (can clobber %d1-%d3 values now) */ - add.l %d4,%d3 /* get r */ - add.l %d4,%d2 /* get g */ - add.l %d4,%d1 /* get b */ - - move.l %d1,%d0 /* is clamping needed? */ - or.l %d2,%d0 - or.l %d3,%d0 - asr.l #6,%d0 - beq.b .yuv_no_clamp4 /* values in range: skip clamping */ - moveq.l #63, %d0 - cmp.l %d0, %d3 - bls.s .yuv_red_ok4 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok4: - cmp.l %d0, %d2 - bls.s .yuv_green_ok4 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok4: - cmp.l %d0, %d1 - bls.s .yuv_blue_ok4 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok4: -.yuv_no_clamp4: - /* : %d3 = R, %d2 = G, %d1 = B */ - - move.l %d2,%d0 /* save g for lower 9 bits */ - lsl.l #3,%d3 /* R << 3 */ - lsr.l #3,%d0 /* G >> 3 */ - or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */ - move.w %d0,(%a0) - lsl.l #6,%d2 /* G << 6 */ - or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d1,(%a0) - - cmp.l %a1,%a5 /* run %a0 up to end of line */ - bhi.w .yuv_line_loop2 - - movem.l (%sp),%d2-%d6/%a2-%a5 - lea.l (36,%sp),%sp /* restore registers */ + movem.l (44+8, %sp), %a2-%a4 /* Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + bra.w .yuv_line_loop + +.yuv_exit: + move.w %d4, (%a0) /* write (very) last 2nd word */ - rts + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (44, %sp), %sp /* restore registers */ + rts .yuv_end: - .size lcd_write_yuv420_lines,.yuv_end-lcd_write_yuv420_lines -/* end lcd_write_yuv420_lines */ + .size lcd_write_yuv420_lines, yuv_end - lcd_write_yuv420_lines /* begin lcd_write_data */ diff --git a/firmware/target/coldfire/iaudio/x5/lcd-x5.c b/firmware/target/coldfire/iaudio/x5/lcd-x5.c index 698ae477fa..92b9fde2e2 100755 --- a/firmware/target/coldfire/iaudio/x5/lcd-x5.c +++ b/firmware/target/coldfire/iaudio/x5/lcd-x5.c @@ -429,11 +429,11 @@ void lcd_blit(const fb_data* data, int x, int by, int width, /* Line write helper function for lcd_yuv_blit. Write two lines of yuv420. * y should have two lines of Y back to back. * bu and rv should contain the Cb and Cr data for the two lines of Y. - * Stores bu, guv and rv in repective buffers for use in second line. + * Needs EMAC set to saturated, signed integer mode. */ extern void lcd_write_yuv420_lines(const unsigned char *y, - unsigned char *bu, unsigned char *guv, unsigned char *rv, - int width); + const unsigned char *bu, + const unsigned char *rv, int width); /* Performance function to blit a YUV bitmap directly to the LCD * src_x, src_y, width and height should be even and within the LCD's @@ -446,7 +446,6 @@ void lcd_yuv_blit(unsigned char * const src[3], /* IRAM Y, Cb/bu, guv and Cb/rv buffers. */ unsigned char y_ibuf[LCD_WIDTH*2]; unsigned char bu_ibuf[LCD_WIDTH/2]; - unsigned char guv_ibuf[LCD_WIDTH/2]; unsigned char rv_ibuf[LCD_WIDTH/2]; const unsigned char *ysrc, *usrc, *vsrc; const unsigned char *ysrc_max; @@ -457,28 +456,29 @@ void lcd_yuv_blit(unsigned char * const src[3], if (r_entry_mode == R_ENTRY_MODE_SOLID) hw_dither(true); - width = (width + 1) & ~1; - height = (height + 1) & ~1; + width &= ~1; /* stay on the safe side */ + height &= ~1; - /* Set start position and window */ + /* Set start position and window */ lcd_write_reg(R_RAM_ADDR_SET, (x << 8) | (y + y_offset)); lcd_write_reg(R_VERT_RAM_ADDR_POS, ((x + width - 1) << 8) | x); lcd_begin_write_gram(); - ysrc = src[0] + src_y*stride + src_x; - usrc = src[1] + (src_y*stride >> 2) + (src_x >> 1); - vsrc = src[2] + (usrc - src[1]); - ysrc_max = ysrc + height*stride; + ysrc = src[0] + src_y * stride + src_x; + usrc = src[1] + (src_y * stride >> 2) + (src_x >> 1); + vsrc = src[2] + (src_y * stride >> 2) + (src_x >> 1); + ysrc_max = ysrc + height * stride; + coldfire_set_macsr(EMAC_SATURATE); do { memcpy(y_ibuf, ysrc, width); - memcpy(&y_ibuf[width], &ysrc[stride], width); + memcpy(y_ibuf + width, ysrc + stride, width); memcpy(bu_ibuf, usrc, width >> 1); memcpy(rv_ibuf, vsrc, width >> 1); - lcd_write_yuv420_lines(y_ibuf, bu_ibuf, guv_ibuf, rv_ibuf, width); - ysrc += stride << 1; + lcd_write_yuv420_lines(y_ibuf, bu_ibuf, rv_ibuf, width); + ysrc += 2 * stride; usrc += stride >> 1; vsrc += stride >> 1; } diff --git a/firmware/target/coldfire/iriver/h300/lcd-as-h300.S b/firmware/target/coldfire/iriver/h300/lcd-as-h300.S index ae55dfb224..1873b905c6 100755 --- a/firmware/target/coldfire/iriver/h300/lcd-as-h300.S +++ b/firmware/target/coldfire/iriver/h300/lcd-as-h300.S @@ -22,7 +22,7 @@ .section .icode, "ax", @progbits -/* lcd_write_yuv420_lines(), based on lcd-as-x5.S +/* lcd_write_yuv420_lines() * * See http://en.wikipedia.org/wiki/YCbCr * ITU-R BT.601 (formerly CCIR 601): @@ -38,252 +38,146 @@ * |R| |1.000000 0.000000 1.402000| |Y'| * |G| = |1.000000 -0.334136 -0.714136| |Pb| * |B| |1.000000 1.772000 0.000000| |Pr| - * Scaled, normalized, rounded and tweaked to yield RGB666, as converting - * directly to RGB565 gives too much roundoff error: - * |R| |74 0 101| |Y' - 16| / 256 - * |G| = |74 -24 -51| |Cb - 128| / 256 - * |B| |74 128 0| |Cr - 128| / 256 + * Scaled, normalized, rounded and tweaked to yield RGB565: + * |R| |19611723 0 26881894| |Y' - 16| >> 27 + * |G| = |19611723 -6406711 -13692816| |Cb - 128| >> 26 + * |B| |19611723 33976259 0| |Cr - 128| >> 27 + * + * Needs EMAC set to saturated, signed integer mode. */ - .align 2 .global lcd_write_yuv420_lines .type lcd_write_yuv420_lines, @function lcd_write_yuv420_lines: - lea.l (-36, %sp), %sp /* free up some registers */ - movem.l %d2-%d6/%a2-%a5, (%sp) + lea.l (-44, %sp), %sp /* free up some registers */ + movem.l %d2-%d7/%a2-%a6, (%sp) lea.l 0xf0000002, %a0 /* LCD data port */ - movem.l (36+4, %sp), %a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */ - lea.l (%a1, %a5), %a5 /* end address */ - -.yuv_line_loop1: - /* chroma for first & second pixel */ - clr.l %d1 /* load bu component */ - move.b (%a2), %d1 - clr.l %d3 /* load rv component */ - move.b (%a4), %d3 + movem.l (44+4, %sp), %a1-%a4 /* Y data, Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + + move.l #19611723, %a5 /* y factor */ + move.l #33976259, %a6 /* bu factor */ + move.l #-6406711, %d5 /* gu factor */ + move.l #-13692816, %d6 /* gv factor */ + move.l #0x8410, %d7 /* bitmask for signed->unsigned conversion + * of R, G and B within RGB565 at once */ + + /* chroma for (very) first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 moveq.l #-128, %d0 - add.l %d0, %d1 + add.l %d0, %d2 add.l %d0, %d3 - move.l %d1, %d2 /* %d2 = cb component for guv */ - asr.l #1, %d1 /* %d1 = 128 * (Cb - 128) / 256 */ - move.b %d1, (%a2)+ /* save bu for next line */ - moveq.l #-24, %d0 - muls.w %d0, %d2 /* %d2 = -24 * (Cb - 128)*/ - moveq.l #-51, %d0 - muls.w %d3, %d0 - add.l %d0, %d2 /* %d2 = -24 * (Cb - 128) - 51 * (Cr - 128) */ - asr.l #8, %d2 - move.b %d2, (%a3)+ /* save guv for next line */ - moveq.l #101, %d0 - muls.w %d0, %d3 /* %d3 = 101 * (Cr - 128) */ - asr.l #8, %d3 - move.b %d3, (%a4)+ /* save rv for next line */ + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ - /* luma for first pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - move.l %d4, %d5 - move.l %d4, %d6 - - /* combine & write first pixel */ - add.l %d1, %d4 /* %d4 = blue */ - add.l %d2, %d5 /* %d5 = green */ - add.l %d3, %d6 /* %d6 = red */ - - move.l %d4, %d0 /* clamping */ - or.l %d5, %d0 - or.l %d6, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok1 - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_blue_ok1 - spl.b %d4 - and.l %d0, %d4 -.yuv_blue_ok1: - cmp.l %d0, %d5 - bls.s .yuv_green_ok1 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok1: - cmp.l %d0, %d6 - bls.s .yuv_red_ok1 - spl.b %d6 - and.l %d0, %d6 -.yuv_red_ok1: -.yuv_all_ok1: - - lsr.l #1, %d6 /* pack, convert to RGB565 and output */ - lsr.l #1, %d4 - lsl.l #6, %d6 - or.l %d6, %d5 - lsl.l #5, %d5 - or.l %d5, %d4 - move.w %d4, (%a0) - - /* luma for second pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - - /* combine & write second pixel */ - add.l %d4, %d1 /* %d1 = blue */ - add.l %d4, %d2 /* %d2 = green */ - add.l %d4, %d3 /* %d3 = red */ - - move.l %d1, %d0 /* clamping */ - or.l %d2, %d0 - or.l %d3, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok2 - moveq.l #63, %d0 - cmp.l %d0, %d1 - bls.s .yuv_blue_ok2 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok2: - cmp.l %d0, %d2 - bls.s .yuv_green_ok2 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok2: - cmp.l %d0, %d3 - bls.s .yuv_red_ok2 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok2: -.yuv_all_ok2: + /* luma for (very) first pixel */ + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 - lsr.l #1, %d3 /* pack, convert to RGB565 and output */ - lsr.l #1, %d1 - lsl.l #6, %d3 - or.l %d3, %d2 - lsl.l #5, %d2 - or.l %d2, %d1 - move.w %d1, (%a0) + bra.b .yuv_line_entry - cmp.l %a1,%a5 /* run %a1 up to end of line */ - bhi.w .yuv_line_loop1 - - /* Rewind chroma pointers */ - movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */ - lea.l (%a1, %a5), %a5 /* next end address */ +.yuv_line_loop: + /* chroma for first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 + moveq.l #-128, %d0 + add.l %d0, %d2 + add.l %d0, %d3 -.yuv_line_loop2: - /* read saved chromas and sign extend */ - move.b (%a2)+, %d1 - extb.l %d1 - move.b (%a3)+, %d2 - extb.l %d2 - move.b (%a4)+, %d3 - extb.l %d3 + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ /* luma for first pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - move.l %d4, %d5 - move.l %d4, %d6 + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 - /* combine & write first pixel */ - add.l %d1, %d4 /* %d4 = blue */ - add.l %d2, %d5 /* %d5 = green */ - add.l %d3, %d6 /* %d6 = red */ + move.w %d4, (%a0) + /* LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB565, pack and output */ +.yuv_line_entry: + moveq.l #27, %d0 + move.l %acc0, %d2 + move.l %acc1, %d3 + move.l %acc2, %d4 + lsr.l %d0, %d2 + lsr.l %d0, %d4 + moveq.l #26, %d0 + lsr.l %d0, %d3 + lsl.l #6, %d4 + or.l %d3, %d4 + lsl.l #5, %d4 + or.l %d2, %d4 + eor.l %d7, %d4 + + /* luma for second pixel as delta from the first */ + clr.l %d0 + move.b (%a1)+, %d0 + sub.l %d1, %d0 + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 + + move.w %d4, (%a0) + /* LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB565, pack and output */ + moveq.l #27, %d0 + movclr.l %acc0, %d2 + movclr.l %acc1, %d3 + movclr.l %acc2, %d4 + lsr.l %d0, %d2 + lsr.l %d0, %d4 + moveq.l #26, %d0 + lsr.l %d0, %d3 + lsl.l #6, %d4 + or.l %d3, %d4 + lsl.l #5, %d4 + or.l %d2, %d4 + eor.l %d7, %d4 + + cmp.l %a1, %a4 /* run %a1 up to end of line */ + bhi.w .yuv_line_loop - move.l %d4, %d0 /* clamping */ - or.l %d5, %d0 - or.l %d6, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok3 - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_blue_ok3 - spl.b %d4 - and.l %d0, %d4 -.yuv_blue_ok3: - cmp.l %d0, %d5 - bls.s .yuv_green_ok3 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok3: - cmp.l %d0, %d6 - bls.s .yuv_red_ok3 - spl.b %d6 - and.l %d0, %d6 -.yuv_red_ok3: -.yuv_all_ok3: + tst.l (44+4, %sp) /* use original Y pointer as a flag to */ + beq.b .yuv_exit /* distinguish between first and second */ + clr.l (44+4, %sp) /* pixel line */ - lsr.l #1, %d6 /* pack, convert to RGB565 and output */ - lsr.l #1, %d4 - lsl.l #6, %d6 - or.l %d6, %d5 - lsl.l #5, %d5 - or.l %d5, %d4 - move.w %d4, (%a0) - - /* luma for second pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - - /* combine & write second pixel */ - add.l %d4, %d1 /* %d1 = blue */ - add.l %d4, %d2 /* %d2 = green */ - add.l %d4, %d3 /* %d3 = red */ + /* Rewind chroma pointers */ + movem.l (44+8, %sp), %a2-%a4 /* Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + bra.w .yuv_line_loop - move.l %d1, %d0 /* clamping */ - or.l %d2, %d0 - or.l %d3, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok4 - moveq.l #63, %d0 - cmp.l %d0, %d1 - bls.s .yuv_blue_ok4 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok4: - cmp.l %d0, %d2 - bls.s .yuv_green_ok4 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok4: - cmp.l %d0, %d3 - bls.s .yuv_red_ok4 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok4: -.yuv_all_ok4: - - lsr.l #1, %d3 /* pack, convert to RGB565 and output */ - lsr.l #1, %d1 - lsl.l #6, %d3 - or.l %d3, %d2 - lsl.l #5, %d2 - or.l %d2, %d1 - move.w %d1, (%a0) - - cmp.l %a1, %a5 /* run %a1 up to end of line */ - bhi.w .yuv_line_loop2 +.yuv_exit: + move.w %d4, (%a0) /* write (very) last pixel */ - movem.l (%sp), %d2-%d6/%a2-%a5 - lea.l (36, %sp), %sp /* restore registers */ + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (44, %sp), %sp /* restore registers */ rts -.lcd_write_yuv420_lines_end: - .size lcd_write_yuv420_lines, .lcd_write_yuv420_lines_end - lcd_write_yuv420_lines +.yuv_end: + .size lcd_write_yuv420_lines, .yuv_end - lcd_write_yuv420_lines -- cgit v1.2.3