From 92fe88a3a7318903b1eb377f92bea750d8f66068 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Thu, 24 Aug 2006 06:23:03 +0000 Subject: X5: Applied tweaks from the H300 lcd_yuv_blit to the X5 version. Smaller code and ca. 1% speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10729 a1c6a512-1295-4272-9138-f99709370657 --- firmware/target/coldfire/iaudio/x5/lcd-as-x5.S | 476 +++++++++++-------------- 1 file changed, 207 insertions(+), 269 deletions(-) diff --git a/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S b/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S index 1a527bb8f3..7c89fb925e 100644 --- a/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S +++ b/firmware/target/coldfire/iaudio/x5/lcd-as-x5.S @@ -40,316 +40,254 @@ * |G| = |1.000000 -0.334136 -0.714136| |Pb| * |B| |1.000000 1.772000 0.000000| |Pr| * Scaled, normalized, rounded and tweaked to yield RGB 666: - * |R| |74 0 102| |Y' - 16| / 256 - * |G| = |74 -25 -52| |Cb - 128| / 256 - * |B| |74 129 0| |Cr - 128| / 256 + * |R| |74 0 101| |Y' - 16| / 256 + * |G| = |74 -24 -51| |Cb - 128| / 256 + * |B| |74 128 0| |Cr - 128| / 256 */ .align 2 .global lcd_write_yuv420_lines .type lcd_write_yuv420_lines,@function lcd_write_yuv420_lines: - lea.l (-40,%sp),%sp /* free up some registers */ - movem.l %d2-%d7/%a2-%a5,(%sp) + lea.l (-36,%sp),%sp /* free up some registers */ + movem.l %d2-%d6/%a2-%a5,(%sp) - lea.l 0xf0008002,%a0 /* LCD data port */ - move.l (40+4,%sp),%a1 /* Y data */ - move.l (40+8,%sp),%a2 /* Cb data */ - move.l (40+12,%sp),%a3 /* guv storage */ - move.l (40+16,%sp),%a4 /* Cr data */ - move.l (40+20,%sp),%d0 /* width */ - lea.l (%a1,%d0.l),%a5 /* end address */ + lea.l 0xf0008002,%a0 /* LCD data port */ + movem.l (36+4,%sp),%a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */ + lea.l (%a1,%a5),%a5 /* end address */ .yuv_line_loop1: /** Write first pixel **/ - clr.l %d1 /* get y component */ - move.b (%a1)+,%d1 - subq.l #8,%d1 - subq.l #8,%d1 - moveq.l #74,%d6 - muls.w %d6,%d1 - asr.l #8,%d1 - - clr %d2 /* get bu component */ - move.b (%a2),%d2 - moveq.l #-128,%d6 - add.l %d6,%d2 - move.l %d2,%d3 /* %d3 = cb component for guv */ - move.w #129,%d6 - muls.w %d6,%d2 + clr.l %d1 /* get bu component */ + move.b (%a2),%d1 + clr.l %d3 /* get rv component */ + move.b (%a4),%d3 + moveq.l #-128,%d0 + add.l %d0,%d1 + add.l %d0,%d3 + + move.l %d1,%d2 /* %d2 = cb component for guv */ + asr.l #1,%d1 /* %d1 = 128 * (Cb - 128) / 256 */ + move.b %d1,(%a2)+ /* save bu for next line */ + moveq.l #-24,%d0 /* multiply first term of guv */ + muls.w %d0,%d2 + moveq.l #-51,%d0 /* multiply second term of guv */ + muls.w %d3,%d0 + add.l %d0,%d2 asr.l #8,%d2 - move.b %d2,(%a2)+ /* save bu for next line */ - - moveq.l #-25,%d6 /* multiply first term of guv */ - muls.w %d6,%d3 - - clr %d4 /* get rv component */ - move.b (%a4),%d4 - moveq.l #-128,%d6 - add.l %d6,%d4 - move.l %d4,%d7 /* %d7 = cr component for guv */ - moveq.l #102,%d6 - muls.w %d6,%d4 + move.b %d2,(%a3)+ /* save guv for next line */ + moveq.l #101,%d0 + muls.w %d0,%d3 + asr.l #8,%d3 + move.b %d3,(%a4)+ /* save rv for next line */ + + clr.l %d4 /* get y component */ + move.b (%a1)+,%d4 + moveq.l #74,%d0 + muls.w %d0,%d4 asr.l #8,%d4 - move.b %d4,(%a4)+ /* save rv for next line */ + subq.l #4,%d4 + move.l %d4,%d5 + move.l %d4,%d6 + /* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ - moveq.l #-52,%d6 /* multiply second term of guv */ - muls.w %d6,%d7 - add.l %d7,%d3 - asr.l #8,%d3 - move.b %d3,(%a3)+ /* save guv for next line */ - /* : %d1 = Y, %d2 = bu, %d3 = guv, %d4 = rv */ - - move.l %d1,%d5 /* get r */ - add.l %d4,%d5 - move.l %d1,%d6 /* get g */ - add.l %d3,%d6 - move.l %d1,%d7 /* get b */ - add.l %d2,%d7 - - move.l %d7,%d1 /* is clamping needed? */ - or.l %d6,%d1 - or.l %d5,%d1 - asr.l #6,%d1 + add.l %d3,%d4 /* get r */ + add.l %d2,%d5 /* get g */ + add.l %d1,%d6 /* get b */ + + move.l %d6,%d0 /* is clamping needed? */ + or.l %d5,%d0 + or.l %d4,%d0 + asr.l #6,%d0 beq.b .yuv_no_clamp1 /* values in range: skip clamping */ - bpl.b .yuv_r63_test1 /* no negative values: skip to high bounds checks */ -.yuv_r0_test1: - clr.l %d1 /* check for any values < 0 */ - cmp.l %d1,%d5 - bgt.b .yuv_g0_test1 - clr.l %d5 -.yuv_g0_test1: - cmp.l %d1,%d6 - bgt.b .yuv_b0_test1 - clr.l %d6 -.yuv_b0_test1: - cmp.l %d1,%d7 - bgt.b .yuv_r63_test1 - clr.l %d7 -.yuv_r63_test1: /* check for any values > 63 */ - moveq.l #63,%d1 - cmp.l %d1,%d5 - blt.b .yuv_g63_test1 - move.l %d1,%d5 -.yuv_g63_test1: - cmp.l %d1,%d6 - blt.b .yuv_b63_test1 - move.l %d1,%d6 -.yuv_b63_test1: - cmp.l %d1,%d7 - blt.b .yuv_no_clamp1 - move.l %d1,%d7 + moveq.l #63, %d0 + cmp.l %d0, %d4 + bls.s .yuv_red_ok1 + spl.b %d4 + and.l %d0, %d4 +.yuv_red_ok1: + cmp.l %d0, %d5 + bls.s .yuv_green_ok1 + spl.b %d5 + and.l %d0, %d5 +.yuv_green_ok1: + cmp.l %d0, %d6 + bls.s .yuv_blue_ok1 + spl.b %d6 + and.l %d0, %d6 +.yuv_blue_ok1: .yuv_no_clamp1: - /* : %d5 = R, %d6 = G, %d7 = B */ + /* : %d4 = R, %d5 = G, %d6 = B */ - move.l %d6,%d1 /* save g for lower 9 bits */ - lsl.l #3,%d5 /* R << 3 */ - lsr.l #3,%d1 /* G >> 3 */ - or.l %d5,%d1 - move.w %d1,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ - lsl.l #6,%d6 /* B << 6 */ - or.l %d6,%d7 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d7,(%a0) + move.l %d5,%d0 /* save g for lower 9 bits */ + lsl.l #3,%d4 /* R << 3 */ + lsr.l #3,%d0 /* G >> 3 */ + or.l %d4,%d0 + move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ + lsl.l #6,%d5 /* B << 6 */ + or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */ + move.w %d6,(%a0) /** Write second pixel **/ - clr %d1 - move.b (%a1)+,%d1 /* get y component */ - subq.l #8,%d1 - subq.l #8,%d1 - moveq.l #74,%d6 - muls.w %d6,%d1 - asr.l #8,%d1 - /* : %d1 = Y, %d2 = bu, %d3 = guv, %d4 = rv */ - - /* Add Y + each chroma component (can clobber %d2-%d4 values now) */ - add.l %d1,%d4 /* get r */ - add.l %d1,%d3 /* get g */ - add.l %d1,%d2 /* get b */ - - move.l %d2,%d1 /* is clamping needed? */ - or.l %d3,%d1 - or.l %d4,%d1 - asr.l #6,%d1 - beq.b .yuv_no_clamp2 /* values in range: skip clamping */ - bpl.b .yuv_r63_test2 /* no negative values: skip to high bounds checks */ -.yuv_r0_test2: - clr.l %d1 /* check for any values < 0 */ - cmp.l %d1,%d4 - bgt.b .yuv_g0_test2 clr.l %d4 -.yuv_g0_test2: - cmp.l %d1,%d3 - bgt.b .yuv_b0_test2 - clr.l %d3 -.yuv_b0_test2: - cmp.l %d1,%d2 - bgt.b .yuv_r63_test2 - clr.l %d2 -.yuv_r63_test2: /* check for any values > 63 */ - moveq.l #63,%d1 - cmp.l %d1,%d4 - blt.b .yuv_g63_test2 - move.l %d1,%d4 -.yuv_g63_test2: - cmp.l %d1,%d3 - blt.b .yuv_b63_test2 - move.l %d1,%d3 -.yuv_b63_test2: - cmp.l %d1,%d2 - blt.b .yuv_no_clamp2 - move.l %d1,%d2 + move.b (%a1)+,%d4 /* get y component */ + moveq.l #74,%d0 + muls.w %d0,%d4 + asr.l #8,%d4 + subq.l #4,%d4 + /* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ + + /* Add Y + each chroma component (can clobber %d1-%d3 values now) */ + add.l %d4,%d3 /* get r */ + add.l %d4,%d2 /* get g */ + add.l %d4,%d1 /* get b */ + + move.l %d1,%d0 /* is clamping needed? */ + or.l %d2,%d0 + or.l %d3,%d0 + asr.l #6,%d0 + beq.b .yuv_no_clamp2 /* values in range: skip clamping */ + moveq.l #63, %d0 + cmp.l %d0, %d3 + bls.s .yuv_red_ok2 + spl.b %d3 + and.l %d0, %d3 +.yuv_red_ok2: + cmp.l %d0, %d2 + bls.s .yuv_green_ok2 + spl.b %d2 + and.l %d0, %d2 +.yuv_green_ok2: + cmp.l %d0, %d1 + bls.s .yuv_blue_ok2 + spl.b %d1 + and.l %d0, %d1 +.yuv_blue_ok2: .yuv_no_clamp2: - /* : %d4 = R, %d3 = G, %d2 = B */ - - move.l %d3,%d1 /* save g for lower 9 bits */ - lsl.l #3,%d4 /* R << 3 */ - lsr.l #3,%d1 /* G >> 3 */ - or.l %d4,%d1 /* |00000000|000000000|0000000r|rrrrrggg| */ + /* : %d3 = R, %d2 = G, %d1 = B */ + + move.l %d2,%d0 /* save g for lower 9 bits */ + lsl.l #3,%d3 /* R << 3 */ + lsr.l #3,%d0 /* G >> 3 */ + or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */ + move.w %d0,(%a0) + lsl.l #6,%d2 /* G << 6 */ + or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */ move.w %d1,(%a0) - lsl.l #6,%d3 /* G << 6 */ - or.l %d3,%d2 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d2,(%a0) cmp.l %a1,%a5 /* run %a1 up to end of line */ bhi.w .yuv_line_loop1 /* Rewind chroma pointers */ - move.l (40+8,%sp),%a2 /* bu data */ - move.l (40+12,%sp),%a3 /* guv data */ - move.l (40+16,%sp),%a4 /* rv data */ - lea.l (%a5,%d0),%a5 /* next end address */ + movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */ + lea.l (%a1, %a5), %a5 /* next end address */ .yuv_line_loop2: - clr %d1 - move.b (%a1)+,%d1 /* get y component */ - subq.l #8,%d1 - subq.l #8,%d1 - moveq.l #74,%d6 - muls.w %d6,%d1 - asr.l #8,%d1 - - move.b (%a2)+,%d2 /* read save chromas and sign extend */ + move.b (%a2)+,%d1 /* read save chromas and sign extend */ + extb.l %d1 + move.b (%a3)+,%d2 extb.l %d2 - move.b (%a3)+,%d3 + move.b (%a4)+,%d3 extb.l %d3 - move.b (%a4)+,%d4 - extb.l %d4 - /* : %d1 = Y, %d2 = bu, %d3 = guv, %d4 = rv */ - - move.l %d1,%d5 /* get r */ - add.l %d4,%d5 - move.l %d1,%d6 /* get g */ - add.l %d3,%d6 - move.l %d1,%d7 /* get b */ - add.l %d2,%d7 - - move.l %d7,%d1 /* is clamping needed? */ - or.l %d6,%d1 - or.l %d5,%d1 - asr.l #6,%d1 + + clr.l %d4 + move.b (%a1)+,%d4 /* get y component */ + moveq.l #74,%d0 + muls.w %d0,%d4 + asr.l #8,%d4 + subq.l #4,%d4 + move.l %d4,%d5 + move.l %d4,%d6 + /* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ + + add.l %d3,%d4 /* get r */ + add.l %d2,%d5 /* get g */ + add.l %d1,%d6 /* get b */ + + move.l %d6,%d0 /* is clamping needed? */ + or.l %d5,%d0 + or.l %d4,%d0 + asr.l #6,%d0 beq.b .yuv_no_clamp3 /* values in range: skip clamping */ - bpl.b .yuv_r63_test3 /* no negative values: skip to high bounds checks */ -.yuv_r0_test3: - clr.l %d1 /* check for any values < 0 */ - cmp.l %d1,%d5 - bgt.b .yuv_g0_test3 - clr.l %d5 -.yuv_g0_test3: - cmp.l %d1,%d6 - bgt.b .yuv_b0_test3 - clr.l %d6 -.yuv_b0_test3: - cmp.l %d1,%d7 - bgt.b .yuv_r63_test3 - clr.l %d7 -.yuv_r63_test3: /* check for any values > 63 */ - moveq.l #63,%d1 - cmp.l %d1,%d5 - blt.b .yuv_g63_test3 - move.l %d1,%d5 -.yuv_g63_test3: - cmp.l %d1,%d6 - blt.b .yuv_b63_test3 - move.l %d1,%d6 -.yuv_b63_test3: - cmp.l %d1,%d7 - blt.b .yuv_no_clamp3 - move.l %d1,%d7 + moveq.l #63, %d0 + cmp.l %d0, %d4 + bls.s .yuv_red_ok3 + spl.b %d4 + and.l %d0, %d4 +.yuv_red_ok3: + cmp.l %d0, %d5 + bls.s .yuv_green_ok3 + spl.b %d5 + and.l %d0, %d5 +.yuv_green_ok3: + cmp.l %d0, %d6 + bls.s .yuv_blue_ok3 + spl.b %d6 + and.l %d0, %d6 +.yuv_blue_ok3: .yuv_no_clamp3: - /* : %d5 = R, %d6 = G, %d7 = B */ + /* : %d4 = R, %d5 = G, %d6 = B */ - move.l %d6,%d1 /* save g for lower 9 bits */ - lsl.l #3,%d5 /* R << 3 */ - lsr.l #3,%d1 /* G >> 3 */ - or.l %d5,%d1 - move.w %d1,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ - lsl.l #6,%d6 /* B << 6 */ - or.l %d6,%d7 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d7,(%a0) + move.l %d5,%d0 /* save g for lower 9 bits */ + lsl.l #3,%d4 /* R << 3 */ + lsr.l #3,%d0 /* G >> 3 */ + or.l %d4,%d0 + move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */ + lsl.l #6,%d5 /* B << 6 */ + or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */ + move.w %d6,(%a0) /** Write second pixel **/ - clr %d1 - move.b (%a1)+,%d1 /* get y component */ - subq.l #8,%d1 - subq.l #8,%d1 - moveq.l #74,%d6 - muls.w %d6,%d1 - asr.l #8,%d1 - /* : %d1 = Y, %d2 = bu, %d3 = guv, %d4 = rv */ - - /* Add Y + each chroma component (can clobber %d2-%d4 values now) */ - add.l %d1,%d4 /* get r */ - add.l %d1,%d3 /* get g */ - add.l %d1,%d2 /* get b */ - - move.l %d2,%d1 /* is clamping needed? */ - or.l %d3,%d1 - or.l %d4,%d1 - asr.l #6,%d1 - beq.b .yuv_no_clamp4 /* values in range: skip clamping */ - bpl.b .yuv_r63_test4 /* no negative values: skip to high bounds checks */ -.yuv_r0_test4: - clr.l %d1 /* check for any values < 0 */ - cmp.l %d1,%d4 - bgt.b .yuv_g0_test4 clr.l %d4 -.yuv_g0_test4: - cmp.l %d1,%d3 - bgt.b .yuv_b0_test4 - clr.l %d3 -.yuv_b0_test4: - cmp.l %d1,%d2 - bgt.b .yuv_r63_test4 - clr.l %d2 -.yuv_r63_test4: /* check for any values > 63 */ - moveq.l #63,%d1 - cmp.l %d1,%d4 - blt.b .yuv_g63_test4 - move.l %d1,%d4 -.yuv_g63_test4: - cmp.l %d1,%d3 - blt.b .yuv_b63_test4 - move.l %d1,%d3 -.yuv_b63_test4: - cmp.l %d1,%d2 - blt.b .yuv_no_clamp4 - move.l %d1,%d2 + move.b (%a1)+,%d4 /* get y component */ + moveq.l #74,%d0 + muls.w %d0,%d4 + asr.l #8,%d4 + subq.l #4,%d4 + /* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */ + + /* Add Y + each chroma component (can clobber %d1-%d3 values now) */ + add.l %d4,%d3 /* get r */ + add.l %d4,%d2 /* get g */ + add.l %d4,%d1 /* get b */ + + move.l %d1,%d0 /* is clamping needed? */ + or.l %d2,%d0 + or.l %d3,%d0 + asr.l #6,%d0 + beq.b .yuv_no_clamp4 /* values in range: skip clamping */ + moveq.l #63, %d0 + cmp.l %d0, %d3 + bls.s .yuv_red_ok4 + spl.b %d3 + and.l %d0, %d3 +.yuv_red_ok4: + cmp.l %d0, %d2 + bls.s .yuv_green_ok4 + spl.b %d2 + and.l %d0, %d2 +.yuv_green_ok4: + cmp.l %d0, %d1 + bls.s .yuv_blue_ok4 + spl.b %d1 + and.l %d0, %d1 +.yuv_blue_ok4: .yuv_no_clamp4: - /* : %d4 = R, %d3 = G, %d2 = B */ - - move.l %d3,%d1 /* save g for lower 9 bits */ - lsl.l #3,%d4 /* R << 3 */ - lsr.l #3,%d1 /* G >> 3 */ - or.l %d4,%d1 /* |00000000|000000000|0000000r|rrrrrggg| */ + /* : %d3 = R, %d2 = G, %d1 = B */ + + move.l %d2,%d0 /* save g for lower 9 bits */ + lsl.l #3,%d3 /* R << 3 */ + lsr.l #3,%d0 /* G >> 3 */ + or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */ + move.w %d0,(%a0) + lsl.l #6,%d2 /* G << 6 */ + or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */ move.w %d1,(%a0) - lsl.l #6,%d3 /* G << 6 */ - or.l %d3,%d2 /* |00000000|000000000|0000gggg|ggbbbbbb| */ - move.w %d2,(%a0) cmp.l %a1,%a5 /* run %a0 up to end of line */ bhi.w .yuv_line_loop2 - movem.l (%sp),%d2-%d7/%a2-%a5 - lea.l (40,%sp),%sp /* restore registers */ + movem.l (%sp),%d2-%d6/%a2-%a5 + lea.l (36,%sp),%sp /* restore registers */ rts /* end lcd_write_yuv420_lines */ -- cgit v1.2.3