From f8b1da2f7bddebc9c7026bd5d106dec118ce70a9 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Sat, 4 Nov 2006 00:42:18 +0000 Subject: H300, X5: Faster lcd_yuv_blit() using EMAC. Speedup of the function itself at 124MHz: 10.5% on X5, 16.5% on H300. mpegplayer speedup 3..4% git-svn-id: svn://svn.rockbox.org/rockbox/trunk@11429 a1c6a512-1295-4272-9138-f99709370657 --- firmware/target/coldfire/iriver/h300/lcd-as-h300.S | 346 +++++++-------------- 1 file changed, 120 insertions(+), 226 deletions(-) (limited to 'firmware/target/coldfire/iriver/h300/lcd-as-h300.S') diff --git a/firmware/target/coldfire/iriver/h300/lcd-as-h300.S b/firmware/target/coldfire/iriver/h300/lcd-as-h300.S index ae55dfb224..1873b905c6 100755 --- a/firmware/target/coldfire/iriver/h300/lcd-as-h300.S +++ b/firmware/target/coldfire/iriver/h300/lcd-as-h300.S @@ -22,7 +22,7 @@ .section .icode, "ax", @progbits -/* lcd_write_yuv420_lines(), based on lcd-as-x5.S +/* lcd_write_yuv420_lines() * * See http://en.wikipedia.org/wiki/YCbCr * ITU-R BT.601 (formerly CCIR 601): @@ -38,252 +38,146 @@ * |R| |1.000000 0.000000 1.402000| |Y'| * |G| = |1.000000 -0.334136 -0.714136| |Pb| * |B| |1.000000 1.772000 0.000000| |Pr| - * Scaled, normalized, rounded and tweaked to yield RGB666, as converting - * directly to RGB565 gives too much roundoff error: - * |R| |74 0 101| |Y' - 16| / 256 - * |G| = |74 -24 -51| |Cb - 128| / 256 - * |B| |74 128 0| |Cr - 128| / 256 + * Scaled, normalized, rounded and tweaked to yield RGB565: + * |R| |19611723 0 26881894| |Y' - 16| >> 27 + * |G| = |19611723 -6406711 -13692816| |Cb - 128| >> 26 + * |B| |19611723 33976259 0| |Cr - 128| >> 27 + * + * Needs EMAC set to saturated, signed integer mode. */ - .align 2 .global lcd_write_yuv420_lines .type lcd_write_yuv420_lines, @function lcd_write_yuv420_lines: - lea.l (-36, %sp), %sp /* free up some registers */ - movem.l %d2-%d6/%a2-%a5, (%sp) + lea.l (-44, %sp), %sp /* free up some registers */ + movem.l %d2-%d7/%a2-%a6, (%sp) lea.l 0xf0000002, %a0 /* LCD data port */ - movem.l (36+4, %sp), %a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */ - lea.l (%a1, %a5), %a5 /* end address */ - -.yuv_line_loop1: - /* chroma for first & second pixel */ - clr.l %d1 /* load bu component */ - move.b (%a2), %d1 - clr.l %d3 /* load rv component */ - move.b (%a4), %d3 + movem.l (44+4, %sp), %a1-%a4 /* Y data, Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + + move.l #19611723, %a5 /* y factor */ + move.l #33976259, %a6 /* bu factor */ + move.l #-6406711, %d5 /* gu factor */ + move.l #-13692816, %d6 /* gv factor */ + move.l #0x8410, %d7 /* bitmask for signed->unsigned conversion + * of R, G and B within RGB565 at once */ + + /* chroma for (very) first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 moveq.l #-128, %d0 - add.l %d0, %d1 + add.l %d0, %d2 add.l %d0, %d3 - move.l %d1, %d2 /* %d2 = cb component for guv */ - asr.l #1, %d1 /* %d1 = 128 * (Cb - 128) / 256 */ - move.b %d1, (%a2)+ /* save bu for next line */ - moveq.l #-24, %d0 - muls.w %d0, %d2 /* %d2 = -24 * (Cb - 128)*/ - moveq.l #-51, %d0 - muls.w %d3, %d0 - add.l %d0, %d2 /* %d2 = -24 * (Cb - 128) - 51 * (Cr - 128) */ - asr.l #8, %d2 - move.b %d2, (%a3)+ /* save guv for next line */ - moveq.l #101, %d0 - muls.w %d0, %d3 /* %d3 = 101 * (Cr - 128) */ - asr.l #8, %d3 - move.b %d3, (%a4)+ /* save rv for next line */ + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ - /* luma for first pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - move.l %d4, %d5 - move.l %d4, %d6 - - /* combine & write first pixel */ - add.l %d1, %d4 /* %d4 = blue */ - add.l %d2, %d5 /* %d5 = green */ - add.l %d3, %d6 /* %d6 = red */ - - move.l %d4, %d0 /* clamping */ - or.l %d5, %d0 - or.l %d6, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok1 - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_blue_ok1 - spl.b %d4 - and.l %d0, %d4 -.yuv_blue_ok1: - cmp.l %d0, %d5 - bls.s .yuv_green_ok1 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok1: - cmp.l %d0, %d6 - bls.s .yuv_red_ok1 - spl.b %d6 - and.l %d0, %d6 -.yuv_red_ok1: -.yuv_all_ok1: - - lsr.l #1, %d6 /* pack, convert to RGB565 and output */ - lsr.l #1, %d4 - lsl.l #6, %d6 - or.l %d6, %d5 - lsl.l #5, %d5 - or.l %d5, %d4 - move.w %d4, (%a0) - - /* luma for second pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - - /* combine & write second pixel */ - add.l %d4, %d1 /* %d1 = blue */ - add.l %d4, %d2 /* %d2 = green */ - add.l %d4, %d3 /* %d3 = red */ - - move.l %d1, %d0 /* clamping */ - or.l %d2, %d0 - or.l %d3, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok2 - moveq.l #63, %d0 - cmp.l %d0, %d1 - bls.s .yuv_blue_ok2 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok2: - cmp.l %d0, %d2 - bls.s .yuv_green_ok2 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok2: - cmp.l %d0, %d3 - bls.s .yuv_red_ok2 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok2: -.yuv_all_ok2: + /* luma for (very) first pixel */ + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 - lsr.l #1, %d3 /* pack, convert to RGB565 and output */ - lsr.l #1, %d1 - lsl.l #6, %d3 - or.l %d3, %d2 - lsl.l #5, %d2 - or.l %d2, %d1 - move.w %d1, (%a0) + bra.b .yuv_line_entry - cmp.l %a1,%a5 /* run %a1 up to end of line */ - bhi.w .yuv_line_loop1 - - /* Rewind chroma pointers */ - movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */ - lea.l (%a1, %a5), %a5 /* next end address */ +.yuv_line_loop: + /* chroma for first & second pixel */ + clr.l %d2 /* load u component */ + move.b (%a2)+, %d2 + clr.l %d3 /* load v component */ + move.b (%a3)+, %d3 + moveq.l #-128, %d0 + add.l %d0, %d2 + add.l %d0, %d3 -.yuv_line_loop2: - /* read saved chromas and sign extend */ - move.b (%a2)+, %d1 - extb.l %d1 - move.b (%a3)+, %d2 - extb.l %d2 - move.b (%a4)+, %d3 - extb.l %d3 + mac.l %a6, %d2, %acc0 /* bu */ + mac.l %d5, %d2, %acc1 /* gu */ + mac.l %d6, %d3, %acc1 /* gv */ + move.l #26881894, %d0 /* rv factor */ + mac.l %d0, %d3, %acc2 /* rv */ /* luma for first pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - move.l %d4, %d5 - move.l %d4, %d6 + clr.l %d1 + move.b (%a1)+, %d1 + moveq.l #-126, %d0 + add.l %d1, %d0 /* y' (-0.5 ... +0.5) */ + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 - /* combine & write first pixel */ - add.l %d1, %d4 /* %d4 = blue */ - add.l %d2, %d5 /* %d5 = green */ - add.l %d3, %d6 /* %d6 = red */ + move.w %d4, (%a0) + /* LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB565, pack and output */ +.yuv_line_entry: + moveq.l #27, %d0 + move.l %acc0, %d2 + move.l %acc1, %d3 + move.l %acc2, %d4 + lsr.l %d0, %d2 + lsr.l %d0, %d4 + moveq.l #26, %d0 + lsr.l %d0, %d3 + lsl.l #6, %d4 + or.l %d3, %d4 + lsl.l #5, %d4 + or.l %d2, %d4 + eor.l %d7, %d4 + + /* luma for second pixel as delta from the first */ + clr.l %d0 + move.b (%a1)+, %d0 + sub.l %d1, %d0 + mac.l %a5, %d0, %acc0 + mac.l %a5, %d0, %acc1 + mac.l %a5, %d0, %acc2 + + move.w %d4, (%a0) + /* LCD write is delayed one pixel to use it for filling the EMAC latency */ + + /* convert to RGB565, pack and output */ + moveq.l #27, %d0 + movclr.l %acc0, %d2 + movclr.l %acc1, %d3 + movclr.l %acc2, %d4 + lsr.l %d0, %d2 + lsr.l %d0, %d4 + moveq.l #26, %d0 + lsr.l %d0, %d3 + lsl.l #6, %d4 + or.l %d3, %d4 + lsl.l #5, %d4 + or.l %d2, %d4 + eor.l %d7, %d4 + + cmp.l %a1, %a4 /* run %a1 up to end of line */ + bhi.w .yuv_line_loop - move.l %d4, %d0 /* clamping */ - or.l %d5, %d0 - or.l %d6, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok3 - moveq.l #63, %d0 - cmp.l %d0, %d4 - bls.s .yuv_blue_ok3 - spl.b %d4 - and.l %d0, %d4 -.yuv_blue_ok3: - cmp.l %d0, %d5 - bls.s .yuv_green_ok3 - spl.b %d5 - and.l %d0, %d5 -.yuv_green_ok3: - cmp.l %d0, %d6 - bls.s .yuv_red_ok3 - spl.b %d6 - and.l %d0, %d6 -.yuv_red_ok3: -.yuv_all_ok3: + tst.l (44+4, %sp) /* use original Y pointer as a flag to */ + beq.b .yuv_exit /* distinguish between first and second */ + clr.l (44+4, %sp) /* pixel line */ - lsr.l #1, %d6 /* pack, convert to RGB565 and output */ - lsr.l #1, %d4 - lsl.l #6, %d6 - or.l %d6, %d5 - lsl.l #5, %d5 - or.l %d5, %d4 - move.w %d4, (%a0) - - /* luma for second pixel */ - clr.l %d4 /* load y component */ - move.b (%a1)+, %d4 - moveq.l #74, %d0 - muls.w %d0, %d4 /* %d4 = 36 * Y */ - asr.l #8, %d4 - subq.l #4, %d4 /* correction for (Y - 16) and rounding */ - - /* combine & write second pixel */ - add.l %d4, %d1 /* %d1 = blue */ - add.l %d4, %d2 /* %d2 = green */ - add.l %d4, %d3 /* %d3 = red */ + /* Rewind chroma pointers */ + movem.l (44+8, %sp), %a2-%a4 /* Cb data, Cr data, width */ + lea.l (%a1, %a4), %a4 /* end address */ + bra.w .yuv_line_loop - move.l %d1, %d0 /* clamping */ - or.l %d2, %d0 - or.l %d3, %d0 - asr.l #6, %d0 - beq.s .yuv_all_ok4 - moveq.l #63, %d0 - cmp.l %d0, %d1 - bls.s .yuv_blue_ok4 - spl.b %d1 - and.l %d0, %d1 -.yuv_blue_ok4: - cmp.l %d0, %d2 - bls.s .yuv_green_ok4 - spl.b %d2 - and.l %d0, %d2 -.yuv_green_ok4: - cmp.l %d0, %d3 - bls.s .yuv_red_ok4 - spl.b %d3 - and.l %d0, %d3 -.yuv_red_ok4: -.yuv_all_ok4: - - lsr.l #1, %d3 /* pack, convert to RGB565 and output */ - lsr.l #1, %d1 - lsl.l #6, %d3 - or.l %d3, %d2 - lsl.l #5, %d2 - or.l %d2, %d1 - move.w %d1, (%a0) - - cmp.l %a1, %a5 /* run %a1 up to end of line */ - bhi.w .yuv_line_loop2 +.yuv_exit: + move.w %d4, (%a0) /* write (very) last pixel */ - movem.l (%sp), %d2-%d6/%a2-%a5 - lea.l (36, %sp), %sp /* restore registers */ + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (44, %sp), %sp /* restore registers */ rts -.lcd_write_yuv420_lines_end: - .size lcd_write_yuv420_lines, .lcd_write_yuv420_lines_end - lcd_write_yuv420_lines +.yuv_end: + .size lcd_write_yuv420_lines, .yuv_end - lcd_write_yuv420_lines -- cgit v1.2.3