summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-12-12 15:23:20 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-12-12 15:23:20 +0000
commit395d72f71aea7f727d5189b29ee3f72a1cc5538e (patch)
tree729daf89818a687be8195eed26d6f3694922d798
parenteadff408632571c93e39966e46d0c5736d7a6aa9 (diff)
downloadrockbox-395d72f71aea7f727d5189b29ee3f72a1cc5538e.tar.gz
rockbox-395d72f71aea7f727d5189b29ee3f72a1cc5538e.zip
FS#11807 - Major speedup of iPod nano 2G. Part 4: Introduce asm for yuv blitting. Overall speedup of part1-4 is +50% for RGB and +93% for YUV.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28813 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--firmware/SOURCES1
-rwxr-xr-xfirmware/target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S254
-rw-r--r--firmware/target/arm/s5l8700/ipodnano2g/lcd-nano2g.c131
3 files changed, 276 insertions, 110 deletions
diff --git a/firmware/SOURCES b/firmware/SOURCES
index 7f21a89771..72b0217dfb 100644
--- a/firmware/SOURCES
+++ b/firmware/SOURCES
@@ -1504,6 +1504,7 @@ target/arm/s5l8700/kernel-s5l8700.c
1504target/arm/s5l8700/dma-s5l8700.c 1504target/arm/s5l8700/dma-s5l8700.c
1505target/arm/s5l8700/ipodnano2g/backlight-nano2g.c 1505target/arm/s5l8700/ipodnano2g/backlight-nano2g.c
1506target/arm/s5l8700/ipodnano2g/lcd-nano2g.c 1506target/arm/s5l8700/ipodnano2g/lcd-nano2g.c
1507target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S
1507target/arm/s5l8700/ipodnano2g/powermgmt-nano2g.c 1508target/arm/s5l8700/ipodnano2g/powermgmt-nano2g.c
1508target/arm/s5l8700/ipodnano2g/power-nano2g.c 1509target/arm/s5l8700/ipodnano2g/power-nano2g.c
1509target/arm/s5l8700/ipodnano2g/ftl-nano2g.c 1510target/arm/s5l8700/ipodnano2g/ftl-nano2g.c
diff --git a/firmware/target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S b/firmware/target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S
new file mode 100755
index 0000000000..25cf662c3f
--- /dev/null
+++ b/firmware/target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S
@@ -0,0 +1,254 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id: lcd-as-video.S 26756 2010-06-11 04:41:36Z funman $
9 *
10 * Copyright (C) 2010 by Andree Buschmann
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23
24 .section .icode, "ax", %progbits
25
26/****************************************************************************
27 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
28 * unsigned LCD_BASE,
29 * int width,
30 * int stride);
31 *
32 * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
33 * |R| |1.164 0.000 1.596| |Y' - 16|
34 * |G| = |1.164 -0.391 -0.813| |Pb - 128|
35 * |B| |1.164 2.018 0.000| |Pr - 128|
36 *
37 * Scaled, normalized, rounded and tweaked to yield RGB 565:
38 * |R| |74 0 101| |Y' - 16| >> 9
39 * |G| = |74 -24 -51| |Cb - 128| >> 8
40 * |B| |74 128 0| |Cr - 128| >> 9
41 *
42 * Converts two lines from YUV to RGB565 and writes to LCD at once. First loop
43 * loads Cb/Cr, calculates the chroma offset and saves them to buffer. Within
44 * the second loop these chroma offset are reloaded from buffer. Within each
45 * loop two pixels are calculated and written to LCD.
46 */
47 .align 2
48 .global lcd_write_yuv420_lines
49 .type lcd_write_yuv420_lines, %function
50lcd_write_yuv420_lines:
51 /* r0 = src = yuv_src */
52 /* r1 = dst = LCD_BASE */
53 /* r2 = width */
54 /* r3 = stride */
55 stmfd sp!, { r4-r10, lr } /* save non-scratch */
56 ldmia r0, { r9, r10, r12 } /* r9 = yuv_src[0] = Y'_p */
57 /* r10 = yuv_src[1] = Cb_p */
58 /* r12 = yuv_src[2] = Cr_p */
59 add r3, r9, r3 /* r3 = &ysrc[stride] */
60 add r4, r2, r2, asr #1 /* chroma buffer lenght = width/2 *3 */
61 mov r4, r4, asl #2 /* use words for str/ldm possibility */
62 add r4, r4, #19 /* plus room for 4 additional words, */
63 bic r4, r4, #3 /* rounded up to multiples of 4 byte */
64 sub sp, sp, r4 /* and allocate on stack */
65 stmia sp, {r1-r4} /* LCD_BASE, width, &ysrc[stride], stack_alloc */
66
67 mov r7, r2 /* r7 = loop count */
68 add r8, sp, #16 /* chroma buffer */
69 mov lr, r1 /* LCD data port = LCD_BASE */
70
71 /* 1st loop start */
7210: /* loop start */
73
74 ldrb r0, [r10], #1 /* r0 = *usrc++ = *Cb_p++ */
75 ldrb r1, [r12], #1 /* r1 = *vsrc++ = *Cr_p++ */
76
77 sub r0, r0, #128 /* r0 = Cb-128 */
78 sub r1, r1, #128 /* r1 = Cr-128 */
79
80 add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */
81 add r2, r2, r2, asl #4
82 add r2, r2, r0, asl #3
83 add r2, r2, r0, asl #4
84
85 add r4, r1, r1, asl #2 /* r1 = Cr*101 */
86 add r4, r4, r1, asl #5
87 add r1, r4, r1, asl #6
88
89 add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
90 mov r1, r1, asr #9
91 rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */
92 mov r2, r2, asr #8
93 add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
94 mov r0, r0, asr #2
95 stmia r8!, {r0-r2} /* store r0, r1 and r2 to chroma buffer */
96
97 /* 1st loop, first pixel */
98 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
99 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
100 add r3, r5, r5, asl #2
101 add r5, r3, r5, asl #5
102
103 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
104 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
105 add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
106
107 orr r5, r6, r4 /* check if clamping is needed... */
108 orr r5, r5, r3, asr #1 /* ...at all */
109 cmp r5, #31
110 bls 15f /* -> no clamp */
111 cmp r6, #31 /* clamp r */
112 mvnhi r6, r6, asr #31
113 andhi r6, r6, #31
114 cmp r3, #63 /* clamp g */
115 mvnhi r3, r3, asr #31
116 andhi r3, r3, #63
117 cmp r4, #31 /* clamp b */
118 mvnhi r4, r4, asr #31
119 andhi r4, r4, #31
12015: /* no clamp */
121
122 /* calculate pixel_1 and save to r5 for later pixel packing */
123 orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
124 orr r5, r4, r6, lsl #11 /* r5 = pixel_1 */
125
126 /* 1st loop, second pixel */
127 ldrb r4, [r9], #1 /* r4 = *ysrc++ = *Y'_p++ */
128 sub r4, r4, #16 /* r4 = (Y'-16) * 74 */
129 add r3, r4, r4, asl #2
130 add r4, r3, r4, asl #5
131
132 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
133 add r3, r2, r4, asr #7 /* r3 = g = (Y >> 8) + guv */
134 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
135
136 orr r0, r6, r4 /* check if clamping is needed... */
137 orr r0, r0, r3, asr #1 /* ...at all */
138 cmp r0, #31
139 bls 15f /* -> no clamp */
140 cmp r6, #31 /* clamp r */
141 mvnhi r6, r6, asr #31
142 andhi r6, r6, #31
143 cmp r3, #63 /* clamp g */
144 mvnhi r3, r3, asr #31
145 andhi r3, r3, #63
146 cmp r4, #31 /* clamp b */
147 mvnhi r4, r4, asr #31
148 andhi r4, r4, #31
14915: /* no clamp */
150
151 /* calculate pixel_2 and pack with pixel_1 before writing */
152 orr r4, r4, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
153 orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
154
155 /* wait for FIFO half full */
156.fifo_wait1:
157 ldr r3, [lr, #0x1C] /* while (LCD_STATUS & 0x08); */
158 tst r3, #0x8
159 bgt .fifo_wait1
160
161 str r5, [lr, #0x40] /* write pixel_1 */
162 str r4, [lr, #0x40] /* write pixel_2 */
163
164 subs r7, r7, #2 /* check for loop end */
165 bgt 10b /* back to beginning */
166 /* 1st loop end */
167
168 /* Reload several registers for pointer rewinding for next loop */
169 add r8, sp, #16 /* chroma buffer */
170 ldmia sp, { r1, r7, r9} /* r1 = LCD_BASE */
171 /* r7 = loop count */
172 /* r9 = &ysrc[stride] */
173
174 /* 2nd loop start */
17520: /* loop start */
176 /* restore r0 (bu), r1 (rv) and r2 (guv) from chroma buffer */
177 ldmia r8!, {r0-r2}
178
179 /* 2nd loop, first pixel */
180 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
181 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
182 add r3, r5, r5, asl #2
183 add r5, r3, r5, asl #5
184
185 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
186 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
187 add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
188
189 orr r5, r6, r4 /* check if clamping is needed... */
190 orr r5, r5, r3, asr #1 /* ...at all */
191 cmp r5, #31
192 bls 15f /* -> no clamp */
193 cmp r6, #31 /* clamp r */
194 mvnhi r6, r6, asr #31
195 andhi r6, r6, #31
196 cmp r3, #63 /* clamp g */
197 mvnhi r3, r3, asr #31
198 andhi r3, r3, #63
199 cmp r4, #31 /* clamp b */
200 mvnhi r4, r4, asr #31
201 andhi r4, r4, #31
20215: /* no clamp */
203 /* calculate pixel_1 and save to r5 for later pixel packing */
204 orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
205 orr r5, r4, r6, lsl #11 /* r5 = pixel_1 */
206
207 /* 2nd loop, second pixel */
208 ldrb r4, [r9], #1 /* r4 = *ysrc++ = *Y'_p++ */
209 sub r4, r4, #16 /* r4 = (Y'-16) * 74 */
210 add r3, r4, r4, asl #2
211 add r4, r3, r4, asl #5
212
213 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
214 add r3, r2, r4, asr #7 /* r3 = g = (Y >> 8) + guv */
215 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
216
217 orr r0, r6, r4 /* check if clamping is needed... */
218 orr r0, r0, r3, asr #1 /* ...at all */
219 cmp r0, #31
220 bls 15f /* -> no clamp */
221 cmp r6, #31 /* clamp r */
222 mvnhi r6, r6, asr #31
223 andhi r6, r6, #31
224 cmp r3, #63 /* clamp g */
225 mvnhi r3, r3, asr #31
226 andhi r3, r3, #63
227 cmp r4, #31 /* clamp b */
228 mvnhi r4, r4, asr #31
229 andhi r4, r4, #31
23015: /* no clamp */
231
232 /* calculate pixel_2 and pack with pixel_1 before writing */
233 orr r4, r4, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
234 orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
235
236 /* wait for FIFO half full */
237.fifo_wait2:
238 ldr r3, [lr, #0x1C] /* while (LCD_STATUS & 0x08); */
239 tst r3, #0x8
240 bgt .fifo_wait2
241
242 str r5, [lr, #0x40] /* write pixel_1 */
243 str r4, [lr, #0x40] /* write pixel_2 */
244
245 subs r7, r7, #2 /* check for loop end */
246 bgt 20b /* back to beginning */
247 /* 2nd loop end */
248
249 ldr r3, [sp, #12]
250 add sp, sp, r3 /* deallocate buffer */
251 ldmpc regs=r4-r10 /* restore registers */
252
253 .ltorg
254 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
diff --git a/firmware/target/arm/s5l8700/ipodnano2g/lcd-nano2g.c b/firmware/target/arm/s5l8700/ipodnano2g/lcd-nano2g.c
index 071b68fde3..d1c41d652f 100644
--- a/firmware/target/arm/s5l8700/ipodnano2g/lcd-nano2g.c
+++ b/firmware/target/arm/s5l8700/ipodnano2g/lcd-nano2g.c
@@ -400,48 +400,21 @@ void lcd_update_rect(int x, int y, int width, int height)
400 } 400 }
401} 401}
402 402
403/*** update functions ***/ 403/* Line write helper function for lcd_yuv_blit. Writes two lines of yuv420. */
404 404extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
405#define CSUB_X 2 405 unsigned lcd_baseadress,
406#define CSUB_Y 2 406 int width,
407 407 int stride);
408/* YUV- > RGB565 conversion
409 * |R| |1.000000 -0.000001 1.402000| |Y'|
410 * |G| = |1.000000 -0.334136 -0.714136| |Pb|
411 * |B| |1.000000 1.772000 0.000000| |Pr|
412 * Scaled, normalized, rounded and tweaked to yield RGB 565:
413 * |R| |74 0 101| |Y' - 16| >> 9
414 * |G| = |74 -24 -51| |Cb - 128| >> 8
415 * |B| |74 128 0| |Cr - 128| >> 9
416*/
417 408
418#define RGBYFAC 74 /* 1.0 */ 409/* Blit a YUV bitmap directly to the LCD */
419#define RVFAC 101 /* 1.402 */
420#define GVFAC (-51) /* -0.714136 */
421#define GUFAC (-24) /* -0.334136 */
422#define BUFAC 128 /* 1.772 */
423
424/* ROUNDOFFS contain constant for correct round-offs as well as
425 constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC
426 -> constant part = -16*RGBYFAC). Through extraction of these
427 constant parts we save at leat 4 substractions in the conversion
428 loop */
429#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC)
430#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC)
431#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC)
432
433#define MAX_5BIT 0x1f
434#define MAX_6BIT 0x3f
435
436/* Performance function to blit a YUV bitmap directly to the LCD */
437void lcd_blit_yuv(unsigned char * const src[3], 410void lcd_blit_yuv(unsigned char * const src[3],
438 int src_x, int src_y, int stride, 411 int src_x, int src_y, int stride,
439 int x, int y, int width, int height) 412 int x, int y, int width, int height)
440{ 413{
441 int h; 414 unsigned int z, y0, x0, y1, x1;;
442 int y0, x0, y1, x1; 415 unsigned char const * yuv_src[3];
443 416
444 width = (width + 1) & ~1; 417 width = (width + 1) & ~1; /* ensure width is even */
445 418
446 x0 = x; /* start horiz */ 419 x0 = x; /* start horiz */
447 y0 = y; /* start vert */ 420 y0 = y; /* start vert */
@@ -471,79 +444,17 @@ void lcd_blit_yuv(unsigned char * const src[3],
471 s5l_lcd_write_cmd(R_MEMORY_WRITE); 444 s5l_lcd_write_cmd(R_MEMORY_WRITE);
472 } 445 }
473 446
474 const int stride_div_csub_x = stride/CSUB_X; 447 z = stride * src_y;
475 448 yuv_src[0] = src[0] + z + src_x;
476 h = height; 449 yuv_src[1] = src[1] + (z >> 2) + (src_x >> 1);
477 while (h > 0) { 450 yuv_src[2] = src[2] + (yuv_src[1] - src[1]);
478 /* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */
479 const unsigned char *ysrc = src[0] + stride * src_y + src_x;
480
481 const int uvoffset = stride_div_csub_x * (src_y/CSUB_Y) +
482 (src_x/CSUB_X);
483 451
484 const unsigned char *usrc = src[1] + uvoffset; 452 height >>= 1;
485 const unsigned char *vsrc = src[2] + uvoffset;
486 const unsigned char *row_end = ysrc + width;
487 453
488 int yp, up, vp; 454 do {
489 int red1, green1, blue1; 455 lcd_write_yuv420_lines(yuv_src, LCD_BASE, width, stride);
490 int red2, green2, blue2; 456 yuv_src[0] += stride << 1;
491 457 yuv_src[1] += stride >> 1; /* Skip down one chroma line */
492 int rc, gc, bc; 458 yuv_src[2] += stride >> 1;
493 459 } while (--height > 0);
494 do
495 {
496 up = *usrc++;
497 vp = *vsrc++;
498 rc = RVFAC * vp + ROUNDOFFSR;
499 gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG;
500 bc = BUFAC * up + ROUNDOFFSB;
501
502 /* Pixel 1 -> RGB565 */
503 yp = *ysrc++ * RGBYFAC;
504 red1 = (yp + rc) >> 9;
505 green1 = (yp + gc) >> 8;
506 blue1 = (yp + bc) >> 9;
507
508 /* Pixel 2 -> RGB565 */
509 yp = *ysrc++ * RGBYFAC;
510 red2 = (yp + rc) >> 9;
511 green2 = (yp + gc) >> 8;
512 blue2 = (yp + bc) >> 9;
513
514 /* Since out of bounds errors are relatively rare, we check two
515 pixels at once to see if any components are out of bounds, and
516 then fix whichever is broken. This works due to high values and
517 negative values both being !=0 when bitmasking them.
518 We first check for red and blue components (5bit range). */
519 if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT)
520 {
521 if (red1 & ~MAX_5BIT)
522 red1 = (red1 >> 31) ? 0 : MAX_5BIT;
523 if (blue1 & ~MAX_5BIT)
524 blue1 = (blue1 >> 31) ? 0 : MAX_5BIT;
525 if (red2 & ~MAX_5BIT)
526 red2 = (red2 >> 31) ? 0 : MAX_5BIT;
527 if (blue2 & ~MAX_5BIT)
528 blue2 = (blue2 >> 31) ? 0 : MAX_5BIT;
529 }
530 /* We second check for green component (6bit range) */
531 if ((green1 | green2) & ~MAX_6BIT)
532 {
533 if (green1 & ~MAX_6BIT)
534 green1 = (green1 >> 31) ? 0 : MAX_6BIT;
535 if (green2 & ~MAX_6BIT)
536 green2 = (green2 >> 31) ? 0 : MAX_6BIT;
537 }
538
539 /* output 2 pixels */
540 while (LCD_STATUS & 0x08); /* wait while FIFO is half full */
541 lcd_write_pixel((red1 << 11) | (green1 << 5) | blue1);
542 lcd_write_pixel((red2 << 11) | (green2 << 5) | blue2);
543 }
544 while (ysrc < row_end);
545
546 src_y++;
547 h--;
548 }
549} 460}