diff options
Diffstat (limited to 'firmware/target/arm/s5l8702')
-rw-r--r-- | firmware/target/arm/s5l8702/ipod6g/lcd-6g.c | 46 | ||||
-rw-r--r-- | firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S | 1013 |
2 files changed, 0 insertions, 1059 deletions
diff --git a/firmware/target/arm/s5l8702/ipod6g/lcd-6g.c b/firmware/target/arm/s5l8702/ipod6g/lcd-6g.c index 14647a5697..e1406549f4 100644 --- a/firmware/target/arm/s5l8702/ipod6g/lcd-6g.c +++ b/firmware/target/arm/s5l8702/ipod6g/lcd-6g.c | |||
@@ -530,49 +530,3 @@ void lcd_update_rect(int x, int y, int width, int height) | |||
530 | 530 | ||
531 | displaylcd_dma(pixels); | 531 | displaylcd_dma(pixels); |
532 | } | 532 | } |
533 | |||
534 | /* Line write helper function for lcd_yuv_blit. Writes two lines of yuv420. */ | ||
535 | extern void lcd_write_yuv420_lines(unsigned char const * const src[3], | ||
536 | uint16_t* outbuf, | ||
537 | int width, | ||
538 | int stride); | ||
539 | |||
540 | /* Blit a YUV bitmap directly to the LCD */ | ||
541 | void lcd_blit_yuv(unsigned char * const src[3], | ||
542 | int src_x, int src_y, int stride, | ||
543 | int x, int y, int width, int height) ICODE_ATTR; | ||
544 | void lcd_blit_yuv(unsigned char * const src[3], | ||
545 | int src_x, int src_y, int stride, | ||
546 | int x, int y, int width, int height) | ||
547 | { | ||
548 | unsigned int z; | ||
549 | unsigned char const * yuv_src[3]; | ||
550 | |||
551 | #ifdef HAVE_LCD_SLEEP | ||
552 | if (!lcd_active()) return; | ||
553 | #endif | ||
554 | |||
555 | width = (width + 1) & ~1; /* ensure width is even */ | ||
556 | |||
557 | int pixels = width * height; | ||
558 | uint16_t* out = lcd_dblbuf[0]; | ||
559 | |||
560 | z = stride * src_y; | ||
561 | yuv_src[0] = src[0] + z + src_x; | ||
562 | yuv_src[1] = src[1] + (z >> 2) + (src_x >> 1); | ||
563 | yuv_src[2] = src[2] + (yuv_src[1] - src[1]); | ||
564 | |||
565 | displaylcd_setup(x, y, width, height); | ||
566 | |||
567 | height >>= 1; | ||
568 | |||
569 | do { | ||
570 | lcd_write_yuv420_lines(yuv_src, out, width, stride); | ||
571 | yuv_src[0] += stride << 1; | ||
572 | yuv_src[1] += stride >> 1; /* Skip down one chroma line */ | ||
573 | yuv_src[2] += stride >> 1; | ||
574 | out += width << 1; | ||
575 | } while (--height); | ||
576 | |||
577 | displaylcd_dma(pixels); | ||
578 | } | ||
diff --git a/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S b/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S deleted file mode 100644 index 1ed7c4e189..0000000000 --- a/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S +++ /dev/null | |||
@@ -1,1013 +0,0 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id: lcd-as-video.S 26756 2010-06-11 04:41:36Z funman $ | ||
9 | * | ||
10 | * Copyright (C) 2010 by Andree Buschmann | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version 2 | ||
15 | * of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
18 | * KIND, either express or implied. | ||
19 | * | ||
20 | ****************************************************************************/ | ||
21 | |||
22 | /* Version history: | ||
23 | * | ||
24 | * SVN: | ||
25 | * - initial SVN version. | ||
26 | * | ||
27 | * ARMv4: | ||
28 | * - use all available registers to calculate four pixels within each | ||
29 | * loop iteration. | ||
30 | * - avoid LDR interlocks. | ||
31 | * | ||
32 | * ARMv5TE: | ||
33 | * - use ARMv5TE+ 1-cycle multiply-accumulate instructions. | ||
34 | * | ||
35 | * ARMv5TE_WST: | ||
36 | * - use data tables (256 bytes) for RBG565 saturation. | ||
37 | * | ||
38 | * All versions are based on current SVN algorithm (round->scale->add) | ||
39 | * using the same coefficients, so output results are identical. | ||
40 | * | ||
41 | * TODO?: SVN coefficients are a very nice approximation for operations | ||
42 | * with shift+add instructions. When 16x16+32 MLA instructions are used, | ||
43 | * NBR and COEF_N could probably be adjusted to slighly increase accuracy. | ||
44 | */ | ||
45 | #define VERSION_SVN 0 | ||
46 | #define VERSION_ARMV4 1 | ||
47 | #define VERSION_ARMV5TE 2 | ||
48 | #define VERSION_ARMV5TE_WST 3 | ||
49 | |||
50 | #define YUV2RGB_VERSION VERSION_ARMV5TE_WST | ||
51 | |||
52 | |||
53 | #define ASM | ||
54 | #include "config.h" | ||
55 | #include "cpu.h" | ||
56 | |||
57 | #if (YUV2RGB_VERSION == VERSION_SVN) | ||
58 | .section .icode, "ax", %progbits | ||
59 | |||
60 | |||
61 | /**************************************************************************** | ||
62 | * extern void lcd_write_yuv420_lines(unsigned char const * const src[3], | ||
63 | * uint16_t* out, | ||
64 | * int width, | ||
65 | * int stride); | ||
66 | * | ||
67 | * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is: | ||
68 | * |R| |1.164 0.000 1.596| |Y' - 16| | ||
69 | * |G| = |1.164 -0.391 -0.813| |Pb - 128| | ||
70 | * |B| |1.164 2.018 0.000| |Pr - 128| | ||
71 | * | ||
72 | * Scaled, normalized, rounded and tweaked to yield RGB 565: | ||
73 | * |R| |74 0 101| |Y' - 16| >> 9 | ||
74 | * |G| = |74 -24 -51| |Cb - 128| >> 8 | ||
75 | * |B| |74 128 0| |Cr - 128| >> 9 | ||
76 | * | ||
77 | * Converts two lines from YUV to RGB565 and writes to LCD at once. First loop | ||
78 | * loads Cb/Cr, calculates the chroma offset and saves them to buffer. Within | ||
79 | * the second loop these chroma offset are reloaded from buffer. Within each | ||
80 | * loop two pixels are calculated and written to LCD. | ||
81 | */ | ||
82 | .align 2 | ||
83 | .global lcd_write_yuv420_lines | ||
84 | .type lcd_write_yuv420_lines, %function | ||
85 | lcd_write_yuv420_lines: | ||
86 | /* r0 = src = yuv_src */ | ||
87 | /* r1 = dst = out */ | ||
88 | /* r2 = width */ | ||
89 | /* r3 = stride */ | ||
90 | stmfd sp!, { r4-r10, lr } /* save non-scratch */ | ||
91 | ldmia r0, { r9, r10, r12 } /* r9 = yuv_src[0] = Y'_p */ | ||
92 | /* r10 = yuv_src[1] = Cb_p */ | ||
93 | /* r12 = yuv_src[2] = Cr_p */ | ||
94 | add r3, r9, r3 /* r3 = &ysrc[stride] */ | ||
95 | add r4, r2, r2, asr #1 /* chroma buffer lenght = width/2 *3 */ | ||
96 | mov r4, r4, asl #2 /* use words for str/ldm possibility */ | ||
97 | add r4, r4, #15 /* plus room for 3 additional words, */ | ||
98 | bic r4, r4, #3 /* rounded up to multiples of 4 byte */ | ||
99 | sub sp, sp, r4 /* and allocate on stack */ | ||
100 | stmia sp, {r2-r4} /* width, &ysrc[stride], stack_alloc */ | ||
101 | |||
102 | mov r7, r2 /* r7 = loop count */ | ||
103 | add r8, sp, #12 /* chroma buffer */ | ||
104 | mov lr, r1 /* RGB565 data destination buffer */ | ||
105 | |||
106 | /* 1st loop start */ | ||
107 | 10: /* loop start */ | ||
108 | |||
109 | ldrb r0, [r10], #1 /* r0 = *usrc++ = *Cb_p++ */ | ||
110 | ldrb r1, [r12], #1 /* r1 = *vsrc++ = *Cr_p++ */ | ||
111 | |||
112 | sub r0, r0, #128 /* r0 = Cb-128 */ | ||
113 | sub r1, r1, #128 /* r1 = Cr-128 */ | ||
114 | |||
115 | add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */ | ||
116 | add r2, r2, r2, asl #4 | ||
117 | add r2, r2, r0, asl #3 | ||
118 | add r2, r2, r0, asl #4 | ||
119 | |||
120 | add r4, r1, r1, asl #2 /* r1 = Cr*101 */ | ||
121 | add r4, r4, r1, asl #5 | ||
122 | add r1, r4, r1, asl #6 | ||
123 | |||
124 | add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */ | ||
125 | mov r1, r1, asr #9 | ||
126 | rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */ | ||
127 | mov r2, r2, asr #8 | ||
128 | add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */ | ||
129 | mov r0, r0, asr #2 | ||
130 | stmia r8!, {r0-r2} /* store r0, r1 and r2 to chroma buffer */ | ||
131 | |||
132 | /* 1st loop, first pixel */ | ||
133 | ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */ | ||
134 | sub r5, r5, #16 /* r5 = (Y'-16) * 74 */ | ||
135 | add r3, r5, r5, asl #2 | ||
136 | add r5, r3, r5, asl #5 | ||
137 | |||
138 | add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
139 | add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */ | ||
140 | add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */ | ||
141 | |||
142 | orr r5, r6, r4 /* check if clamping is needed... */ | ||
143 | orr r5, r5, r3, asr #1 /* ...at all */ | ||
144 | cmp r5, #31 | ||
145 | bls 15f /* -> no clamp */ | ||
146 | cmp r6, #31 /* clamp r */ | ||
147 | mvnhi r6, r6, asr #31 | ||
148 | andhi r6, r6, #31 | ||
149 | cmp r3, #63 /* clamp g */ | ||
150 | mvnhi r3, r3, asr #31 | ||
151 | andhi r3, r3, #63 | ||
152 | cmp r4, #31 /* clamp b */ | ||
153 | mvnhi r4, r4, asr #31 | ||
154 | andhi r4, r4, #31 | ||
155 | 15: /* no clamp */ | ||
156 | |||
157 | /* calculate pixel_1 and save to r4 for later pixel packing */ | ||
158 | orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */ | ||
159 | orr r4, r4, r6, lsl #11 /* r4 = pixel_1 */ | ||
160 | |||
161 | /* 1st loop, second pixel */ | ||
162 | ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */ | ||
163 | sub r5, r5, #16 /* r5 = (Y'-16) * 74 */ | ||
164 | add r3, r5, r5, asl #2 | ||
165 | add r5, r3, r5, asl #5 | ||
166 | |||
167 | add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
168 | add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */ | ||
169 | add r5, r0, r5, asr #8 /* r5 = b = (Y >> 9) + bu */ | ||
170 | |||
171 | orr r0, r6, r5 /* check if clamping is needed... */ | ||
172 | orr r0, r0, r3, asr #1 /* ...at all */ | ||
173 | cmp r0, #31 | ||
174 | bls 15f /* -> no clamp */ | ||
175 | cmp r6, #31 /* clamp r */ | ||
176 | mvnhi r6, r6, asr #31 | ||
177 | andhi r6, r6, #31 | ||
178 | cmp r3, #63 /* clamp g */ | ||
179 | mvnhi r3, r3, asr #31 | ||
180 | andhi r3, r3, #63 | ||
181 | cmp r5, #31 /* clamp b */ | ||
182 | mvnhi r5, r5, asr #31 | ||
183 | andhi r5, r5, #31 | ||
184 | 15: /* no clamp */ | ||
185 | |||
186 | /* calculate pixel_2 and pack with pixel_1 before writing */ | ||
187 | orr r5, r5, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */ | ||
188 | orr r5, r5, r6, lsl #11 /* r5 = pixel_2 */ | ||
189 | orr r4, r4, r5, lsl #16 | ||
190 | str r4, [lr], #4 /* write pixel_1 and pixel_2 */ | ||
191 | |||
192 | subs r7, r7, #2 /* check for loop end */ | ||
193 | bgt 10b /* back to beginning */ | ||
194 | /* 1st loop end */ | ||
195 | |||
196 | /* Reload several registers for pointer rewinding for next loop */ | ||
197 | add r8, sp, #12 /* chroma buffer */ | ||
198 | ldmia sp, {r7, r9} /* r7 = loop count */ | ||
199 | /* r9 = &ysrc[stride] */ | ||
200 | |||
201 | /* 2nd loop start */ | ||
202 | 20: /* loop start */ | ||
203 | /* restore r0 (bu), r1 (rv) and r2 (guv) from chroma buffer */ | ||
204 | ldmia r8!, {r0-r2} | ||
205 | |||
206 | /* 2nd loop, first pixel */ | ||
207 | ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */ | ||
208 | sub r5, r5, #16 /* r5 = (Y'-16) * 74 */ | ||
209 | add r3, r5, r5, asl #2 | ||
210 | add r5, r3, r5, asl #5 | ||
211 | |||
212 | add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
213 | add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */ | ||
214 | add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */ | ||
215 | |||
216 | orr r5, r6, r4 /* check if clamping is needed... */ | ||
217 | orr r5, r5, r3, asr #1 /* ...at all */ | ||
218 | cmp r5, #31 | ||
219 | bls 15f /* -> no clamp */ | ||
220 | cmp r6, #31 /* clamp r */ | ||
221 | mvnhi r6, r6, asr #31 | ||
222 | andhi r6, r6, #31 | ||
223 | cmp r3, #63 /* clamp g */ | ||
224 | mvnhi r3, r3, asr #31 | ||
225 | andhi r3, r3, #63 | ||
226 | cmp r4, #31 /* clamp b */ | ||
227 | mvnhi r4, r4, asr #31 | ||
228 | andhi r4, r4, #31 | ||
229 | 15: /* no clamp */ | ||
230 | /* calculate pixel_1 and save to r4 for later pixel packing */ | ||
231 | orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */ | ||
232 | orr r4, r4, r6, lsl #11 /* r4 = pixel_1 */ | ||
233 | |||
234 | /* 2nd loop, second pixel */ | ||
235 | ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */ | ||
236 | sub r5, r5, #16 /* r5 = (Y'-16) * 74 */ | ||
237 | add r3, r5, r5, asl #2 | ||
238 | add r5, r3, r5, asl #5 | ||
239 | |||
240 | add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
241 | add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */ | ||
242 | add r5, r0, r5, asr #8 /* r5 = b = (Y >> 9) + bu */ | ||
243 | |||
244 | orr r0, r6, r5 /* check if clamping is needed... */ | ||
245 | orr r0, r0, r3, asr #1 /* ...at all */ | ||
246 | cmp r0, #31 | ||
247 | bls 15f /* -> no clamp */ | ||
248 | cmp r6, #31 /* clamp r */ | ||
249 | mvnhi r6, r6, asr #31 | ||
250 | andhi r6, r6, #31 | ||
251 | cmp r3, #63 /* clamp g */ | ||
252 | mvnhi r3, r3, asr #31 | ||
253 | andhi r3, r3, #63 | ||
254 | cmp r5, #31 /* clamp b */ | ||
255 | mvnhi r5, r5, asr #31 | ||
256 | andhi r5, r5, #31 | ||
257 | 15: /* no clamp */ | ||
258 | |||
259 | /* calculate pixel_2 and pack with pixel_1 before writing */ | ||
260 | orr r5, r5, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */ | ||
261 | orr r5, r5, r6, lsl #11 /* r5 = pixel_2 */ | ||
262 | orr r4, r4, r5, lsl #16 | ||
263 | str r4, [lr], #4 /* write pixel_1 and pixel_2 */ | ||
264 | |||
265 | subs r7, r7, #2 /* check for loop end */ | ||
266 | bgt 20b /* back to beginning */ | ||
267 | /* 2nd loop end */ | ||
268 | |||
269 | ldr r3, [sp, #8] | ||
270 | add sp, sp, r3 /* deallocate buffer */ | ||
271 | ldmpc regs=r4-r10 /* restore registers */ | ||
272 | |||
273 | .ltorg | ||
274 | .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines | ||
275 | |||
276 | |||
277 | #elif (YUV2RGB_VERSION == VERSION_ARMV4) | ||
278 | /**************************************************************************** | ||
279 | * extern void lcd_write_yuv420_lines(unsigned char const * const src[3], | ||
280 | * uint16_t* out, | ||
281 | * int width, | ||
282 | * int stride); | ||
283 | * | ||
284 | * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is: | ||
285 | * |R| |1.164 0.000 1.596| |Y' - 16| | ||
286 | * |G| = |1.164 -0.391 -0.813| |Pb - 128| | ||
287 | * |B| |1.164 2.018 0.000| |Pr - 128| | ||
288 | * | ||
289 | * Scaled, normalized, rounded and tweaked to yield RGB 565: | ||
290 | * |R| |74 0 101| |Y' - 16| >> 9 | ||
291 | * |G| = |74 -24 -51| |Cb - 128| >> 8 | ||
292 | * |B| |74 128 0| |Cr - 128| >> 9 | ||
293 | * | ||
294 | * Converts two lines from YUV420 to RGB565, within each iteration four | ||
295 | * pixels (2 per line) are calculated and written to destination buffer. | ||
296 | */ | ||
297 | .section .icode, "ax", %progbits | ||
298 | |||
299 | .align 2 | ||
300 | .global lcd_write_yuv420_lines | ||
301 | .type lcd_write_yuv420_lines, %function | ||
302 | |||
303 | lcd_write_yuv420_lines: | ||
304 | /* r0 = src = yuv_src */ | ||
305 | /* r1 = dst = out */ | ||
306 | /* r2 = width */ | ||
307 | /* r3 = stride */ | ||
308 | stmfd sp!, {r4-r11,lr} /* save non-scratch */ | ||
309 | ldmia r0, {r10-r12} /* r10 = yuv_src[0] = Y'_p */ | ||
310 | /* r11 = yuv_src[1] = Cb_p */ | ||
311 | /* r12 = yuv_src[2] = Cr_p */ | ||
312 | mov r9, r2, lsl #1 /* r9 = 2*width (loop count) */ | ||
313 | str r9, [sp, #-4]! /* [--sp] = 2*width (constant) */ | ||
314 | add r8, r10, r3 /* r8 = Y'_p + stride = Y'stride_p */ | ||
315 | mov lr, r1 /* RGB565 data destination buffer */ | ||
316 | |||
317 | 10: /* loop start */ | ||
318 | ldrb r0, [r11], #1 /* r0 = *Cb_p++ */ | ||
319 | ldrb r1, [r12], #1 /* r1 = *Cr_p++ */ | ||
320 | ldrb r3, [r8], #1 /* r3 = Y'3 */ | ||
321 | ldrb r4, [r8], #1 /* r4 = Y'4 */ | ||
322 | |||
323 | sub r0, r0, #128 /* r0 = Cb-128 */ | ||
324 | sub r1, r1, #128 /* r1 = Cr-128 */ | ||
325 | |||
326 | add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */ | ||
327 | add r2, r2, r2, asl #4 | ||
328 | add r2, r2, r0, asl #3 | ||
329 | add r2, r2, r0, asl #4 | ||
330 | |||
331 | add r5, r1, r1, asl #2 /* r1 = Cr*101 */ | ||
332 | add r5, r5, r1, asl #5 | ||
333 | add r1, r5, r1, asl #6 | ||
334 | |||
335 | add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */ | ||
336 | mov r1, r1, asr #9 | ||
337 | rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */ | ||
338 | mov r2, r2, asr #8 | ||
339 | add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */ | ||
340 | mov r0, r0, asr #2 | ||
341 | |||
342 | /* pixel_3 */ | ||
343 | sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */ | ||
344 | add r7, r3, r3, asl #2 | ||
345 | add r3, r7, r3, asl #5 | ||
346 | |||
347 | add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
348 | add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */ | ||
349 | add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */ | ||
350 | |||
351 | orr r3, r6, r5 /* check if clamping is needed... */ | ||
352 | orr r3, r3, r7, asr #1 /* ...at all */ | ||
353 | cmp r3, #31 | ||
354 | bls 15f /* no clamp */ | ||
355 | cmp r6, #31 /* clamp r */ | ||
356 | mvnhi r6, r6, asr #31 | ||
357 | andhi r6, r6, #31 | ||
358 | cmp r7, #63 /* clamp g */ | ||
359 | mvnhi r7, r7, asr #31 | ||
360 | andhi r7, r7, #63 | ||
361 | cmp r5, #31 /* clamp b */ | ||
362 | mvnhi r5, r5, asr #31 | ||
363 | andhi r5, r5, #31 | ||
364 | 15: /* no clamp */ | ||
365 | |||
366 | /* calculate pixel_3 and save to r5 for later pixel packing */ | ||
367 | orr r5, r5, r7, lsl #5 /* pixel_3 = r<<11 | g<<5 | b */ | ||
368 | orr r5, r5, r6, lsl #11 /* r5 = pixel_3 */ | ||
369 | |||
370 | /* pixel_4 */ | ||
371 | sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */ | ||
372 | add r7, r4, r4, asl #2 | ||
373 | add r4, r7, r4, asl #5 | ||
374 | |||
375 | add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
376 | add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */ | ||
377 | add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */ | ||
378 | |||
379 | orr r3, r6, r4 /* check if clamping is needed... */ | ||
380 | orr r3, r3, r7, asr #1 /* ...at all */ | ||
381 | cmp r3, #31 | ||
382 | bls 15f /* no clamp */ | ||
383 | cmp r6, #31 /* clamp r */ | ||
384 | mvnhi r6, r6, asr #31 | ||
385 | andhi r6, r6, #31 | ||
386 | cmp r7, #63 /* clamp g */ | ||
387 | mvnhi r7, r7, asr #31 | ||
388 | andhi r7, r7, #63 | ||
389 | cmp r4, #31 /* clamp b */ | ||
390 | mvnhi r4, r4, asr #31 | ||
391 | andhi r4, r4, #31 | ||
392 | 15: /* no clamp */ | ||
393 | |||
394 | /* calculate pixel_4 and pack with pixel_3 before writing */ | ||
395 | orr r4, r4, r7, lsl #5 /* pixel_4 = r<<11 | g<<5 | b */ | ||
396 | orr r4, r4, r6, lsl #11 /* r4 = pixel_4 */ | ||
397 | orr r5, r5, r4, lsl #16 /* r5 = pixel_4<<16 | pixel_3 */ | ||
398 | |||
399 | ldr r7, [sp] /* r7 = 2*width */ | ||
400 | ldrb r3, [r10], #1 /* r3 = Y'1 */ | ||
401 | ldrb r4, [r10], #1 /* r4 = Y'2 */ | ||
402 | |||
403 | str r5, [lr, r7] /* write pixel_3 and pixel_4 */ | ||
404 | |||
405 | /* pixel_1 */ | ||
406 | sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */ | ||
407 | add r7, r3, r3, asl #2 | ||
408 | add r3, r7, r3, asl #5 | ||
409 | |||
410 | add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
411 | add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */ | ||
412 | add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */ | ||
413 | |||
414 | orr r3, r6, r5 /* check if clamping is needed... */ | ||
415 | orr r3, r3, r7, asr #1 /* ...at all */ | ||
416 | cmp r3, #31 | ||
417 | bls 15f /* no clamp */ | ||
418 | cmp r6, #31 /* clamp r */ | ||
419 | mvnhi r6, r6, asr #31 | ||
420 | andhi r6, r6, #31 | ||
421 | cmp r7, #63 /* clamp g */ | ||
422 | mvnhi r7, r7, asr #31 | ||
423 | andhi r7, r7, #63 | ||
424 | cmp r5, #31 /* clamp b */ | ||
425 | mvnhi r5, r5, asr #31 | ||
426 | andhi r5, r5, #31 | ||
427 | 15: /* no clamp */ | ||
428 | |||
429 | /* calculate pixel_1 and save to r5 for later pixel packing */ | ||
430 | orr r5, r5, r7, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */ | ||
431 | orr r5, r5, r6, lsl #11 /* r5 = pixel_1 */ | ||
432 | |||
433 | /* pixel_2 */ | ||
434 | sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */ | ||
435 | add r7, r4, r4, asl #2 | ||
436 | add r4, r7, r4, asl #5 | ||
437 | |||
438 | add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */ | ||
439 | add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */ | ||
440 | add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */ | ||
441 | |||
442 | orr r3, r6, r4 /* check if clamping is needed... */ | ||
443 | orr r3, r3, r7, asr #1 /* ...at all */ | ||
444 | cmp r3, #31 | ||
445 | bls 15f /* no clamp */ | ||
446 | cmp r6, #31 /* clamp r */ | ||
447 | mvnhi r6, r6, asr #31 | ||
448 | andhi r6, r6, #31 | ||
449 | cmp r7, #63 /* clamp g */ | ||
450 | mvnhi r7, r7, asr #31 | ||
451 | andhi r7, r7, #63 | ||
452 | cmp r4, #31 /* clamp b */ | ||
453 | mvnhi r4, r4, asr #31 | ||
454 | andhi r4, r4, #31 | ||
455 | 15: /* no clamp */ | ||
456 | |||
457 | /* calculate pixel_2 and pack with pixel_1 before writing */ | ||
458 | orr r4, r4, r7, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */ | ||
459 | orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */ | ||
460 | orr r5, r5, r4, lsl #16 /* r5 = pixel_2<<16 | pixel_1 */ | ||
461 | |||
462 | str r5, [lr], #4 /* write pixel_1 and pixel_2 */ | ||
463 | |||
464 | subs r9, r9, #4 /* check for loop end */ | ||
465 | bgt 10b /* back to beginning */ | ||
466 | |||
467 | /* loop end */ | ||
468 | add sp, sp, #4 /* deallocate stack */ | ||
469 | ldmpc regs=r4-r11 /* restore registers */ | ||
470 | |||
471 | .ltorg | ||
472 | .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines | ||
473 | |||
474 | |||
475 | #elif (YUV2RGB_VERSION == VERSION_ARMV5TE) | ||
476 | /**************************************************************************** | ||
477 | * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ) | ||
478 | * |R| |0.00456621 0 0.00625893| |Y' - 16| | ||
479 | * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128| | ||
480 | * |B| |0.00456621 0.00791071 0 | |Pr - 128| | ||
481 | * | ||
482 | * Scaled, normalized, rounded and tweaked to yield RGB 565: | ||
483 | * |R| |74 0 101| |Y' - 16| >> 9 | ||
484 | * |G| = |74 -24 -51| |Cb - 128| >> 8 | ||
485 | * |B| |74 128 0| |Cr - 128| >> 9 | ||
486 | */ | ||
487 | #define NBR 14 /* 14-bit resolution (SVN) */ | ||
488 | #define COEF_C0 74 | ||
489 | #define COEF_C1 101 | ||
490 | #define COEF_C2 -24 | ||
491 | #define COEF_C3 -51 | ||
492 | #define COEF_C4 128 | ||
493 | #define C4_IS_POW2 | ||
494 | |||
495 | /* constant for rounding a NBR number before down-scaling it to RS bits */ | ||
496 | #define ROUND(RS) (1 << (NBR - RS - 1)) | ||
497 | |||
498 | /* packed 16-bit coefficients */ | ||
499 | #define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff)) | ||
500 | #define COEF_2C3_2C2 ((COEF_C3 << 17) | ((COEF_C2 << 1) & 0xffff)) | ||
501 | /* 32-bit MLA constants */ | ||
502 | #define CONST_MLA_Y (-16 * COEF_C0) | ||
503 | |||
504 | /**************************************************************************** | ||
505 | * extern void lcd_write_yuv420_lines(unsigned char const * const src[3], | ||
506 | * uint16_t* out, | ||
507 | * int width, | ||
508 | * int stride); | ||
509 | * | ||
510 | * Converts two lines from YUV420 to RGB565, within each iteration four | ||
511 | * pixels (2 per line) are calculated and written to destination buffer. | ||
512 | * | ||
513 | * - use ARMv5TE+ 1-cycle multiply+accumulator instructions. | ||
514 | */ | ||
515 | .section .icode, "ax", %progbits | ||
516 | |||
517 | .align 2 | ||
518 | .global lcd_write_yuv420_lines | ||
519 | .type lcd_write_yuv420_lines, %function | ||
520 | |||
521 | lcd_write_yuv420_lines: | ||
522 | @ r0 = src = yuv_src | ||
523 | @ r1 = out = dst_p | ||
524 | @ r2 = width | ||
525 | @ r3 = stride | ||
526 | stmfd sp!, {r4-r11,lr} @ save non-scratch | ||
527 | ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p | ||
528 | @ r11 = yuv_src[1] = Cb_p | ||
529 | @ r12 = yuv_src[2] = Cr_p | ||
530 | adr r0, const_data @ load constants | ||
531 | ldmia r0, {r5-r8} @ r5 = COEF_C4_C1 | ||
532 | @ r6 = COEF_2C3_2C2 | ||
533 | @ r7 = COEF_C0 | ||
534 | @ r8 = CONST_MLA_Y | ||
535 | sub r4, r12, r11 @ r4 = Cr_p-Cb_p | ||
536 | mov r9, r2, asl #1 @ r9 = 2*width | ||
537 | stmfd sp!, {r4-r6,r9} @ SP -> Cr_p-Cb_p | ||
538 | @ COEF_C4_C1 | ||
539 | @ COEF_2C3_2C2 | ||
540 | @ 2*width | ||
541 | add r12, r10, r3 @ r12 = Y'_p + stride = Y'stride_p | ||
542 | mov lr, r1 @ RGB565 data destination buffer | ||
543 | orr r9, r7, r2, lsl #15 @ loop_count = width/2; | ||
544 | @ r9 = loop_count<<16 | COEF_C0 | ||
545 | sub r9, r9, #0x10000 @ loop_count-- | ||
546 | |||
547 | 10: @ loop_start | ||
548 | |||
549 | @ register usage: | ||
550 | @ r8 = CONST_MLA_Y | ||
551 | @ r9 = loop count<<16 | COEF_C0 | ||
552 | @ r10 = Y'_p | ||
553 | @ r11 = Cb_p | ||
554 | @ r12 = Y'stride_p | ||
555 | @ lr = dst_p | ||
556 | @ free: r0-r7 | ||
557 | |||
558 | ldmia sp, {r2-r4} @ r2 = Cr_p-Cb_p | ||
559 | @ r3 = COEF_C4_C1 | ||
560 | @ r4 = COEF_2C3_2C2 | ||
561 | mov r5, #ROUND(5) @ r5 = round constant | ||
562 | |||
563 | ldrb r6, [r12], #1 @ r6 = Y'3 | ||
564 | ldrb r7, [r12], #1 @ r7 = Y'4 | ||
565 | |||
566 | ldrb r1, [r11, r2] @ r1 = Cr = *Cr_p++ | ||
567 | ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++ | ||
568 | |||
569 | /* calculate Y3 and Y4 */ | ||
570 | smlabb r6, r6, r9, r8 @ r6 = Y3 = C0*Y'3 - C0*16 | ||
571 | smlabb r7, r7, r9, r8 @ r7 = Y4 = C0*Y'4 - C0*16 | ||
572 | |||
573 | /* calculate rv, guv, bu */ | ||
574 | sub r1, r1, #128 @ r1 = Cr" = Cr-128 | ||
575 | sub r0, r0, #128 @ r0 = Cb" = Cb-128 | ||
576 | |||
577 | smlabt r2, r1, r4, r5 @ r2 = guv" = Cr"*(2*C2) + | ||
578 | smlabb r2, r0, r4, r2 @ Cb"*(2*C3) + round | ||
579 | smlabb r1, r1, r3, r5 @ r1 = rv" = Cr"*C1 + round | ||
580 | #ifdef C4_IS_POW2 | ||
581 | add r0, r5, r0, asl #NBR-7 @ r0 = bu" = Cb"*C4 + round | ||
582 | #else | ||
583 | smlabt r0, r0, r3, r5 @ r0 = bu" = Cb"*C4 + round | ||
584 | #endif | ||
585 | |||
586 | /* scale rv",guv",bu" */ | ||
587 | mov r2, r2, asr #NBR-5 @ r2 = guv = guv" >> scale | ||
588 | mov r1, r1, asr #NBR-5 @ r1 = rv = rv" >> scale | ||
589 | mov r0, r0, asr #NBR-5 @ r0 = bu = bu" >> scale | ||
590 | |||
591 | @ register usage: | ||
592 | @ r8-r12,lr: pointers, counters | ||
593 | @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565) | ||
594 | @ r6,r7 = Y'3,Y'4 | ||
595 | @ free: r3-r5 | ||
596 | |||
597 | /* pixel_3 */ | ||
598 | add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y3 >> scale) + rv | ||
599 | add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y3 >> scale) + guv | ||
600 | add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y3 >> scale) + bu | ||
601 | |||
602 | orr r6, r5, r3 @ check if clamping is needed... | ||
603 | orr r6, r6, r4, asr #1 @ ...at all | ||
604 | cmp r6, #31 | ||
605 | bls 15f @ no clamp | ||
606 | cmp r5, #31 @ clamp r | ||
607 | mvnhi r5, r5, asr #31 | ||
608 | andhi r5, r5, #31 | ||
609 | cmp r4, #63 @ clamp g | ||
610 | mvnhi r4, r4, asr #31 | ||
611 | andhi r4, r4, #63 | ||
612 | cmp r3, #31 @ clamp b | ||
613 | mvnhi r3, r3, asr #31 | ||
614 | andhi r3, r3, #31 | ||
615 | 15: @ no clamp | ||
616 | |||
617 | /* calculate pixel_3 and save to r3 for later pixel packing */ | ||
618 | orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = r<<11 | g<<5 | b | ||
619 | orr r3, r3, r5, lsl #11 | ||
620 | |||
621 | /* pixel_4 */ | ||
622 | add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y4 >> scale) + rv | ||
623 | add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y4 >> scale) + guv | ||
624 | add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y4 >> scale) + bu | ||
625 | |||
626 | orr r6, r5, r7 @ check if clamping is needed... | ||
627 | orr r6, r6, r4, asr #1 @ ...at all | ||
628 | cmp r6, #31 | ||
629 | bls 15f @ no clamp | ||
630 | cmp r5, #31 @ clamp r | ||
631 | mvnhi r5, r5, asr #31 | ||
632 | andhi r5, r5, #31 | ||
633 | cmp r4, #63 @ clamp g | ||
634 | mvnhi r4, r4, asr #31 | ||
635 | andhi r4, r4, #63 | ||
636 | cmp r7, #31 @ clamp b | ||
637 | mvnhi r7, r7, asr #31 | ||
638 | andhi r7, r7, #31 | ||
639 | 15: @ no clamp | ||
640 | |||
641 | /* calculate pixel_4 and pack with pixel_3 before writing */ | ||
642 | orr r7, r7, r4, lsl #5 @ r7 = pixel_4 = r<<11 | g<<5 | b | ||
643 | orr r7, r7, r5, lsl #11 | ||
644 | orr r3, r3, r7, lsl #16 @ r3 = pixel_4<<16 | pixel_3 | ||
645 | |||
646 | /* avoid interlocks when writing pixel_3 and pixel_4 */ | ||
647 | ldr r5, [sp, #12] @ r5 = 2*width | ||
648 | |||
649 | ldrb r6, [r10], #1 @ r6 = Y'1 | ||
650 | ldrb r7, [r10], #1 @ r7 = Y'2 | ||
651 | |||
652 | /* write pixel_3 and pixel_4 */ | ||
653 | str r3, [lr, r5] @ [dst_p + 2*width] = r3 | ||
654 | |||
655 | @ register usage: | ||
656 | @ r8-r12,lr: pointers, counters | ||
657 | @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565) | ||
658 | @ r6,r7 = Y'1,Y'2 | ||
659 | @ free: r3-r5 | ||
660 | |||
661 | /* calculate Y1 and Y2 */ | ||
662 | smlabb r6, r6, r9, r8 @ r6 = Y1 = C0*Y'1 - C0*16 | ||
663 | smlabb r7, r7, r9, r8 @ r7 = Y2 = C0*Y'2 - C0*16 | ||
664 | |||
665 | /* pixel_1 */ | ||
666 | add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y1 >> scale) + rv | ||
667 | add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y1 >> scale) + guv | ||
668 | add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y1 >> scale) + bu | ||
669 | |||
670 | orr r6, r5, r3 @ check if clamping is needed... | ||
671 | orr r6, r6, r4, asr #1 @ ...at all | ||
672 | cmp r6, #31 | ||
673 | bls 15f @ no clamp | ||
674 | cmp r5, #31 @ clamp r | ||
675 | mvnhi r5, r5, asr #31 | ||
676 | andhi r5, r5, #31 | ||
677 | cmp r4, #63 @ clamp g | ||
678 | mvnhi r4, r4, asr #31 | ||
679 | andhi r4, r4, #63 | ||
680 | cmp r3, #31 @ clamp b | ||
681 | mvnhi r3, r3, asr #31 | ||
682 | andhi r3, r3, #31 | ||
683 | 15: @ no clamp | ||
684 | |||
685 | /* calculate pixel_1 and save to r3 for later pixel packing */ | ||
686 | orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = r<<11 | g<<5 | b | ||
687 | orr r3, r3, r5, lsl #11 | ||
688 | |||
689 | /* pixel_2 */ | ||
690 | add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y2 >> scale) + rv | ||
691 | add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y2 >> scale) + guv | ||
692 | add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y2 >> scale) + bu | ||
693 | |||
694 | orr r6, r5, r7 @ check if clamping is needed... | ||
695 | orr r6, r6, r4, asr #1 @ ...at all | ||
696 | cmp r6, #31 | ||
697 | bls 15f @ no clamp | ||
698 | cmp r5, #31 @ clamp r | ||
699 | mvnhi r5, r5, asr #31 | ||
700 | andhi r5, r5, #31 | ||
701 | cmp r4, #63 @ clamp g | ||
702 | mvnhi r4, r4, asr #31 | ||
703 | andhi r4, r4, #63 | ||
704 | cmp r7, #31 @ clamp b | ||
705 | mvnhi r7, r7, asr #31 | ||
706 | andhi r7, r7, #31 | ||
707 | 15: @ no clamp | ||
708 | |||
709 | /* calculate pixel_2 and pack with pixel_1 before writing */ | ||
710 | orr r7, r7, r4, lsl #5 @ r7 = pixel_2 = r<<11 | g<<5 | b | ||
711 | orr r7, r7, r5, lsl #11 | ||
712 | orr r3, r3, r7, lsl #16 @ r3 = pixel_2 << 16 | pixel_1 | ||
713 | |||
714 | str r3, [lr], #4 @ write pixel_1 and pixel_2 | ||
715 | |||
716 | /* check for loop end */ | ||
717 | subs r9, r9, #0x10000 @ loop_count-- | ||
718 | bge 10b @ back to beginning | ||
719 | |||
720 | /* bye */ | ||
721 | add sp, sp, #16 | ||
722 | ldmpc regs=r4-r11 @ restore registers | ||
723 | |||
724 | .ltorg | ||
725 | .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines | ||
726 | |||
727 | /* data */ | ||
728 | .align 2 | ||
729 | const_data: | ||
730 | .word COEF_C4_C1 | ||
731 | .word COEF_2C3_2C2 | ||
732 | .word COEF_C0 | ||
733 | .word CONST_MLA_Y | ||
734 | |||
735 | .size const_data, .-const_data | ||
736 | |||
737 | |||
738 | #else /* YUV2RGB_VERSION == VERSION_ARMV5TE_WST */ | ||
739 | /**************************************************************************** | ||
740 | * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ) | ||
741 | * |R| |0.00456621 0 0.00625893| |Y' - 16| | ||
742 | * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128| | ||
743 | * |B| |0.00456621 0.00791071 0 | |Pr - 128| | ||
744 | * | ||
745 | * Scaled, normalized, rounded and tweaked to yield RGB 565: | ||
746 | * |R| |74 0 101| |Y' - 16| >> 9 | ||
747 | * |G| = |74 -24 -51| |Cb - 128| >> 8 | ||
748 | * |B| |74 128 0| |Cr - 128| >> 9 | ||
749 | */ | ||
750 | #define NBR 14 /* 14-bit resolution (SVN) */ | ||
751 | #define COEF_C0 74 | ||
752 | #define COEF_C1 101 | ||
753 | #define COEF_C2 -24 | ||
754 | #define COEF_C3 -51 | ||
755 | #define COEF_C4 128 | ||
756 | #define C4_IS_POW2 | ||
757 | |||
758 | /* packed 16-bit coefficients */ | ||
759 | #define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff)) | ||
760 | #define COEF_C3_C2 ((COEF_C3 << 16) | (COEF_C2 & 0xffff)) | ||
761 | |||
762 | /* constant for rounding an NBR number before down-scaling it to RS bits */ | ||
763 | #define ROUND(RS) (1 << (NBR - RS - 1)) | ||
764 | |||
765 | /* 32-bit MLA constants */ | ||
766 | #define CONST_MLA_Y (-16 * COEF_C0) | ||
767 | #define CONST_MLA_RV ((-128 * COEF_C1) + ROUND(5)) | ||
768 | #define CONST_MLA_BU ((-128 * COEF_C4) + ROUND(5)) | ||
769 | /* trick to save the register needed for table_sat6 reference: | ||
770 | add table_sat6-table_sat5 offset (conveniently scaled) to guv MLA */ | ||
771 | #define CONST_MLA_GUV (-128 * (COEF_C2 + COEF_C3) + ROUND(6) + \ | ||
772 | ((table_sat6 - table_sat5) << (NBR - 6))) | ||
773 | |||
774 | /**************************************************************************** | ||
775 | * extern void lcd_write_yuv420_lines(unsigned char const * const src[3], | ||
776 | * uint16_t* out, | ||
777 | * int width, | ||
778 | * int stride); | ||
779 | * | ||
780 | * Converts two lines from YUV420 to RGB565, within each iteration four | ||
781 | * pixels (2 per line) are calculated and written to destination buffer. | ||
782 | * | ||
783 | * - use ARMv5TE+ 1-cycle multiply+accumulator instructions. | ||
784 | * - use data tables (256 bytes) for RBG565 saturation. | ||
785 | */ | ||
786 | .section .icode, "ax", %progbits | ||
787 | |||
788 | .align 2 | ||
789 | .global lcd_write_yuv420_lines | ||
790 | .type lcd_write_yuv420_lines, %function | ||
791 | |||
792 | lcd_write_yuv420_lines: | ||
793 | @ r0 = src = yuv_src | ||
794 | @ r1 = out = dst1_p | ||
795 | @ r2 = width | ||
796 | @ r3 = stride | ||
797 | stmfd sp!, {r4-r11,lr} @ save non-scratch | ||
798 | ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p | ||
799 | @ r11 = yuv_src[1] = Cb_p | ||
800 | @ r12 = yuv_src[2] = Cr_p | ||
801 | /* prepare data and fill stack */ | ||
802 | adr r0, const_data @ load constants | ||
803 | ldmia r0, {r4-r9,lr} @ r4 = COEF_C0 | ||
804 | @ r5 = CONST_MLA_GUV | ||
805 | @ r6 = COEF_C3_C2 | ||
806 | @ r7 = CONST_MLA_BU | ||
807 | @ r8 = COEF_C4_C1 | ||
808 | @ r9 = CONST_MLA_RV | ||
809 | @ lr = table_sat5 | ||
810 | sub r0, r12, r11 @ r0 = Cr_p-Cb_p | ||
811 | #define STACK_SZ 28 | ||
812 | stmfd sp!, {r0,r5-r9,lr} @ SP -> Cr_p-Cb_p | ||
813 | @ CONST_MLA_GUV | ||
814 | @ COEF_C3_C2 | ||
815 | @ CONST_MLA_BU | ||
816 | @ COEF_C4_C1 | ||
817 | @ CONST_MLA_RV | ||
818 | @ table_sat5 | ||
819 | mov r8, r4, lsl #4 @ | ||
820 | rsb r8, #0 @ r8 = -16*COEF_C0 = CONST_MLA_Y | ||
821 | mov lr, r1 @ RGB565 data destination buffer | ||
822 | add r9, lr, r2, asl #1 @ r9 = out + 2*width = dst2_p | ||
823 | add r12, r3, r10 @ r12 = Y'_p + stride | ||
824 | orr r7, r4, r2, lsl #15 @ loop_count = width/2; | ||
825 | @ r7 = loop_count<<16 | COEF_C0 | ||
826 | sub r7, r7, #0x10000 @ loop_count-- | ||
827 | |||
828 | /* align loop code to minimize occupied lines, execution | ||
829 | time per loop is optimized ~10% on ARM926EJ-S */ | ||
830 | .align CACHEALIGN_BITS | ||
831 | loop_start: | ||
832 | |||
833 | @ register usage: | ||
834 | @ r7 = loop count<<16 | COEF_C0 | ||
835 | @ r8 = CONST_MLA_Y | ||
836 | @ r9 = dst2_p | ||
837 | @ r10 = Y'_p | ||
838 | @ r11 = Cb_p | ||
839 | @ r12 = Y'stride_p | ||
840 | @ lr = dst1_p | ||
841 | @ free: r0-r6 | ||
842 | |||
843 | /* load constants from stack */ | ||
844 | ldmia sp, {r1-r3,r6} @ r1 = Cr_p-Cb_p | ||
845 | @ r2 = CONST_MLA_GUV | ||
846 | @ r3 = COEF_C3_C2 | ||
847 | @ r6 = CONST_MLA_BU | ||
848 | |||
849 | /* read Cr", Cb" */ | ||
850 | ldrb r1, [r11, r1] @ r1 = Cr = *Cr_p++ | ||
851 | ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++ | ||
852 | |||
853 | /* load more constants (avoids r1 interlock) */ | ||
854 | ldrd r4, [sp, #16] @ r4 = COEF_C4_C1 | ||
855 | @ r5 = CONST_MLA_RV | ||
856 | |||
857 | /* calculate rv", guv", bu" */ | ||
858 | smlabt r2, r1, r3, r2 @ r2 = guv" = Cr*C2 + Cb*C3 | ||
859 | smlabb r2, r0, r3, r2 @ + CONST_MLA_GUV | ||
860 | smlabb r1, r1, r4, r5 @ r1 = rv" = Cr*C1 + CONST_MLA_RV | ||
861 | #ifdef C4_IS_POW2 | ||
862 | add r0, r6, r0, asl #NBR-7 @ r0 = bu" = Cb*C4 + CONST_MLA_BU | ||
863 | #else | ||
864 | smlabt r0, r0, r4, r6 @ r0 = bu" = Cb*C4 + CONST_MLA_BU | ||
865 | #endif | ||
866 | |||
867 | ldr r4, [sp, #STACK_SZ-4] @ r4 = table_sat5 | ||
868 | |||
869 | /* read Y'1 and Y'2 */ | ||
870 | ldrb r5, [r10], #1 @ r5 = Y'1 = *Y'_p++ | ||
871 | ldrb r6, [r10], #1 @ r6 = Y'2 = *Y'_p++ | ||
872 | |||
873 | /* scale rv",guv",bu", adding sat5_p here saves instructions later */ | ||
874 | add r1, r4, r1, asr #NBR-5 @ r1 = rv' = sat5_p + rv">>scale | ||
875 | add r2, r4, r2, asr #NBR-6 @ r2 = guv' = sat5_p + guv">>scale | ||
876 | add r0, r4, r0, asr #NBR-5 @ r0 = bu' = sat5_p + bu">>scale | ||
877 | |||
878 | @ register usage: | ||
879 | @ r7-r12,lr: pointers, counters, tables | ||
880 | @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled | ||
881 | @ r5,r6 = Y'1,Y'2 | ||
882 | @ free: r3,r4 | ||
883 | |||
884 | /* calculate Y1 and Y2 */ | ||
885 | smlabb r5, r5, r7, r8 @ r5 = Y1 = C0*Y'1 - 16*C0 | ||
886 | smlabb r6, r6, r7, r8 @ r6 = Y2 = C0*Y'2 - 16*C0 | ||
887 | |||
888 | /* pixel_1 */ | ||
889 | ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y1>>scale + bu'] | ||
890 | ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y1>>scale + guv'] | ||
891 | ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y1>>scale + rv'] | ||
892 | |||
893 | /* calculate pixel_1 */ | ||
894 | orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = g<<5 | b | ||
895 | |||
896 | /* pixel_2 (avoid r5 interlock) */ | ||
897 | ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y2>>scale + bu'] | ||
898 | |||
899 | /* calculate pixel_1 and save to r3 for later pixel packing */ | ||
900 | orr r3, r3, r5, lsl #11 @ r3 = pixel_1 = r<<11 | g<<5 | b | ||
901 | |||
902 | /* pixel_2 */ | ||
903 | ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y2>>scale + guv'] | ||
904 | ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y2>>scale + rv'] | ||
905 | |||
906 | /* calculate pixel_2 and pack with pixel_1 before writing */ | ||
907 | orr r3, r3, r4, lsl #16 @ r3 = pixel_2<<16 | pixel_1 | ||
908 | orr r3, r3, r5, lsl #21 | ||
909 | orr r3, r3, r6, lsl #27 | ||
910 | |||
911 | /* read Y'3 and Y'4 */ | ||
912 | ldrb r5, [r12], #1 @ r5 = Y'3 = *Y'stride_p++ | ||
913 | ldrb r6, [r12], #1 @ r6 = Y'4 = *Y'stride_p++ | ||
914 | |||
915 | /* write pixel_1 and pixel_2 */ | ||
916 | str r3, [lr], #4 @ *dst2_p++ = r3 | ||
917 | |||
918 | @ register usage: | ||
919 | @ r7-r12,lr: pointers, counters, tables | ||
920 | @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled | ||
921 | @ r5,r6 = Y'3,Y'4 | ||
922 | @ free: r3,r4 | ||
923 | |||
924 | /* calculate Y3 and Y4 */ | ||
925 | smlabb r5, r5, r7, r8 @ r5 = Y3 = C0*Y'3 - 16*C0 | ||
926 | smlabb r6, r6, r7, r8 @ r6 = Y4 = C0*Y'4 - 16*C0 | ||
927 | |||
928 | /* pixel_3 */ | ||
929 | ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y3>>scale + bu'] | ||
930 | ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y3>>scale + guv'] | ||
931 | ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y3>>scale + rv'] | ||
932 | |||
933 | /* calculate pixel_3 */ | ||
934 | orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = g<<5 | b | ||
935 | |||
936 | /* pixel_4 (avoid r5 interlock) */ | ||
937 | ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y4>>scale + bu'] | ||
938 | |||
939 | /* calculate pixel_3 and save to r3 for later pixel packing */ | ||
940 | orr r3, r3, r5, lsl #11 @ r3 = pixel_3 = r<<11 | g<<5 | b | ||
941 | |||
942 | /* pixel_4 */ | ||
943 | ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y4>>scale + guv'] | ||
944 | ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y4>>scale + rv'] | ||
945 | |||
946 | /* calculate pixel_4 and pack with pixel_3 before writing */ | ||
947 | orr r3, r3, r4, lsl #16 @ r3 = pixel_4 << 16 | pixel_3 | ||
948 | orr r3, r3, r5, lsl #21 | ||
949 | orr r3, r3, r6, lsl #27 | ||
950 | |||
951 | /* write pixel_3 and pixel_4 */ | ||
952 | str r3, [r9], #4 @ *dst1_p++ = r3 | ||
953 | |||
954 | /* check for loop end */ | ||
955 | subs r7, r7, #0x10000 @ loop_count-- | ||
956 | bge loop_start @ back to beginning | ||
957 | |||
958 | /* bye */ | ||
959 | add sp, sp, #STACK_SZ @ deallocate stack | ||
960 | ldmpc regs=r4-r11 @ restore registers | ||
961 | |||
962 | .ltorg | ||
963 | .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines | ||
964 | |||
965 | /* data */ | ||
966 | .align 2 | ||
967 | const_data: | ||
968 | .word COEF_C0 | ||
969 | .word CONST_MLA_GUV | ||
970 | .word COEF_C3_C2 | ||
971 | .word CONST_MLA_BU | ||
972 | .word COEF_C4_C1 | ||
973 | .word CONST_MLA_RV | ||
974 | .word table_sat5 | ||
975 | |||
976 | .size const_data, .-const_data | ||
977 | |||
978 | /* saturation tables */ | ||
979 | /*.section .data*/ | ||
980 | /* aligned to cache line size to minimize cache usage */ | ||
981 | .align CACHEALIGN_BITS | ||
982 | |||
983 | saturation_tables: | ||
984 | /* 5-bit saturation table [-36..0..+67], size=104 */ | ||
985 | /* table_sat5[-36..-1] */ | ||
986 | .byte 0, 0, 0, 0 | ||
987 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
988 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
989 | table_sat5: | ||
990 | /* table_sat5[0..67] */ | ||
991 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | ||
992 | .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | ||
993 | .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31 | ||
994 | .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31 | ||
995 | .byte 31, 31, 31, 31 | ||
996 | |||
997 | /* 6-bit saturation table [-44..0..+107], size=152 */ | ||
998 | /* table_sat6[-44..-1] */ | ||
999 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
1000 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
1001 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
1002 | table_sat6: | ||
1003 | /* table_sat6[0..107] */ | ||
1004 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | ||
1005 | .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | ||
1006 | .byte 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 | ||
1007 | .byte 48, 49, 50, 51, 52, 53 ,54, 55, 56, 57, 58, 59, 60, 61, 62, 63 | ||
1008 | .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 | ||
1009 | .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 | ||
1010 | .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 | ||
1011 | |||
1012 | .size saturation_tables, .-saturation_tables | ||
1013 | #endif /* YUV2RGB_VERSION */ | ||