summaryrefslogtreecommitdiff
path: root/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S
diff options
context:
space:
mode:
Diffstat (limited to 'firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S')
-rw-r--r--firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S1013
1 files changed, 1013 insertions, 0 deletions
diff --git a/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S b/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S
new file mode 100644
index 0000000000..1ed7c4e189
--- /dev/null
+++ b/firmware/target/arm/s5l8702/ipod6g/lcd-asm-6g.S
@@ -0,0 +1,1013 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id: lcd-as-video.S 26756 2010-06-11 04:41:36Z funman $
9 *
10 * Copyright (C) 2010 by Andree Buschmann
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22/* Version history:
23 *
24 * SVN:
25 * - initial SVN version.
26 *
27 * ARMv4:
28 * - use all available registers to calculate four pixels within each
29 * loop iteration.
30 * - avoid LDR interlocks.
31 *
32 * ARMv5TE:
33 * - use ARMv5TE+ 1-cycle multiply-accumulate instructions.
34 *
35 * ARMv5TE_WST:
36 * - use data tables (256 bytes) for RBG565 saturation.
37 *
38 * All versions are based on current SVN algorithm (round->scale->add)
39 * using the same coefficients, so output results are identical.
40 *
41 * TODO?: SVN coefficients are a very nice approximation for operations
42 * with shift+add instructions. When 16x16+32 MLA instructions are used,
43 * NBR and COEF_N could probably be adjusted to slighly increase accuracy.
44 */
45#define VERSION_SVN 0
46#define VERSION_ARMV4 1
47#define VERSION_ARMV5TE 2
48#define VERSION_ARMV5TE_WST 3
49
50#define YUV2RGB_VERSION VERSION_ARMV5TE_WST
51
52
53#define ASM
54#include "config.h"
55#include "cpu.h"
56
57#if (YUV2RGB_VERSION == VERSION_SVN)
58 .section .icode, "ax", %progbits
59
60
61/****************************************************************************
62 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
63 * uint16_t* out,
64 * int width,
65 * int stride);
66 *
67 * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
68 * |R| |1.164 0.000 1.596| |Y' - 16|
69 * |G| = |1.164 -0.391 -0.813| |Pb - 128|
70 * |B| |1.164 2.018 0.000| |Pr - 128|
71 *
72 * Scaled, normalized, rounded and tweaked to yield RGB 565:
73 * |R| |74 0 101| |Y' - 16| >> 9
74 * |G| = |74 -24 -51| |Cb - 128| >> 8
75 * |B| |74 128 0| |Cr - 128| >> 9
76 *
77 * Converts two lines from YUV to RGB565 and writes to LCD at once. First loop
78 * loads Cb/Cr, calculates the chroma offset and saves them to buffer. Within
79 * the second loop these chroma offset are reloaded from buffer. Within each
80 * loop two pixels are calculated and written to LCD.
81 */
82 .align 2
83 .global lcd_write_yuv420_lines
84 .type lcd_write_yuv420_lines, %function
85lcd_write_yuv420_lines:
86 /* r0 = src = yuv_src */
87 /* r1 = dst = out */
88 /* r2 = width */
89 /* r3 = stride */
90 stmfd sp!, { r4-r10, lr } /* save non-scratch */
91 ldmia r0, { r9, r10, r12 } /* r9 = yuv_src[0] = Y'_p */
92 /* r10 = yuv_src[1] = Cb_p */
93 /* r12 = yuv_src[2] = Cr_p */
94 add r3, r9, r3 /* r3 = &ysrc[stride] */
95 add r4, r2, r2, asr #1 /* chroma buffer lenght = width/2 *3 */
96 mov r4, r4, asl #2 /* use words for str/ldm possibility */
97 add r4, r4, #15 /* plus room for 3 additional words, */
98 bic r4, r4, #3 /* rounded up to multiples of 4 byte */
99 sub sp, sp, r4 /* and allocate on stack */
100 stmia sp, {r2-r4} /* width, &ysrc[stride], stack_alloc */
101
102 mov r7, r2 /* r7 = loop count */
103 add r8, sp, #12 /* chroma buffer */
104 mov lr, r1 /* RGB565 data destination buffer */
105
106 /* 1st loop start */
10710: /* loop start */
108
109 ldrb r0, [r10], #1 /* r0 = *usrc++ = *Cb_p++ */
110 ldrb r1, [r12], #1 /* r1 = *vsrc++ = *Cr_p++ */
111
112 sub r0, r0, #128 /* r0 = Cb-128 */
113 sub r1, r1, #128 /* r1 = Cr-128 */
114
115 add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */
116 add r2, r2, r2, asl #4
117 add r2, r2, r0, asl #3
118 add r2, r2, r0, asl #4
119
120 add r4, r1, r1, asl #2 /* r1 = Cr*101 */
121 add r4, r4, r1, asl #5
122 add r1, r4, r1, asl #6
123
124 add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
125 mov r1, r1, asr #9
126 rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */
127 mov r2, r2, asr #8
128 add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
129 mov r0, r0, asr #2
130 stmia r8!, {r0-r2} /* store r0, r1 and r2 to chroma buffer */
131
132 /* 1st loop, first pixel */
133 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
134 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
135 add r3, r5, r5, asl #2
136 add r5, r3, r5, asl #5
137
138 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
139 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
140 add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
141
142 orr r5, r6, r4 /* check if clamping is needed... */
143 orr r5, r5, r3, asr #1 /* ...at all */
144 cmp r5, #31
145 bls 15f /* -> no clamp */
146 cmp r6, #31 /* clamp r */
147 mvnhi r6, r6, asr #31
148 andhi r6, r6, #31
149 cmp r3, #63 /* clamp g */
150 mvnhi r3, r3, asr #31
151 andhi r3, r3, #63
152 cmp r4, #31 /* clamp b */
153 mvnhi r4, r4, asr #31
154 andhi r4, r4, #31
15515: /* no clamp */
156
157 /* calculate pixel_1 and save to r4 for later pixel packing */
158 orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
159 orr r4, r4, r6, lsl #11 /* r4 = pixel_1 */
160
161 /* 1st loop, second pixel */
162 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
163 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
164 add r3, r5, r5, asl #2
165 add r5, r3, r5, asl #5
166
167 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
168 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
169 add r5, r0, r5, asr #8 /* r5 = b = (Y >> 9) + bu */
170
171 orr r0, r6, r5 /* check if clamping is needed... */
172 orr r0, r0, r3, asr #1 /* ...at all */
173 cmp r0, #31
174 bls 15f /* -> no clamp */
175 cmp r6, #31 /* clamp r */
176 mvnhi r6, r6, asr #31
177 andhi r6, r6, #31
178 cmp r3, #63 /* clamp g */
179 mvnhi r3, r3, asr #31
180 andhi r3, r3, #63
181 cmp r5, #31 /* clamp b */
182 mvnhi r5, r5, asr #31
183 andhi r5, r5, #31
18415: /* no clamp */
185
186 /* calculate pixel_2 and pack with pixel_1 before writing */
187 orr r5, r5, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
188 orr r5, r5, r6, lsl #11 /* r5 = pixel_2 */
189 orr r4, r4, r5, lsl #16
190 str r4, [lr], #4 /* write pixel_1 and pixel_2 */
191
192 subs r7, r7, #2 /* check for loop end */
193 bgt 10b /* back to beginning */
194 /* 1st loop end */
195
196 /* Reload several registers for pointer rewinding for next loop */
197 add r8, sp, #12 /* chroma buffer */
198 ldmia sp, {r7, r9} /* r7 = loop count */
199 /* r9 = &ysrc[stride] */
200
201 /* 2nd loop start */
20220: /* loop start */
203 /* restore r0 (bu), r1 (rv) and r2 (guv) from chroma buffer */
204 ldmia r8!, {r0-r2}
205
206 /* 2nd loop, first pixel */
207 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
208 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
209 add r3, r5, r5, asl #2
210 add r5, r3, r5, asl #5
211
212 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
213 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
214 add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
215
216 orr r5, r6, r4 /* check if clamping is needed... */
217 orr r5, r5, r3, asr #1 /* ...at all */
218 cmp r5, #31
219 bls 15f /* -> no clamp */
220 cmp r6, #31 /* clamp r */
221 mvnhi r6, r6, asr #31
222 andhi r6, r6, #31
223 cmp r3, #63 /* clamp g */
224 mvnhi r3, r3, asr #31
225 andhi r3, r3, #63
226 cmp r4, #31 /* clamp b */
227 mvnhi r4, r4, asr #31
228 andhi r4, r4, #31
22915: /* no clamp */
230 /* calculate pixel_1 and save to r4 for later pixel packing */
231 orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
232 orr r4, r4, r6, lsl #11 /* r4 = pixel_1 */
233
234 /* 2nd loop, second pixel */
235 ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
236 sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
237 add r3, r5, r5, asl #2
238 add r5, r3, r5, asl #5
239
240 add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
241 add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
242 add r5, r0, r5, asr #8 /* r5 = b = (Y >> 9) + bu */
243
244 orr r0, r6, r5 /* check if clamping is needed... */
245 orr r0, r0, r3, asr #1 /* ...at all */
246 cmp r0, #31
247 bls 15f /* -> no clamp */
248 cmp r6, #31 /* clamp r */
249 mvnhi r6, r6, asr #31
250 andhi r6, r6, #31
251 cmp r3, #63 /* clamp g */
252 mvnhi r3, r3, asr #31
253 andhi r3, r3, #63
254 cmp r5, #31 /* clamp b */
255 mvnhi r5, r5, asr #31
256 andhi r5, r5, #31
25715: /* no clamp */
258
259 /* calculate pixel_2 and pack with pixel_1 before writing */
260 orr r5, r5, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
261 orr r5, r5, r6, lsl #11 /* r5 = pixel_2 */
262 orr r4, r4, r5, lsl #16
263 str r4, [lr], #4 /* write pixel_1 and pixel_2 */
264
265 subs r7, r7, #2 /* check for loop end */
266 bgt 20b /* back to beginning */
267 /* 2nd loop end */
268
269 ldr r3, [sp, #8]
270 add sp, sp, r3 /* deallocate buffer */
271 ldmpc regs=r4-r10 /* restore registers */
272
273 .ltorg
274 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
275
276
277#elif (YUV2RGB_VERSION == VERSION_ARMV4)
278/****************************************************************************
279 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
280 * uint16_t* out,
281 * int width,
282 * int stride);
283 *
284 * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
285 * |R| |1.164 0.000 1.596| |Y' - 16|
286 * |G| = |1.164 -0.391 -0.813| |Pb - 128|
287 * |B| |1.164 2.018 0.000| |Pr - 128|
288 *
289 * Scaled, normalized, rounded and tweaked to yield RGB 565:
290 * |R| |74 0 101| |Y' - 16| >> 9
291 * |G| = |74 -24 -51| |Cb - 128| >> 8
292 * |B| |74 128 0| |Cr - 128| >> 9
293 *
294 * Converts two lines from YUV420 to RGB565, within each iteration four
295 * pixels (2 per line) are calculated and written to destination buffer.
296 */
297 .section .icode, "ax", %progbits
298
299 .align 2
300 .global lcd_write_yuv420_lines
301 .type lcd_write_yuv420_lines, %function
302
303lcd_write_yuv420_lines:
304 /* r0 = src = yuv_src */
305 /* r1 = dst = out */
306 /* r2 = width */
307 /* r3 = stride */
308 stmfd sp!, {r4-r11,lr} /* save non-scratch */
309 ldmia r0, {r10-r12} /* r10 = yuv_src[0] = Y'_p */
310 /* r11 = yuv_src[1] = Cb_p */
311 /* r12 = yuv_src[2] = Cr_p */
312 mov r9, r2, lsl #1 /* r9 = 2*width (loop count) */
313 str r9, [sp, #-4]! /* [--sp] = 2*width (constant) */
314 add r8, r10, r3 /* r8 = Y'_p + stride = Y'stride_p */
315 mov lr, r1 /* RGB565 data destination buffer */
316
31710: /* loop start */
318 ldrb r0, [r11], #1 /* r0 = *Cb_p++ */
319 ldrb r1, [r12], #1 /* r1 = *Cr_p++ */
320 ldrb r3, [r8], #1 /* r3 = Y'3 */
321 ldrb r4, [r8], #1 /* r4 = Y'4 */
322
323 sub r0, r0, #128 /* r0 = Cb-128 */
324 sub r1, r1, #128 /* r1 = Cr-128 */
325
326 add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */
327 add r2, r2, r2, asl #4
328 add r2, r2, r0, asl #3
329 add r2, r2, r0, asl #4
330
331 add r5, r1, r1, asl #2 /* r1 = Cr*101 */
332 add r5, r5, r1, asl #5
333 add r1, r5, r1, asl #6
334
335 add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
336 mov r1, r1, asr #9
337 rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */
338 mov r2, r2, asr #8
339 add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
340 mov r0, r0, asr #2
341
342 /* pixel_3 */
343 sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */
344 add r7, r3, r3, asl #2
345 add r3, r7, r3, asl #5
346
347 add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */
348 add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */
349 add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */
350
351 orr r3, r6, r5 /* check if clamping is needed... */
352 orr r3, r3, r7, asr #1 /* ...at all */
353 cmp r3, #31
354 bls 15f /* no clamp */
355 cmp r6, #31 /* clamp r */
356 mvnhi r6, r6, asr #31
357 andhi r6, r6, #31
358 cmp r7, #63 /* clamp g */
359 mvnhi r7, r7, asr #31
360 andhi r7, r7, #63
361 cmp r5, #31 /* clamp b */
362 mvnhi r5, r5, asr #31
363 andhi r5, r5, #31
36415: /* no clamp */
365
366 /* calculate pixel_3 and save to r5 for later pixel packing */
367 orr r5, r5, r7, lsl #5 /* pixel_3 = r<<11 | g<<5 | b */
368 orr r5, r5, r6, lsl #11 /* r5 = pixel_3 */
369
370 /* pixel_4 */
371 sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */
372 add r7, r4, r4, asl #2
373 add r4, r7, r4, asl #5
374
375 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
376 add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */
377 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
378
379 orr r3, r6, r4 /* check if clamping is needed... */
380 orr r3, r3, r7, asr #1 /* ...at all */
381 cmp r3, #31
382 bls 15f /* no clamp */
383 cmp r6, #31 /* clamp r */
384 mvnhi r6, r6, asr #31
385 andhi r6, r6, #31
386 cmp r7, #63 /* clamp g */
387 mvnhi r7, r7, asr #31
388 andhi r7, r7, #63
389 cmp r4, #31 /* clamp b */
390 mvnhi r4, r4, asr #31
391 andhi r4, r4, #31
39215: /* no clamp */
393
394 /* calculate pixel_4 and pack with pixel_3 before writing */
395 orr r4, r4, r7, lsl #5 /* pixel_4 = r<<11 | g<<5 | b */
396 orr r4, r4, r6, lsl #11 /* r4 = pixel_4 */
397 orr r5, r5, r4, lsl #16 /* r5 = pixel_4<<16 | pixel_3 */
398
399 ldr r7, [sp] /* r7 = 2*width */
400 ldrb r3, [r10], #1 /* r3 = Y'1 */
401 ldrb r4, [r10], #1 /* r4 = Y'2 */
402
403 str r5, [lr, r7] /* write pixel_3 and pixel_4 */
404
405 /* pixel_1 */
406 sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */
407 add r7, r3, r3, asl #2
408 add r3, r7, r3, asl #5
409
410 add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */
411 add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */
412 add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */
413
414 orr r3, r6, r5 /* check if clamping is needed... */
415 orr r3, r3, r7, asr #1 /* ...at all */
416 cmp r3, #31
417 bls 15f /* no clamp */
418 cmp r6, #31 /* clamp r */
419 mvnhi r6, r6, asr #31
420 andhi r6, r6, #31
421 cmp r7, #63 /* clamp g */
422 mvnhi r7, r7, asr #31
423 andhi r7, r7, #63
424 cmp r5, #31 /* clamp b */
425 mvnhi r5, r5, asr #31
426 andhi r5, r5, #31
42715: /* no clamp */
428
429 /* calculate pixel_1 and save to r5 for later pixel packing */
430 orr r5, r5, r7, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
431 orr r5, r5, r6, lsl #11 /* r5 = pixel_1 */
432
433 /* pixel_2 */
434 sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */
435 add r7, r4, r4, asl #2
436 add r4, r7, r4, asl #5
437
438 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
439 add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */
440 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
441
442 orr r3, r6, r4 /* check if clamping is needed... */
443 orr r3, r3, r7, asr #1 /* ...at all */
444 cmp r3, #31
445 bls 15f /* no clamp */
446 cmp r6, #31 /* clamp r */
447 mvnhi r6, r6, asr #31
448 andhi r6, r6, #31
449 cmp r7, #63 /* clamp g */
450 mvnhi r7, r7, asr #31
451 andhi r7, r7, #63
452 cmp r4, #31 /* clamp b */
453 mvnhi r4, r4, asr #31
454 andhi r4, r4, #31
45515: /* no clamp */
456
457 /* calculate pixel_2 and pack with pixel_1 before writing */
458 orr r4, r4, r7, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
459 orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
460 orr r5, r5, r4, lsl #16 /* r5 = pixel_2<<16 | pixel_1 */
461
462 str r5, [lr], #4 /* write pixel_1 and pixel_2 */
463
464 subs r9, r9, #4 /* check for loop end */
465 bgt 10b /* back to beginning */
466
467 /* loop end */
468 add sp, sp, #4 /* deallocate stack */
469 ldmpc regs=r4-r11 /* restore registers */
470
471 .ltorg
472 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
473
474
475#elif (YUV2RGB_VERSION == VERSION_ARMV5TE)
476/****************************************************************************
477 * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ)
478 * |R| |0.00456621 0 0.00625893| |Y' - 16|
479 * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128|
480 * |B| |0.00456621 0.00791071 0 | |Pr - 128|
481 *
482 * Scaled, normalized, rounded and tweaked to yield RGB 565:
483 * |R| |74 0 101| |Y' - 16| >> 9
484 * |G| = |74 -24 -51| |Cb - 128| >> 8
485 * |B| |74 128 0| |Cr - 128| >> 9
486 */
487#define NBR 14 /* 14-bit resolution (SVN) */
488#define COEF_C0 74
489#define COEF_C1 101
490#define COEF_C2 -24
491#define COEF_C3 -51
492#define COEF_C4 128
493#define C4_IS_POW2
494
495/* constant for rounding a NBR number before down-scaling it to RS bits */
496#define ROUND(RS) (1 << (NBR - RS - 1))
497
498/* packed 16-bit coefficients */
499#define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff))
500#define COEF_2C3_2C2 ((COEF_C3 << 17) | ((COEF_C2 << 1) & 0xffff))
501/* 32-bit MLA constants */
502#define CONST_MLA_Y (-16 * COEF_C0)
503
504/****************************************************************************
505 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
506 * uint16_t* out,
507 * int width,
508 * int stride);
509 *
510 * Converts two lines from YUV420 to RGB565, within each iteration four
511 * pixels (2 per line) are calculated and written to destination buffer.
512 *
513 * - use ARMv5TE+ 1-cycle multiply+accumulator instructions.
514 */
515 .section .icode, "ax", %progbits
516
517 .align 2
518 .global lcd_write_yuv420_lines
519 .type lcd_write_yuv420_lines, %function
520
521lcd_write_yuv420_lines:
522 @ r0 = src = yuv_src
523 @ r1 = out = dst_p
524 @ r2 = width
525 @ r3 = stride
526 stmfd sp!, {r4-r11,lr} @ save non-scratch
527 ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p
528 @ r11 = yuv_src[1] = Cb_p
529 @ r12 = yuv_src[2] = Cr_p
530 adr r0, const_data @ load constants
531 ldmia r0, {r5-r8} @ r5 = COEF_C4_C1
532 @ r6 = COEF_2C3_2C2
533 @ r7 = COEF_C0
534 @ r8 = CONST_MLA_Y
535 sub r4, r12, r11 @ r4 = Cr_p-Cb_p
536 mov r9, r2, asl #1 @ r9 = 2*width
537 stmfd sp!, {r4-r6,r9} @ SP -> Cr_p-Cb_p
538 @ COEF_C4_C1
539 @ COEF_2C3_2C2
540 @ 2*width
541 add r12, r10, r3 @ r12 = Y'_p + stride = Y'stride_p
542 mov lr, r1 @ RGB565 data destination buffer
543 orr r9, r7, r2, lsl #15 @ loop_count = width/2;
544 @ r9 = loop_count<<16 | COEF_C0
545 sub r9, r9, #0x10000 @ loop_count--
546
54710: @ loop_start
548
549 @ register usage:
550 @ r8 = CONST_MLA_Y
551 @ r9 = loop count<<16 | COEF_C0
552 @ r10 = Y'_p
553 @ r11 = Cb_p
554 @ r12 = Y'stride_p
555 @ lr = dst_p
556 @ free: r0-r7
557
558 ldmia sp, {r2-r4} @ r2 = Cr_p-Cb_p
559 @ r3 = COEF_C4_C1
560 @ r4 = COEF_2C3_2C2
561 mov r5, #ROUND(5) @ r5 = round constant
562
563 ldrb r6, [r12], #1 @ r6 = Y'3
564 ldrb r7, [r12], #1 @ r7 = Y'4
565
566 ldrb r1, [r11, r2] @ r1 = Cr = *Cr_p++
567 ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++
568
569 /* calculate Y3 and Y4 */
570 smlabb r6, r6, r9, r8 @ r6 = Y3 = C0*Y'3 - C0*16
571 smlabb r7, r7, r9, r8 @ r7 = Y4 = C0*Y'4 - C0*16
572
573 /* calculate rv, guv, bu */
574 sub r1, r1, #128 @ r1 = Cr" = Cr-128
575 sub r0, r0, #128 @ r0 = Cb" = Cb-128
576
577 smlabt r2, r1, r4, r5 @ r2 = guv" = Cr"*(2*C2) +
578 smlabb r2, r0, r4, r2 @ Cb"*(2*C3) + round
579 smlabb r1, r1, r3, r5 @ r1 = rv" = Cr"*C1 + round
580 #ifdef C4_IS_POW2
581 add r0, r5, r0, asl #NBR-7 @ r0 = bu" = Cb"*C4 + round
582 #else
583 smlabt r0, r0, r3, r5 @ r0 = bu" = Cb"*C4 + round
584 #endif
585
586 /* scale rv",guv",bu" */
587 mov r2, r2, asr #NBR-5 @ r2 = guv = guv" >> scale
588 mov r1, r1, asr #NBR-5 @ r1 = rv = rv" >> scale
589 mov r0, r0, asr #NBR-5 @ r0 = bu = bu" >> scale
590
591 @ register usage:
592 @ r8-r12,lr: pointers, counters
593 @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565)
594 @ r6,r7 = Y'3,Y'4
595 @ free: r3-r5
596
597 /* pixel_3 */
598 add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y3 >> scale) + rv
599 add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y3 >> scale) + guv
600 add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y3 >> scale) + bu
601
602 orr r6, r5, r3 @ check if clamping is needed...
603 orr r6, r6, r4, asr #1 @ ...at all
604 cmp r6, #31
605 bls 15f @ no clamp
606 cmp r5, #31 @ clamp r
607 mvnhi r5, r5, asr #31
608 andhi r5, r5, #31
609 cmp r4, #63 @ clamp g
610 mvnhi r4, r4, asr #31
611 andhi r4, r4, #63
612 cmp r3, #31 @ clamp b
613 mvnhi r3, r3, asr #31
614 andhi r3, r3, #31
61515: @ no clamp
616
617 /* calculate pixel_3 and save to r3 for later pixel packing */
618 orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = r<<11 | g<<5 | b
619 orr r3, r3, r5, lsl #11
620
621 /* pixel_4 */
622 add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y4 >> scale) + rv
623 add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y4 >> scale) + guv
624 add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y4 >> scale) + bu
625
626 orr r6, r5, r7 @ check if clamping is needed...
627 orr r6, r6, r4, asr #1 @ ...at all
628 cmp r6, #31
629 bls 15f @ no clamp
630 cmp r5, #31 @ clamp r
631 mvnhi r5, r5, asr #31
632 andhi r5, r5, #31
633 cmp r4, #63 @ clamp g
634 mvnhi r4, r4, asr #31
635 andhi r4, r4, #63
636 cmp r7, #31 @ clamp b
637 mvnhi r7, r7, asr #31
638 andhi r7, r7, #31
63915: @ no clamp
640
641 /* calculate pixel_4 and pack with pixel_3 before writing */
642 orr r7, r7, r4, lsl #5 @ r7 = pixel_4 = r<<11 | g<<5 | b
643 orr r7, r7, r5, lsl #11
644 orr r3, r3, r7, lsl #16 @ r3 = pixel_4<<16 | pixel_3
645
646 /* avoid interlocks when writing pixel_3 and pixel_4 */
647 ldr r5, [sp, #12] @ r5 = 2*width
648
649 ldrb r6, [r10], #1 @ r6 = Y'1
650 ldrb r7, [r10], #1 @ r7 = Y'2
651
652 /* write pixel_3 and pixel_4 */
653 str r3, [lr, r5] @ [dst_p + 2*width] = r3
654
655 @ register usage:
656 @ r8-r12,lr: pointers, counters
657 @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565)
658 @ r6,r7 = Y'1,Y'2
659 @ free: r3-r5
660
661 /* calculate Y1 and Y2 */
662 smlabb r6, r6, r9, r8 @ r6 = Y1 = C0*Y'1 - C0*16
663 smlabb r7, r7, r9, r8 @ r7 = Y2 = C0*Y'2 - C0*16
664
665 /* pixel_1 */
666 add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y1 >> scale) + rv
667 add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y1 >> scale) + guv
668 add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y1 >> scale) + bu
669
670 orr r6, r5, r3 @ check if clamping is needed...
671 orr r6, r6, r4, asr #1 @ ...at all
672 cmp r6, #31
673 bls 15f @ no clamp
674 cmp r5, #31 @ clamp r
675 mvnhi r5, r5, asr #31
676 andhi r5, r5, #31
677 cmp r4, #63 @ clamp g
678 mvnhi r4, r4, asr #31
679 andhi r4, r4, #63
680 cmp r3, #31 @ clamp b
681 mvnhi r3, r3, asr #31
682 andhi r3, r3, #31
68315: @ no clamp
684
685 /* calculate pixel_1 and save to r3 for later pixel packing */
686 orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = r<<11 | g<<5 | b
687 orr r3, r3, r5, lsl #11
688
689 /* pixel_2 */
690 add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y2 >> scale) + rv
691 add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y2 >> scale) + guv
692 add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y2 >> scale) + bu
693
694 orr r6, r5, r7 @ check if clamping is needed...
695 orr r6, r6, r4, asr #1 @ ...at all
696 cmp r6, #31
697 bls 15f @ no clamp
698 cmp r5, #31 @ clamp r
699 mvnhi r5, r5, asr #31
700 andhi r5, r5, #31
701 cmp r4, #63 @ clamp g
702 mvnhi r4, r4, asr #31
703 andhi r4, r4, #63
704 cmp r7, #31 @ clamp b
705 mvnhi r7, r7, asr #31
706 andhi r7, r7, #31
70715: @ no clamp
708
709 /* calculate pixel_2 and pack with pixel_1 before writing */
710 orr r7, r7, r4, lsl #5 @ r7 = pixel_2 = r<<11 | g<<5 | b
711 orr r7, r7, r5, lsl #11
712 orr r3, r3, r7, lsl #16 @ r3 = pixel_2 << 16 | pixel_1
713
714 str r3, [lr], #4 @ write pixel_1 and pixel_2
715
716 /* check for loop end */
717 subs r9, r9, #0x10000 @ loop_count--
718 bge 10b @ back to beginning
719
720 /* bye */
721 add sp, sp, #16
722 ldmpc regs=r4-r11 @ restore registers
723
724 .ltorg
725 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
726
727/* data */
728 .align 2
729const_data:
730 .word COEF_C4_C1
731 .word COEF_2C3_2C2
732 .word COEF_C0
733 .word CONST_MLA_Y
734
735 .size const_data, .-const_data
736
737
738#else /* YUV2RGB_VERSION == VERSION_ARMV5TE_WST */
739/****************************************************************************
740 * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ)
741 * |R| |0.00456621 0 0.00625893| |Y' - 16|
742 * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128|
743 * |B| |0.00456621 0.00791071 0 | |Pr - 128|
744 *
745 * Scaled, normalized, rounded and tweaked to yield RGB 565:
746 * |R| |74 0 101| |Y' - 16| >> 9
747 * |G| = |74 -24 -51| |Cb - 128| >> 8
748 * |B| |74 128 0| |Cr - 128| >> 9
749 */
750#define NBR 14 /* 14-bit resolution (SVN) */
751#define COEF_C0 74
752#define COEF_C1 101
753#define COEF_C2 -24
754#define COEF_C3 -51
755#define COEF_C4 128
756#define C4_IS_POW2
757
758/* packed 16-bit coefficients */
759#define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff))
760#define COEF_C3_C2 ((COEF_C3 << 16) | (COEF_C2 & 0xffff))
761
762/* constant for rounding an NBR number before down-scaling it to RS bits */
763#define ROUND(RS) (1 << (NBR - RS - 1))
764
765/* 32-bit MLA constants */
766#define CONST_MLA_Y (-16 * COEF_C0)
767#define CONST_MLA_RV ((-128 * COEF_C1) + ROUND(5))
768#define CONST_MLA_BU ((-128 * COEF_C4) + ROUND(5))
769/* trick to save the register needed for table_sat6 reference:
770 add table_sat6-table_sat5 offset (conveniently scaled) to guv MLA */
771#define CONST_MLA_GUV (-128 * (COEF_C2 + COEF_C3) + ROUND(6) + \
772 ((table_sat6 - table_sat5) << (NBR - 6)))
773
774/****************************************************************************
775 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
776 * uint16_t* out,
777 * int width,
778 * int stride);
779 *
780 * Converts two lines from YUV420 to RGB565, within each iteration four
781 * pixels (2 per line) are calculated and written to destination buffer.
782 *
783 * - use ARMv5TE+ 1-cycle multiply+accumulator instructions.
784 * - use data tables (256 bytes) for RBG565 saturation.
785 */
786 .section .icode, "ax", %progbits
787
788 .align 2
789 .global lcd_write_yuv420_lines
790 .type lcd_write_yuv420_lines, %function
791
792lcd_write_yuv420_lines:
793 @ r0 = src = yuv_src
794 @ r1 = out = dst1_p
795 @ r2 = width
796 @ r3 = stride
797 stmfd sp!, {r4-r11,lr} @ save non-scratch
798 ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p
799 @ r11 = yuv_src[1] = Cb_p
800 @ r12 = yuv_src[2] = Cr_p
801 /* prepare data and fill stack */
802 adr r0, const_data @ load constants
803 ldmia r0, {r4-r9,lr} @ r4 = COEF_C0
804 @ r5 = CONST_MLA_GUV
805 @ r6 = COEF_C3_C2
806 @ r7 = CONST_MLA_BU
807 @ r8 = COEF_C4_C1
808 @ r9 = CONST_MLA_RV
809 @ lr = table_sat5
810 sub r0, r12, r11 @ r0 = Cr_p-Cb_p
811 #define STACK_SZ 28
812 stmfd sp!, {r0,r5-r9,lr} @ SP -> Cr_p-Cb_p
813 @ CONST_MLA_GUV
814 @ COEF_C3_C2
815 @ CONST_MLA_BU
816 @ COEF_C4_C1
817 @ CONST_MLA_RV
818 @ table_sat5
819 mov r8, r4, lsl #4 @
820 rsb r8, #0 @ r8 = -16*COEF_C0 = CONST_MLA_Y
821 mov lr, r1 @ RGB565 data destination buffer
822 add r9, lr, r2, asl #1 @ r9 = out + 2*width = dst2_p
823 add r12, r3, r10 @ r12 = Y'_p + stride
824 orr r7, r4, r2, lsl #15 @ loop_count = width/2;
825 @ r7 = loop_count<<16 | COEF_C0
826 sub r7, r7, #0x10000 @ loop_count--
827
828 /* align loop code to minimize occupied lines, execution
829 time per loop is optimized ~10% on ARM926EJ-S */
830 .align CACHEALIGN_BITS
831loop_start:
832
833 @ register usage:
834 @ r7 = loop count<<16 | COEF_C0
835 @ r8 = CONST_MLA_Y
836 @ r9 = dst2_p
837 @ r10 = Y'_p
838 @ r11 = Cb_p
839 @ r12 = Y'stride_p
840 @ lr = dst1_p
841 @ free: r0-r6
842
843 /* load constants from stack */
844 ldmia sp, {r1-r3,r6} @ r1 = Cr_p-Cb_p
845 @ r2 = CONST_MLA_GUV
846 @ r3 = COEF_C3_C2
847 @ r6 = CONST_MLA_BU
848
849 /* read Cr", Cb" */
850 ldrb r1, [r11, r1] @ r1 = Cr = *Cr_p++
851 ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++
852
853 /* load more constants (avoids r1 interlock) */
854 ldrd r4, [sp, #16] @ r4 = COEF_C4_C1
855 @ r5 = CONST_MLA_RV
856
857 /* calculate rv", guv", bu" */
858 smlabt r2, r1, r3, r2 @ r2 = guv" = Cr*C2 + Cb*C3
859 smlabb r2, r0, r3, r2 @ + CONST_MLA_GUV
860 smlabb r1, r1, r4, r5 @ r1 = rv" = Cr*C1 + CONST_MLA_RV
861 #ifdef C4_IS_POW2
862 add r0, r6, r0, asl #NBR-7 @ r0 = bu" = Cb*C4 + CONST_MLA_BU
863 #else
864 smlabt r0, r0, r4, r6 @ r0 = bu" = Cb*C4 + CONST_MLA_BU
865 #endif
866
867 ldr r4, [sp, #STACK_SZ-4] @ r4 = table_sat5
868
869 /* read Y'1 and Y'2 */
870 ldrb r5, [r10], #1 @ r5 = Y'1 = *Y'_p++
871 ldrb r6, [r10], #1 @ r6 = Y'2 = *Y'_p++
872
873 /* scale rv",guv",bu", adding sat5_p here saves instructions later */
874 add r1, r4, r1, asr #NBR-5 @ r1 = rv' = sat5_p + rv">>scale
875 add r2, r4, r2, asr #NBR-6 @ r2 = guv' = sat5_p + guv">>scale
876 add r0, r4, r0, asr #NBR-5 @ r0 = bu' = sat5_p + bu">>scale
877
878 @ register usage:
879 @ r7-r12,lr: pointers, counters, tables
880 @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled
881 @ r5,r6 = Y'1,Y'2
882 @ free: r3,r4
883
884 /* calculate Y1 and Y2 */
885 smlabb r5, r5, r7, r8 @ r5 = Y1 = C0*Y'1 - 16*C0
886 smlabb r6, r6, r7, r8 @ r6 = Y2 = C0*Y'2 - 16*C0
887
888 /* pixel_1 */
889 ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y1>>scale + bu']
890 ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y1>>scale + guv']
891 ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y1>>scale + rv']
892
893 /* calculate pixel_1 */
894 orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = g<<5 | b
895
896 /* pixel_2 (avoid r5 interlock) */
897 ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y2>>scale + bu']
898
899 /* calculate pixel_1 and save to r3 for later pixel packing */
900 orr r3, r3, r5, lsl #11 @ r3 = pixel_1 = r<<11 | g<<5 | b
901
902 /* pixel_2 */
903 ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y2>>scale + guv']
904 ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y2>>scale + rv']
905
906 /* calculate pixel_2 and pack with pixel_1 before writing */
907 orr r3, r3, r4, lsl #16 @ r3 = pixel_2<<16 | pixel_1
908 orr r3, r3, r5, lsl #21
909 orr r3, r3, r6, lsl #27
910
911 /* read Y'3 and Y'4 */
912 ldrb r5, [r12], #1 @ r5 = Y'3 = *Y'stride_p++
913 ldrb r6, [r12], #1 @ r6 = Y'4 = *Y'stride_p++
914
915 /* write pixel_1 and pixel_2 */
916 str r3, [lr], #4 @ *dst2_p++ = r3
917
918 @ register usage:
919 @ r7-r12,lr: pointers, counters, tables
920 @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled
921 @ r5,r6 = Y'3,Y'4
922 @ free: r3,r4
923
924 /* calculate Y3 and Y4 */
925 smlabb r5, r5, r7, r8 @ r5 = Y3 = C0*Y'3 - 16*C0
926 smlabb r6, r6, r7, r8 @ r6 = Y4 = C0*Y'4 - 16*C0
927
928 /* pixel_3 */
929 ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y3>>scale + bu']
930 ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y3>>scale + guv']
931 ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y3>>scale + rv']
932
933 /* calculate pixel_3 */
934 orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = g<<5 | b
935
936 /* pixel_4 (avoid r5 interlock) */
937 ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y4>>scale + bu']
938
939 /* calculate pixel_3 and save to r3 for later pixel packing */
940 orr r3, r3, r5, lsl #11 @ r3 = pixel_3 = r<<11 | g<<5 | b
941
942 /* pixel_4 */
943 ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y4>>scale + guv']
944 ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y4>>scale + rv']
945
946 /* calculate pixel_4 and pack with pixel_3 before writing */
947 orr r3, r3, r4, lsl #16 @ r3 = pixel_4 << 16 | pixel_3
948 orr r3, r3, r5, lsl #21
949 orr r3, r3, r6, lsl #27
950
951 /* write pixel_3 and pixel_4 */
952 str r3, [r9], #4 @ *dst1_p++ = r3
953
954 /* check for loop end */
955 subs r7, r7, #0x10000 @ loop_count--
956 bge loop_start @ back to beginning
957
958 /* bye */
959 add sp, sp, #STACK_SZ @ deallocate stack
960 ldmpc regs=r4-r11 @ restore registers
961
962 .ltorg
963 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
964
965/* data */
966 .align 2
967const_data:
968 .word COEF_C0
969 .word CONST_MLA_GUV
970 .word COEF_C3_C2
971 .word CONST_MLA_BU
972 .word COEF_C4_C1
973 .word CONST_MLA_RV
974 .word table_sat5
975
976 .size const_data, .-const_data
977
978/* saturation tables */
979 /*.section .data*/
980 /* aligned to cache line size to minimize cache usage */
981 .align CACHEALIGN_BITS
982
983saturation_tables:
984 /* 5-bit saturation table [-36..0..+67], size=104 */
985 /* table_sat5[-36..-1] */
986 .byte 0, 0, 0, 0
987 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
988 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
989 table_sat5:
990 /* table_sat5[0..67] */
991 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
992 .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
993 .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31
994 .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31
995 .byte 31, 31, 31, 31
996
997 /* 6-bit saturation table [-44..0..+107], size=152 */
998 /* table_sat6[-44..-1] */
999 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1000 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1001 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1002 table_sat6:
1003 /* table_sat6[0..107] */
1004 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1005 .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1006 .byte 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
1007 .byte 48, 49, 50, 51, 52, 53 ,54, 55, 56, 57, 58, 59, 60, 61, 62, 63
1008 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1009 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1010 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1011
1012 .size saturation_tables, .-saturation_tables
1013#endif /* YUV2RGB_VERSION */