summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCástor Muñoz <cmvidal@gmail.com>2014-11-10 05:19:42 +0100
committerCástor Muñoz <cmvidal@gmail.com>2014-12-10 20:39:34 +0100
commitb320bbaf61c9f71a74d1c09e75764a3d7e92879c (patch)
treeaf43d3c1f95cb14322bb76782fa29d300f24b561
parenta2136a811f912ac5d7a4f7f07a5c303fac6051a2 (diff)
downloadrockbox-b320bbaf61c9f71a74d1c09e75764a3d7e92879c.tar.gz
rockbox-b320bbaf61c9f71a74d1c09e75764a3d7e92879c.zip
iPod Classic: YUV to RGB optimizations for ARM v5+
Optimizes YUV to RGB conversion using ARMv5 multiply-accumulate intructions for operations and data tables for saturation. This first patch set includes the three versions i have developed. Although iPod Classic need to use the latest version to reach 30fps, old versions may serve other targets. All versions are based on current SVN algorithm (round->scale->add) using the same coefficients, so output results are identical. Version history: ARMv4: - use all available registers to calculate four pixels within each loop iteration. - avoid LDR interlocks. ARMv5TE: - use ARMv5TE+ 1-cycle multiply-accumulate instructions. ARMv5TE_WST: - use data tables (256 bytes) for RBG565 saturation. Benchmarks results using iPod Classic (ARM926EJ 216Mhz): size test_fps (1) mpegplayer (2) bytes YUV YUV1/4 average min/max ----- ----------- ------------------ SVN-20141107 528 27.8 110.0 11035 10864/13397 ARMv4 480 28.8 114.0 9767 9586/12126 ARMv5TE 468 29.7 117.5 8751 8584/11118 ARMv5TE_WST 544 33.6 133.0 6355 6316/6403 (1) boosted (2) play full elephants_dream_320x240.mpg file (15693 frames) using mpegplayer, patched RB measures YUV to RGB565 frame conversion time (microseconds) Compared against the WST version, the ARMV5TE version w/o cached saturation tables is slower, but it is smaller and i have doubts about the power consumption. Change-Id: I2b6a81804636658d85a1bb104ccb2055e77ac120 Reviewed-on: http://gerrit.rockbox.org/1034 Reviewed-by: Cástor Muñoz <cmvidal@gmail.com> Tested: Cástor Muñoz <cmvidal@gmail.com>
-rw-r--r--firmware/target/arm/s5l8702/ipod6g/lcd-asm-ipod6g.S772
1 files changed, 772 insertions, 0 deletions
diff --git a/firmware/target/arm/s5l8702/ipod6g/lcd-asm-ipod6g.S b/firmware/target/arm/s5l8702/ipod6g/lcd-asm-ipod6g.S
index ec8a24c4e5..6ee90098af 100644
--- a/firmware/target/arm/s5l8702/ipod6g/lcd-asm-ipod6g.S
+++ b/firmware/target/arm/s5l8702/ipod6g/lcd-asm-ipod6g.S
@@ -19,9 +19,42 @@
19 * 19 *
20 ****************************************************************************/ 20 ****************************************************************************/
21 21
22/* Version history:
23 *
24 * SVN:
25 * - initial SVN version.
26 *
27 * ARMv4:
28 * - use all available registers to calculate four pixels within each
29 * loop iteration.
30 * - avoid LDR interlocks.
31 *
32 * ARMv5TE:
33 * - use ARMv5TE+ 1-cycle multiply-accumulate instructions.
34 *
35 * ARMv5TE_WST:
36 * - use data tables (256 bytes) for RBG565 saturation.
37 *
38 * All versions are based on current SVN algorithm (round->scale->add)
39 * using the same coefficients, so output results are identical.
40 *
41 * TODO?: SVN coefficients are a very nice approximation for operations
42 * with shift+add instructions. When 16x16+32 MLA instructions are used,
43 * NBR and COEF_N could probably be adjusted to slighly increase accuracy.
44 */
45#define VERSION_SVN 0
46#define VERSION_ARMV4 1
47#define VERSION_ARMV5TE 2
48#define VERSION_ARMV5TE_WST 3
49
50#define YUV2RGB_VERSION VERSION_ARMV5TE_WST
22 51
52
53#define ASM
23#include "config.h" 54#include "config.h"
55#include "cpu.h"
24 56
57#if (YUV2RGB_VERSION == VERSION_SVN)
25 .section .icode, "ax", %progbits 58 .section .icode, "ax", %progbits
26 59
27 60
@@ -239,3 +272,742 @@ lcd_write_yuv420_lines:
239 272
240 .ltorg 273 .ltorg
241 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines 274 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
275
276
277#elif (YUV2RGB_VERSION == VERSION_ARMV4)
278/****************************************************************************
279 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
280 * uint16_t* out,
281 * int width,
282 * int stride);
283 *
284 * Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
285 * |R| |1.164 0.000 1.596| |Y' - 16|
286 * |G| = |1.164 -0.391 -0.813| |Pb - 128|
287 * |B| |1.164 2.018 0.000| |Pr - 128|
288 *
289 * Scaled, normalized, rounded and tweaked to yield RGB 565:
290 * |R| |74 0 101| |Y' - 16| >> 9
291 * |G| = |74 -24 -51| |Cb - 128| >> 8
292 * |B| |74 128 0| |Cr - 128| >> 9
293 *
294 * Converts two lines from YUV420 to RGB565, within each iteration four
295 * pixels (2 per line) are calculated and written to destination buffer.
296 */
297 .section .icode, "ax", %progbits
298
299 .align 2
300 .global lcd_write_yuv420_lines
301 .type lcd_write_yuv420_lines, %function
302
303lcd_write_yuv420_lines:
304 /* r0 = src = yuv_src */
305 /* r1 = dst = out */
306 /* r2 = width */
307 /* r3 = stride */
308 stmfd sp!, {r4-r11,lr} /* save non-scratch */
309 ldmia r0, {r10-r12} /* r10 = yuv_src[0] = Y'_p */
310 /* r11 = yuv_src[1] = Cb_p */
311 /* r12 = yuv_src[2] = Cr_p */
312 mov r9, r2, lsl #1 /* r9 = 2*width (loop count) */
313 str r9, [sp, #-4]! /* [--sp] = 2*width (constant) */
314 add r8, r10, r3 /* r8 = Y'_p + stride = Y'stride_p */
315 mov lr, r1 /* RGB565 data destination buffer */
316
31710: /* loop start */
318 ldrb r0, [r11], #1 /* r0 = *Cb_p++ */
319 ldrb r1, [r12], #1 /* r1 = *Cr_p++ */
320 ldrb r3, [r8], #1 /* r3 = Y'3 */
321 ldrb r4, [r8], #1 /* r4 = Y'4 */
322
323 sub r0, r0, #128 /* r0 = Cb-128 */
324 sub r1, r1, #128 /* r1 = Cr-128 */
325
326 add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */
327 add r2, r2, r2, asl #4
328 add r2, r2, r0, asl #3
329 add r2, r2, r0, asl #4
330
331 add r5, r1, r1, asl #2 /* r1 = Cr*101 */
332 add r5, r5, r1, asl #5
333 add r1, r5, r1, asl #6
334
335 add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
336 mov r1, r1, asr #9
337 rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */
338 mov r2, r2, asr #8
339 add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
340 mov r0, r0, asr #2
341
342 /* pixel_3 */
343 sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */
344 add r7, r3, r3, asl #2
345 add r3, r7, r3, asl #5
346
347 add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */
348 add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */
349 add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */
350
351 orr r3, r6, r5 /* check if clamping is needed... */
352 orr r3, r3, r7, asr #1 /* ...at all */
353 cmp r3, #31
354 bls 15f /* no clamp */
355 cmp r6, #31 /* clamp r */
356 mvnhi r6, r6, asr #31
357 andhi r6, r6, #31
358 cmp r7, #63 /* clamp g */
359 mvnhi r7, r7, asr #31
360 andhi r7, r7, #63
361 cmp r5, #31 /* clamp b */
362 mvnhi r5, r5, asr #31
363 andhi r5, r5, #31
36415: /* no clamp */
365
366 /* calculate pixel_3 and save to r5 for later pixel packing */
367 orr r5, r5, r7, lsl #5 /* pixel_3 = r<<11 | g<<5 | b */
368 orr r5, r5, r6, lsl #11 /* r5 = pixel_3 */
369
370 /* pixel_4 */
371 sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */
372 add r7, r4, r4, asl #2
373 add r4, r7, r4, asl #5
374
375 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
376 add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */
377 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
378
379 orr r3, r6, r4 /* check if clamping is needed... */
380 orr r3, r3, r7, asr #1 /* ...at all */
381 cmp r3, #31
382 bls 15f /* no clamp */
383 cmp r6, #31 /* clamp r */
384 mvnhi r6, r6, asr #31
385 andhi r6, r6, #31
386 cmp r7, #63 /* clamp g */
387 mvnhi r7, r7, asr #31
388 andhi r7, r7, #63
389 cmp r4, #31 /* clamp b */
390 mvnhi r4, r4, asr #31
391 andhi r4, r4, #31
39215: /* no clamp */
393
394 /* calculate pixel_4 and pack with pixel_3 before writing */
395 orr r4, r4, r7, lsl #5 /* pixel_4 = r<<11 | g<<5 | b */
396 orr r4, r4, r6, lsl #11 /* r4 = pixel_4 */
397 orr r5, r5, r4, lsl #16 /* r5 = pixel_4<<16 | pixel_3 */
398
399 ldr r7, [sp] /* r7 = 2*width */
400 ldrb r3, [r10], #1 /* r3 = Y'1 */
401 ldrb r4, [r10], #1 /* r4 = Y'2 */
402
403 str r5, [lr, r7] /* write pixel_3 and pixel_4 */
404
405 /* pixel_1 */
406 sub r3, r3, #16 /* r3 = (Y'-16) * (74/2) */
407 add r7, r3, r3, asl #2
408 add r3, r7, r3, asl #5
409
410 add r6, r1, r3, asr #8 /* r6 = r = (Y >> 9) + rv */
411 add r7, r2, r3, asr #7 /* r7 = g = (Y >> 8) + guv */
412 add r5, r0, r3, asr #8 /* r5 = b = (Y >> 9) + bu */
413
414 orr r3, r6, r5 /* check if clamping is needed... */
415 orr r3, r3, r7, asr #1 /* ...at all */
416 cmp r3, #31
417 bls 15f /* no clamp */
418 cmp r6, #31 /* clamp r */
419 mvnhi r6, r6, asr #31
420 andhi r6, r6, #31
421 cmp r7, #63 /* clamp g */
422 mvnhi r7, r7, asr #31
423 andhi r7, r7, #63
424 cmp r5, #31 /* clamp b */
425 mvnhi r5, r5, asr #31
426 andhi r5, r5, #31
42715: /* no clamp */
428
429 /* calculate pixel_1 and save to r5 for later pixel packing */
430 orr r5, r5, r7, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
431 orr r5, r5, r6, lsl #11 /* r5 = pixel_1 */
432
433 /* pixel_2 */
434 sub r4, r4, #16 /* r4 = (Y'-16) * (74/2) */
435 add r7, r4, r4, asl #2
436 add r4, r7, r4, asl #5
437
438 add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
439 add r7, r2, r4, asr #7 /* r7 = g = (Y >> 8) + guv */
440 add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
441
442 orr r3, r6, r4 /* check if clamping is needed... */
443 orr r3, r3, r7, asr #1 /* ...at all */
444 cmp r3, #31
445 bls 15f /* no clamp */
446 cmp r6, #31 /* clamp r */
447 mvnhi r6, r6, asr #31
448 andhi r6, r6, #31
449 cmp r7, #63 /* clamp g */
450 mvnhi r7, r7, asr #31
451 andhi r7, r7, #63
452 cmp r4, #31 /* clamp b */
453 mvnhi r4, r4, asr #31
454 andhi r4, r4, #31
45515: /* no clamp */
456
457 /* calculate pixel_2 and pack with pixel_1 before writing */
458 orr r4, r4, r7, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
459 orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
460 orr r5, r5, r4, lsl #16 /* r5 = pixel_2<<16 | pixel_1 */
461
462 str r5, [lr], #4 /* write pixel_1 and pixel_2 */
463
464 subs r9, r9, #4 /* check for loop end */
465 bgt 10b /* back to beginning */
466
467 /* loop end */
468 add sp, sp, #4 /* deallocate stack */
469 ldmpc regs=r4-r11 /* restore registers */
470
471 .ltorg
472 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
473
474
475#elif (YUV2RGB_VERSION == VERSION_ARMV5TE)
476/****************************************************************************
477 * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ)
478 * |R| |0.00456621 0 0.00625893| |Y' - 16|
479 * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128|
480 * |B| |0.00456621 0.00791071 0 | |Pr - 128|
481 *
482 * Scaled, normalized, rounded and tweaked to yield RGB 565:
483 * |R| |74 0 101| |Y' - 16| >> 9
484 * |G| = |74 -24 -51| |Cb - 128| >> 8
485 * |B| |74 128 0| |Cr - 128| >> 9
486 */
487#define NBR 14 /* 14-bit resolution (SVN) */
488#define COEF_C0 74
489#define COEF_C1 101
490#define COEF_C2 -24
491#define COEF_C3 -51
492#define COEF_C4 128
493#define C4_IS_POW2
494
495/* constant for rounding a NBR number before down-scaling it to RS bits */
496#define ROUND(RS) (1 << (NBR - RS - 1))
497
498/* packed 16-bit coefficients */
499#define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff))
500#define COEF_2C3_2C2 ((COEF_C3 << 17) | ((COEF_C2 << 1) & 0xffff))
501/* 32-bit MLA constants */
502#define CONST_MLA_Y (-16 * COEF_C0)
503
504/****************************************************************************
505 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
506 * uint16_t* out,
507 * int width,
508 * int stride);
509 *
510 * Converts two lines from YUV420 to RGB565, within each iteration four
511 * pixels (2 per line) are calculated and written to destination buffer.
512 *
513 * - use ARMv5TE+ 1-cycle multiply+accumulator instructions.
514 */
515 .section .icode, "ax", %progbits
516
517 .align 2
518 .global lcd_write_yuv420_lines
519 .type lcd_write_yuv420_lines, %function
520
521lcd_write_yuv420_lines:
522 @ r0 = src = yuv_src
523 @ r1 = out = dst_p
524 @ r2 = width
525 @ r3 = stride
526 stmfd sp!, {r4-r11,lr} @ save non-scratch
527 ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p
528 @ r11 = yuv_src[1] = Cb_p
529 @ r12 = yuv_src[2] = Cr_p
530 adr r0, const_data @ load constants
531 ldmia r0, {r5-r8} @ r5 = COEF_C4_C1
532 @ r6 = COEF_2C3_2C2
533 @ r7 = COEF_C0
534 @ r8 = CONST_MLA_Y
535 sub r4, r12, r11 @ r4 = Cr_p-Cb_p
536 mov r9, r2, asl #1 @ r9 = 2*width
537 stmfd sp!, {r4-r6,r9} @ SP -> Cr_p-Cb_p
538 @ COEF_C4_C1
539 @ COEF_2C3_2C2
540 @ 2*width
541 add r12, r10, r3 @ r12 = Y'_p + stride = Y'stride_p
542 mov lr, r1 @ RGB565 data destination buffer
543 orr r9, r7, r2, lsl #15 @ loop_count = width/2;
544 @ r9 = loop_count<<16 | COEF_C0
545 sub r9, r9, #0x10000 @ loop_count--
546
54710: @ loop_start
548
549 @ register usage:
550 @ r8 = CONST_MLA_Y
551 @ r9 = loop count<<16 | COEF_C0
552 @ r10 = Y'_p
553 @ r11 = Cb_p
554 @ r12 = Y'stride_p
555 @ lr = dst_p
556 @ free: r0-r7
557
558 ldmia sp, {r2-r4} @ r2 = Cr_p-Cb_p
559 @ r3 = COEF_C4_C1
560 @ r4 = COEF_2C3_2C2
561 mov r5, #ROUND(5) @ r5 = round constant
562
563 ldrb r6, [r12], #1 @ r6 = Y'3
564 ldrb r7, [r12], #1 @ r7 = Y'4
565
566 ldrb r1, [r11, r2] @ r1 = Cr = *Cr_p++
567 ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++
568
569 /* calculate Y3 and Y4 */
570 smlabb r6, r6, r9, r8 @ r6 = Y3 = C0*Y'3 - C0*16
571 smlabb r7, r7, r9, r8 @ r7 = Y4 = C0*Y'4 - C0*16
572
573 /* calculate rv, guv, bu */
574 sub r1, r1, #128 @ r1 = Cr" = Cr-128
575 sub r0, r0, #128 @ r0 = Cb" = Cb-128
576
577 smlabt r2, r1, r4, r5 @ r2 = guv" = Cr"*(2*C2) +
578 smlabb r2, r0, r4, r2 @ Cb"*(2*C3) + round
579 smlabb r1, r1, r3, r5 @ r1 = rv" = Cr"*C1 + round
580 #ifdef C4_IS_POW2
581 add r0, r5, r0, asl #NBR-7 @ r0 = bu" = Cb"*C4 + round
582 #else
583 smlabt r0, r0, r3, r5 @ r0 = bu" = Cb"*C4 + round
584 #endif
585
586 /* scale rv",guv",bu" */
587 mov r2, r2, asr #NBR-5 @ r2 = guv = guv" >> scale
588 mov r1, r1, asr #NBR-5 @ r1 = rv = rv" >> scale
589 mov r0, r0, asr #NBR-5 @ r0 = bu = bu" >> scale
590
591 @ register usage:
592 @ r8-r12,lr: pointers, counters
593 @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565)
594 @ r6,r7 = Y'3,Y'4
595 @ free: r3-r5
596
597 /* pixel_3 */
598 add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y3 >> scale) + rv
599 add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y3 >> scale) + guv
600 add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y3 >> scale) + bu
601
602 orr r6, r5, r3 @ check if clamping is needed...
603 orr r6, r6, r4, asr #1 @ ...at all
604 cmp r6, #31
605 bls 15f @ no clamp
606 cmp r5, #31 @ clamp r
607 mvnhi r5, r5, asr #31
608 andhi r5, r5, #31
609 cmp r4, #63 @ clamp g
610 mvnhi r4, r4, asr #31
611 andhi r4, r4, #63
612 cmp r3, #31 @ clamp b
613 mvnhi r3, r3, asr #31
614 andhi r3, r3, #31
61515: @ no clamp
616
617 /* calculate pixel_3 and save to r3 for later pixel packing */
618 orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = r<<11 | g<<5 | b
619 orr r3, r3, r5, lsl #11
620
621 /* pixel_4 */
622 add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y4 >> scale) + rv
623 add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y4 >> scale) + guv
624 add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y4 >> scale) + bu
625
626 orr r6, r5, r7 @ check if clamping is needed...
627 orr r6, r6, r4, asr #1 @ ...at all
628 cmp r6, #31
629 bls 15f @ no clamp
630 cmp r5, #31 @ clamp r
631 mvnhi r5, r5, asr #31
632 andhi r5, r5, #31
633 cmp r4, #63 @ clamp g
634 mvnhi r4, r4, asr #31
635 andhi r4, r4, #63
636 cmp r7, #31 @ clamp b
637 mvnhi r7, r7, asr #31
638 andhi r7, r7, #31
63915: @ no clamp
640
641 /* calculate pixel_4 and pack with pixel_3 before writing */
642 orr r7, r7, r4, lsl #5 @ r7 = pixel_4 = r<<11 | g<<5 | b
643 orr r7, r7, r5, lsl #11
644 orr r3, r3, r7, lsl #16 @ r3 = pixel_4<<16 | pixel_3
645
646 /* avoid interlocks when writing pixel_3 and pixel_4 */
647 ldr r5, [sp, #12] @ r5 = 2*width
648
649 ldrb r6, [r10], #1 @ r6 = Y'1
650 ldrb r7, [r10], #1 @ r7 = Y'2
651
652 /* write pixel_3 and pixel_4 */
653 str r3, [lr, r5] @ [dst_p + 2*width] = r3
654
655 @ register usage:
656 @ r8-r12,lr: pointers, counters
657 @ r0,r1,r2 = bu,rv,guv (rounded and scaled to RGB565)
658 @ r6,r7 = Y'1,Y'2
659 @ free: r3-r5
660
661 /* calculate Y1 and Y2 */
662 smlabb r6, r6, r9, r8 @ r6 = Y1 = C0*Y'1 - C0*16
663 smlabb r7, r7, r9, r8 @ r7 = Y2 = C0*Y'2 - C0*16
664
665 /* pixel_1 */
666 add r5, r1, r6, asr #NBR-5 @ r5 = r = (Y1 >> scale) + rv
667 add r4, r2, r6, asr #NBR-6 @ r4 = g = (Y1 >> scale) + guv
668 add r3, r0, r6, asr #NBR-5 @ r3 = b = (Y1 >> scale) + bu
669
670 orr r6, r5, r3 @ check if clamping is needed...
671 orr r6, r6, r4, asr #1 @ ...at all
672 cmp r6, #31
673 bls 15f @ no clamp
674 cmp r5, #31 @ clamp r
675 mvnhi r5, r5, asr #31
676 andhi r5, r5, #31
677 cmp r4, #63 @ clamp g
678 mvnhi r4, r4, asr #31
679 andhi r4, r4, #63
680 cmp r3, #31 @ clamp b
681 mvnhi r3, r3, asr #31
682 andhi r3, r3, #31
68315: @ no clamp
684
685 /* calculate pixel_1 and save to r3 for later pixel packing */
686 orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = r<<11 | g<<5 | b
687 orr r3, r3, r5, lsl #11
688
689 /* pixel_2 */
690 add r5, r1, r7, asr #NBR-5 @ r5 = r = (Y2 >> scale) + rv
691 add r4, r2, r7, asr #NBR-6 @ r4 = g = (Y2 >> scale) + guv
692 add r7, r0, r7, asr #NBR-5 @ r7 = b = (Y2 >> scale) + bu
693
694 orr r6, r5, r7 @ check if clamping is needed...
695 orr r6, r6, r4, asr #1 @ ...at all
696 cmp r6, #31
697 bls 15f @ no clamp
698 cmp r5, #31 @ clamp r
699 mvnhi r5, r5, asr #31
700 andhi r5, r5, #31
701 cmp r4, #63 @ clamp g
702 mvnhi r4, r4, asr #31
703 andhi r4, r4, #63
704 cmp r7, #31 @ clamp b
705 mvnhi r7, r7, asr #31
706 andhi r7, r7, #31
70715: @ no clamp
708
709 /* calculate pixel_2 and pack with pixel_1 before writing */
710 orr r7, r7, r4, lsl #5 @ r7 = pixel_2 = r<<11 | g<<5 | b
711 orr r7, r7, r5, lsl #11
712 orr r3, r3, r7, lsl #16 @ r3 = pixel_2 << 16 | pixel_1
713
714 str r3, [lr], #4 @ write pixel_1 and pixel_2
715
716 /* check for loop end */
717 subs r9, r9, #0x10000 @ loop_count--
718 bge 10b @ back to beginning
719
720 /* bye */
721 add sp, sp, #16
722 ldmpc regs=r4-r11 @ restore registers
723
724 .ltorg
725 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
726
727/* data */
728 .align 2
729const_data:
730 .word COEF_C4_C1
731 .word COEF_2C3_2C2
732 .word COEF_C0
733 .word CONST_MLA_Y
734
735 .size const_data, .-const_data
736
737
738#else /* YUV2RGB_VERSION == VERSION_ARMV5TE_WST */
739/****************************************************************************
740 * How do I encode Y'CBCR components from R'G'B' in [0, +1]? (see ColorFAQ)
741 * |R| |0.00456621 0 0.00625893| |Y' - 16|
742 * |G| = |0.00456621 -0.00153632 -0.00318811| |Pb - 128|
743 * |B| |0.00456621 0.00791071 0 | |Pr - 128|
744 *
745 * Scaled, normalized, rounded and tweaked to yield RGB 565:
746 * |R| |74 0 101| |Y' - 16| >> 9
747 * |G| = |74 -24 -51| |Cb - 128| >> 8
748 * |B| |74 128 0| |Cr - 128| >> 9
749 */
750#define NBR 14 /* 14-bit resolution (SVN) */
751#define COEF_C0 74
752#define COEF_C1 101
753#define COEF_C2 -24
754#define COEF_C3 -51
755#define COEF_C4 128
756#define C4_IS_POW2
757
758/* packed 16-bit coefficients */
759#define COEF_C4_C1 ((COEF_C4 << 16) | (COEF_C1 & 0xffff))
760#define COEF_C3_C2 ((COEF_C3 << 16) | (COEF_C2 & 0xffff))
761
762/* constant for rounding an NBR number before down-scaling it to RS bits */
763#define ROUND(RS) (1 << (NBR - RS - 1))
764
765/* 32-bit MLA constants */
766#define CONST_MLA_Y (-16 * COEF_C0)
767#define CONST_MLA_RV ((-128 * COEF_C1) + ROUND(5))
768#define CONST_MLA_BU ((-128 * COEF_C4) + ROUND(5))
769/* trick to save the register needed for table_sat6 reference:
770 add table_sat6-table_sat5 offset (conveniently scaled) to guv MLA */
771#define CONST_MLA_GUV (-128 * (COEF_C2 + COEF_C3) + ROUND(6) + \
772 ((table_sat6 - table_sat5) << (NBR - 6)))
773
774/****************************************************************************
775 * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
776 * uint16_t* out,
777 * int width,
778 * int stride);
779 *
780 * Converts two lines from YUV420 to RGB565, within each iteration four
781 * pixels (2 per line) are calculated and written to destination buffer.
782 *
783 * - use ARMv5TE+ 1-cycle multiply+accumulator instructions.
784 * - use data tables (256 bytes) for RBG565 saturation.
785 */
786 .section .icode, "ax", %progbits
787
788 .align 2
789 .global lcd_write_yuv420_lines
790 .type lcd_write_yuv420_lines, %function
791
792lcd_write_yuv420_lines:
793 @ r0 = src = yuv_src
794 @ r1 = out = dst1_p
795 @ r2 = width
796 @ r3 = stride
797 stmfd sp!, {r4-r11,lr} @ save non-scratch
798 ldmia r0, {r10-r12} @ r10 = yuv_src[0] = Y'_p
799 @ r11 = yuv_src[1] = Cb_p
800 @ r12 = yuv_src[2] = Cr_p
801 /* prepare data and fill stack */
802 adr r0, const_data @ load constants
803 ldmia r0, {r4-r9,lr} @ r4 = COEF_C0
804 @ r5 = CONST_MLA_GUV
805 @ r6 = COEF_C3_C2
806 @ r7 = CONST_MLA_BU
807 @ r8 = COEF_C4_C1
808 @ r9 = CONST_MLA_RV
809 @ lr = table_sat5
810 sub r0, r12, r11 @ r0 = Cr_p-Cb_p
811 #define STACK_SZ 28
812 stmfd sp!, {r0,r5-r9,lr} @ SP -> Cr_p-Cb_p
813 @ CONST_MLA_GUV
814 @ COEF_C3_C2
815 @ CONST_MLA_BU
816 @ COEF_C4_C1
817 @ CONST_MLA_RV
818 @ table_sat5
819 mov r8, r4, lsl #4 @
820 rsb r8, #0 @ r8 = -16*COEF_C0 = CONST_MLA_Y
821 mov lr, r1 @ RGB565 data destination buffer
822 add r9, lr, r2, asl #1 @ r9 = out + 2*width = dst2_p
823 add r12, r3, r10 @ r12 = Y'_p + stride
824 orr r7, r4, r2, lsl #15 @ loop_count = width/2;
825 @ r7 = loop_count<<16 | COEF_C0
826 sub r7, r7, #0x10000 @ loop_count--
827
828 /* align loop code to minimize occupied lines, execution
829 time per loop is optimized ~10% on ARM926EJ-S */
830 .align CACHEALIGN_BITS
831loop_start:
832
833 @ register usage:
834 @ r7 = loop count<<16 | COEF_C0
835 @ r8 = CONST_MLA_Y
836 @ r9 = dst2_p
837 @ r10 = Y'_p
838 @ r11 = Cb_p
839 @ r12 = Y'stride_p
840 @ lr = dst1_p
841 @ free: r0-r6
842
843 /* load constants from stack */
844 ldmia sp, {r1-r3,r6} @ r1 = Cr_p-Cb_p
845 @ r2 = CONST_MLA_GUV
846 @ r3 = COEF_C3_C2
847 @ r6 = CONST_MLA_BU
848
849 /* read Cr", Cb" */
850 ldrb r1, [r11, r1] @ r1 = Cr = *Cr_p++
851 ldrb r0, [r11], #1 @ r0 = Cb = *Cb_p++
852
853 /* load more constants (avoids r1 interlock) */
854 ldrd r4, [sp, #16] @ r4 = COEF_C4_C1
855 @ r5 = CONST_MLA_RV
856
857 /* calculate rv", guv", bu" */
858 smlabt r2, r1, r3, r2 @ r2 = guv" = Cr*C2 + Cb*C3
859 smlabb r2, r0, r3, r2 @ + CONST_MLA_GUV
860 smlabb r1, r1, r4, r5 @ r1 = rv" = Cr*C1 + CONST_MLA_RV
861 #ifdef C4_IS_POW2
862 add r0, r6, r0, asl #NBR-7 @ r0 = bu" = Cb*C4 + CONST_MLA_BU
863 #else
864 smlabt r0, r0, r4, r6 @ r0 = bu" = Cb*C4 + CONST_MLA_BU
865 #endif
866
867 ldr r4, [sp, #STACK_SZ-4] @ r4 = table_sat5
868
869 /* read Y'1 and Y'2 */
870 ldrb r5, [r10], #1 @ r5 = Y'1 = *Y'_p++
871 ldrb r6, [r10], #1 @ r6 = Y'2 = *Y'_p++
872
873 /* scale rv",guv",bu", adding sat5_p here saves instructions later */
874 add r1, r4, r1, asr #NBR-5 @ r1 = rv' = sat5_p + rv">>scale
875 add r2, r4, r2, asr #NBR-6 @ r2 = guv' = sat5_p + guv">>scale
876 add r0, r4, r0, asr #NBR-5 @ r0 = bu' = sat5_p + bu">>scale
877
878 @ register usage:
879 @ r7-r12,lr: pointers, counters, tables
880 @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled
881 @ r5,r6 = Y'1,Y'2
882 @ free: r3,r4
883
884 /* calculate Y1 and Y2 */
885 smlabb r5, r5, r7, r8 @ r5 = Y1 = C0*Y'1 - 16*C0
886 smlabb r6, r6, r7, r8 @ r6 = Y2 = C0*Y'2 - 16*C0
887
888 /* pixel_1 */
889 ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y1>>scale + bu']
890 ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y1>>scale + guv']
891 ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y1>>scale + rv']
892
893 /* calculate pixel_1 */
894 orr r3, r3, r4, lsl #5 @ r3 = pixel_1 = g<<5 | b
895
896 /* pixel_2 (avoid r5 interlock) */
897 ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y2>>scale + bu']
898
899 /* calculate pixel_1 and save to r3 for later pixel packing */
900 orr r3, r3, r5, lsl #11 @ r3 = pixel_1 = r<<11 | g<<5 | b
901
902 /* pixel_2 */
903 ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y2>>scale + guv']
904 ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y2>>scale + rv']
905
906 /* calculate pixel_2 and pack with pixel_1 before writing */
907 orr r3, r3, r4, lsl #16 @ r3 = pixel_2<<16 | pixel_1
908 orr r3, r3, r5, lsl #21
909 orr r3, r3, r6, lsl #27
910
911 /* read Y'3 and Y'4 */
912 ldrb r5, [r12], #1 @ r5 = Y'3 = *Y'stride_p++
913 ldrb r6, [r12], #1 @ r6 = Y'4 = *Y'stride_p++
914
915 /* write pixel_1 and pixel_2 */
916 str r3, [lr], #4 @ *dst2_p++ = r3
917
918 @ register usage:
919 @ r7-r12,lr: pointers, counters, tables
920 @ r0,r1,r2 = (bu,rv,guv) rounded and RGB565 scaled
921 @ r5,r6 = Y'3,Y'4
922 @ free: r3,r4
923
924 /* calculate Y3 and Y4 */
925 smlabb r5, r5, r7, r8 @ r5 = Y3 = C0*Y'3 - 16*C0
926 smlabb r6, r6, r7, r8 @ r6 = Y4 = C0*Y'4 - 16*C0
927
928 /* pixel_3 */
929 ldrb r3, [r0, r5, asr #NBR-5] @ r3 = b = sat5[Y3>>scale + bu']
930 ldrb r4, [r2, r5, asr #NBR-6] @ r4 = g = sat6[Y3>>scale + guv']
931 ldrb r5, [r1, r5, asr #NBR-5] @ r5 = r = sat5[Y3>>scale + rv']
932
933 /* calculate pixel_3 */
934 orr r3, r3, r4, lsl #5 @ r3 = pixel_3 = g<<5 | b
935
936 /* pixel_4 (avoid r5 interlock) */
937 ldrb r4, [r0, r6, asr #NBR-5] @ r4 = b = sat5[Y4>>scale + bu']
938
939 /* calculate pixel_3 and save to r3 for later pixel packing */
940 orr r3, r3, r5, lsl #11 @ r3 = pixel_3 = r<<11 | g<<5 | b
941
942 /* pixel_4 */
943 ldrb r5, [r2, r6, asr #NBR-6] @ r5 = g = sat6[Y4>>scale + guv']
944 ldrb r6, [r1, r6, asr #NBR-5] @ r6 = r = sat5[Y4>>scale + rv']
945
946 /* calculate pixel_4 and pack with pixel_3 before writing */
947 orr r3, r3, r4, lsl #16 @ r3 = pixel_4 << 16 | pixel_3
948 orr r3, r3, r5, lsl #21
949 orr r3, r3, r6, lsl #27
950
951 /* write pixel_3 and pixel_4 */
952 str r3, [r9], #4 @ *dst1_p++ = r3
953
954 /* check for loop end */
955 subs r7, r7, #0x10000 @ loop_count--
956 bge loop_start @ back to beginning
957
958 /* bye */
959 add sp, sp, #STACK_SZ @ deallocate stack
960 ldmpc regs=r4-r11 @ restore registers
961
962 .ltorg
963 .size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
964
965/* data */
966 .align 2
967const_data:
968 .word COEF_C0
969 .word CONST_MLA_GUV
970 .word COEF_C3_C2
971 .word CONST_MLA_BU
972 .word COEF_C4_C1
973 .word CONST_MLA_RV
974 .word table_sat5
975
976 .size const_data, .-const_data
977
978/* saturation tables */
979 /*.section .data*/
980 /* aligned to cache line size to minimize cache usage */
981 .align CACHEALIGN_BITS
982
983saturation_tables:
984 /* 5-bit saturation table [-36..0..+67], size=104 */
985 /* table_sat5[-36..-1] */
986 .byte 0, 0, 0, 0
987 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
988 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
989 table_sat5:
990 /* table_sat5[0..67] */
991 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
992 .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
993 .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31
994 .byte 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31
995 .byte 31, 31, 31, 31
996
997 /* 6-bit saturation table [-44..0..+107], size=152 */
998 /* table_sat6[-44..-1] */
999 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1000 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1001 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1002 table_sat6:
1003 /* table_sat6[0..107] */
1004 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1005 .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1006 .byte 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
1007 .byte 48, 49, 50, 51, 52, 53 ,54, 55, 56, 57, 58, 59, 60, 61, 62, 63
1008 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1009 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1010 .byte 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63
1011
1012 .size saturation_tables, .-saturation_tables
1013#endif /* YUV2RGB_VERSION */