summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2009-06-27 09:21:22 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2009-06-27 09:21:22 +0000
commit3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6 (patch)
tree9cc77098cf188b151ef905068da6b1ae6e982df3
parent07a55a747d161f164c4cb1c73e2d697a84e4e5de (diff)
downloadrockbox-3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6.tar.gz
rockbox-3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6.zip
ARM assembly 8-point IDCT, both passes. No ARMv5/6 optimizations yet, aside from usat for final output.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21526 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/recorder/jpeg_idct_arm.S265
-rw-r--r--apps/recorder/jpeg_load.c17
2 files changed, 275 insertions, 7 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index b9c94e5639..01b08c4b5a 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -35,6 +35,10 @@
35 .type jpeg_idct4v, %function 35 .type jpeg_idct4v, %function
36 .global jpeg_idct4h 36 .global jpeg_idct4h
37 .type jpeg_idct4h, %function 37 .type jpeg_idct4h, %function
38 .global jpeg_idct8v
39 .type jpeg_idct8v, %function
40 .global jpeg_idct8h
41 .type jpeg_idct8h, %function
38 42
39jpeg_idct1h: 43jpeg_idct1h:
40/* In the common case of one pass through the loop, the extra add should be 44/* In the common case of one pass through the loop, the extra add should be
@@ -414,3 +418,264 @@ jpeg_idct4h:
414 ldmia sp!, { r4-r9, pc } 418 ldmia sp!, { r4-r9, pc }
415#endif 419#endif
416 .size jpeg_idct4h, .-jpeg_idct4h 420 .size jpeg_idct4h, .-jpeg_idct4h
421
422jpeg_idct8v:
423 stmdb sp!, { r4-r11, lr }
424 add r2, r0, #128
4251:
426 ldmia r0!, { r4-r7 }
427 mov r8, r4, lsl #16
428 orrs r9, r6, r7
429 orreqs r9, r5, r4, lsr #16
430 bne 2f
431 mov r8, r8, asr #14
432 strh r8, [r2]
433 strh r8, [r2, #16]
434 strh r8, [r2, #32]
435 strh r8, [r2, #48]
436 strh r8, [r2, #64]
437 strh r8, [r2, #80]
438 strh r8, [r2, #96]
439 strh r8, [r2, #112]
440 cmp r0, r1
441 add r2, r2, #2
442 bcc 1b
443 ldmia sp!, { r4-r11, pc }
4442:
445 ldr r14, =4433
446 ldr r12, =-15137
447 mov r10, r5, lsl #16
448 mov r11, r7, lsl #16
449 mov r10, r10, asr #16 /* r10 = z2 = d2 */
450 mov r11, r11, asr #16 /* r11 = z3 = d6 */
451 add r8, r8, #8192
452 add r9, r10, r11
453 mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
454 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
455 ldr r14, =6270
456 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
457 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
458 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
459 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
460 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
461 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
462 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
463 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
464 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
465 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
466 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
467 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
468 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
469 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
470 ldr r10, =9633
471 ldr r11, =-16069
472 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
473 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
474 add r9, r12, r14 /* r9 = z3 + z4 */
475 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
476 ldr r10, =-3196
477 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
478 ldr r11, =-7373
479 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
480 ldr r10, =2446
481 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
482 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
483 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
484 ldr r11, =12299
485 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
486 ldr r10, =-20995
487 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
488 ldr r11, =25172
489 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
490 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
491 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
492 ldr r10, =16819
493 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
494 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
495 ldmdb sp, { r8-r11 }
496 add r12, r8, r4 /* o0 */
497 sub r14, r8, r4 /* o7 */
498 add r8, r9, r7 /* o3 */
499 sub r9, r9, r7 /* o4 */
500 add r4, r10, r5 /* O1 */
501 sub r5, r10, r5 /* o6 */
502 add r10, r11, r6 /* o2 */
503 sub r11, r11, r6 /* o5 */
504 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
505 mov r12, r12, asr #11
506 mov r4, r4, asr #11
507 mov r10, r10, asr #11
508 mov r8, r8, asr #11
509 mov r9, r9, asr #11
510 mov r11, r11, asr #11
511 mov r5, r5, asr #11
512 mov r14, r14, asr #11
513 strh r12, [r2]
514 strh r4, [r2, #16]
515 strh r10, [r2, #32]
516 strh r8, [r2, #48]
517 strh r9, [r2, #64]
518 strh r11, [r2, #80]
519 strh r5, [r2, #96]
520 strh r14, [r2, #112]
521 cmp r0, r1
522 add r2, r2, #2
523 bcc 1b
524 ldmia sp!, { r4-r11, pc }
525 .size jpeg_idct8v, .-jpeg_idct8v
526
527jpeg_idct8h:
528 stmdb sp!, { r4-r11, lr }
5291:
530 ldmia r0!, { r4-r7 }
531 ldr r14, =4112
532 mov r8, r4, lsl #16
533 add r8, r8, r14, lsl #16
534 orrs r9, r6, r7
535 orreqs r9, r5, r4, lsr #16
536 bne 2f
537 mov r8, r8, asr #21
538 cmp r8, #255
539 mvnhi r8, r8, asr #31
540#ifdef HAVE_LCD_COLOR
541 strb r8, [r1]
542 strb r8, [r1, #4]
543 strb r8, [r1, #8]
544 strb r8, [r1, #12]
545 strb r8, [r1, #16]
546 strb r8, [r1, #20]
547 strb r8, [r1, #24]
548 strb r8, [r1, #28]
549#else
550 strb r8, [r1]
551 strb r8, [r1, #1]
552 strb r8, [r1, #2]
553 strb r8, [r1, #3]
554 strb r8, [r1, #4]
555 strb r8, [r1, #5]
556 strb r8, [r1, #6]
557 strb r8, [r1, #7]
558#endif
559 add r1, r1, r3
560 cmp r0, r2
561 bcc 1b
562 ldmia sp!, { r4-r11, pc }
5632:
564 ldr r14, =4433
565 ldr r12, =-15137
566 mov r10, r5, lsl #16
567 mov r11, r7, lsl #16
568 mov r10, r10, asr #16 /* r10 = z2 = d2 */
569 mov r11, r11, asr #16 /* r11 = z3 = d6 */
570 add r9, r10, r11
571 mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
572 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
573 ldr r14, =6270
574 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
575 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
576 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
577 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
578 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
579 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
580 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
581 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
582 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
583 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
584 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
585 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
586 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
587 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
588 ldr r10, =9633
589 ldr r11, =-16069
590 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
591 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
592 add r9, r12, r14 /* r9 = z3 + z4 */
593 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
594 ldr r10, =-3196
595 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
596 ldr r11, =-7373
597 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
598 ldr r10, =2446
599 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
600 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
601 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
602 ldr r11, =12299
603 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
604 ldr r10, =-20995
605 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
606 ldr r11, =25172
607 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
608 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
609 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
610 ldr r10, =16819
611 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
612 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
613 ldmdb sp, { r8-r11 }
614 add r12, r8, r4 /* o0 */
615 sub r14, r8, r4 /* o7 */
616 add r8, r9, r7 /* o3 */
617 sub r9, r9, r7 /* o4 */
618 add r4, r10, r5 /* O1 */
619 sub r5, r10, r5 /* o6 */
620 add r10, r11, r6 /* o2 */
621 sub r11, r11, r6 /* o5 */
622 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
623#if ARM_ARCH < 6
624 mov r12, r12, asr #18
625 cmp r12, #255
626 mvnhi r12, r12, asr #31
627 mov r4, r4, asr #18
628 cmp r4, #255
629 mvnhi r4, r4, asr #31
630 mov r10, r10, asr #18
631 cmp r10, #255
632 mvnhi r10, r10, asr #31
633 mov r8, r8, asr #18
634 cmp r8, #255
635 mvnhi r8, r8, asr #31
636 mov r9, r9, asr #18
637 cmp r9, #255
638 mvnhi r9, r9, asr #31
639 mov r11, r11, asr #18
640 cmp r11, #255
641 mvnhi r11, r11, asr #31
642 mov r5, r5, asr #18
643 cmp r5, #255
644 mvnhi r5, r5, asr #31
645 mov r14, r14, asr #18
646 cmp r14, #255
647 mvnhi r14, r14, asr #31
648#else
649 usat r12, #8, r12, asr #18
650 usat r4, #8, r4, asr #18
651 usat r10, #8, r10, asr #18
652 usat r8, #8, r8, asr #18
653 usat r9, #8, r9, asr #18
654 usat r11, #8, r11, asr #18
655 usat r5, #8, r5, asr #18
656 usat r14, #8, r14, asr #18
657#endif
658#ifdef HAVE_LCD_COLOR
659 strb r12, [r1]
660 strb r4, [r1, #4]
661 strb r10, [r1, #8]
662 strb r8, [r1, #12]
663 strb r9, [r1, #16]
664 strb r11, [r1, #20]
665 strb r5, [r1, #24]
666 strb r14, [r1, #28]
667#else
668 strb r12, [r1]
669 strb r4, [r1, #1]
670 strb r10, [r1, #2]
671 strb r8, [r1, #3]
672 strb r9, [r1, #4]
673 strb r11, [r1, #5]
674 strb r5, [r1, #6]
675 strb r14, [r1, #7]
676#endif
677 add r1, r1, r3
678 cmp r0, r2
679 bcc 1b
680 ldmia sp!, { r4-r11, pc }
681 .size jpeg_idct8h, .-jpeg_idct8h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index fa2df5b993..5ffa4a54a0 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -382,13 +382,6 @@ static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowst
382 DS_OUT)); 382 DS_OUT));
383 } 383 }
384} 384}
385#else
386extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
387extern void jpeg_idct2v(int16_t *ws, int16_t *end);
388extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
389extern void jpeg_idct4v(int16_t *ws, int16_t *end);
390extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
391#endif
392 385
393/* vertical-pass 8-point IDCT */ 386/* vertical-pass 8-point IDCT */
394static void jpeg_idct8v(int16_t *ws, int16_t *end) 387static void jpeg_idct8v(int16_t *ws, int16_t *end)
@@ -599,6 +592,16 @@ static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowst
599 } 592 }
600} 593}
601 594
595#else
596extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
597extern void jpeg_idct2v(int16_t *ws, int16_t *end);
598extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
599extern void jpeg_idct4v(int16_t *ws, int16_t *end);
600extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
601extern void jpeg_idct8v(int16_t *ws, int16_t *end);
602extern void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
603#endif
604
602#ifdef HAVE_LCD_COLOR 605#ifdef HAVE_LCD_COLOR
603/* vertical-pass 16-point IDCT */ 606/* vertical-pass 16-point IDCT */
604static void jpeg_idct16v(int16_t *ws, int16_t *end) 607static void jpeg_idct16v(int16_t *ws, int16_t *end)