summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-12-09 23:20:59 +0000
committerJens Arnold <amiconn@rockbox.org>2008-12-09 23:20:59 +0000
commita29b659758e3d15b11a22f3ae369a9240de182b5 (patch)
treeb4f1aa3c9cfa5ad4a2475a20774e761f837174a6
parent781421afa2085b841b52f876d80f484f565c0755 (diff)
downloadrockbox-a29b659758e3d15b11a22f3ae369a9240de182b5.tar.gz
rockbox-a29b659758e3d15b11a22f3ae369a9240de182b5.zip
Assembler optimised mono predictor for ARM. Speedup for -c1000 mono is ~5% on PP, ~8% on Gigabeat S (less for higher compression levels). Also fix some overlooked comments in the stereo predictor.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19375 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/predictor-arm.S175
-rw-r--r--apps/codecs/demac/libdemac/predictor.c2
2 files changed, 167 insertions, 10 deletions
diff --git a/apps/codecs/demac/libdemac/predictor-arm.S b/apps/codecs/demac/libdemac/predictor-arm.S
index 1ffba75318..f1d3bc3739 100644
--- a/apps/codecs/demac/libdemac/predictor-arm.S
+++ b/apps/codecs/demac/libdemac/predictor-arm.S
@@ -27,10 +27,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 27
28 .align 2 28 .align 2
29 29
30 .global predictor_decode_stereo
31 .type predictor_decode_stereo,%function
32
33
34/* NOTE: The following need to be kept in sync with parser.h */ 30/* NOTE: The following need to be kept in sync with parser.h */
35 31
36#define YDELAYA 200 32#define YDELAYA 200
@@ -90,6 +86,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
90#endif 86#endif
91.endm 87.endm
92 88
89 .global predictor_decode_stereo
90 .type predictor_decode_stereo,%function
91
93@ Register usage: 92@ Register usage:
94@ 93@
95@ r0-r11 - scratch 94@ r0-r11 - scratch
@@ -221,8 +220,8 @@ loop:
221 @ r2 contains decoded0 220 @ r2 contains decoded0
222 @ r3 contains *decoded0 221 @ r3 contains *decoded0
223 222
224 @ r6, r7, r8, r9, r11 contain p->YcoeffsB[0..4] 223 @ r5, r6, r7, r8, r9 contain p->YcoeffsB[0..4]
225 @ r5, r10 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB] 224 @ r10, r11 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
226 225
227 str r1, [r2], #4 @ *(decoded0++) := r1 (p->YfilterA) 226 str r1, [r2], #4 @ *(decoded0++) := r1 (p->YfilterA)
228 str r2, [sp] @ save decoded0 227 str r2, [sp] @ save decoded0
@@ -407,8 +406,8 @@ loop:
407 @ r2 contains decoded1 406 @ r2 contains decoded1
408 @ r3 contains *decoded1 407 @ r3 contains *decoded1
409 408
410 @ r6, r7, r8, r9, r11 contain p->XcoeffsB[0..4] 409 @ r5, r6, r7, r8, r9 contain p->XcoeffsB[0..4]
411 @ r5, r10 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB] 410 @ r10, r11 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
412 411
413 str r1, [r2], #4 @ *(decoded1++) := r1 (p->XfilterA) 412 str r1, [r2], #4 @ *(decoded1++) := r1 (p->XfilterA)
414 str r2, [sp, #4] @ save decoded1 413 str r2, [sp, #4] @ save decoded1
@@ -533,3 +532,163 @@ move_hist:
533 bne loop 532 bne loop
534 533
535 b done 534 b done
535 .size predictor_decode_stereo, .-predictor_decode_stereo
536
537 .global predictor_decode_mono
538 .type predictor_decode_mono,%function
539
540@ Register usage:
541@
542@ r0-r11 - scratch
543@ r12 - struct predictor_t* p
544@ r14 - int32_t* p->buf
545
546@ void predictor_decode_mono(struct predictor_t* p,
547@ int32_t* decoded0,
548@ int count)
549
550predictor_decode_mono:
551 stmdb sp!, {r1, r2, r4-r11, lr}
552
553 @ r1 (decoded0) is [sp]
554 @ r2 (count) is [sp, #4]
555
556 mov r12, r0 @ r12 := p
557 ldr r14, [r0] @ r14 := p->buf
558
559loopm:
560
561@@@@@@@@@@@@@@@@@@@@@@@@@@@ PREDICTOR
562
563 ldr r11, [r12, #YlastA] @ r11 := p->YlastA
564
565 add r2, r14, #YDELAYA-12 @ r2 := &p->buf[YDELAYA-3]
566 ldmia r2, {r2, r3, r10} @ r2 := p->buf[YDELAYA-3]
567 @ r3 := p->buf[YDELAYA-2]
568 @ r10 := p->buf[YDELAYA-1]
569
570 add r5, r12, #YcoeffsA @ r5 := &p->YcoeffsA[0]
571 ldmia r5, {r6 - r9} @ r6 := p->YcoeffsA[0]
572 @ r7 := p->YcoeffsA[1]
573 @ r8 := p->YcoeffsA[2]
574 @ r9 := p->YcoeffsA[3]
575
576 subs r10, r11, r10 @ r10 := r11 - r10
577
578 STR2OFS r10, r11, r14, #YDELAYA-4
579 @ p->buf[YDELAYA-1] = r10
580 @ p->buf[YDELAYA] = r11
581
582 mul r0, r11, r6 @ r0 := p->buf[YDELAYA] * p->YcoeffsA[0]
583 mla r0, r10, r7, r0 @ r0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
584 mla r0, r3, r8, r0 @ r0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
585 mla r0, r2, r9, r0 @ r0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
586
587 @ flags were set above, in the subs instruction
588 mvngt r10, #0
589 movlt r10, #1 @ r10 := SIGN(r10) (see .c for SIGN macro)
590
591 cmp r11, #0
592 mvngt r11, #0
593 movlt r11, #1 @ r11 := SIGN(r11) (see .c for SIGN macro)
594
595 STR2OFS r10, r11, r14, #YADAPTCOEFFSA-4
596 @ p->buf[YADAPTCOEFFSA-1] := r10
597 @ p->buf[YADAPTCOEFFSA] := r11
598
599 ldr r2, [sp] @ r2 := decoded0
600 ldr r4, [r12, #YfilterA] @ r4 := p->YfilterA
601 ldr r3, [r2] @ r3 := *decoded0
602 rsb r4, r4, r4, lsl #5 @ r4 := r4 * 32 - r4 ( == r4*31)
603 add r1, r3, r0, asr #10 @ r1 := r3 + (r0 >> 10)
604 str r1, [r12, #YlastA] @ p->YlastA := r1
605 add r1, r1, r4, asr #5 @ r1 := r1 + (r4 >> 5)
606 str r1, [r12, #YfilterA] @ p->YfilterA := r1
607
608 @ r1 contains p->YfilterA
609 @ r2 contains decoded0
610 @ r3 contains *decoded0
611
612 @ r6, r7, r8, r9 contain p->YcoeffsA[0..3]
613 @ r10, r11 contain p->buf[YADAPTCOEFFSA-1] and p->buf[YADAPTCOEFFSA]
614
615 str r1, [r2], #4 @ *(decoded0++) := r1 (p->YfilterA)
616 str r2, [sp] @ save decoded0
617 cmp r3, #0
618 beq 3f
619
620 LDR2OFS r2, r3, r14, #YADAPTCOEFFSA-12
621 @ r2 := p->buf[YADAPTCOEFFSA-3]
622 @ r3 := p->buf[YADAPTCOEFFSA-2]
623 blt 1f
624
625 @ *decoded0 > 0
626
627 sub r6, r6, r11 @ r6 := p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
628 sub r7, r7, r10 @ r7 := p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
629 sub r9, r9, r2 @ r9 := p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
630 sub r8, r8, r3 @ r8 := p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
631
632 b 2f
633
6341: @ *decoded0 < 0
635
636 add r6, r6, r11 @ r6 := p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
637 add r7, r7, r10 @ r7 := p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
638 add r9, r9, r2 @ r9 := p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
639 add r8, r8, r3 @ r8 := p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
640
6412:
642 stmia r5, {r6 - r9} @ Save p->YcoeffsA
643
6443:
645
646@@@@@@@@@@@@@@@@@@@@@@@@@@@ COMMON
647
648 add r14, r14, #4 @ p->buf++
649
650 add r11, r12, #historybuffer @ r11 := &p->historybuffer[0]
651
652 sub r10, r14, #PREDICTOR_HISTORY_SIZE*4
653 @ r10 := p->buf - PREDICTOR_HISTORY_SIZE
654
655 ldr r0, [sp, #4]
656 cmp r10, r11
657 beq move_histm @ The history buffer is full, we need to do a memmove
658
659 @ Check loop count
660 subs r0, r0, #1
661 strne r0, [sp, #4]
662 bne loopm
663
664donem:
665 str r14, [r12] @ Save value of p->buf
666 add sp, sp, #8 @ Don't bother restoring r1, r2
667 ldmia sp!, {r4 - r11, pc}
668
669move_histm:
670 @ dest = r11 (p->historybuffer)
671 @ src = r14 (p->buf)
672 @ n = 200
673
674 ldmia r14!, {r0-r9} @ 40 bytes
675 stmia r11!, {r0-r9}
676 ldmia r14!, {r0-r9} @ 40 bytes
677 stmia r11!, {r0-r9}
678 ldmia r14!, {r0-r9} @ 40 bytes
679 stmia r11!, {r0-r9}
680 ldmia r14!, {r0-r9} @ 40 bytes
681 stmia r11!, {r0-r9}
682 ldmia r14!, {r0-r9} @ 40 bytes
683 stmia r11!, {r0-r9}
684
685 ldr r0, [sp, #4]
686 add r14, r12, #historybuffer @ p->buf = &p->historybuffer[0]
687
688 @ Check loop count
689 subs r0, r0, #1
690 strne r0, [sp, #4]
691 bne loopm
692
693 b donem
694 .size predictor_decode_mono, .-predictor_decode_mono
diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c
index 0d03d1d2fb..45912dddbd 100644
--- a/apps/codecs/demac/libdemac/predictor.c
+++ b/apps/codecs/demac/libdemac/predictor.c
@@ -209,9 +209,7 @@ void ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
209 } 209 }
210 } 210 }
211} 211}
212#endif
213 212
214#if !defined(CPU_COLDFIRE)
215void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, 213void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
216 int32_t* decoded0, 214 int32_t* decoded0,
217 int count) 215 int count)