From 35024bd54e0e9a75b80ab102c44da4b4f369aec5 Mon Sep 17 00:00:00 2001 From: Andree Buschmann Date: Sat, 13 Feb 2010 22:01:24 +0000 Subject: Speed up atrac codec for ARM through simple loop unrolling. Saves 9 MHz on PP5022 (14% speed up). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24637 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libatrac/atrac3_arm.S | 127 ++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 20 deletions(-) diff --git a/apps/codecs/libatrac/atrac3_arm.S b/apps/codecs/libatrac/atrac3_arm.S index be8b2a0e0e..0908d582ed 100644 --- a/apps/codecs/libatrac/atrac3_arm.S +++ b/apps/codecs/libatrac/atrac3_arm.S @@ -100,38 +100,125 @@ atrac3_iqmf_dewindowing: /* r1 = input samples */ /* r2 = window coefficients */ /* r3 = counter */ - stmfd sp!, {r4-r10, lr} /* save non-scratch registers */ + stmfd sp!, {r4-r9, lr} /* save non-scratch registers */ .iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */ - - ldmia r2!, {r5, r6} /* load win[0..1] */ - ldmia r1!, {r7, r8} /* load in[0..1] */ - smull lr , r10, r5, r7 /* s1 = win[0] * in[0] */ - smull r12, r9 , r6, r8 /* s2 = win[1] * in[1] */ - - mov r4, #46 /* r4 = 46 */ -.iqmf_dewindow_inner_loop: /* inner loop i=2...48 */ - ldmia r2!, {r5, r6} /* load win[i...i+1] */ - ldmia r1!, {r7, r8} /* load in[i...i+1] */ - smlal lr , r10, r5, r7 /* s1 = win[i ] * in[i ] */ - smlal r12, r9 , r6, r8 /* s2 = win[i+1] * in[i+1] */ - - subs r4, r4, #2 /* inner loop -= 2*/ - bgt .iqmf_dewindow_inner_loop + /* 0.. 7 */ + ldmia r2!, {r4, r5} /* load win[0..1] */ + ldmia r1!, {r6, r7} /* load in[0..1] */ + smull lr , r9, r4, r6 /* s1 = win[0] * in[0] */ + smull r12, r8, r5, r7 /* s2 = win[1] * in[1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + /* 8..15 */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + /* 16..23 */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + /* 24..31 */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + /* 32..39 */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + /* 40..47 */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ + ldmia r2!, {r4, r5} /* load win[i...i+1] */ + ldmia r1!, {r6, r7} /* load in[i...i+1] */ + smlal lr , r9, r4, r6 /* s1 = win[i ] * in[i ] */ + smlal r12, r8, r5, r7 /* s2 = win[i+1] * in[i+1] */ mov lr , lr , lsr #31 - orr r10, lr , r10, lsl #1 /* s1 = low>>31 || hi<<1 */ + orr r9, lr , r9, lsl #1 /* s1 = low>>31 || hi<<1 */ mov r12, r12, lsr #31 - orr r9 , r12, r9 , lsl #1 /* s2 = low>>31 || hi<<1 */ + orr r8, r12, r8, lsl #1 /* s2 = low>>31 || hi<<1 */ - stmia r0!, {r9, r10} /* store result out[0]=s2, out[1]=s1 */ + stmia r0!, {r8, r9} /* store result out[0]=s2, out[1]=s1 */ sub r1, r1, #184 /* roll back 64 entries = 184 bytes */ sub r2, r2, #192 /* roll back 48 entries = 192 bytes = win[0] */ subs r3, r3, #1 /* outer loop -= 1 */ bgt .iqmf_dewindow_outer_loop - ldmfd sp!, {r4-r10, pc} /* restore registers */ + ldmfd sp!, {r4-r9, pc} /* restore registers */ .atrac3_iqmf_dewindowing_end: .size atrac3_iqmf_dewindowing,.atrac3_iqmf_dewindowing_end-atrac3_iqmf_dewindowing -- cgit v1.2.3