From 48bb3d00db6f8e1c628ffc517fcee01cf1d909e5 Mon Sep 17 00:00:00 2001 From: Andree Buschmann Date: Tue, 2 Nov 2010 21:00:34 +0000 Subject: Optimize mpc's synthesis filter by reducing stalls for arm9 and above. Speed up ranges from 4% (nano 2g) to 11% (beast). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28452 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmusepack/synth_filter_arm.S | 210 ++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 2 deletions(-) (limited to 'apps') diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 731a21ce21..5bdae93561 100644 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S @@ -92,7 +92,7 @@ mpc_decoder_windowing_D: bgt .loop32 ldmpc regs=r4-r8 -#else +#elif defined(CPU_ARM7TDMI) /* arm7 only */ mpc_decoder_windowing_D: /* r0 = Data[] */ /* r1 = V[] */ @@ -106,6 +106,7 @@ mpc_decoder_windowing_D: * saved at the cost of 15 x 4 + 1 add's. * The row V[16] can be extracted as it has symmetries within this single * row. 8 smull/mlal and 8 ldr's can be saved. + * Used for arm7 only. For arm9 and above see implementation below. ***********************************************************************/ stmfd sp!, {r4-r11, lr} @@ -152,7 +153,7 @@ mpc_decoder_windowing_D: add r2, r2, #7*4 /* D+=7, r2 = D[16] */ /****************************************** - * rows 01..15 are symmetrc to rows 31..17 + * rows 01..15 are symmetric to rows 31..17 * r8 = lo, r9 = hi of 01..15 * r1 = V[01..15] * r10 = lo, r11 = hi of 31..17 @@ -289,6 +290,211 @@ mpc_decoder_windowing_D: str r8, [r0], #4 /* store Data */ add r1, r1, #4 /* V++ */ + ldmpc regs=r4-r11 +#else /* arm9 and above */ + mpc_decoder_windowing_D: + /* r0 = Data[] */ + /* r1 = V[] */ + /* r2 = D[] */ + /* lr = counter */ + /************************************************************************ + * Further speed up through making use of symmetries within D[]-window. + * The row V[00] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. + * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be + * saved at the cost of 15 x 4 + 1 add's. + * The row V[16] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved. + * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds + * up decoding even though several ldm-calls are replaced with ldr to free + * 2 registers. + ***********************************************************************/ + stmfd sp!, {r4-r11, lr} + + /****************************************** + * row 0 with internal symmetry + *****************************************/ + add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ + ldmia r2!, { r3-r6 } /* load D[01..04] */ + ldr r7 , [r1, #96*4] /* 1 */ + ldr r10, [r1, #992*4] /* 15 */ + ldr r11, [r1, #128*4] /* 2 */ + ldr r12, [r1, #896*4] /* 14 */ + rsb r10, r10, r7 /* V[01] - V[15] */ + smull r8, r9, r10, r3 + ldr r7 , [r1, #224*4] /* 3 */ + ldr r10, [r1, #864*4] /* 13 */ + add r12, r12, r11 /* V[02] + V[14] */ + smlal r8, r9, r12, r4 + ldr r11, [r1, #256*4] /* 4 */ + ldr r12, [r1, #768*4] /* 12 */ + rsb r10, r10, r7 /* V[03] - V[13] */ + smlal r8, r9, r10, r5 + ldr r7 , [r1, #352*4] /* 5 */ + ldr r10, [r1, #736*4] /* 11 */ + add r12, r12, r11 /* V[04] + V[12] */ + smlal r8, r9, r12, r6 + ldmia r2!, { r3-r6 } /* load D[05..08] */ + ldr r11, [r1, #384*4] /* 6 */ + ldr r12, [r1, #640*4] /* 10 */ + rsb r10, r10, r7 /* V[05] - V[11] */ + smlal r8, r9, r10, r3 + ldr r7 , [r1, #480*4] /* 7 */ + ldr r10, [r1, #608*4] /* 9 */ + add r12, r12, r11 /* V[06] + V[10] */ + smlal r8, r9, r12, r4 + ldr r11, [r1, #512*4] /* 8 */ + rsb r10, r10, r7 /* V[07] - V[09] */ + smlal r8, r9, r10, r5 + smlal r8, r9, r11, r6 + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0], #4 /* store Data */ + add r1, r1, #4 /* V+=1, r1 = V[01] */ + add r2, r2, #7*4 /* D+=7, r2 = D[16] */ + + /****************************************** + * rows 01..15 are symmetric to rows 31..17 + * r8 = lo, r9 = hi of 01..15 + * r1 = V[01..15] + * r10 = lo, r11 = hi of 31..17 + * r12 = V[31..16] + *****************************************/ + mov lr, #15 + add r12, r1, #30*4 /* r12 = V[31] */ +.loop15: + ldmia r2!, { r3-r4 } /* load D[00..01] */ + ldr r7, [r12, #896*4] /* 14 */ + ldr r5, [r12, #992*4] /* 15 */ + smull r10, r11, r7, r4 + ldr r7, [r1] /* 0 */ + smlal r10, r11, r5, r3 + ldr r5, [r1, #96*4] /* 1 */ + smull r8, r9, r7, r3 + ldr r7, [r12, #768*4] /* 12 */ + smlal r8, r9, r5, r4 + ldmia r2!, { r3-r4 } /* load D[02..03] */ + ldr r5, [r12, #864*4] /* 13 */ + smlal r10, r11, r7, r4 + ldr r7, [r1, #128*4] /* 2 */ + smlal r10, r11, r5, r3 + ldr r5, [r1, #224*4] /* 3 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #256*4] /* 4 */ + smlal r8, r9, r5, r4 + ldmia r2!, { r3-r4 } /* load D[04..04] */ + ldr r5, [r1, #352*4] /* 5 */ + smlal r8, r9, r7, r3 + ldr r7, [r12, #640*4] /* 10 */ + smlal r8, r9, r5, r4 + ldr r5, [r12, #736*4] /* 11 */ + smlal r10, r11, r7, r4 + ldr r7, [r1, #384*4] /* 6 */ + smlal r10, r11, r5, r3 + ldmia r2!, { r3-r4 } /* load D[06..07] */ + ldr r5, [r1, #480*4] /* 7 */ + smlal r8, r9, r7, r3 + ldr r7, [r12, #512*4] /* 8 */ + smlal r8, r9, r5, r4 + ldr r5, [r12, #608*4] /* 9 */ + smlal r10, r11, r7, r4 + ldr r7, [r12, #384*4] /* 6 */ + smlal r10, r11, r5, r3 + ldmia r2!, { r3-r4 } /* load D[08..09] */ + ldr r5, [r12, #480*4] /* 7 */ + smlal r10, r11, r7, r4 + ldr r7, [r1, #512*4] /* 8 */ + smlal r10, r11, r5, r3 + ldr r5, [r1, #608*4] /* 9 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #640*4] /* 10 */ + smlal r8, r9, r5, r4 + ldmia r2!, { r3-r4 } /* load D[10..11] */ + ldr r5, [r1, #736*4] /* 11 */ + smlal r8, r9, r7, r3 + ldr r7, [r12, #256*4] /* 4 */ + smlal r8, r9, r5, r4 + ldr r5, [r12, #352*4] /* 5 */ + smlal r10, r11, r7, r4 + ldr r7, [r1, #768*4] /* 12 */ + smlal r10, r11, r5, r3 + ldmia r2!, { r3-r4 } /* load D[12..13] */ + ldr r5, [r1, #864*4] /* 13 */ + smlal r8, r9, r7, r3 + ldr r7, [r12, #128*4] /* 2 */ + smlal r8, r9, r5, r4 + ldr r5, [r12, #224*4] /* 3 */ + smlal r10, r11, r7, r4 + ldr r7, [r12] /* 0 */ + smlal r10, r11, r5, r3 + ldmia r2!, { r3-r4 } /* load D[14..15] */ + ldr r5, [r12, #96*4] /* 1 */ + smlal r10, r11, r7, r4 + ldr r7, [r1, #896*4] /* 14 */ + smlal r10, r11, r5, r3 + ldr r5, [r1, #992*4] /* 15 */ + smlal r8, r9, r7, r3 + smlal r8, r9, r5, r4 + /* store Data[01..15] */ + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0] /* store Data */ + /* store Data[31..17] */ + add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */ + mov r10, r10, lsr #16 + orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ + rsb r10, r10, #0 /* r10 = -r10 */ + str r10, [r0], #4 /* store Data */ + sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */ + /* correct adresses for next loop */ + sub r12, r12, #4 /* r12 = V-- */ + add r1, r1, #4 /* r1 = V++ */ + /* next loop */ + subs lr, lr, #1 + bgt .loop15 + + /****************************************** + * V[16] with internal symmetry + *****************************************/ + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7 , [r1] /* 0 */ + ldr r10, [r1, #992*4] /* 15 */ + ldr r11, [r1, #96*4] /* 1 */ + ldr r12, [r1, #896*4] /* 14 */ + rsb r10, r10, r7 /* V[00] - V[15] */ + smull r8, r9, r10, r3 + ldr r7 , [r1, #128*4] /* 2 */ + ldr r10, [r1, #864*4] /* 13 */ + rsb r12, r12, r11 /* V[01] - V[14] */ + smlal r8, r9, r12, r4 + ldr r11, [r1, #224*4] /* 3 */ + ldr r12, [r1, #768*4] /* 12 */ + rsb r10, r10, r7 /* V[02] - V[13] */ + smlal r8, r9, r10, r5 + ldr r7 , [r1, #256*4] /* 4 */ + ldr r10, [r1, #736*4] /* 11 */ + rsb r12, r12, r11 /* V[03] - V[12] */ + smlal r8, r9, r12, r6 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r11, [r1, #352*4] /* 5 */ + ldr r12, [r1, #640*4] /* 10 */ + rsb r10, r10, r7 /* V[04] - V[11] */ + smlal r8, r9, r10, r3 + ldr r7 , [r1, #384*4] /* 6 */ + ldr r10, [r1, #608*4] /* 9 */ + rsb r12, r12, r11 /* V[05] - V[10] */ + smlal r8, r9, r12, r4 + ldr r11, [r1, #480*4] /* 7 */ + ldr r12, [r1, #512*4] /* 8 */ + rsb r10, r10, r7 /* V[06] - V[09] */ + smlal r8, r9, r10, r5 + rsb r12, r12, r11 /* V[07] - V[08] */ + smlal r8, r9, r12, r6 + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0], #4 /* store Data */ + add r1, r1, #4 /* V++ */ + ldmpc regs=r4-r11 #endif .mpc_dewindowing_end: -- cgit v1.2.3