From 16714539922fb8828180ff605df3586f3eab97c3 Mon Sep 17 00:00:00 2001 From: Andree Buschmann Date: Sun, 15 Jun 2008 12:17:22 +0000 Subject: Musepack speed optimization. Speep up 64 bit precision synthesizer by another 1.5MHz through using symmetries within D[] filter coefficients. For ARM only. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17724 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmusepack/synth_filter_arm.S | 204 ++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 1 deletion(-) diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 41bfda740b..83867086aa 100755 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S @@ -99,12 +99,15 @@ mpc_decoder_windowing_D: .align 2 .global mpc_decoder_windowing_D .type mpc_decoder_windowing_D, %function +#if 0 mpc_decoder_windowing_D: /* r0 = Data[] */ /* r1 = V[] */ /* r2 = D[] */ /* lr = counter */ - + /************************************************************************ + * Reference implementation. + ***********************************************************************/ stmfd sp!, {r4-r9, lr} mov lr, #32 @@ -154,6 +157,205 @@ mpc_decoder_windowing_D: bgt .loop32 ldmfd sp!, {r4-r9, pc} +#else +mpc_decoder_windowing_D: + /* r0 = Data[] */ + /* r1 = V[] */ + /* r2 = D[] */ + /* lr = counter */ + /************************************************************************ + * Further speed up through making use of symmetries within D[]-window. + * The row V[00] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. + * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be + * saved at the cost of 15 x 4 + 1 add's. + * The row V[16] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved. + ***********************************************************************/ + stmfd sp!, {r4-r12, lr} + + /****************************************** + * row 0 with internal symmetry + *****************************************/ + add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ + ldmia r2!, { r3-r6 } /* load D[01..04] */ + ldr r7 , [r1, #96*4] /* 1 */ + ldr r10, [r1, #992*4] /* 15 */ + rsb r10, r10, r7 /* V[01] - V[15] */ + smull r8, r9, r10, r3 + ldr r7 , [r1, #128*4] /* 2 */ + ldr r10, [r1, #896*4] /* 14 */ + add r10, r10, r7 /* V[02] + V[14] */ + smlal r8, r9, r10, r4 + ldr r7 , [r1, #224*4] /* 3 */ + ldr r10, [r1, #864*4] /* 13 */ + rsb r10, r10, r7 /* V[03] - V[13] */ + smlal r8, r9, r10, r5 + ldr r7 , [r1, #256*4] /* 4 */ + ldr r10, [r1, #768*4] /* 12 */ + add r10, r10, r7 /* V[04] + V[12] */ + smlal r8, r9, r10, r6 + ldmia r2!, { r3-r6 } /* load D[05..08] */ + ldr r7 , [r1, #352*4] /* 5 */ + ldr r10, [r1, #736*4] /* 11 */ + rsb r10, r10, r7 /* V[05] - V[11] */ + smlal r8, r9, r10, r3 + ldr r7 , [r1, #384*4] /* 6 */ + ldr r10, [r1, #640*4] /* 10 */ + add r10, r10, r7 /* V[06] + V[10] */ + smlal r8, r9, r10, r4 + ldr r7 , [r1, #480*4] /* 7 */ + ldr r10, [r1, #608*4] /* 9 */ + rsb r10, r10, r7 /* V[07] - V[09] */ + smlal r8, r9, r10, r5 + ldr r10, [r1, #512*4] /* 8 */ + smlal r8, r9, r10, r6 + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0], #4 /* store Data */ + add r1, r1, #4 /* V+=1, r1 = V[01] */ + add r2, r2, #7*4 /* D+=7, r2 = D[16] */ + + /****************************************** + * rows 01..15 are symmetrc to rows 31..17 + * r8 = lo, r9 = hi of 01..15 + * r1 = V[01..15] + * r10 = lo, r11 = hi of 31..17 + * r12 = V[31..16] + *****************************************/ + mov lr, #15 + add r12, r1, #30*4 /* r12 = V[31] */ +.loop15: + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7, [r12, #768*4] /* 12 */ + smull r10, r11, r7, r6 + ldr r7, [r12, #864*4] /* 13 */ + smlal r10, r11, r7, r5 + ldr r7, [r12, #896*4] /* 14 */ + smlal r10, r11, r7, r4 + ldr r7, [r12, #992*4] /* 15 */ + smlal r10, r11, r7, r3 + ldr r7, [r1] /* 0 */ + smull r8, r9, r7, r3 + ldr r7, [r1, #96*4] /* 1 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #128*4] /* 2 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #224*4] /* 3 */ + smlal r8, r9, r7, r6 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r7, [r1, #256*4] /* 4 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #352*4] /* 5 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #384*4] /* 6 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #480*4] /* 7 */ + smlal r8, r9, r7, r6 + ldr r7, [r12, #512*4] /* 8 */ + smlal r10, r11, r7, r6 + ldr r7, [r12, #608*4] /* 9 */ + smlal r10, r11, r7, r5 + ldr r7, [r12, #640*4] /* 10 */ + smlal r10, r11, r7, r4 + ldr r7, [r12, #736*4] /* 11 */ + smlal r10, r11, r7, r3 + ldmia r2!, { r3-r6 } /* load D[08..11] */ + ldr r7, [r12, #256*4] /* 4 */ + smlal r10, r11, r7, r6 + ldr r7, [r12, #352*4] /* 5 */ + smlal r10, r11, r7, r5 + ldr r7, [r12, #384*4] /* 6 */ + smlal r10, r11, r7, r4 + ldr r7, [r12, #480*4] /* 7 */ + smlal r10, r11, r7, r3 + ldr r7, [r1, #512*4] /* 8 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #608*4] /* 9 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #640*4] /* 10 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #736*4] /* 11 */ + smlal r8, r9, r7, r6 + ldmia r2!, { r3-r6 } /* load D[12..15] */ + ldr r7, [r1, #768*4] /* 12 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #864*4] /* 13 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #896*4] /* 14 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #992*4] /* 15 */ + smlal r8, r9, r7, r6 + ldr r7, [r12] /* 0 */ + smlal r10, r11, r7, r6 + ldr r7, [r12, #96*4] /* 1 */ + smlal r10, r11, r7, r5 + ldr r7, [r12, #128*4] /* 2 */ + smlal r10, r11, r7, r4 + ldr r7, [r12, #224*4] /* 3 */ + smlal r10, r11, r7, r3 + /* store Data[01..15] */ + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0] /* store Data */ + /* store Data[31..17] */ + add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */ + mov r10, r10, lsr #16 + orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ + rsb r10, r10, #0 /* r10 = -r10 */ + str r10, [r0], #4 /* store Data */ + sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */ + /* correct adresses for next loop */ + sub r12, r12, #4 /* r12 = V-- */ + add r1, r1, #4 /* r1 = V++ */ + /* next loop */ + subs lr, lr, #1 + bgt .loop15 + + /****************************************** + * V[16] with internal symmetry + *****************************************/ + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7 , [r1] /* 0 */ + ldr r10, [r1, #992*4] /* 15 */ + rsb r10, r10, r7 /* V[00] - V[15] */ + smull r8, r9, r10, r3 + ldr r7 , [r1, #96*4] /* 1 */ + ldr r10, [r1, #896*4] /* 14 */ + rsb r10, r10, r7 /* V[01] - V[14] */ + smlal r8, r9, r10, r4 + ldr r7 , [r1, #128*4] /* 2 */ + ldr r10, [r1, #864*4] /* 13 */ + rsb r10, r10, r7 /* V[02] - V[13] */ + smlal r8, r9, r10, r5 + ldr r7 , [r1, #224*4] /* 3 */ + ldr r10, [r1, #768*4] /* 12 */ + rsb r10, r10, r7 /* V[03] - V[12] */ + smlal r8, r9, r10, r6 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r7 , [r1, #256*4] /* 4 */ + ldr r10, [r1, #736*4] /* 11 */ + rsb r10, r10, r7 /* V[04] - V[11] */ + smlal r8, r9, r10, r3 + ldr r7 , [r1, #352*4] /* 5 */ + ldr r10, [r1, #640*4] /* 10 */ + rsb r10, r10, r7 /* V[05] - V[10] */ + smlal r8, r9, r10, r4 + ldr r7 , [r1, #384*4] /* 6 */ + ldr r10, [r1, #608*4] /* 9 */ + rsb r10, r10, r7 /* V[06] - V[09] */ + smlal r8, r9, r10, r5 + ldr r7 , [r1, #480*4] /* 7 */ + ldr r10, [r1, #512*4] /* 8 */ + rsb r10, r10, r7 /* V[07] - V[08] */ + smlal r8, r9, r10, r6 + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0], #4 /* store Data */ + add r1, r1, #4 /* V++ */ + + ldmfd sp!, {r4-r12, pc} +#endif .mpc_dewindowing_end: .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D #endif -- cgit v1.2.3