From 2a5053f58c1a33334776cc90264c67dde815cef3 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Wed, 19 Nov 2008 21:31:33 +0000 Subject: Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating blocks. * Use MUL (variant) instead of MLA (variant) in the first step of the ARM scalarproduct() if there's no loop. * Unroll ARM assembler functions to 32 where not already done, plus the generic scalarproduct(). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19144 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/vector_math16_armv5te.h | 36 +++++-- apps/codecs/demac/libdemac/vector_math16_armv6.h | 111 ++++++++++----------- apps/codecs/demac/libdemac/vector_math16_cf.h | 55 ++++------ apps/codecs/demac/libdemac/vector_math32_armv4.h | 95 +++++++++--------- apps/codecs/demac/libdemac/vector_math_generic.h | 22 +++- 5 files changed, 171 insertions(+), 148 deletions(-) diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h index a999c0333a..826aaa3f80 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h @@ -117,21 +117,35 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) * incorrect results (if ARM aligncheck is disabled). */ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) { - int res = 0; + int res; +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + #if ORDER > 16 - int cnt = ORDER>>4; +#define MLA_BLOCKS "3" +#else +#define MLA_BLOCKS "1" #endif asm volatile ( +#if ORDER > 32 + "mov %[res], #0 \n" +#endif "tst %[v2], #2 \n" "beq 20f \n" "10: \n" "ldrh r7, [%[v2]], #2 \n" +#if ORDER > 32 "mov r7, r7, lsl #16 \n" "1: \n" "ldmia %[v1]!, {r0-r3} \n" "smlabt %[res], r0, r7, %[res] \n" +#else + "ldmia %[v1]!, {r0-r3} \n" + "smulbb %[res], r0, r7 \n" +#endif "ldmia %[v2]!, {r4-r7} \n" "smlatb %[res], r0, r4, %[res] \n" "smlabt %[res], r1, r4, %[res] \n" @@ -140,6 +154,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "smlatb %[res], r2, r6, %[res] \n" "smlabt %[res], r3, r6, %[res] \n" "smlatb %[res], r3, r7, %[res] \n" + + ".rept " MLA_BLOCKS "\n" "ldmia %[v1]!, {r0-r3} \n" "smlabt %[res], r0, r7, %[res] \n" "ldmia %[v2]!, {r4-r7} \n" @@ -150,7 +166,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "smlatb %[res], r2, r6, %[res] \n" "smlabt %[res], r3, r6, %[res] \n" "smlatb %[res], r3, r7, %[res] \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif @@ -160,7 +177,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "1: \n" "ldmia %[v1]!, {r0-r3} \n" "ldmia %[v2]!, {r4-r7} \n" +#if ORDER > 32 "smlabb %[res], r0, r4, %[res] \n" +#else + "smulbb %[res], r0, r4 \n" +#endif "smlatt %[res], r0, r4, %[res] \n" "smlabb %[res], r1, r5, %[res] \n" "smlatt %[res], r1, r5, %[res] \n" @@ -168,6 +189,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "smlatt %[res], r2, r6, %[res] \n" "smlabb %[res], r3, r7, %[res] \n" "smlatt %[res], r3, r7, %[res] \n" + + ".rept " MLA_BLOCKS "\n" "ldmia %[v1]!, {r0-r3} \n" "ldmia %[v2]!, {r4-r7} \n" "smlabb %[res], r0, r4, %[res] \n" @@ -178,19 +201,20 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "smlatt %[res], r2, r6, %[res] \n" "smlabb %[res], r3, r7, %[res] \n" "smlatt %[res], r3, r7, %[res] \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif "99: \n" : /* outputs */ -#if ORDER > 16 +#if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), [v2] "+r"(v2), - [res]"+r"(res) + [res]"=r"(res) : /* inputs */ : /* clobbers */ "r0", "r1", "r2", "r3", diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index 49fa2ceb7d..cd27b271af 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h @@ -29,8 +29,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA * incorrect results (if ARM aligncheck is disabled). */ static inline void vector_add(int16_t* v1, int16_t* v2) { +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + #if ORDER > 16 - int cnt = ORDER>>4; +#define ADD_SUB_BLOCKS "4" +#else +#define ADD_SUB_BLOCKS "2" #endif asm volatile ( @@ -42,6 +48,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "ldr r5, [%[v2]], #4 \n" "mov r4, r4, lsl #16 \n" "1: \n" + ".rept " ADD_SUB_BLOCKS "\n" "ldmia %[v2]!, {r6-r7} \n" "ldmia %[v1], {r0-r3} \n" "mov r5, r5, ror #16 \n" @@ -56,21 +63,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "pkhbt r7, r7, r4, lsl #16 \n" "sadd16 r3, r3, r7 \n" "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r6-r7} \n" - "ldmia %[v1], {r0-r3} \n" - "mov r5, r5, ror #16 \n" - "pkhtb r4, r5, r4, asr #16 \n" - "sadd16 r0, r0, r4 \n" - "pkhbt r5, r5, r6, lsl #16 \n" - "sadd16 r1, r1, r5 \n" - "ldmia %[v2]!, {r4-r5} \n" - "mov r7, r7, ror #16 \n" - "pkhtb r6, r7, r6, asr #16 \n" - "sadd16 r2, r2, r6 \n" - "pkhbt r7, r7, r4, lsl #16 \n" - "sadd16 r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif @@ -78,6 +72,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "20: \n" "1: \n" + ".rept " ADD_SUB_BLOCKS "\n" "ldmia %[v2]!, {r4-r7} \n" "ldmia %[v1], {r0-r3} \n" "sadd16 r0, r0, r4 \n" @@ -85,21 +80,15 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "sadd16 r2, r2, r6 \n" "sadd16 r3, r3, r7 \n" "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - "ldmia %[v1], {r0-r3} \n" - "sadd16 r0, r0, r4 \n" - "sadd16 r1, r1, r5 \n" - "sadd16 r2, r2, r6 \n" - "sadd16 r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif "99: \n" : /* outputs */ -#if ORDER > 16 +#if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), @@ -116,8 +105,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) * incorrect results (if ARM aligncheck is disabled). */ static inline void vector_sub(int16_t* v1, int16_t* v2) { -#if ORDER > 16 - int cnt = ORDER>>4; +#if ORDER > 32 + int cnt = ORDER>>5; #endif asm volatile ( @@ -129,6 +118,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "ldr r5, [%[v2]], #4 \n" "mov r4, r4, lsl #16 \n" "1: \n" + ".rept " ADD_SUB_BLOCKS "\n" "ldmia %[v2]!, {r6-r7} \n" "ldmia %[v1], {r0-r3} \n" "mov r5, r5, ror #16 \n" @@ -143,21 +133,8 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "pkhbt r7, r7, r4, lsl #16 \n" "ssub16 r3, r3, r7 \n" "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r6-r7} \n" - "ldmia %[v1], {r0-r3} \n" - "mov r5, r5, ror #16 \n" - "pkhtb r4, r5, r4, asr #16 \n" - "ssub16 r0, r0, r4 \n" - "pkhbt r5, r5, r6, lsl #16 \n" - "ssub16 r1, r1, r5 \n" - "ldmia %[v2]!, {r4-r5} \n" - "mov r7, r7, ror #16 \n" - "pkhtb r6, r7, r6, asr #16 \n" - "ssub16 r2, r2, r6 \n" - "pkhbt r7, r7, r4, lsl #16 \n" - "ssub16 r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif @@ -165,6 +142,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "20: \n" "1: \n" + ".rept " ADD_SUB_BLOCKS "\n" "ldmia %[v2]!, {r4-r7} \n" "ldmia %[v1], {r0-r3} \n" "ssub16 r0, r0, r4 \n" @@ -172,21 +150,15 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "ssub16 r2, r2, r6 \n" "ssub16 r3, r3, r7 \n" "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - "ldmia %[v1], {r0-r3} \n" - "ssub16 r0, r0, r4 \n" - "ssub16 r1, r1, r5 \n" - "ssub16 r2, r2, r6 \n" - "ssub16 r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 + ".endr \n" +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" #endif "99: \n" : /* outputs */ -#if ORDER > 16 +#if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), @@ -203,12 +175,21 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) * incorrect results (if ARM aligncheck is disabled). */ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) { - int res = 0; + int res; +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + #if ORDER > 16 - int cnt = ORDER>>4; +#define MLA_BLOCKS "3" +#else +#define MLA_BLOCKS "1" #endif asm volatile ( +#if ORDER > 32 + "mov %[res], #0 \n" +#endif "tst %[v2], #2 \n" "beq 20f \n" @@ -216,11 +197,18 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "ldrh r7, [%[v2]], #2 \n" "ldmia %[v2]!, {r4-r5} \n" "ldmia %[v1]!, {r0-r1} \n" +#if ORDER > 32 "mov r7, r7, lsl #16 \n" "1: \n" "pkhbt r8, r4, r7 \n" "ldmia %[v2]!, {r6-r7} \n" "smladx %[res], r0, r8, %[res] \n" +#else + "pkhbt r8, r4, r7, lsl #16 \n" + "ldmia %[v2]!, {r6-r7} \n" + "smuadx %[res], r0, r8 \n" +#endif + ".rept " MLA_BLOCKS "\n" "pkhbt r8, r5, r4 \n" "ldmia %[v1]!, {r2-r3} \n" "smladx %[res], r1, r8, %[res] \n" @@ -233,11 +221,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "pkhbt r8, r4, r7 \n" "ldmia %[v2]!, {r6-r7} \n" "smladx %[res], r0, r8, %[res] \n" + ".endr \n" + "pkhbt r8, r5, r4 \n" "ldmia %[v1]!, {r2-r3} \n" "smladx %[res], r1, r8, %[res] \n" "pkhbt r8, r6, r5 \n" -#if ORDER > 16 +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "ldmneia %[v2]!, {r4-r5} \n" "smladx %[res], r2, r8, %[res] \n" @@ -257,7 +247,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "ldmia %[v2]!, {r5-r7} \n" "1: \n" "ldmia %[v1]!, {r2-r3} \n" +#if ORDER > 32 "smlad %[res], r0, r5, %[res] \n" +#else + "smuad %[res], r0, r5 \n" +#endif + ".rept " MLA_BLOCKS "\n" "ldmia %[v2]!, {r4-r5} \n" "smlad %[res], r1, r6, %[res] \n" "ldmia %[v1]!, {r0-r1} \n" @@ -266,9 +261,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "smlad %[res], r3, r4, %[res] \n" "ldmia %[v1]!, {r2-r3} \n" "smlad %[res], r0, r5, %[res] \n" + ".endr \n" + "ldmia %[v2]!, {r4-r5} \n" "smlad %[res], r1, r6, %[res] \n" -#if ORDER > 16 +#if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "ldmneia %[v1]!, {r0-r1} \n" "smlad %[res], r2, r7, %[res] \n" @@ -282,12 +279,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "99: \n" : /* outputs */ -#if ORDER > 16 +#if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), [v2] "+r"(v2), - [res]"+r"(res) + [res]"=r"(res) : /* inputs */ : /* clobbers */ "r0", "r1", "r2", "r3", "r4", diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 0c3aaca223..11e7f07adf 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h @@ -67,7 +67,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "move.l %%d3, (%[v1])+ \n" "lea.l (16, %[v2]), %[v2] \n" "move.l %%d4, %%d0 \n" - + "movem.l (%[v1]), %%a0-%%a3 \n" "movem.l (%[v2]), %%d1-%%d4 \n" ADDHALFXREGS(%%a0, %%d1, %%d0) @@ -175,7 +175,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "move.l %%d3, (%[v1])+ \n" "lea.l (16, %[v2]), %[v2] \n" "move.l %%d4, %%d0 \n" - + "movem.l (%[v2]), %%d1-%%d4 \n" "movem.l (%[v1]), %%a0-%%a3 \n" SUBHALFXREGS(%%a0, %%d1, %%d0) @@ -207,7 +207,6 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "move.l %%d2, (%[v1])+ \n" SUBHALFREGS(%%a3, %%d4, %%d3) "move.l %%d3, (%[v1])+ \n" - "lea.l (16, %[v2]), %[v2] \n" "movem.l (%[v2]), %%d1-%%d4 \n" @@ -248,22 +247,16 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) * in signed integer mode - call above macro before use. */ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) { - int res = 0; + int res; #if ORDER > 32 int cnt = ORDER>>5; #endif -#define MACBLOCK4 \ - "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ - "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ - "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ - "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" - -#define MACBLOCK4_U2 \ - "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ - "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ - "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ - "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" +#if ORDER > 16 +#define MAC_BLOCKS "7" +#else +#define MAC_BLOCKS "3" +#endif asm volatile ( "move.l %[v2], %%d0 \n" @@ -274,15 +267,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "move.l (%[v1])+, %%d0 \n" "move.w (%[v2])+, %%d1 \n" "1: \n" -#if ORDER > 16 - MACBLOCK4_U2 - MACBLOCK4_U2 - MACBLOCK4_U2 - MACBLOCK4_U2 -#endif - MACBLOCK4_U2 - MACBLOCK4_U2 - MACBLOCK4_U2 + ".rept " MAC_BLOCKS "\n" + "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" + ".endr \n" + "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" @@ -299,15 +290,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "move.l (%[v1])+, %%d0 \n" "move.l (%[v2])+, %%d1 \n" "1: \n" -#if ORDER > 16 - MACBLOCK4 - MACBLOCK4 - MACBLOCK4 - MACBLOCK4 -#endif - MACBLOCK4 - MACBLOCK4 - MACBLOCK4 + ".rept " MAC_BLOCKS "\n" + "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" + "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" + "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" + "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" + ".endr \n" + "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" #if ORDER > 32 diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index b729bd3a0a..89b24f2b06 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h @@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2) int cnt = ORDER>>5; #endif -#define ADDBLOCK4 \ - "ldmia %[v1], {r0-r3} \n" \ - "ldmia %[v2]!, {r4-r7} \n" \ - "add r0, r0, r4 \n" \ - "add r1, r1, r5 \n" \ - "add r2, r2, r6 \n" \ - "add r3, r3, r7 \n" \ - "stmia %[v1]!, {r0-r3} \n" +#if ORDER > 16 +#define ADD_SUB_BLOCKS "8" +#else +#define ADD_SUB_BLOCKS "4" +#endif asm volatile ( "1: \n" - ADDBLOCK4 - ADDBLOCK4 - ADDBLOCK4 - ADDBLOCK4 -#if ORDER > 16 - ADDBLOCK4 - ADDBLOCK4 - ADDBLOCK4 - ADDBLOCK4 -#endif + ".rept " ADD_SUB_BLOCKS "\n" + "ldmia %[v1], {r0-r3} \n" + "ldmia %[v2]!, {r4-r7} \n" + "add r0, r0, r4 \n" + "add r1, r1, r5 \n" + "add r2, r2, r6 \n" + "add r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" + ".endr \n" #if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" @@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) int cnt = ORDER>>5; #endif -#define SUBBLOCK4 \ - "ldmia %[v1], {r0-r3} \n" \ - "ldmia %[v2]!, {r4-r7} \n" \ - "sub r0, r0, r4 \n" \ - "sub r1, r1, r5 \n" \ - "sub r2, r2, r6 \n" \ - "sub r3, r3, r7 \n" \ - "stmia %[v1]!, {r0-r3} \n" - asm volatile ( "1: \n" - SUBBLOCK4 - SUBBLOCK4 - SUBBLOCK4 - SUBBLOCK4 -#if ORDER > 16 - SUBBLOCK4 - SUBBLOCK4 - SUBBLOCK4 - SUBBLOCK4 -#endif + ".rept " ADD_SUB_BLOCKS "\n" + "ldmia %[v1], {r0-r3} \n" + "ldmia %[v2]!, {r4-r7} \n" + "sub r0, r0, r4 \n" + "sub r1, r1, r5 \n" + "sub r2, r2, r6 \n" + "sub r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" + ".endr \n" #if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" "bne 1b \n" @@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) { - int res = 0; + int res; #if ORDER > 32 int cnt = ORDER>>5; #endif asm volatile ( #if ORDER > 16 +#if ORDER > 32 + "mov %[res], #0 \n" +#endif "ldmia %[v2]!, {r6-r7} \n" "1: \n" "ldmia %[v1]!, {r0,r1,r3-r5} \n" +#if ORDER > 32 "mla %[res], r6, r0, %[res] \n" +#else + "mul %[res], r6, r0 \n" +#endif "mla %[res], r7, r1, %[res] \n" "ldmia %[v2]!, {r0-r2,r6-r8} \n" "mla %[res], r0, r3, %[res] \n" @@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) #endif #else /* ORDER <= 16 */ - -#define MLABLOCK4 \ - "ldmia %[v1]!, {r0-r3} \n" \ - "ldmia %[v2]!, {r4-r7} \n" \ - "mla %[res], r4, r0, %[res] \n" \ - "mla %[res], r5, r1, %[res] \n" \ - "mla %[res], r6, r2, %[res] \n" \ + "ldmia %[v1]!, {r0-r3} \n" + "ldmia %[v2]!, {r4-r7} \n" + "mul %[res], r4, r0 \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" "mla %[res], r7, r3, %[res] \n" - MLABLOCK4 - MLABLOCK4 - MLABLOCK4 - MLABLOCK4 + ".rept 3 \n" + "ldmia %[v1]!, {r0-r3} \n" + "ldmia %[v2]!, {r4-r7} \n" + "mla %[res], r4, r0, %[res] \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" + "mla %[res], r7, r3, %[res] \n" + ".endr \n" #endif /* ORDER <= 16 */ : /* outputs */ #if ORDER > 32 @@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) #endif [v1] "+r"(v1), [v2] "+r"(v2), - [res]"+r"(res) + [res]"=r"(res) : /* inputs */ : /* clobbers */ "r0", "r1", "r2", "r3", diff --git a/apps/codecs/demac/libdemac/vector_math_generic.h b/apps/codecs/demac/libdemac/vector_math_generic.h index 7b61db77be..00bf07a007 100644 --- a/apps/codecs/demac/libdemac/vector_math_generic.h +++ b/apps/codecs/demac/libdemac/vector_math_generic.h @@ -116,8 +116,8 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2) { int res = 0; -#if ORDER > 16 - int order = (ORDER >> 4); +#if ORDER > 32 + int order = (ORDER >> 5); while (order--) #endif { @@ -137,6 +137,24 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2) res += *v1++ * *v2++; res += *v1++ * *v2++; res += *v1++ * *v2++; +#if ORDER > 16 + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; + res += *v1++ * *v2++; +#endif } return res; } -- cgit v1.2.3