From d456460707f79ec48d08baf5d8f28c88c9641e64 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Fri, 3 Oct 2008 12:30:18 +0000 Subject: Further speedup for ARMv6 by better pipelining in scalarproduct(). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18697 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/vector_math16_armv6.h | 80 ++++++++++++++++-------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index e963e10ff0..bf50d9cabd 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h @@ -217,54 +217,80 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "beq 20f \n" "10: \n" - "ldrh r4, [%[v2]], #2 \n" - "mov r4, r4, lsl #16 \n" + "ldrh r2, [%[v2]], #2 \n" + "ldr r0, [%[v1]], #4 \n" + "ldr r3, [%[v2]], #4 \n" + "mov r2, r2, lsl #16 \n" "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" + "ldr r1, [%[v1]], #4 \n" + "smlabt %[res], r0, r2, %[res] \n" + "ldr r4, [%[v2]], #4 \n" + "smlatb %[res], r0, r3, %[res] \n" + "ldr r0, [%[v1]], #4 \n" + "smlabt %[res], r1, r3, %[res] \n" + "ldr r5, [%[v2]], #4 \n" + "smlatb %[res], r1, r4, %[res] \n" + "ldr r1, [%[v1]], #4 \n" "smlabt %[res], r0, r4, %[res] \n" + "ldr r6, [%[v2]], #4 \n" "smlatb %[res], r0, r5, %[res] \n" + "ldr r0, [%[v1]], #4 \n" "smlabt %[res], r1, r5, %[res] \n" + "ldr r3, [%[v2]], #4 \n" "smlatb %[res], r1, r6, %[res] \n" - "smlabt %[res], r2, r6, %[res] \n" - "smlatb %[res], r2, r7, %[res] \n" - "smlabt %[res], r3, r7, %[res] \n" - "smlatb %[res], r3, r8, %[res] \n" - "mov r4, r8 \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" + "mov r2, r6 \n" + "ldr r1, [%[v1]], #4 \n" + "smlabt %[res], r0, r2, %[res] \n" + "ldr r4, [%[v2]], #4 \n" + "smlatb %[res], r0, r3, %[res] \n" + "ldr r0, [%[v1]], #4 \n" + "smlabt %[res], r1, r3, %[res] \n" + "ldr r5, [%[v2]], #4 \n" + "smlatb %[res], r1, r4, %[res] \n" + "ldr r1, [%[v1]], #4 \n" "smlabt %[res], r0, r4, %[res] \n" + "ldr r6, [%[v2]], #4 \n" "smlatb %[res], r0, r5, %[res] \n" - "smlabt %[res], r1, r5, %[res] \n" - "smlatb %[res], r1, r6, %[res] \n" - "smlabt %[res], r2, r6, %[res] \n" - "smlatb %[res], r2, r7, %[res] \n" - "smlabt %[res], r3, r7, %[res] \n" - "smlatb %[res], r3, r8, %[res] \n" #if ORDER > 16 - "mov r4, r8 \n" "subs %[cnt], %[cnt], #1 \n" + "ldrne r0, [%[v1]], #4 \n" + "smlabt %[res], r1, r5, %[res] \n" + "ldrne r3, [%[v2]], #4 \n" + "smlatb %[res], r1, r6, %[res] \n" + "mov r2, r6 \n" "bne 1b \n" +#else + "smlabt %[res], r1, r5, %[res] \n" + "smlatb %[res], r1, r6, %[res] \n" #endif "b 99f \n" "20: \n" + "ldmia %[v1]!, {r0-r1} \n" + "ldmia %[v2]!, {r4-r5} \n" "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" + "ldmia %[v1]!, {r2-r3} \n" "smlad %[res], r0, r4, %[res] \n" + "ldmia %[v2]!, {r6-r7} \n" "smlad %[res], r1, r5, %[res] \n" + "ldmia %[v1]!, {r0-r1} \n" "smlad %[res], r2, r6, %[res] \n" + "ldmia %[v2]!, {r4-r5} \n" "smlad %[res], r3, r7, %[res] \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" + "ldmia %[v1]!, {r2-r3} \n" "smlad %[res], r0, r4, %[res] \n" + "ldmia %[v2]!, {r6-r7} \n" "smlad %[res], r1, r5, %[res] \n" - "smlad %[res], r2, r6, %[res] \n" - "smlad %[res], r3, r7, %[res] \n" #if ORDER > 16 "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "ldmneia %[v1]!, {r0-r1} \n" + "smlad %[res], r2, r6, %[res] \n" + "ldmneia %[v2]!, {r4-r5} \n" + "smlad %[res], r3, r7, %[res] \n" + "bne 1b \n" +#else + "smlad %[res], r2, r6, %[res] \n" + "smlad %[res], r3, r7, %[res] \n" #endif "99: \n" @@ -277,8 +303,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) [res]"+r"(res) : /* inputs */ : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", - "r5", "r6", "r7", "r8" + "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7" ); return res; } -- cgit v1.2.3