summaryrefslogtreecommitdiff
path: root/apps/codecs/demac/libdemac/vector_math32_armv4.h
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-19 21:31:33 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-19 21:31:33 +0000
commit2a5053f58c1a33334776cc90264c67dde815cef3 (patch)
tree7acc0727874ff6b307eff293a18172a3239cd895 /apps/codecs/demac/libdemac/vector_math32_armv4.h
parent14d37cb4555703d216e954db15ccca2c34642dc3 (diff)
downloadrockbox-2a5053f58c1a33334776cc90264c67dde815cef3.tar.gz
rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.zip
Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating blocks. * Use MUL (variant) instead of MLA (variant) in the first step of the ARM scalarproduct() if there's no loop. * Unroll ARM assembler functions to 32 where not already done, plus the generic scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19144 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math32_armv4.h')
-rw-r--r--apps/codecs/demac/libdemac/vector_math32_armv4.h95
1 files changed, 45 insertions, 50 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
index b729bd3a0a..89b24f2b06 100644
--- a/apps/codecs/demac/libdemac/vector_math32_armv4.h
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2)
30 int cnt = ORDER>>5; 30 int cnt = ORDER>>5;
31#endif 31#endif
32 32
33#define ADDBLOCK4 \ 33#if ORDER > 16
34 "ldmia %[v1], {r0-r3} \n" \ 34#define ADD_SUB_BLOCKS "8"
35 "ldmia %[v2]!, {r4-r7} \n" \ 35#else
36 "add r0, r0, r4 \n" \ 36#define ADD_SUB_BLOCKS "4"
37 "add r1, r1, r5 \n" \ 37#endif
38 "add r2, r2, r6 \n" \
39 "add r3, r3, r7 \n" \
40 "stmia %[v1]!, {r0-r3} \n"
41 38
42 asm volatile ( 39 asm volatile (
43 "1: \n" 40 "1: \n"
44 ADDBLOCK4 41 ".rept " ADD_SUB_BLOCKS "\n"
45 ADDBLOCK4 42 "ldmia %[v1], {r0-r3} \n"
46 ADDBLOCK4 43 "ldmia %[v2]!, {r4-r7} \n"
47 ADDBLOCK4 44 "add r0, r0, r4 \n"
48#if ORDER > 16 45 "add r1, r1, r5 \n"
49 ADDBLOCK4 46 "add r2, r2, r6 \n"
50 ADDBLOCK4 47 "add r3, r3, r7 \n"
51 ADDBLOCK4 48 "stmia %[v1]!, {r0-r3} \n"
52 ADDBLOCK4 49 ".endr \n"
53#endif
54#if ORDER > 32 50#if ORDER > 32
55 "subs %[cnt], %[cnt], #1 \n" 51 "subs %[cnt], %[cnt], #1 \n"
56 "bne 1b \n" 52 "bne 1b \n"
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
74 int cnt = ORDER>>5; 70 int cnt = ORDER>>5;
75#endif 71#endif
76 72
77#define SUBBLOCK4 \
78 "ldmia %[v1], {r0-r3} \n" \
79 "ldmia %[v2]!, {r4-r7} \n" \
80 "sub r0, r0, r4 \n" \
81 "sub r1, r1, r5 \n" \
82 "sub r2, r2, r6 \n" \
83 "sub r3, r3, r7 \n" \
84 "stmia %[v1]!, {r0-r3} \n"
85
86 asm volatile ( 73 asm volatile (
87 "1: \n" 74 "1: \n"
88 SUBBLOCK4 75 ".rept " ADD_SUB_BLOCKS "\n"
89 SUBBLOCK4 76 "ldmia %[v1], {r0-r3} \n"
90 SUBBLOCK4 77 "ldmia %[v2]!, {r4-r7} \n"
91 SUBBLOCK4 78 "sub r0, r0, r4 \n"
92#if ORDER > 16 79 "sub r1, r1, r5 \n"
93 SUBBLOCK4 80 "sub r2, r2, r6 \n"
94 SUBBLOCK4 81 "sub r3, r3, r7 \n"
95 SUBBLOCK4 82 "stmia %[v1]!, {r0-r3} \n"
96 SUBBLOCK4 83 ".endr \n"
97#endif
98#if ORDER > 32 84#if ORDER > 32
99 "subs %[cnt], %[cnt], #1 \n" 85 "subs %[cnt], %[cnt], #1 \n"
100 "bne 1b \n" 86 "bne 1b \n"
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
114 100
115static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) 101static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
116{ 102{
117 int res = 0; 103 int res;
118#if ORDER > 32 104#if ORDER > 32
119 int cnt = ORDER>>5; 105 int cnt = ORDER>>5;
120#endif 106#endif
121 107
122 asm volatile ( 108 asm volatile (
123#if ORDER > 16 109#if ORDER > 16
110#if ORDER > 32
111 "mov %[res], #0 \n"
112#endif
124 "ldmia %[v2]!, {r6-r7} \n" 113 "ldmia %[v2]!, {r6-r7} \n"
125 "1: \n" 114 "1: \n"
126 "ldmia %[v1]!, {r0,r1,r3-r5} \n" 115 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
116#if ORDER > 32
127 "mla %[res], r6, r0, %[res] \n" 117 "mla %[res], r6, r0, %[res] \n"
118#else
119 "mul %[res], r6, r0 \n"
120#endif
128 "mla %[res], r7, r1, %[res] \n" 121 "mla %[res], r7, r1, %[res] \n"
129 "ldmia %[v2]!, {r0-r2,r6-r8} \n" 122 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
130 "mla %[res], r0, r3, %[res] \n" 123 "mla %[res], r0, r3, %[res] \n"
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
177#endif 170#endif
178 171
179#else /* ORDER <= 16 */ 172#else /* ORDER <= 16 */
180 173 "ldmia %[v1]!, {r0-r3} \n"
181#define MLABLOCK4 \ 174 "ldmia %[v2]!, {r4-r7} \n"
182 "ldmia %[v1]!, {r0-r3} \n" \ 175 "mul %[res], r4, r0 \n"
183 "ldmia %[v2]!, {r4-r7} \n" \ 176 "mla %[res], r5, r1, %[res] \n"
184 "mla %[res], r4, r0, %[res] \n" \ 177 "mla %[res], r6, r2, %[res] \n"
185 "mla %[res], r5, r1, %[res] \n" \
186 "mla %[res], r6, r2, %[res] \n" \
187 "mla %[res], r7, r3, %[res] \n" 178 "mla %[res], r7, r3, %[res] \n"
188 179
189 MLABLOCK4 180 ".rept 3 \n"
190 MLABLOCK4 181 "ldmia %[v1]!, {r0-r3} \n"
191 MLABLOCK4 182 "ldmia %[v2]!, {r4-r7} \n"
192 MLABLOCK4 183 "mla %[res], r4, r0, %[res] \n"
184 "mla %[res], r5, r1, %[res] \n"
185 "mla %[res], r6, r2, %[res] \n"
186 "mla %[res], r7, r3, %[res] \n"
187 ".endr \n"
193#endif /* ORDER <= 16 */ 188#endif /* ORDER <= 16 */
194 : /* outputs */ 189 : /* outputs */
195#if ORDER > 32 190#if ORDER > 32
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
197#endif 192#endif
198 [v1] "+r"(v1), 193 [v1] "+r"(v1),
199 [v2] "+r"(v2), 194 [v2] "+r"(v2),
200 [res]"+r"(res) 195 [res]"=r"(res)
201 : /* inputs */ 196 : /* inputs */
202 : /* clobbers */ 197 : /* clobbers */
203 "r0", "r1", "r2", "r3", 198 "r0", "r1", "r2", "r3",