summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-10-07 19:40:17 +0000
committerJens Arnold <amiconn@rockbox.org>2008-10-07 19:40:17 +0000
commit6b84f600466ab02f5a671d5004cc5b13f18460af (patch)
tree2dddfb3838ec570cd415e10f0251c3d3f0d0cf10
parent46573019a53dca411f754d40d0f21c1e4eafaedf (diff)
downloadrockbox-6b84f600466ab02f5a671d5004cc5b13f18460af.tar.gz
rockbox-6b84f600466ab02f5a671d5004cc5b13f18460af.zip
APE: Further ARMv6 filter optimisations: Save 4 'ror's per round by utilising the shift feature of the 'pack halfword' instructions in the unaligned vector addition/ subtraction, better pipelining in the aligned scalarproduct(), and a new method to calculate the unaligned scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18736 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv6.h147
1 files changed, 66 insertions, 81 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h
index bf50d9cabd..e180429193 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv6.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h
@@ -39,36 +39,33 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
39 39
40 "10: \n" 40 "10: \n"
41 "ldrh r4, [%[v2]], #2 \n" 41 "ldrh r4, [%[v2]], #2 \n"
42 "mov r4, r4, lsl #16 \n"
42 "1: \n" 43 "1: \n"
43 "ldmia %[v2]!, {r5-r8} \n" 44 "ldmia %[v2]!, {r5-r8} \n"
44 "ldmia %[v1], {r0-r3} \n" 45 "ldmia %[v1], {r0-r3} \n"
45 "mov r5, r5, ror #16 \n" 46 "mov r5, r5, ror #16 \n"
46 "pkhbt r4, r4, r5 \n" 47 "pkhtb r4, r5, r4, asr #16 \n"
47 "sadd16 r0, r0, r4 \n" 48 "sadd16 r0, r0, r4 \n"
48 "mov r6, r6, ror #16 \n" 49 "pkhbt r5, r5, r6, lsl #16 \n"
49 "pkhbt r5, r5, r6 \n"
50 "sadd16 r1, r1, r5 \n" 50 "sadd16 r1, r1, r5 \n"
51 "mov r7, r7, ror #16 \n" 51 "mov r7, r7, ror #16 \n"
52 "pkhbt r6, r6, r7 \n" 52 "pkhtb r6, r7, r6, asr #16 \n"
53 "sadd16 r2, r2, r6 \n" 53 "sadd16 r2, r2, r6 \n"
54 "mov r8, r8, ror #16 \n" 54 "pkhbt r7, r7, r8, lsl #16 \n"
55 "pkhbt r7, r7, r8 \n"
56 "sadd16 r3, r3, r7 \n" 55 "sadd16 r3, r3, r7 \n"
57 "stmia %[v1]!, {r0-r3} \n" 56 "stmia %[v1]!, {r0-r3} \n"
58 "mov r4, r8 \n" 57 "mov r4, r8 \n"
59 "ldmia %[v2]!, {r5-r8} \n" 58 "ldmia %[v2]!, {r5-r8} \n"
60 "ldmia %[v1], {r0-r3} \n" 59 "ldmia %[v1], {r0-r3} \n"
61 "mov r5, r5, ror #16 \n" 60 "mov r5, r5, ror #16 \n"
62 "pkhbt r4, r4, r5 \n" 61 "pkhtb r4, r5, r4, asr #16 \n"
63 "sadd16 r0, r0, r4 \n" 62 "sadd16 r0, r0, r4 \n"
64 "mov r6, r6, ror #16 \n" 63 "pkhbt r5, r5, r6, lsl #16 \n"
65 "pkhbt r5, r5, r6 \n"
66 "sadd16 r1, r1, r5 \n" 64 "sadd16 r1, r1, r5 \n"
67 "mov r7, r7, ror #16 \n" 65 "mov r7, r7, ror #16 \n"
68 "pkhbt r6, r6, r7 \n" 66 "pkhtb r6, r7, r6, asr #16 \n"
69 "sadd16 r2, r2, r6 \n" 67 "sadd16 r2, r2, r6 \n"
70 "mov r8, r8, ror #16 \n" 68 "pkhbt r7, r7, r8, lsl #16 \n"
71 "pkhbt r7, r7, r8 \n"
72 "sadd16 r3, r3, r7 \n" 69 "sadd16 r3, r3, r7 \n"
73 "stmia %[v1]!, {r0-r3} \n" 70 "stmia %[v1]!, {r0-r3} \n"
74#if ORDER > 16 71#if ORDER > 16
@@ -128,36 +125,33 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
128 125
129 "10: \n" 126 "10: \n"
130 "ldrh r4, [%[v2]], #2 \n" 127 "ldrh r4, [%[v2]], #2 \n"
128 "mov r4, r4, lsl #16 \n"
131 "1: \n" 129 "1: \n"
132 "ldmia %[v2]!, {r5-r8} \n" 130 "ldmia %[v2]!, {r5-r8} \n"
133 "ldmia %[v1], {r0-r3} \n" 131 "ldmia %[v1], {r0-r3} \n"
134 "mov r5, r5, ror #16 \n" 132 "mov r5, r5, ror #16 \n"
135 "pkhbt r4, r4, r5 \n" 133 "pkhtb r4, r5, r4, asr #16 \n"
136 "ssub16 r0, r0, r4 \n" 134 "ssub16 r0, r0, r4 \n"
137 "mov r6, r6, ror #16 \n" 135 "pkhbt r5, r5, r6, lsl #16 \n"
138 "pkhbt r5, r5, r6 \n"
139 "ssub16 r1, r1, r5 \n" 136 "ssub16 r1, r1, r5 \n"
140 "mov r7, r7, ror #16 \n" 137 "mov r7, r7, ror #16 \n"
141 "pkhbt r6, r6, r7 \n" 138 "pkhtb r6, r7, r6, asr #16 \n"
142 "ssub16 r2, r2, r6 \n" 139 "ssub16 r2, r2, r6 \n"
143 "mov r8, r8, ror #16 \n" 140 "pkhbt r7, r7, r8, lsl #16 \n"
144 "pkhbt r7, r7, r8 \n"
145 "ssub16 r3, r3, r7 \n" 141 "ssub16 r3, r3, r7 \n"
146 "stmia %[v1]!, {r0-r3} \n" 142 "stmia %[v1]!, {r0-r3} \n"
147 "mov r4, r8 \n" 143 "mov r4, r8 \n"
148 "ldmia %[v2]!, {r5-r8} \n" 144 "ldmia %[v2]!, {r5-r8} \n"
149 "ldmia %[v1], {r0-r3} \n" 145 "ldmia %[v1], {r0-r3} \n"
150 "mov r5, r5, ror #16 \n" 146 "mov r5, r5, ror #16 \n"
151 "pkhbt r4, r4, r5 \n" 147 "pkhtb r4, r5, r4, asr #16 \n"
152 "ssub16 r0, r0, r4 \n" 148 "ssub16 r0, r0, r4 \n"
153 "mov r6, r6, ror #16 \n" 149 "pkhbt r5, r5, r6, lsl #16 \n"
154 "pkhbt r5, r5, r6 \n"
155 "ssub16 r1, r1, r5 \n" 150 "ssub16 r1, r1, r5 \n"
156 "mov r7, r7, ror #16 \n" 151 "mov r7, r7, ror #16 \n"
157 "pkhbt r6, r6, r7 \n" 152 "pkhtb r6, r7, r6, asr #16 \n"
158 "ssub16 r2, r2, r6 \n" 153 "ssub16 r2, r2, r6 \n"
159 "mov r8, r8, ror #16 \n" 154 "pkhbt r7, r7, r8, lsl #16 \n"
160 "pkhbt r7, r7, r8 \n"
161 "ssub16 r3, r3, r7 \n" 155 "ssub16 r3, r3, r7 \n"
162 "stmia %[v1]!, {r0-r3} \n" 156 "stmia %[v1]!, {r0-r3} \n"
163#if ORDER > 16 157#if ORDER > 16
@@ -217,80 +211,71 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
217 "beq 20f \n" 211 "beq 20f \n"
218 212
219 "10: \n" 213 "10: \n"
220 "ldrh r2, [%[v2]], #2 \n" 214 "ldrh r7, [%[v2]], #2 \n"
221 "ldr r0, [%[v1]], #4 \n" 215 "ldmia %[v2]!, {r4-r5} \n"
222 "ldr r3, [%[v2]], #4 \n" 216 "ldmia %[v1]!, {r0-r1} \n"
223 "mov r2, r2, lsl #16 \n" 217 "mov r7, r7, lsl #16 \n"
224 "1: \n" 218 "1: \n"
225 "ldr r1, [%[v1]], #4 \n" 219 "pkhbt r8, r4, r7 \n"
226 "smlabt %[res], r0, r2, %[res] \n" 220 "ldmia %[v2]!, {r6-r7} \n"
227 "ldr r4, [%[v2]], #4 \n" 221 "smladx %[res], r0, r8, %[res] \n"
228 "smlatb %[res], r0, r3, %[res] \n" 222 "pkhbt r8, r5, r4 \n"
229 "ldr r0, [%[v1]], #4 \n" 223 "ldmia %[v1]!, {r2-r3} \n"
230 "smlabt %[res], r1, r3, %[res] \n" 224 "smladx %[res], r1, r8, %[res] \n"
231 "ldr r5, [%[v2]], #4 \n" 225 "pkhbt r8, r6, r5 \n"
232 "smlatb %[res], r1, r4, %[res] \n" 226 "ldmia %[v2]!, {r4-r5} \n"
233 "ldr r1, [%[v1]], #4 \n" 227 "smladx %[res], r2, r8, %[res] \n"
234 "smlabt %[res], r0, r4, %[res] \n" 228 "pkhbt r8, r7, r6 \n"
235 "ldr r6, [%[v2]], #4 \n" 229 "ldmia %[v1]!, {r0-r1} \n"
236 "smlatb %[res], r0, r5, %[res] \n" 230 "smladx %[res], r3, r8, %[res] \n"
237 "ldr r0, [%[v1]], #4 \n" 231 "pkhbt r8, r4, r7 \n"
238 "smlabt %[res], r1, r5, %[res] \n" 232 "ldmia %[v2]!, {r6-r7} \n"
239 "ldr r3, [%[v2]], #4 \n" 233 "smladx %[res], r0, r8, %[res] \n"
240 "smlatb %[res], r1, r6, %[res] \n" 234 "pkhbt r8, r5, r4 \n"
241 "mov r2, r6 \n" 235 "ldmia %[v1]!, {r2-r3} \n"
242 "ldr r1, [%[v1]], #4 \n" 236 "smladx %[res], r1, r8, %[res] \n"
243 "smlabt %[res], r0, r2, %[res] \n" 237 "pkhbt r8, r6, r5 \n"
244 "ldr r4, [%[v2]], #4 \n"
245 "smlatb %[res], r0, r3, %[res] \n"
246 "ldr r0, [%[v1]], #4 \n"
247 "smlabt %[res], r1, r3, %[res] \n"
248 "ldr r5, [%[v2]], #4 \n"
249 "smlatb %[res], r1, r4, %[res] \n"
250 "ldr r1, [%[v1]], #4 \n"
251 "smlabt %[res], r0, r4, %[res] \n"
252 "ldr r6, [%[v2]], #4 \n"
253 "smlatb %[res], r0, r5, %[res] \n"
254#if ORDER > 16 238#if ORDER > 16
255 "subs %[cnt], %[cnt], #1 \n" 239 "subs %[cnt], %[cnt], #1 \n"
256 "ldrne r0, [%[v1]], #4 \n" 240 "ldmneia %[v2]!, {r4-r5} \n"
257 "smlabt %[res], r1, r5, %[res] \n" 241 "smladx %[res], r2, r8, %[res] \n"
258 "ldrne r3, [%[v2]], #4 \n" 242 "pkhbt r8, r7, r6 \n"
259 "smlatb %[res], r1, r6, %[res] \n" 243 "ldmneia %[v1]!, {r0-r1} \n"
260 "mov r2, r6 \n" 244 "smladx %[res], r3, r8, %[res] \n"
261 "bne 1b \n" 245 "bne 1b \n"
262#else 246#else
263 "smlabt %[res], r1, r5, %[res] \n" 247 "pkhbt r7, r7, r6 \n"
264 "smlatb %[res], r1, r6, %[res] \n" 248 "smladx %[res], r2, r8, %[res] \n"
249 "smladx %[res], r3, r7, %[res] \n"
265#endif 250#endif
266 "b 99f \n" 251 "b 99f \n"
267 252
268 "20: \n" 253 "20: \n"
269 "ldmia %[v1]!, {r0-r1} \n" 254 "ldmia %[v1]!, {r0-r1} \n"
270 "ldmia %[v2]!, {r4-r5} \n" 255 "ldmia %[v2]!, {r5-r7} \n"
271 "1: \n" 256 "1: \n"
272 "ldmia %[v1]!, {r2-r3} \n" 257 "ldmia %[v1]!, {r2-r3} \n"
273 "smlad %[res], r0, r4, %[res] \n" 258 "smlad %[res], r0, r5, %[res] \n"
274 "ldmia %[v2]!, {r6-r7} \n"
275 "smlad %[res], r1, r5, %[res] \n"
276 "ldmia %[v1]!, {r0-r1} \n"
277 "smlad %[res], r2, r6, %[res] \n"
278 "ldmia %[v2]!, {r4-r5} \n" 259 "ldmia %[v2]!, {r4-r5} \n"
279 "smlad %[res], r3, r7, %[res] \n" 260 "smlad %[res], r1, r6, %[res] \n"
280 "ldmia %[v1]!, {r2-r3} \n" 261 "ldmia %[v1]!, {r0-r1} \n"
281 "smlad %[res], r0, r4, %[res] \n" 262 "smlad %[res], r2, r7, %[res] \n"
282 "ldmia %[v2]!, {r6-r7} \n" 263 "ldmia %[v2]!, {r6-r7} \n"
283 "smlad %[res], r1, r5, %[res] \n" 264 "smlad %[res], r3, r4, %[res] \n"
265 "ldmia %[v1]!, {r2-r3} \n"
266 "smlad %[res], r0, r5, %[res] \n"
267 "ldmia %[v2]!, {r4-r5} \n"
268 "smlad %[res], r1, r6, %[res] \n"
284#if ORDER > 16 269#if ORDER > 16
285 "subs %[cnt], %[cnt], #1 \n" 270 "subs %[cnt], %[cnt], #1 \n"
286 "ldmneia %[v1]!, {r0-r1} \n" 271 "ldmneia %[v1]!, {r0-r1} \n"
287 "smlad %[res], r2, r6, %[res] \n" 272 "smlad %[res], r2, r7, %[res] \n"
288 "ldmneia %[v2]!, {r4-r5} \n" 273 "ldmneia %[v2]!, {r6-r7} \n"
289 "smlad %[res], r3, r7, %[res] \n" 274 "smlad %[res], r3, r4, %[res] \n"
290 "bne 1b \n" 275 "bne 1b \n"
291#else 276#else
292 "smlad %[res], r2, r6, %[res] \n" 277 "smlad %[res], r2, r7, %[res] \n"
293 "smlad %[res], r3, r7, %[res] \n" 278 "smlad %[res], r3, r4, %[res] \n"
294#endif 279#endif
295 280
296 "99: \n" 281 "99: \n"
@@ -303,8 +288,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
303 [res]"+r"(res) 288 [res]"+r"(res)
304 : /* inputs */ 289 : /* inputs */
305 : /* clobbers */ 290 : /* clobbers */
306 "r0", "r1", "r2", "r3", 291 "r0", "r1", "r2", "r3", "r4",
307 "r4", "r5", "r6", "r7" 292 "r5", "r6", "r7", "r8"
308 ); 293 );
309 return res; 294 return res;
310} 295}