diff options
author | Jens Arnold <amiconn@rockbox.org> | 2008-10-03 12:30:18 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2008-10-03 12:30:18 +0000 |
commit | d456460707f79ec48d08baf5d8f28c88c9641e64 (patch) | |
tree | a922a1829e3a90886f4eacc698f92c6b749dcd93 /apps/codecs/demac/libdemac/vector_math16_armv6.h | |
parent | 7fc446263f99aad5f0b2f9f674fde02e6eac4d5c (diff) | |
download | rockbox-d456460707f79ec48d08baf5d8f28c88c9641e64.tar.gz rockbox-d456460707f79ec48d08baf5d8f28c88c9641e64.zip |
Further speedup for ARMv6 by better pipelining in scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18697 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_armv6.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv6.h | 80 |
1 files changed, 53 insertions, 27 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index e963e10ff0..bf50d9cabd 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -217,54 +217,80 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
217 | "beq 20f \n" | 217 | "beq 20f \n" |
218 | 218 | ||
219 | "10: \n" | 219 | "10: \n" |
220 | "ldrh r4, [%[v2]], #2 \n" | 220 | "ldrh r2, [%[v2]], #2 \n" |
221 | "mov r4, r4, lsl #16 \n" | 221 | "ldr r0, [%[v1]], #4 \n" |
222 | "ldr r3, [%[v2]], #4 \n" | ||
223 | "mov r2, r2, lsl #16 \n" | ||
222 | "1: \n" | 224 | "1: \n" |
223 | "ldmia %[v1]!, {r0-r3} \n" | 225 | "ldr r1, [%[v1]], #4 \n" |
224 | "ldmia %[v2]!, {r5-r8} \n" | 226 | "smlabt %[res], r0, r2, %[res] \n" |
227 | "ldr r4, [%[v2]], #4 \n" | ||
228 | "smlatb %[res], r0, r3, %[res] \n" | ||
229 | "ldr r0, [%[v1]], #4 \n" | ||
230 | "smlabt %[res], r1, r3, %[res] \n" | ||
231 | "ldr r5, [%[v2]], #4 \n" | ||
232 | "smlatb %[res], r1, r4, %[res] \n" | ||
233 | "ldr r1, [%[v1]], #4 \n" | ||
225 | "smlabt %[res], r0, r4, %[res] \n" | 234 | "smlabt %[res], r0, r4, %[res] \n" |
235 | "ldr r6, [%[v2]], #4 \n" | ||
226 | "smlatb %[res], r0, r5, %[res] \n" | 236 | "smlatb %[res], r0, r5, %[res] \n" |
237 | "ldr r0, [%[v1]], #4 \n" | ||
227 | "smlabt %[res], r1, r5, %[res] \n" | 238 | "smlabt %[res], r1, r5, %[res] \n" |
239 | "ldr r3, [%[v2]], #4 \n" | ||
228 | "smlatb %[res], r1, r6, %[res] \n" | 240 | "smlatb %[res], r1, r6, %[res] \n" |
229 | "smlabt %[res], r2, r6, %[res] \n" | 241 | "mov r2, r6 \n" |
230 | "smlatb %[res], r2, r7, %[res] \n" | 242 | "ldr r1, [%[v1]], #4 \n" |
231 | "smlabt %[res], r3, r7, %[res] \n" | 243 | "smlabt %[res], r0, r2, %[res] \n" |
232 | "smlatb %[res], r3, r8, %[res] \n" | 244 | "ldr r4, [%[v2]], #4 \n" |
233 | "mov r4, r8 \n" | 245 | "smlatb %[res], r0, r3, %[res] \n" |
234 | "ldmia %[v1]!, {r0-r3} \n" | 246 | "ldr r0, [%[v1]], #4 \n" |
235 | "ldmia %[v2]!, {r5-r8} \n" | 247 | "smlabt %[res], r1, r3, %[res] \n" |
248 | "ldr r5, [%[v2]], #4 \n" | ||
249 | "smlatb %[res], r1, r4, %[res] \n" | ||
250 | "ldr r1, [%[v1]], #4 \n" | ||
236 | "smlabt %[res], r0, r4, %[res] \n" | 251 | "smlabt %[res], r0, r4, %[res] \n" |
252 | "ldr r6, [%[v2]], #4 \n" | ||
237 | "smlatb %[res], r0, r5, %[res] \n" | 253 | "smlatb %[res], r0, r5, %[res] \n" |
238 | "smlabt %[res], r1, r5, %[res] \n" | ||
239 | "smlatb %[res], r1, r6, %[res] \n" | ||
240 | "smlabt %[res], r2, r6, %[res] \n" | ||
241 | "smlatb %[res], r2, r7, %[res] \n" | ||
242 | "smlabt %[res], r3, r7, %[res] \n" | ||
243 | "smlatb %[res], r3, r8, %[res] \n" | ||
244 | #if ORDER > 16 | 254 | #if ORDER > 16 |
245 | "mov r4, r8 \n" | ||
246 | "subs %[cnt], %[cnt], #1 \n" | 255 | "subs %[cnt], %[cnt], #1 \n" |
256 | "ldrne r0, [%[v1]], #4 \n" | ||
257 | "smlabt %[res], r1, r5, %[res] \n" | ||
258 | "ldrne r3, [%[v2]], #4 \n" | ||
259 | "smlatb %[res], r1, r6, %[res] \n" | ||
260 | "mov r2, r6 \n" | ||
247 | "bne 1b \n" | 261 | "bne 1b \n" |
262 | #else | ||
263 | "smlabt %[res], r1, r5, %[res] \n" | ||
264 | "smlatb %[res], r1, r6, %[res] \n" | ||
248 | #endif | 265 | #endif |
249 | "b 99f \n" | 266 | "b 99f \n" |
250 | 267 | ||
251 | "20: \n" | 268 | "20: \n" |
269 | "ldmia %[v1]!, {r0-r1} \n" | ||
270 | "ldmia %[v2]!, {r4-r5} \n" | ||
252 | "1: \n" | 271 | "1: \n" |
253 | "ldmia %[v1]!, {r0-r3} \n" | 272 | "ldmia %[v1]!, {r2-r3} \n" |
254 | "ldmia %[v2]!, {r4-r7} \n" | ||
255 | "smlad %[res], r0, r4, %[res] \n" | 273 | "smlad %[res], r0, r4, %[res] \n" |
274 | "ldmia %[v2]!, {r6-r7} \n" | ||
256 | "smlad %[res], r1, r5, %[res] \n" | 275 | "smlad %[res], r1, r5, %[res] \n" |
276 | "ldmia %[v1]!, {r0-r1} \n" | ||
257 | "smlad %[res], r2, r6, %[res] \n" | 277 | "smlad %[res], r2, r6, %[res] \n" |
278 | "ldmia %[v2]!, {r4-r5} \n" | ||
258 | "smlad %[res], r3, r7, %[res] \n" | 279 | "smlad %[res], r3, r7, %[res] \n" |
259 | "ldmia %[v1]!, {r0-r3} \n" | 280 | "ldmia %[v1]!, {r2-r3} \n" |
260 | "ldmia %[v2]!, {r4-r7} \n" | ||
261 | "smlad %[res], r0, r4, %[res] \n" | 281 | "smlad %[res], r0, r4, %[res] \n" |
282 | "ldmia %[v2]!, {r6-r7} \n" | ||
262 | "smlad %[res], r1, r5, %[res] \n" | 283 | "smlad %[res], r1, r5, %[res] \n" |
263 | "smlad %[res], r2, r6, %[res] \n" | ||
264 | "smlad %[res], r3, r7, %[res] \n" | ||
265 | #if ORDER > 16 | 284 | #if ORDER > 16 |
266 | "subs %[cnt], %[cnt], #1 \n" | 285 | "subs %[cnt], %[cnt], #1 \n" |
267 | "bne 1b \n" | 286 | "ldmneia %[v1]!, {r0-r1} \n" |
287 | "smlad %[res], r2, r6, %[res] \n" | ||
288 | "ldmneia %[v2]!, {r4-r5} \n" | ||
289 | "smlad %[res], r3, r7, %[res] \n" | ||
290 | "bne 1b \n" | ||
291 | #else | ||
292 | "smlad %[res], r2, r6, %[res] \n" | ||
293 | "smlad %[res], r3, r7, %[res] \n" | ||
268 | #endif | 294 | #endif |
269 | 295 | ||
270 | "99: \n" | 296 | "99: \n" |
@@ -277,8 +303,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
277 | [res]"+r"(res) | 303 | [res]"+r"(res) |
278 | : /* inputs */ | 304 | : /* inputs */ |
279 | : /* clobbers */ | 305 | : /* clobbers */ |
280 | "r0", "r1", "r2", "r3", "r4", | 306 | "r0", "r1", "r2", "r3", |
281 | "r5", "r6", "r7", "r8" | 307 | "r4", "r5", "r6", "r7" |
282 | ); | 308 | ); |
283 | return res; | 309 | return res; |
284 | } | 310 | } |