summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2010-02-08 21:59:24 +0000
committerJens Arnold <amiconn@rockbox.org>2010-02-08 21:59:24 +0000
commit1cc4bd8f86cf013813d52aeb2c8aa37989026dfc (patch)
tree41fe60252fd1c90971bd6e7ed84b55d557a3b0cf
parent9955e9a7df32418f20a8de27a3787d35bb9436f4 (diff)
downloadrockbox-1cc4bd8f86cf013813d52aeb2c8aa37989026dfc.tar.gz
rockbox-1cc4bd8f86cf013813d52aeb2c8aa37989026dfc.zip
APE: Fused vector math for the filters on ARMv6. Speedup is ~2.5% for -c2000, ~7% for -c3000 and higher.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24569 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv6.h495
1 files changed, 348 insertions, 147 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h
index 61471103bd..0ace6c5811 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv6.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h
@@ -24,148 +24,350 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *requires* v1 to be 27#define FUSED_VECTOR_MATH
28 * 32 bit aligned, otherwise it will result either in a data abort, or 28
29 * incorrect results (if ARM aligncheck is disabled). */ 29#if ORDER > 16
30static inline void vector_add(int16_t* v1, int16_t* v2) 30#define BLOCK_REPEAT "3"
31#else
32#define BLOCK_REPEAT "1"
33#endif
34
35/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
36 * This version fetches data as 32 bit words, and *requires* v1 to be
37 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
38 * aligned or both unaligned. If either condition isn't met, it will either
39 * result in a data abort or incorrect results. */
40static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
31{ 41{
42 int res;
32#if ORDER > 32 43#if ORDER > 32
33 int cnt = ORDER>>5; 44 int cnt = ORDER>>5;
34#endif 45#endif
35 46
36#if ORDER > 16 47 asm volatile (
37#define ADD_SUB_BLOCKS "4" 48#if ORDER > 32
49 "mov %[res], #0 \n"
50#endif
51 "tst %[f2], #2 \n"
52 "beq 20f \n"
53
54 "10: \n"
55 "ldrh r3, [%[f2]], #2 \n"
56 "ldrh r6, [%[s2]], #2 \n"
57 "ldmia %[f2]!, {r2,r4} \n"
58 "mov r3, r3, lsl #16 \n"
59 "mov r6, r6, lsl #16 \n"
60
61 "1: \n"
62 "ldmia %[s2]!, {r5,r7} \n"
63 "pkhtb r3, r3, r2 \n"
64 "pkhtb r2, r2, r4 \n"
65 "ldrd r0, [%[v1]] \n"
66 "mov r5, r5, ror #16 \n"
67 "pkhtb r6, r5, r6, asr #16 \n"
68 "pkhbt r5, r5, r7, lsl #16 \n"
69#if ORDER > 32
70 "smladx %[res], r0, r3, %[res] \n"
38#else 71#else
39#define ADD_SUB_BLOCKS "2" 72 "smuadx %[res], r0, r3 \n"
40#endif 73#endif
74 "smladx %[res], r1, r2, %[res] \n"
75 "ldmia %[f2]!, {r2,r3} \n"
76 "sadd16 r0, r0, r6 \n"
77 "sadd16 r1, r1, r5 \n"
78 "strd r0, [%[v1]], #8 \n"
79
80 ".rept " BLOCK_REPEAT "\n"
81 "ldmia %[s2]!, {r5,r6} \n"
82 "pkhtb r4, r4, r2 \n"
83 "pkhtb r2, r2, r3 \n"
84 "ldrd r0, [%[v1]] \n"
85 "mov r5, r5, ror #16 \n"
86 "pkhtb r7, r5, r7, asr #16 \n"
87 "pkhbt r5, r5, r6, lsl #16 \n"
88 "smladx %[res], r0, r4, %[res] \n"
89 "smladx %[res], r1, r2, %[res] \n"
90 "ldmia %[f2]!, {r2,r4} \n"
91 "sadd16 r0, r0, r7 \n"
92 "sadd16 r1, r1, r5 \n"
93 "strd r0, [%[v1]], #8 \n"
94 "ldmia %[s2]!, {r5,r7} \n"
95 "pkhtb r3, r3, r2 \n"
96 "pkhtb r2, r2, r4 \n"
97 "ldrd r0, [%[v1]] \n"
98 "mov r5, r5, ror #16 \n"
99 "pkhtb r6, r5, r6, asr #16 \n"
100 "pkhbt r5, r5, r7, lsl #16 \n"
101 "smladx %[res], r0, r3, %[res] \n"
102 "smladx %[res], r1, r2, %[res] \n"
103 "ldmia %[f2]!, {r2,r3} \n"
104 "sadd16 r0, r0, r6 \n"
105 "sadd16 r1, r1, r5 \n"
106 "strd r0, [%[v1]], #8 \n"
107 ".endr \n"
108
109 "ldmia %[s2]!, {r5,r6} \n"
110 "pkhtb r4, r4, r2 \n"
111 "pkhtb r2, r2, r3 \n"
112 "ldrd r0, [%[v1]] \n"
113 "mov r5, r5, ror #16 \n"
114 "pkhtb r7, r5, r7, asr #16 \n"
115 "pkhbt r5, r5, r6, lsl #16 \n"
116 "smladx %[res], r0, r4, %[res] \n"
117 "smladx %[res], r1, r2, %[res] \n"
118#if ORDER > 32
119 "subs %[cnt], %[cnt], #1 \n"
120 "ldmneia %[f2]!, {r2,r4} \n"
121 "sadd16 r0, r0, r7 \n"
122 "sadd16 r1, r1, r5 \n"
123 "strd r0, [%[v1]], #8 \n"
124 "bne 1b \n"
125#else
126 "sadd16 r0, r0, r7 \n"
127 "sadd16 r1, r1, r5 \n"
128 "strd r0, [%[v1]], #8 \n"
129#endif
130
131 "b 99f \n"
132
133 "20: \n"
134 "ldrd r4, [%[f2]], #8 \n"
135 "ldrd r0, [%[v1]] \n"
41 136
42 asm volatile (
43 "tst %[v2], #2 \n"
44 "beq 20f \n"
45
46 "10: \n"
47 "bic %[v2], %[v2], #2 \n"
48 "ldmia %[v2]!, {r4-r5} \n"
49 "1: \n"
50 ".rept " ADD_SUB_BLOCKS "\n"
51 "ldmia %[v2]!, {r6-r7} \n"
52 "ldmia %[v1], {r0-r3} \n"
53 "mov r5, r5, ror #16 \n"
54 "pkhtb r4, r5, r4, asr #16 \n"
55 "sadd16 r0, r0, r4 \n"
56 "pkhbt r5, r5, r6, lsl #16 \n"
57 "sadd16 r1, r1, r5 \n"
58 "ldmia %[v2]!, {r4-r5} \n"
59 "mov r7, r7, ror #16 \n"
60 "pkhtb r6, r7, r6, asr #16 \n"
61 "sadd16 r2, r2, r6 \n"
62 "pkhbt r7, r7, r4, lsl #16 \n"
63 "sadd16 r3, r3, r7 \n"
64 "stmia %[v1]!, {r0-r3} \n"
65 ".endr \n"
66#if ORDER > 32 137#if ORDER > 32
67 "subs %[cnt], %[cnt], #1 \n" 138 "1: \n"
68 "bne 1b \n" 139 "smlad %[res], r0, r4, %[res] \n"
140#else
141 "smuad %[res], r0, r4 \n"
69#endif 142#endif
70 "b 99f \n" 143 "ldrd r6, [%[s2]], #8 \n"
71 144 "smlad %[res], r1, r5, %[res] \n"
72 "20: \n" 145 "ldrd r4, [%[f2]], #8 \n"
73 "1: \n" 146 "ldrd r2, [%[v1], #8] \n"
74 ".rept " ADD_SUB_BLOCKS "\n" 147 "sadd16 r0, r0, r6 \n"
75 "ldmia %[v2]!, {r4-r7} \n" 148 "sadd16 r1, r1, r7 \n"
76 "ldmia %[v1], {r0-r3} \n" 149 "strd r0, [%[v1]], #8 \n"
77 "sadd16 r0, r0, r4 \n" 150
78 "sadd16 r1, r1, r5 \n" 151 ".rept " BLOCK_REPEAT "\n"
79 "sadd16 r2, r2, r6 \n" 152 "smlad %[res], r2, r4, %[res] \n"
80 "sadd16 r3, r3, r7 \n" 153 "ldrd r6, [%[s2]], #8 \n"
81 "stmia %[v1]!, {r0-r3} \n" 154 "smlad %[res], r3, r5, %[res] \n"
82 ".endr \n" 155 "ldrd r4, [%[f2]], #8 \n"
156 "ldrd r0, [%[v1], #8] \n"
157 "sadd16 r2, r2, r6 \n"
158 "sadd16 r3, r3, r7 \n"
159 "strd r2, [%[v1]], #8 \n"
160 "smlad %[res], r0, r4, %[res] \n"
161 "ldrd r6, [%[s2]], #8 \n"
162 "smlad %[res], r1, r5, %[res] \n"
163 "ldrd r4, [%[f2]], #8 \n"
164 "ldrd r2, [%[v1], #8] \n"
165 "sadd16 r0, r0, r6 \n"
166 "sadd16 r1, r1, r7 \n"
167 "strd r0, [%[v1]], #8 \n"
168 ".endr \n"
169
170 "smlad %[res], r2, r4, %[res] \n"
171 "ldrd r6, [%[s2]], #8 \n"
172 "smlad %[res], r3, r5, %[res] \n"
83#if ORDER > 32 173#if ORDER > 32
84 "subs %[cnt], %[cnt], #1 \n" 174 "subs %[cnt], %[cnt], #1 \n"
85 "bne 1b \n" 175 "ldrned r4, [%[f2]], #8 \n"
176 "ldrned r0, [%[v1], #8] \n"
177 "sadd16 r2, r2, r6 \n"
178 "sadd16 r3, r3, r7 \n"
179 "strd r2, [%[v1]], #8 \n"
180 "bne 1b \n"
181#else
182 "sadd16 r2, r2, r6 \n"
183 "sadd16 r3, r3, r7 \n"
184 "strd r2, [%[v1]], #8 \n"
86#endif 185#endif
87 186
88 "99: \n" 187 "99: \n"
89 : /* outputs */ 188 : /* outputs */
90#if ORDER > 32 189#if ORDER > 32
91 [cnt]"+r"(cnt), 190 [cnt]"+r"(cnt),
92#endif 191#endif
93 [v1] "+r"(v1), 192 [v1] "+r"(v1),
94 [v2] "+r"(v2) 193 [f2] "+r"(f2),
194 [s2] "+r"(s2),
195 [res]"=r"(res)
95 : /* inputs */ 196 : /* inputs */
96 : /* clobbers */ 197 : /* clobbers */
97 "r0", "r1", "r2", "r3", "r4", 198 "r0", "r1", "r2", "r3", "r4",
98 "r5", "r6", "r7", "memory" 199 "r5", "r6", "r7", "memory"
99 ); 200 );
201 return res;
100} 202}
101 203
102/* This version fetches data as 32 bit words, and *requires* v1 to be 204/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
103 * 32 bit aligned, otherwise it will result either in a data abort, or 205 * This version fetches data as 32 bit words, and *requires* v1 to be
104 * incorrect results (if ARM aligncheck is disabled). */ 206 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
105static inline void vector_sub(int16_t* v1, int16_t* v2) 207 * aligned or both unaligned. If either condition isn't met, it will either
208 * result in a data abort or incorrect results. */
209static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
106{ 210{
211 int res;
107#if ORDER > 32 212#if ORDER > 32
108 int cnt = ORDER>>5; 213 int cnt = ORDER>>5;
109#endif 214#endif
110 215
111 asm volatile ( 216 asm volatile (
112 "tst %[v2], #2 \n"
113 "beq 20f \n"
114
115 "10: \n"
116 "bic %[v2], %[v2], #2 \n"
117 "ldmia %[v2]!, {r4-r5} \n"
118 "1: \n"
119 ".rept " ADD_SUB_BLOCKS "\n"
120 "ldmia %[v2]!, {r6-r7} \n"
121 "ldmia %[v1], {r0-r3} \n"
122 "mov r5, r5, ror #16 \n"
123 "pkhtb r4, r5, r4, asr #16 \n"
124 "ssub16 r0, r0, r4 \n"
125 "pkhbt r5, r5, r6, lsl #16 \n"
126 "ssub16 r1, r1, r5 \n"
127 "ldmia %[v2]!, {r4-r5} \n"
128 "mov r7, r7, ror #16 \n"
129 "pkhtb r6, r7, r6, asr #16 \n"
130 "ssub16 r2, r2, r6 \n"
131 "pkhbt r7, r7, r4, lsl #16 \n"
132 "ssub16 r3, r3, r7 \n"
133 "stmia %[v1]!, {r0-r3} \n"
134 ".endr \n"
135#if ORDER > 32 217#if ORDER > 32
136 "subs %[cnt], %[cnt], #1 \n" 218 "mov %[res], #0 \n"
137 "bne 1b \n" 219#endif
220 "tst %[f2], #2 \n"
221 "beq 20f \n"
222
223 "10: \n"
224 "ldrh r3, [%[f2]], #2 \n"
225 "ldrh r6, [%[s2]], #2 \n"
226 "ldmia %[f2]!, {r2,r4} \n"
227 "mov r3, r3, lsl #16 \n"
228 "mov r6, r6, lsl #16 \n"
229
230 "1: \n"
231 "ldmia %[s2]!, {r5,r7} \n"
232 "pkhtb r3, r3, r2 \n"
233 "pkhtb r2, r2, r4 \n"
234 "ldrd r0, [%[v1]] \n"
235 "mov r5, r5, ror #16 \n"
236 "pkhtb r6, r5, r6, asr #16 \n"
237 "pkhbt r5, r5, r7, lsl #16 \n"
238#if ORDER > 32
239 "smladx %[res], r0, r3, %[res] \n"
240#else
241 "smuadx %[res], r0, r3 \n"
242#endif
243 "smladx %[res], r1, r2, %[res] \n"
244 "ldmia %[f2]!, {r2,r3} \n"
245 "ssub16 r0, r0, r6 \n"
246 "ssub16 r1, r1, r5 \n"
247 "strd r0, [%[v1]], #8 \n"
248
249 ".rept " BLOCK_REPEAT "\n"
250 "ldmia %[s2]!, {r5,r6} \n"
251 "pkhtb r4, r4, r2 \n"
252 "pkhtb r2, r2, r3 \n"
253 "ldrd r0, [%[v1]] \n"
254 "mov r5, r5, ror #16 \n"
255 "pkhtb r7, r5, r7, asr #16 \n"
256 "pkhbt r5, r5, r6, lsl #16 \n"
257 "smladx %[res], r0, r4, %[res] \n"
258 "smladx %[res], r1, r2, %[res] \n"
259 "ldmia %[f2]!, {r2,r4} \n"
260 "ssub16 r0, r0, r7 \n"
261 "ssub16 r1, r1, r5 \n"
262 "strd r0, [%[v1]], #8 \n"
263 "ldmia %[s2]!, {r5,r7} \n"
264 "pkhtb r3, r3, r2 \n"
265 "pkhtb r2, r2, r4 \n"
266 "ldrd r0, [%[v1]] \n"
267 "mov r5, r5, ror #16 \n"
268 "pkhtb r6, r5, r6, asr #16 \n"
269 "pkhbt r5, r5, r7, lsl #16 \n"
270 "smladx %[res], r0, r3, %[res] \n"
271 "smladx %[res], r1, r2, %[res] \n"
272 "ldmia %[f2]!, {r2,r3} \n"
273 "ssub16 r0, r0, r6 \n"
274 "ssub16 r1, r1, r5 \n"
275 "strd r0, [%[v1]], #8 \n"
276 ".endr \n"
277
278 "ldmia %[s2]!, {r5,r6} \n"
279 "pkhtb r4, r4, r2 \n"
280 "pkhtb r2, r2, r3 \n"
281 "ldrd r0, [%[v1]] \n"
282 "mov r5, r5, ror #16 \n"
283 "pkhtb r7, r5, r7, asr #16 \n"
284 "pkhbt r5, r5, r6, lsl #16 \n"
285 "smladx %[res], r0, r4, %[res] \n"
286 "smladx %[res], r1, r2, %[res] \n"
287#if ORDER > 32
288 "subs %[cnt], %[cnt], #1 \n"
289 "ldmneia %[f2]!, {r2,r4} \n"
290 "ssub16 r0, r0, r7 \n"
291 "ssub16 r1, r1, r5 \n"
292 "strd r0, [%[v1]], #8 \n"
293 "bne 1b \n"
294#else
295 "ssub16 r0, r0, r7 \n"
296 "ssub16 r1, r1, r5 \n"
297 "strd r0, [%[v1]], #8 \n"
298#endif
299
300 "b 99f \n"
301
302 "20: \n"
303 "ldrd r4, [%[f2]], #8 \n"
304 "ldrd r0, [%[v1]] \n"
305
306#if ORDER > 32
307 "1: \n"
308 "smlad %[res], r0, r4, %[res] \n"
309#else
310 "smuad %[res], r0, r4 \n"
138#endif 311#endif
139 "b 99f \n" 312 "ldrd r6, [%[s2]], #8 \n"
140 313 "smlad %[res], r1, r5, %[res] \n"
141 "20: \n" 314 "ldrd r4, [%[f2]], #8 \n"
142 "1: \n" 315 "ldrd r2, [%[v1], #8] \n"
143 ".rept " ADD_SUB_BLOCKS "\n" 316 "ssub16 r0, r0, r6 \n"
144 "ldmia %[v2]!, {r4-r7} \n" 317 "ssub16 r1, r1, r7 \n"
145 "ldmia %[v1], {r0-r3} \n" 318 "strd r0, [%[v1]], #8 \n"
146 "ssub16 r0, r0, r4 \n" 319
147 "ssub16 r1, r1, r5 \n" 320 ".rept " BLOCK_REPEAT "\n"
148 "ssub16 r2, r2, r6 \n" 321 "smlad %[res], r2, r4, %[res] \n"
149 "ssub16 r3, r3, r7 \n" 322 "ldrd r6, [%[s2]], #8 \n"
150 "stmia %[v1]!, {r0-r3} \n" 323 "smlad %[res], r3, r5, %[res] \n"
151 ".endr \n" 324 "ldrd r4, [%[f2]], #8 \n"
325 "ldrd r0, [%[v1], #8] \n"
326 "ssub16 r2, r2, r6 \n"
327 "ssub16 r3, r3, r7 \n"
328 "strd r2, [%[v1]], #8 \n"
329 "smlad %[res], r0, r4, %[res] \n"
330 "ldrd r6, [%[s2]], #8 \n"
331 "smlad %[res], r1, r5, %[res] \n"
332 "ldrd r4, [%[f2]], #8 \n"
333 "ldrd r2, [%[v1], #8] \n"
334 "ssub16 r0, r0, r6 \n"
335 "ssub16 r1, r1, r7 \n"
336 "strd r0, [%[v1]], #8 \n"
337 ".endr \n"
338
339 "smlad %[res], r2, r4, %[res] \n"
340 "ldrd r6, [%[s2]], #8 \n"
341 "smlad %[res], r3, r5, %[res] \n"
152#if ORDER > 32 342#if ORDER > 32
153 "subs %[cnt], %[cnt], #1 \n" 343 "subs %[cnt], %[cnt], #1 \n"
154 "bne 1b \n" 344 "ldrned r4, [%[f2]], #8 \n"
345 "ldrned r0, [%[v1], #8] \n"
346 "ssub16 r2, r2, r6 \n"
347 "ssub16 r3, r3, r7 \n"
348 "strd r2, [%[v1]], #8 \n"
349 "bne 1b \n"
350#else
351 "ssub16 r2, r2, r6 \n"
352 "ssub16 r3, r3, r7 \n"
353 "strd r2, [%[v1]], #8 \n"
155#endif 354#endif
156 355
157 "99: \n" 356 "99: \n"
158 : /* outputs */ 357 : /* outputs */
159#if ORDER > 32 358#if ORDER > 32
160 [cnt]"+r"(cnt), 359 [cnt]"+r"(cnt),
161#endif 360#endif
162 [v1] "+r"(v1), 361 [v1] "+r"(v1),
163 [v2] "+r"(v2) 362 [f2] "+r"(f2),
363 [s2] "+r"(s2),
364 [res]"=r"(res)
164 : /* inputs */ 365 : /* inputs */
165 : /* clobbers */ 366 : /* clobbers */
166 "r0", "r1", "r2", "r3", "r4", 367 "r0", "r1", "r2", "r3", "r4",
167 "r5", "r6", "r7", "memory" 368 "r5", "r6", "r7", "memory"
168 ); 369 );
370 return res;
169} 371}
170 372
171/* This version fetches data as 32 bit words, and *requires* v1 to be 373/* This version fetches data as 32 bit words, and *requires* v1 to be
@@ -178,12 +380,6 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
178 int cnt = ORDER>>5; 380 int cnt = ORDER>>5;
179#endif 381#endif
180 382
181#if ORDER > 16
182#define MLA_BLOCKS "3"
183#else
184#define MLA_BLOCKS "1"
185#endif
186
187 asm volatile ( 383 asm volatile (
188#if ORDER > 32 384#if ORDER > 32
189 "mov %[res], #0 \n" 385 "mov %[res], #0 \n"
@@ -194,80 +390,85 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
194 "10: \n" 390 "10: \n"
195 "bic %[v2], %[v2], #2 \n" 391 "bic %[v2], %[v2], #2 \n"
196 "ldmia %[v2]!, {r5-r7} \n" 392 "ldmia %[v2]!, {r5-r7} \n"
197 "ldmia %[v1]!, {r0-r1} \n" 393 "ldrd r0, [%[v1]], #8 \n"
394
198 "1: \n" 395 "1: \n"
199 "pkhbt r8, r6, r5 \n" 396 "pkhtb r3, r5, r6 \n"
200 "ldmia %[v2]!, {r4-r5} \n" 397 "ldrd r4, [%[v2]], #8 \n"
201#if ORDER > 32 398#if ORDER > 32
202 "smladx %[res], r0, r8, %[res] \n" 399 "smladx %[res], r0, r3, %[res] \n"
203#else 400#else
204 "smuadx %[res], r0, r8 \n" 401 "smuadx %[res], r0, r3 \n"
205#endif 402#endif
206 ".rept " MLA_BLOCKS "\n" 403 ".rept " BLOCK_REPEAT "\n"
207 "pkhbt r8, r7, r6 \n" 404 "pkhtb r0, r6, r7 \n"
208 "ldmia %[v1]!, {r2-r3} \n" 405 "ldrd r2, [%[v1]], #8 \n"
209 "smladx %[res], r1, r8, %[res] \n" 406 "smladx %[res], r1, r0, %[res] \n"
210 "pkhbt r8, r4, r7 \n" 407 "pkhtb r1, r7, r4 \n"
211 "ldmia %[v2]!, {r6-r7} \n" 408 "ldrd r6, [%[v2]], #8 \n"
212 "smladx %[res], r2, r8, %[res] \n" 409 "smladx %[res], r2, r1, %[res] \n"
213 "pkhbt r8, r5, r4 \n" 410 "pkhtb r2, r4, r5 \n"
214 "ldmia %[v1]!, {r0-r1} \n" 411 "ldrd r0, [%[v1]], #8 \n"
215 "smladx %[res], r3, r8, %[res] \n" 412 "smladx %[res], r3, r2, %[res] \n"
216 "pkhbt r8, r6, r5 \n" 413 "pkhtb r3, r5, r6 \n"
217 "ldmia %[v2]!, {r4-r5} \n" 414 "ldrd r4, [%[v2]], #8 \n"
218 "smladx %[res], r0, r8, %[res] \n" 415 "smladx %[res], r0, r3, %[res] \n"
219 ".endr \n" 416 ".endr \n"
220 417
221 "pkhbt r8, r7, r6 \n" 418 "pkhtb r0, r6, r7 \n"
222 "ldmia %[v1]!, {r2-r3} \n" 419 "ldrd r2, [%[v1]], #8 \n"
223 "smladx %[res], r1, r8, %[res] \n" 420 "smladx %[res], r1, r0, %[res] \n"
224 "pkhbt r8, r4, r7 \n" 421 "pkhtb r1, r7, r4 \n"
225#if ORDER > 32 422#if ORDER > 32
226 "subs %[cnt], %[cnt], #1 \n" 423 "subs %[cnt], %[cnt], #1 \n"
227 "ldmneia %[v2]!, {r6-r7} \n" 424 "ldrned r6, [%[v2]], #8 \n"
228 "smladx %[res], r2, r8, %[res] \n" 425 "smladx %[res], r2, r1, %[res] \n"
229 "pkhbt r8, r5, r4 \n" 426 "pkhtb r2, r4, r5 \n"
230 "ldmneia %[v1]!, {r0-r1} \n" 427 "ldrned r0, [%[v1]], #8 \n"
231 "smladx %[res], r3, r8, %[res] \n" 428 "smladx %[res], r3, r2, %[res] \n"
232 "bne 1b \n" 429 "bne 1b \n"
233#else 430#else
234 "pkhbt r5, r5, r4 \n" 431 "pkhtb r4, r4, r5 \n"
235 "smladx %[res], r2, r8, %[res] \n" 432 "smladx %[res], r2, r1, %[res] \n"
236 "smladx %[res], r3, r5, %[res] \n" 433 "smladx %[res], r3, r4, %[res] \n"
237#endif 434#endif
238 "b 99f \n"
239 435
436 "b 99f \n"
437
240 "20: \n" 438 "20: \n"
241 "ldmia %[v1]!, {r0-r1} \n" 439 "ldrd r0, [%[v1]], #8 \n"
242 "ldmia %[v2]!, {r5-r7} \n" 440 "ldmia %[v2]!, {r5-r7} \n"
441
243 "1: \n" 442 "1: \n"
244 "ldmia %[v1]!, {r2-r3} \n" 443 "ldrd r2, [%[v1]], #8 \n"
245#if ORDER > 32 444#if ORDER > 32
246 "smlad %[res], r0, r5, %[res] \n" 445 "smlad %[res], r0, r5, %[res] \n"
247#else 446#else
248 "smuad %[res], r0, r5 \n" 447 "smuad %[res], r0, r5 \n"
249#endif 448#endif
250 ".rept " MLA_BLOCKS "\n" 449 ".rept " BLOCK_REPEAT "\n"
251 "ldmia %[v2]!, {r4-r5} \n" 450 "ldrd r4, [%[v2]], #8 \n"
252 "smlad %[res], r1, r6, %[res] \n" 451 "smlad %[res], r1, r6, %[res] \n"
253 "ldmia %[v1]!, {r0-r1} \n" 452 "ldrd r0, [%[v1]], #8 \n"
254 "smlad %[res], r2, r7, %[res] \n" 453 "smlad %[res], r2, r7, %[res] \n"
255 "ldmia %[v2]!, {r6-r7} \n" 454 "ldrd r6, [%[v2]], #8 \n"
256 "smlad %[res], r3, r4, %[res] \n" 455 "smlad %[res], r3, r4, %[res] \n"
257 "ldmia %[v1]!, {r2-r3} \n" 456 "ldrd r2, [%[v1]], #8 \n"
258 "smlad %[res], r0, r5, %[res] \n" 457 "smlad %[res], r0, r5, %[res] \n"
259 ".endr \n" 458 ".endr \n"
260 459
261 "ldmia %[v2]!, {r4-r5} \n"
262 "smlad %[res], r1, r6, %[res] \n"
263#if ORDER > 32 460#if ORDER > 32
461 "ldrd r4, [%[v2]], #8 \n"
462 "smlad %[res], r1, r6, %[res] \n"
264 "subs %[cnt], %[cnt], #1 \n" 463 "subs %[cnt], %[cnt], #1 \n"
265 "ldmneia %[v1]!, {r0-r1} \n" 464 "ldrned r0, [%[v1]], #8 \n"
266 "smlad %[res], r2, r7, %[res] \n" 465 "smlad %[res], r2, r7, %[res] \n"
267 "ldmneia %[v2]!, {r6-r7} \n" 466 "ldrned r6, [%[v2]], #8 \n"
268 "smlad %[res], r3, r4, %[res] \n" 467 "smlad %[res], r3, r4, %[res] \n"
269 "bne 1b \n" 468 "bne 1b \n"
270#else 469#else
470 "ldr r4, [%[v2]], #4 \n"
471 "smlad %[res], r1, r6, %[res] \n"
271 "smlad %[res], r2, r7, %[res] \n" 472 "smlad %[res], r2, r7, %[res] \n"
272 "smlad %[res], r3, r4, %[res] \n" 473 "smlad %[res], r3, r4, %[res] \n"
273#endif 474#endif
@@ -282,8 +483,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
282 [res]"=r"(res) 483 [res]"=r"(res)
283 : /* inputs */ 484 : /* inputs */
284 : /* clobbers */ 485 : /* clobbers */
285 "r0", "r1", "r2", "r3", "r4", 486 "r0", "r1", "r2", "r3",
286 "r5", "r6", "r7", "r8" 487 "r4", "r5", "r6", "r7"
287 ); 488 );
288 return res; 489 return res;
289} 490}