diff options
author | Sean Bartell <wingedtachikoma@gmail.com> | 2011-06-25 21:32:25 -0400 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2012-04-25 22:13:20 +0200 |
commit | f40bfc9267b13b54e6379dfe7539447662879d24 (patch) | |
tree | 9b20069d5e62809ff434061ad730096836f916f2 /lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h | |
parent | a0009907de7a0107d49040d8a180f140e2eff299 (diff) | |
download | rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.tar.gz rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.zip |
Add codecs to librbcodec.
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97
Reviewed-on: http://gerrit.rockbox.org/137
Reviewed-by: Nils Wallménius <nils@rockbox.org>
Tested-by: Nils Wallménius <nils@rockbox.org>
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h')
-rw-r--r-- | lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h | 490 |
1 files changed, 490 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h new file mode 100644 index 0000000000..8d27331b62 --- /dev/null +++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -0,0 +1,490 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARMv6 vector math copyright (C) 2008 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | #define FUSED_VECTOR_MATH | ||
28 | |||
29 | #if ORDER > 16 | ||
30 | #define REPEAT_BLOCK(x) x x x | ||
31 | #else | ||
32 | #define REPEAT_BLOCK(x) x | ||
33 | #endif | ||
34 | |||
35 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) | ||
36 | * This version fetches data as 32 bit words, and *requires* v1 to be | ||
37 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
38 | * aligned or both unaligned. If either condition isn't met, it will either | ||
39 | * result in a data abort or incorrect results. */ | ||
40 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
41 | { | ||
42 | int res; | ||
43 | #if ORDER > 32 | ||
44 | int cnt = ORDER>>5; | ||
45 | #endif | ||
46 | |||
47 | asm volatile ( | ||
48 | #if ORDER > 32 | ||
49 | "mov %[res], #0 \n" | ||
50 | #endif | ||
51 | "tst %[f2], #2 \n" | ||
52 | "beq 20f \n" | ||
53 | |||
54 | "10: \n" | ||
55 | "ldrh r3, [%[f2]], #2 \n" | ||
56 | "ldrh r6, [%[s2]], #2 \n" | ||
57 | "ldmia %[f2]!, {r2,r4} \n" | ||
58 | "mov r3, r3, lsl #16 \n" | ||
59 | "mov r6, r6, lsl #16 \n" | ||
60 | |||
61 | "1: \n" | ||
62 | "ldmia %[s2]!, {r5,r7} \n" | ||
63 | "pkhtb r3, r3, r2 \n" | ||
64 | "pkhtb r2, r2, r4 \n" | ||
65 | "ldrd r0, [%[v1]] \n" | ||
66 | "mov r5, r5, ror #16 \n" | ||
67 | "pkhtb r6, r5, r6, asr #16 \n" | ||
68 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
69 | #if ORDER > 32 | ||
70 | "smladx %[res], r0, r3, %[res] \n" | ||
71 | #else | ||
72 | "smuadx %[res], r0, r3 \n" | ||
73 | #endif | ||
74 | "smladx %[res], r1, r2, %[res] \n" | ||
75 | "ldmia %[f2]!, {r2,r3} \n" | ||
76 | "sadd16 r0, r0, r6 \n" | ||
77 | "sadd16 r1, r1, r5 \n" | ||
78 | "strd r0, [%[v1]], #8 \n" | ||
79 | |||
80 | REPEAT_BLOCK( | ||
81 | "ldmia %[s2]!, {r5,r6} \n" | ||
82 | "pkhtb r4, r4, r2 \n" | ||
83 | "pkhtb r2, r2, r3 \n" | ||
84 | "ldrd r0, [%[v1]] \n" | ||
85 | "mov r5, r5, ror #16 \n" | ||
86 | "pkhtb r7, r5, r7, asr #16 \n" | ||
87 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
88 | "smladx %[res], r0, r4, %[res] \n" | ||
89 | "smladx %[res], r1, r2, %[res] \n" | ||
90 | "ldmia %[f2]!, {r2,r4} \n" | ||
91 | "sadd16 r0, r0, r7 \n" | ||
92 | "sadd16 r1, r1, r5 \n" | ||
93 | "strd r0, [%[v1]], #8 \n" | ||
94 | "ldmia %[s2]!, {r5,r7} \n" | ||
95 | "pkhtb r3, r3, r2 \n" | ||
96 | "pkhtb r2, r2, r4 \n" | ||
97 | "ldrd r0, [%[v1]] \n" | ||
98 | "mov r5, r5, ror #16 \n" | ||
99 | "pkhtb r6, r5, r6, asr #16 \n" | ||
100 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
101 | "smladx %[res], r0, r3, %[res] \n" | ||
102 | "smladx %[res], r1, r2, %[res] \n" | ||
103 | "ldmia %[f2]!, {r2,r3} \n" | ||
104 | "sadd16 r0, r0, r6 \n" | ||
105 | "sadd16 r1, r1, r5 \n" | ||
106 | "strd r0, [%[v1]], #8 \n" | ||
107 | ) | ||
108 | |||
109 | "ldmia %[s2]!, {r5,r6} \n" | ||
110 | "pkhtb r4, r4, r2 \n" | ||
111 | "pkhtb r2, r2, r3 \n" | ||
112 | "ldrd r0, [%[v1]] \n" | ||
113 | "mov r5, r5, ror #16 \n" | ||
114 | "pkhtb r7, r5, r7, asr #16 \n" | ||
115 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
116 | "smladx %[res], r0, r4, %[res] \n" | ||
117 | "smladx %[res], r1, r2, %[res] \n" | ||
118 | #if ORDER > 32 | ||
119 | "subs %[cnt], %[cnt], #1 \n" | ||
120 | "ldmneia %[f2]!, {r2,r4} \n" | ||
121 | "sadd16 r0, r0, r7 \n" | ||
122 | "sadd16 r1, r1, r5 \n" | ||
123 | "strd r0, [%[v1]], #8 \n" | ||
124 | "bne 1b \n" | ||
125 | #else | ||
126 | "sadd16 r0, r0, r7 \n" | ||
127 | "sadd16 r1, r1, r5 \n" | ||
128 | "strd r0, [%[v1]], #8 \n" | ||
129 | #endif | ||
130 | |||
131 | "b 99f \n" | ||
132 | |||
133 | "20: \n" | ||
134 | "ldrd r4, [%[f2]], #8 \n" | ||
135 | "ldrd r0, [%[v1]] \n" | ||
136 | |||
137 | #if ORDER > 32 | ||
138 | "1: \n" | ||
139 | "smlad %[res], r0, r4, %[res] \n" | ||
140 | #else | ||
141 | "smuad %[res], r0, r4 \n" | ||
142 | #endif | ||
143 | "ldrd r6, [%[s2]], #8 \n" | ||
144 | "smlad %[res], r1, r5, %[res] \n" | ||
145 | "ldrd r4, [%[f2]], #8 \n" | ||
146 | "ldrd r2, [%[v1], #8] \n" | ||
147 | "sadd16 r0, r0, r6 \n" | ||
148 | "sadd16 r1, r1, r7 \n" | ||
149 | "strd r0, [%[v1]], #8 \n" | ||
150 | |||
151 | REPEAT_BLOCK( | ||
152 | "smlad %[res], r2, r4, %[res] \n" | ||
153 | "ldrd r6, [%[s2]], #8 \n" | ||
154 | "smlad %[res], r3, r5, %[res] \n" | ||
155 | "ldrd r4, [%[f2]], #8 \n" | ||
156 | "ldrd r0, [%[v1], #8] \n" | ||
157 | "sadd16 r2, r2, r6 \n" | ||
158 | "sadd16 r3, r3, r7 \n" | ||
159 | "strd r2, [%[v1]], #8 \n" | ||
160 | "smlad %[res], r0, r4, %[res] \n" | ||
161 | "ldrd r6, [%[s2]], #8 \n" | ||
162 | "smlad %[res], r1, r5, %[res] \n" | ||
163 | "ldrd r4, [%[f2]], #8 \n" | ||
164 | "ldrd r2, [%[v1], #8] \n" | ||
165 | "sadd16 r0, r0, r6 \n" | ||
166 | "sadd16 r1, r1, r7 \n" | ||
167 | "strd r0, [%[v1]], #8 \n" | ||
168 | ) | ||
169 | |||
170 | "smlad %[res], r2, r4, %[res] \n" | ||
171 | "ldrd r6, [%[s2]], #8 \n" | ||
172 | "smlad %[res], r3, r5, %[res] \n" | ||
173 | #if ORDER > 32 | ||
174 | "subs %[cnt], %[cnt], #1 \n" | ||
175 | "ldrned r4, [%[f2]], #8 \n" | ||
176 | "ldrned r0, [%[v1], #8] \n" | ||
177 | "sadd16 r2, r2, r6 \n" | ||
178 | "sadd16 r3, r3, r7 \n" | ||
179 | "strd r2, [%[v1]], #8 \n" | ||
180 | "bne 1b \n" | ||
181 | #else | ||
182 | "sadd16 r2, r2, r6 \n" | ||
183 | "sadd16 r3, r3, r7 \n" | ||
184 | "strd r2, [%[v1]], #8 \n" | ||
185 | #endif | ||
186 | |||
187 | "99: \n" | ||
188 | : /* outputs */ | ||
189 | #if ORDER > 32 | ||
190 | [cnt]"+r"(cnt), | ||
191 | #endif | ||
192 | [v1] "+r"(v1), | ||
193 | [f2] "+r"(f2), | ||
194 | [s2] "+r"(s2), | ||
195 | [res]"=r"(res) | ||
196 | : /* inputs */ | ||
197 | : /* clobbers */ | ||
198 | "r0", "r1", "r2", "r3", "r4", | ||
199 | "r5", "r6", "r7", "cc", "memory" | ||
200 | ); | ||
201 | return res; | ||
202 | } | ||
203 | |||
204 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) | ||
205 | * This version fetches data as 32 bit words, and *requires* v1 to be | ||
206 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
207 | * aligned or both unaligned. If either condition isn't met, it will either | ||
208 | * result in a data abort or incorrect results. */ | ||
209 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
210 | { | ||
211 | int res; | ||
212 | #if ORDER > 32 | ||
213 | int cnt = ORDER>>5; | ||
214 | #endif | ||
215 | |||
216 | asm volatile ( | ||
217 | #if ORDER > 32 | ||
218 | "mov %[res], #0 \n" | ||
219 | #endif | ||
220 | "tst %[f2], #2 \n" | ||
221 | "beq 20f \n" | ||
222 | |||
223 | "10: \n" | ||
224 | "ldrh r3, [%[f2]], #2 \n" | ||
225 | "ldrh r6, [%[s2]], #2 \n" | ||
226 | "ldmia %[f2]!, {r2,r4} \n" | ||
227 | "mov r3, r3, lsl #16 \n" | ||
228 | "mov r6, r6, lsl #16 \n" | ||
229 | |||
230 | "1: \n" | ||
231 | "ldmia %[s2]!, {r5,r7} \n" | ||
232 | "pkhtb r3, r3, r2 \n" | ||
233 | "pkhtb r2, r2, r4 \n" | ||
234 | "ldrd r0, [%[v1]] \n" | ||
235 | "mov r5, r5, ror #16 \n" | ||
236 | "pkhtb r6, r5, r6, asr #16 \n" | ||
237 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
238 | #if ORDER > 32 | ||
239 | "smladx %[res], r0, r3, %[res] \n" | ||
240 | #else | ||
241 | "smuadx %[res], r0, r3 \n" | ||
242 | #endif | ||
243 | "smladx %[res], r1, r2, %[res] \n" | ||
244 | "ldmia %[f2]!, {r2,r3} \n" | ||
245 | "ssub16 r0, r0, r6 \n" | ||
246 | "ssub16 r1, r1, r5 \n" | ||
247 | "strd r0, [%[v1]], #8 \n" | ||
248 | |||
249 | REPEAT_BLOCK( | ||
250 | "ldmia %[s2]!, {r5,r6} \n" | ||
251 | "pkhtb r4, r4, r2 \n" | ||
252 | "pkhtb r2, r2, r3 \n" | ||
253 | "ldrd r0, [%[v1]] \n" | ||
254 | "mov r5, r5, ror #16 \n" | ||
255 | "pkhtb r7, r5, r7, asr #16 \n" | ||
256 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
257 | "smladx %[res], r0, r4, %[res] \n" | ||
258 | "smladx %[res], r1, r2, %[res] \n" | ||
259 | "ldmia %[f2]!, {r2,r4} \n" | ||
260 | "ssub16 r0, r0, r7 \n" | ||
261 | "ssub16 r1, r1, r5 \n" | ||
262 | "strd r0, [%[v1]], #8 \n" | ||
263 | "ldmia %[s2]!, {r5,r7} \n" | ||
264 | "pkhtb r3, r3, r2 \n" | ||
265 | "pkhtb r2, r2, r4 \n" | ||
266 | "ldrd r0, [%[v1]] \n" | ||
267 | "mov r5, r5, ror #16 \n" | ||
268 | "pkhtb r6, r5, r6, asr #16 \n" | ||
269 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
270 | "smladx %[res], r0, r3, %[res] \n" | ||
271 | "smladx %[res], r1, r2, %[res] \n" | ||
272 | "ldmia %[f2]!, {r2,r3} \n" | ||
273 | "ssub16 r0, r0, r6 \n" | ||
274 | "ssub16 r1, r1, r5 \n" | ||
275 | "strd r0, [%[v1]], #8 \n" | ||
276 | ) | ||
277 | |||
278 | "ldmia %[s2]!, {r5,r6} \n" | ||
279 | "pkhtb r4, r4, r2 \n" | ||
280 | "pkhtb r2, r2, r3 \n" | ||
281 | "ldrd r0, [%[v1]] \n" | ||
282 | "mov r5, r5, ror #16 \n" | ||
283 | "pkhtb r7, r5, r7, asr #16 \n" | ||
284 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
285 | "smladx %[res], r0, r4, %[res] \n" | ||
286 | "smladx %[res], r1, r2, %[res] \n" | ||
287 | #if ORDER > 32 | ||
288 | "subs %[cnt], %[cnt], #1 \n" | ||
289 | "ldmneia %[f2]!, {r2,r4} \n" | ||
290 | "ssub16 r0, r0, r7 \n" | ||
291 | "ssub16 r1, r1, r5 \n" | ||
292 | "strd r0, [%[v1]], #8 \n" | ||
293 | "bne 1b \n" | ||
294 | #else | ||
295 | "ssub16 r0, r0, r7 \n" | ||
296 | "ssub16 r1, r1, r5 \n" | ||
297 | "strd r0, [%[v1]], #8 \n" | ||
298 | #endif | ||
299 | |||
300 | "b 99f \n" | ||
301 | |||
302 | "20: \n" | ||
303 | "ldrd r4, [%[f2]], #8 \n" | ||
304 | "ldrd r0, [%[v1]] \n" | ||
305 | |||
306 | #if ORDER > 32 | ||
307 | "1: \n" | ||
308 | "smlad %[res], r0, r4, %[res] \n" | ||
309 | #else | ||
310 | "smuad %[res], r0, r4 \n" | ||
311 | #endif | ||
312 | "ldrd r6, [%[s2]], #8 \n" | ||
313 | "smlad %[res], r1, r5, %[res] \n" | ||
314 | "ldrd r4, [%[f2]], #8 \n" | ||
315 | "ldrd r2, [%[v1], #8] \n" | ||
316 | "ssub16 r0, r0, r6 \n" | ||
317 | "ssub16 r1, r1, r7 \n" | ||
318 | "strd r0, [%[v1]], #8 \n" | ||
319 | |||
320 | REPEAT_BLOCK( | ||
321 | "smlad %[res], r2, r4, %[res] \n" | ||
322 | "ldrd r6, [%[s2]], #8 \n" | ||
323 | "smlad %[res], r3, r5, %[res] \n" | ||
324 | "ldrd r4, [%[f2]], #8 \n" | ||
325 | "ldrd r0, [%[v1], #8] \n" | ||
326 | "ssub16 r2, r2, r6 \n" | ||
327 | "ssub16 r3, r3, r7 \n" | ||
328 | "strd r2, [%[v1]], #8 \n" | ||
329 | "smlad %[res], r0, r4, %[res] \n" | ||
330 | "ldrd r6, [%[s2]], #8 \n" | ||
331 | "smlad %[res], r1, r5, %[res] \n" | ||
332 | "ldrd r4, [%[f2]], #8 \n" | ||
333 | "ldrd r2, [%[v1], #8] \n" | ||
334 | "ssub16 r0, r0, r6 \n" | ||
335 | "ssub16 r1, r1, r7 \n" | ||
336 | "strd r0, [%[v1]], #8 \n" | ||
337 | ) | ||
338 | |||
339 | "smlad %[res], r2, r4, %[res] \n" | ||
340 | "ldrd r6, [%[s2]], #8 \n" | ||
341 | "smlad %[res], r3, r5, %[res] \n" | ||
342 | #if ORDER > 32 | ||
343 | "subs %[cnt], %[cnt], #1 \n" | ||
344 | "ldrned r4, [%[f2]], #8 \n" | ||
345 | "ldrned r0, [%[v1], #8] \n" | ||
346 | "ssub16 r2, r2, r6 \n" | ||
347 | "ssub16 r3, r3, r7 \n" | ||
348 | "strd r2, [%[v1]], #8 \n" | ||
349 | "bne 1b \n" | ||
350 | #else | ||
351 | "ssub16 r2, r2, r6 \n" | ||
352 | "ssub16 r3, r3, r7 \n" | ||
353 | "strd r2, [%[v1]], #8 \n" | ||
354 | #endif | ||
355 | |||
356 | "99: \n" | ||
357 | : /* outputs */ | ||
358 | #if ORDER > 32 | ||
359 | [cnt]"+r"(cnt), | ||
360 | #endif | ||
361 | [v1] "+r"(v1), | ||
362 | [f2] "+r"(f2), | ||
363 | [s2] "+r"(s2), | ||
364 | [res]"=r"(res) | ||
365 | : /* inputs */ | ||
366 | : /* clobbers */ | ||
367 | "r0", "r1", "r2", "r3", "r4", | ||
368 | "r5", "r6", "r7", "cc", "memory" | ||
369 | ); | ||
370 | return res; | ||
371 | } | ||
372 | |||
373 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
374 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
375 | * incorrect results (if ARM aligncheck is disabled). */ | ||
376 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
377 | { | ||
378 | int res; | ||
379 | #if ORDER > 32 | ||
380 | int cnt = ORDER>>5; | ||
381 | #endif | ||
382 | |||
383 | asm volatile ( | ||
384 | #if ORDER > 32 | ||
385 | "mov %[res], #0 \n" | ||
386 | #endif | ||
387 | "tst %[v2], #2 \n" | ||
388 | "beq 20f \n" | ||
389 | |||
390 | "10: \n" | ||
391 | "bic %[v2], %[v2], #2 \n" | ||
392 | "ldmia %[v2]!, {r5-r7} \n" | ||
393 | "ldrd r0, [%[v1]], #8 \n" | ||
394 | |||
395 | "1: \n" | ||
396 | "pkhtb r3, r5, r6 \n" | ||
397 | "ldrd r4, [%[v2]], #8 \n" | ||
398 | #if ORDER > 32 | ||
399 | "smladx %[res], r0, r3, %[res] \n" | ||
400 | #else | ||
401 | "smuadx %[res], r0, r3 \n" | ||
402 | #endif | ||
403 | REPEAT_BLOCK( | ||
404 | "pkhtb r0, r6, r7 \n" | ||
405 | "ldrd r2, [%[v1]], #8 \n" | ||
406 | "smladx %[res], r1, r0, %[res] \n" | ||
407 | "pkhtb r1, r7, r4 \n" | ||
408 | "ldrd r6, [%[v2]], #8 \n" | ||
409 | "smladx %[res], r2, r1, %[res] \n" | ||
410 | "pkhtb r2, r4, r5 \n" | ||
411 | "ldrd r0, [%[v1]], #8 \n" | ||
412 | "smladx %[res], r3, r2, %[res] \n" | ||
413 | "pkhtb r3, r5, r6 \n" | ||
414 | "ldrd r4, [%[v2]], #8 \n" | ||
415 | "smladx %[res], r0, r3, %[res] \n" | ||
416 | ) | ||
417 | |||
418 | "pkhtb r0, r6, r7 \n" | ||
419 | "ldrd r2, [%[v1]], #8 \n" | ||
420 | "smladx %[res], r1, r0, %[res] \n" | ||
421 | "pkhtb r1, r7, r4 \n" | ||
422 | #if ORDER > 32 | ||
423 | "subs %[cnt], %[cnt], #1 \n" | ||
424 | "ldrned r6, [%[v2]], #8 \n" | ||
425 | "smladx %[res], r2, r1, %[res] \n" | ||
426 | "pkhtb r2, r4, r5 \n" | ||
427 | "ldrned r0, [%[v1]], #8 \n" | ||
428 | "smladx %[res], r3, r2, %[res] \n" | ||
429 | "bne 1b \n" | ||
430 | #else | ||
431 | "pkhtb r4, r4, r5 \n" | ||
432 | "smladx %[res], r2, r1, %[res] \n" | ||
433 | "smladx %[res], r3, r4, %[res] \n" | ||
434 | #endif | ||
435 | |||
436 | "b 99f \n" | ||
437 | |||
438 | "20: \n" | ||
439 | "ldrd r0, [%[v1]], #8 \n" | ||
440 | "ldmia %[v2]!, {r5-r7} \n" | ||
441 | |||
442 | "1: \n" | ||
443 | "ldrd r2, [%[v1]], #8 \n" | ||
444 | #if ORDER > 32 | ||
445 | "smlad %[res], r0, r5, %[res] \n" | ||
446 | #else | ||
447 | "smuad %[res], r0, r5 \n" | ||
448 | #endif | ||
449 | REPEAT_BLOCK( | ||
450 | "ldrd r4, [%[v2]], #8 \n" | ||
451 | "smlad %[res], r1, r6, %[res] \n" | ||
452 | "ldrd r0, [%[v1]], #8 \n" | ||
453 | "smlad %[res], r2, r7, %[res] \n" | ||
454 | "ldrd r6, [%[v2]], #8 \n" | ||
455 | "smlad %[res], r3, r4, %[res] \n" | ||
456 | "ldrd r2, [%[v1]], #8 \n" | ||
457 | "smlad %[res], r0, r5, %[res] \n" | ||
458 | ) | ||
459 | |||
460 | #if ORDER > 32 | ||
461 | "ldrd r4, [%[v2]], #8 \n" | ||
462 | "smlad %[res], r1, r6, %[res] \n" | ||
463 | "subs %[cnt], %[cnt], #1 \n" | ||
464 | "ldrned r0, [%[v1]], #8 \n" | ||
465 | "smlad %[res], r2, r7, %[res] \n" | ||
466 | "ldrned r6, [%[v2]], #8 \n" | ||
467 | "smlad %[res], r3, r4, %[res] \n" | ||
468 | "bne 1b \n" | ||
469 | #else | ||
470 | "ldr r4, [%[v2]], #4 \n" | ||
471 | "smlad %[res], r1, r6, %[res] \n" | ||
472 | "smlad %[res], r2, r7, %[res] \n" | ||
473 | "smlad %[res], r3, r4, %[res] \n" | ||
474 | #endif | ||
475 | |||
476 | "99: \n" | ||
477 | : /* outputs */ | ||
478 | #if ORDER > 32 | ||
479 | [cnt]"+r"(cnt), | ||
480 | #endif | ||
481 | [v1] "+r"(v1), | ||
482 | [v2] "+r"(v2), | ||
483 | [res]"=r"(res) | ||
484 | : /* inputs */ | ||
485 | : /* clobbers */ | ||
486 | "r0", "r1", "r2", "r3", | ||
487 | "r4", "r5", "r6", "r7", "cc", "memory" | ||
488 | ); | ||
489 | return res; | ||
490 | } | ||