diff options
Diffstat (limited to 'apps/codecs/demac')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv6.h | 495 |
1 files changed, 348 insertions, 147 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index 61471103bd..0ace6c5811 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -24,148 +24,350 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 27 | #define FUSED_VECTOR_MATH |
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | 28 | |
29 | * incorrect results (if ARM aligncheck is disabled). */ | 29 | #if ORDER > 16 |
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | 30 | #define BLOCK_REPEAT "3" |
31 | #else | ||
32 | #define BLOCK_REPEAT "1" | ||
33 | #endif | ||
34 | |||
35 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) | ||
36 | * This version fetches data as 32 bit words, and *requires* v1 to be | ||
37 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
38 | * aligned or both unaligned. If either condition isn't met, it will either | ||
39 | * result in a data abort or incorrect results. */ | ||
40 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
31 | { | 41 | { |
42 | int res; | ||
32 | #if ORDER > 32 | 43 | #if ORDER > 32 |
33 | int cnt = ORDER>>5; | 44 | int cnt = ORDER>>5; |
34 | #endif | 45 | #endif |
35 | 46 | ||
36 | #if ORDER > 16 | 47 | asm volatile ( |
37 | #define ADD_SUB_BLOCKS "4" | 48 | #if ORDER > 32 |
49 | "mov %[res], #0 \n" | ||
50 | #endif | ||
51 | "tst %[f2], #2 \n" | ||
52 | "beq 20f \n" | ||
53 | |||
54 | "10: \n" | ||
55 | "ldrh r3, [%[f2]], #2 \n" | ||
56 | "ldrh r6, [%[s2]], #2 \n" | ||
57 | "ldmia %[f2]!, {r2,r4} \n" | ||
58 | "mov r3, r3, lsl #16 \n" | ||
59 | "mov r6, r6, lsl #16 \n" | ||
60 | |||
61 | "1: \n" | ||
62 | "ldmia %[s2]!, {r5,r7} \n" | ||
63 | "pkhtb r3, r3, r2 \n" | ||
64 | "pkhtb r2, r2, r4 \n" | ||
65 | "ldrd r0, [%[v1]] \n" | ||
66 | "mov r5, r5, ror #16 \n" | ||
67 | "pkhtb r6, r5, r6, asr #16 \n" | ||
68 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
69 | #if ORDER > 32 | ||
70 | "smladx %[res], r0, r3, %[res] \n" | ||
38 | #else | 71 | #else |
39 | #define ADD_SUB_BLOCKS "2" | 72 | "smuadx %[res], r0, r3 \n" |
40 | #endif | 73 | #endif |
74 | "smladx %[res], r1, r2, %[res] \n" | ||
75 | "ldmia %[f2]!, {r2,r3} \n" | ||
76 | "sadd16 r0, r0, r6 \n" | ||
77 | "sadd16 r1, r1, r5 \n" | ||
78 | "strd r0, [%[v1]], #8 \n" | ||
79 | |||
80 | ".rept " BLOCK_REPEAT "\n" | ||
81 | "ldmia %[s2]!, {r5,r6} \n" | ||
82 | "pkhtb r4, r4, r2 \n" | ||
83 | "pkhtb r2, r2, r3 \n" | ||
84 | "ldrd r0, [%[v1]] \n" | ||
85 | "mov r5, r5, ror #16 \n" | ||
86 | "pkhtb r7, r5, r7, asr #16 \n" | ||
87 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
88 | "smladx %[res], r0, r4, %[res] \n" | ||
89 | "smladx %[res], r1, r2, %[res] \n" | ||
90 | "ldmia %[f2]!, {r2,r4} \n" | ||
91 | "sadd16 r0, r0, r7 \n" | ||
92 | "sadd16 r1, r1, r5 \n" | ||
93 | "strd r0, [%[v1]], #8 \n" | ||
94 | "ldmia %[s2]!, {r5,r7} \n" | ||
95 | "pkhtb r3, r3, r2 \n" | ||
96 | "pkhtb r2, r2, r4 \n" | ||
97 | "ldrd r0, [%[v1]] \n" | ||
98 | "mov r5, r5, ror #16 \n" | ||
99 | "pkhtb r6, r5, r6, asr #16 \n" | ||
100 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
101 | "smladx %[res], r0, r3, %[res] \n" | ||
102 | "smladx %[res], r1, r2, %[res] \n" | ||
103 | "ldmia %[f2]!, {r2,r3} \n" | ||
104 | "sadd16 r0, r0, r6 \n" | ||
105 | "sadd16 r1, r1, r5 \n" | ||
106 | "strd r0, [%[v1]], #8 \n" | ||
107 | ".endr \n" | ||
108 | |||
109 | "ldmia %[s2]!, {r5,r6} \n" | ||
110 | "pkhtb r4, r4, r2 \n" | ||
111 | "pkhtb r2, r2, r3 \n" | ||
112 | "ldrd r0, [%[v1]] \n" | ||
113 | "mov r5, r5, ror #16 \n" | ||
114 | "pkhtb r7, r5, r7, asr #16 \n" | ||
115 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
116 | "smladx %[res], r0, r4, %[res] \n" | ||
117 | "smladx %[res], r1, r2, %[res] \n" | ||
118 | #if ORDER > 32 | ||
119 | "subs %[cnt], %[cnt], #1 \n" | ||
120 | "ldmneia %[f2]!, {r2,r4} \n" | ||
121 | "sadd16 r0, r0, r7 \n" | ||
122 | "sadd16 r1, r1, r5 \n" | ||
123 | "strd r0, [%[v1]], #8 \n" | ||
124 | "bne 1b \n" | ||
125 | #else | ||
126 | "sadd16 r0, r0, r7 \n" | ||
127 | "sadd16 r1, r1, r5 \n" | ||
128 | "strd r0, [%[v1]], #8 \n" | ||
129 | #endif | ||
130 | |||
131 | "b 99f \n" | ||
132 | |||
133 | "20: \n" | ||
134 | "ldrd r4, [%[f2]], #8 \n" | ||
135 | "ldrd r0, [%[v1]] \n" | ||
41 | 136 | ||
42 | asm volatile ( | ||
43 | "tst %[v2], #2 \n" | ||
44 | "beq 20f \n" | ||
45 | |||
46 | "10: \n" | ||
47 | "bic %[v2], %[v2], #2 \n" | ||
48 | "ldmia %[v2]!, {r4-r5} \n" | ||
49 | "1: \n" | ||
50 | ".rept " ADD_SUB_BLOCKS "\n" | ||
51 | "ldmia %[v2]!, {r6-r7} \n" | ||
52 | "ldmia %[v1], {r0-r3} \n" | ||
53 | "mov r5, r5, ror #16 \n" | ||
54 | "pkhtb r4, r5, r4, asr #16 \n" | ||
55 | "sadd16 r0, r0, r4 \n" | ||
56 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
57 | "sadd16 r1, r1, r5 \n" | ||
58 | "ldmia %[v2]!, {r4-r5} \n" | ||
59 | "mov r7, r7, ror #16 \n" | ||
60 | "pkhtb r6, r7, r6, asr #16 \n" | ||
61 | "sadd16 r2, r2, r6 \n" | ||
62 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
63 | "sadd16 r3, r3, r7 \n" | ||
64 | "stmia %[v1]!, {r0-r3} \n" | ||
65 | ".endr \n" | ||
66 | #if ORDER > 32 | 137 | #if ORDER > 32 |
67 | "subs %[cnt], %[cnt], #1 \n" | 138 | "1: \n" |
68 | "bne 1b \n" | 139 | "smlad %[res], r0, r4, %[res] \n" |
140 | #else | ||
141 | "smuad %[res], r0, r4 \n" | ||
69 | #endif | 142 | #endif |
70 | "b 99f \n" | 143 | "ldrd r6, [%[s2]], #8 \n" |
71 | 144 | "smlad %[res], r1, r5, %[res] \n" | |
72 | "20: \n" | 145 | "ldrd r4, [%[f2]], #8 \n" |
73 | "1: \n" | 146 | "ldrd r2, [%[v1], #8] \n" |
74 | ".rept " ADD_SUB_BLOCKS "\n" | 147 | "sadd16 r0, r0, r6 \n" |
75 | "ldmia %[v2]!, {r4-r7} \n" | 148 | "sadd16 r1, r1, r7 \n" |
76 | "ldmia %[v1], {r0-r3} \n" | 149 | "strd r0, [%[v1]], #8 \n" |
77 | "sadd16 r0, r0, r4 \n" | 150 | |
78 | "sadd16 r1, r1, r5 \n" | 151 | ".rept " BLOCK_REPEAT "\n" |
79 | "sadd16 r2, r2, r6 \n" | 152 | "smlad %[res], r2, r4, %[res] \n" |
80 | "sadd16 r3, r3, r7 \n" | 153 | "ldrd r6, [%[s2]], #8 \n" |
81 | "stmia %[v1]!, {r0-r3} \n" | 154 | "smlad %[res], r3, r5, %[res] \n" |
82 | ".endr \n" | 155 | "ldrd r4, [%[f2]], #8 \n" |
156 | "ldrd r0, [%[v1], #8] \n" | ||
157 | "sadd16 r2, r2, r6 \n" | ||
158 | "sadd16 r3, r3, r7 \n" | ||
159 | "strd r2, [%[v1]], #8 \n" | ||
160 | "smlad %[res], r0, r4, %[res] \n" | ||
161 | "ldrd r6, [%[s2]], #8 \n" | ||
162 | "smlad %[res], r1, r5, %[res] \n" | ||
163 | "ldrd r4, [%[f2]], #8 \n" | ||
164 | "ldrd r2, [%[v1], #8] \n" | ||
165 | "sadd16 r0, r0, r6 \n" | ||
166 | "sadd16 r1, r1, r7 \n" | ||
167 | "strd r0, [%[v1]], #8 \n" | ||
168 | ".endr \n" | ||
169 | |||
170 | "smlad %[res], r2, r4, %[res] \n" | ||
171 | "ldrd r6, [%[s2]], #8 \n" | ||
172 | "smlad %[res], r3, r5, %[res] \n" | ||
83 | #if ORDER > 32 | 173 | #if ORDER > 32 |
84 | "subs %[cnt], %[cnt], #1 \n" | 174 | "subs %[cnt], %[cnt], #1 \n" |
85 | "bne 1b \n" | 175 | "ldrned r4, [%[f2]], #8 \n" |
176 | "ldrned r0, [%[v1], #8] \n" | ||
177 | "sadd16 r2, r2, r6 \n" | ||
178 | "sadd16 r3, r3, r7 \n" | ||
179 | "strd r2, [%[v1]], #8 \n" | ||
180 | "bne 1b \n" | ||
181 | #else | ||
182 | "sadd16 r2, r2, r6 \n" | ||
183 | "sadd16 r3, r3, r7 \n" | ||
184 | "strd r2, [%[v1]], #8 \n" | ||
86 | #endif | 185 | #endif |
87 | 186 | ||
88 | "99: \n" | 187 | "99: \n" |
89 | : /* outputs */ | 188 | : /* outputs */ |
90 | #if ORDER > 32 | 189 | #if ORDER > 32 |
91 | [cnt]"+r"(cnt), | 190 | [cnt]"+r"(cnt), |
92 | #endif | 191 | #endif |
93 | [v1] "+r"(v1), | 192 | [v1] "+r"(v1), |
94 | [v2] "+r"(v2) | 193 | [f2] "+r"(f2), |
194 | [s2] "+r"(s2), | ||
195 | [res]"=r"(res) | ||
95 | : /* inputs */ | 196 | : /* inputs */ |
96 | : /* clobbers */ | 197 | : /* clobbers */ |
97 | "r0", "r1", "r2", "r3", "r4", | 198 | "r0", "r1", "r2", "r3", "r4", |
98 | "r5", "r6", "r7", "memory" | 199 | "r5", "r6", "r7", "memory" |
99 | ); | 200 | ); |
201 | return res; | ||
100 | } | 202 | } |
101 | 203 | ||
102 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 204 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) |
103 | * 32 bit aligned, otherwise it will result either in a data abort, or | 205 | * This version fetches data as 32 bit words, and *requires* v1 to be |
104 | * incorrect results (if ARM aligncheck is disabled). */ | 206 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit |
105 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 207 | * aligned or both unaligned. If either condition isn't met, it will either |
208 | * result in a data abort or incorrect results. */ | ||
209 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
106 | { | 210 | { |
211 | int res; | ||
107 | #if ORDER > 32 | 212 | #if ORDER > 32 |
108 | int cnt = ORDER>>5; | 213 | int cnt = ORDER>>5; |
109 | #endif | 214 | #endif |
110 | 215 | ||
111 | asm volatile ( | 216 | asm volatile ( |
112 | "tst %[v2], #2 \n" | ||
113 | "beq 20f \n" | ||
114 | |||
115 | "10: \n" | ||
116 | "bic %[v2], %[v2], #2 \n" | ||
117 | "ldmia %[v2]!, {r4-r5} \n" | ||
118 | "1: \n" | ||
119 | ".rept " ADD_SUB_BLOCKS "\n" | ||
120 | "ldmia %[v2]!, {r6-r7} \n" | ||
121 | "ldmia %[v1], {r0-r3} \n" | ||
122 | "mov r5, r5, ror #16 \n" | ||
123 | "pkhtb r4, r5, r4, asr #16 \n" | ||
124 | "ssub16 r0, r0, r4 \n" | ||
125 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
126 | "ssub16 r1, r1, r5 \n" | ||
127 | "ldmia %[v2]!, {r4-r5} \n" | ||
128 | "mov r7, r7, ror #16 \n" | ||
129 | "pkhtb r6, r7, r6, asr #16 \n" | ||
130 | "ssub16 r2, r2, r6 \n" | ||
131 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
132 | "ssub16 r3, r3, r7 \n" | ||
133 | "stmia %[v1]!, {r0-r3} \n" | ||
134 | ".endr \n" | ||
135 | #if ORDER > 32 | 217 | #if ORDER > 32 |
136 | "subs %[cnt], %[cnt], #1 \n" | 218 | "mov %[res], #0 \n" |
137 | "bne 1b \n" | 219 | #endif |
220 | "tst %[f2], #2 \n" | ||
221 | "beq 20f \n" | ||
222 | |||
223 | "10: \n" | ||
224 | "ldrh r3, [%[f2]], #2 \n" | ||
225 | "ldrh r6, [%[s2]], #2 \n" | ||
226 | "ldmia %[f2]!, {r2,r4} \n" | ||
227 | "mov r3, r3, lsl #16 \n" | ||
228 | "mov r6, r6, lsl #16 \n" | ||
229 | |||
230 | "1: \n" | ||
231 | "ldmia %[s2]!, {r5,r7} \n" | ||
232 | "pkhtb r3, r3, r2 \n" | ||
233 | "pkhtb r2, r2, r4 \n" | ||
234 | "ldrd r0, [%[v1]] \n" | ||
235 | "mov r5, r5, ror #16 \n" | ||
236 | "pkhtb r6, r5, r6, asr #16 \n" | ||
237 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
238 | #if ORDER > 32 | ||
239 | "smladx %[res], r0, r3, %[res] \n" | ||
240 | #else | ||
241 | "smuadx %[res], r0, r3 \n" | ||
242 | #endif | ||
243 | "smladx %[res], r1, r2, %[res] \n" | ||
244 | "ldmia %[f2]!, {r2,r3} \n" | ||
245 | "ssub16 r0, r0, r6 \n" | ||
246 | "ssub16 r1, r1, r5 \n" | ||
247 | "strd r0, [%[v1]], #8 \n" | ||
248 | |||
249 | ".rept " BLOCK_REPEAT "\n" | ||
250 | "ldmia %[s2]!, {r5,r6} \n" | ||
251 | "pkhtb r4, r4, r2 \n" | ||
252 | "pkhtb r2, r2, r3 \n" | ||
253 | "ldrd r0, [%[v1]] \n" | ||
254 | "mov r5, r5, ror #16 \n" | ||
255 | "pkhtb r7, r5, r7, asr #16 \n" | ||
256 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
257 | "smladx %[res], r0, r4, %[res] \n" | ||
258 | "smladx %[res], r1, r2, %[res] \n" | ||
259 | "ldmia %[f2]!, {r2,r4} \n" | ||
260 | "ssub16 r0, r0, r7 \n" | ||
261 | "ssub16 r1, r1, r5 \n" | ||
262 | "strd r0, [%[v1]], #8 \n" | ||
263 | "ldmia %[s2]!, {r5,r7} \n" | ||
264 | "pkhtb r3, r3, r2 \n" | ||
265 | "pkhtb r2, r2, r4 \n" | ||
266 | "ldrd r0, [%[v1]] \n" | ||
267 | "mov r5, r5, ror #16 \n" | ||
268 | "pkhtb r6, r5, r6, asr #16 \n" | ||
269 | "pkhbt r5, r5, r7, lsl #16 \n" | ||
270 | "smladx %[res], r0, r3, %[res] \n" | ||
271 | "smladx %[res], r1, r2, %[res] \n" | ||
272 | "ldmia %[f2]!, {r2,r3} \n" | ||
273 | "ssub16 r0, r0, r6 \n" | ||
274 | "ssub16 r1, r1, r5 \n" | ||
275 | "strd r0, [%[v1]], #8 \n" | ||
276 | ".endr \n" | ||
277 | |||
278 | "ldmia %[s2]!, {r5,r6} \n" | ||
279 | "pkhtb r4, r4, r2 \n" | ||
280 | "pkhtb r2, r2, r3 \n" | ||
281 | "ldrd r0, [%[v1]] \n" | ||
282 | "mov r5, r5, ror #16 \n" | ||
283 | "pkhtb r7, r5, r7, asr #16 \n" | ||
284 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
285 | "smladx %[res], r0, r4, %[res] \n" | ||
286 | "smladx %[res], r1, r2, %[res] \n" | ||
287 | #if ORDER > 32 | ||
288 | "subs %[cnt], %[cnt], #1 \n" | ||
289 | "ldmneia %[f2]!, {r2,r4} \n" | ||
290 | "ssub16 r0, r0, r7 \n" | ||
291 | "ssub16 r1, r1, r5 \n" | ||
292 | "strd r0, [%[v1]], #8 \n" | ||
293 | "bne 1b \n" | ||
294 | #else | ||
295 | "ssub16 r0, r0, r7 \n" | ||
296 | "ssub16 r1, r1, r5 \n" | ||
297 | "strd r0, [%[v1]], #8 \n" | ||
298 | #endif | ||
299 | |||
300 | "b 99f \n" | ||
301 | |||
302 | "20: \n" | ||
303 | "ldrd r4, [%[f2]], #8 \n" | ||
304 | "ldrd r0, [%[v1]] \n" | ||
305 | |||
306 | #if ORDER > 32 | ||
307 | "1: \n" | ||
308 | "smlad %[res], r0, r4, %[res] \n" | ||
309 | #else | ||
310 | "smuad %[res], r0, r4 \n" | ||
138 | #endif | 311 | #endif |
139 | "b 99f \n" | 312 | "ldrd r6, [%[s2]], #8 \n" |
140 | 313 | "smlad %[res], r1, r5, %[res] \n" | |
141 | "20: \n" | 314 | "ldrd r4, [%[f2]], #8 \n" |
142 | "1: \n" | 315 | "ldrd r2, [%[v1], #8] \n" |
143 | ".rept " ADD_SUB_BLOCKS "\n" | 316 | "ssub16 r0, r0, r6 \n" |
144 | "ldmia %[v2]!, {r4-r7} \n" | 317 | "ssub16 r1, r1, r7 \n" |
145 | "ldmia %[v1], {r0-r3} \n" | 318 | "strd r0, [%[v1]], #8 \n" |
146 | "ssub16 r0, r0, r4 \n" | 319 | |
147 | "ssub16 r1, r1, r5 \n" | 320 | ".rept " BLOCK_REPEAT "\n" |
148 | "ssub16 r2, r2, r6 \n" | 321 | "smlad %[res], r2, r4, %[res] \n" |
149 | "ssub16 r3, r3, r7 \n" | 322 | "ldrd r6, [%[s2]], #8 \n" |
150 | "stmia %[v1]!, {r0-r3} \n" | 323 | "smlad %[res], r3, r5, %[res] \n" |
151 | ".endr \n" | 324 | "ldrd r4, [%[f2]], #8 \n" |
325 | "ldrd r0, [%[v1], #8] \n" | ||
326 | "ssub16 r2, r2, r6 \n" | ||
327 | "ssub16 r3, r3, r7 \n" | ||
328 | "strd r2, [%[v1]], #8 \n" | ||
329 | "smlad %[res], r0, r4, %[res] \n" | ||
330 | "ldrd r6, [%[s2]], #8 \n" | ||
331 | "smlad %[res], r1, r5, %[res] \n" | ||
332 | "ldrd r4, [%[f2]], #8 \n" | ||
333 | "ldrd r2, [%[v1], #8] \n" | ||
334 | "ssub16 r0, r0, r6 \n" | ||
335 | "ssub16 r1, r1, r7 \n" | ||
336 | "strd r0, [%[v1]], #8 \n" | ||
337 | ".endr \n" | ||
338 | |||
339 | "smlad %[res], r2, r4, %[res] \n" | ||
340 | "ldrd r6, [%[s2]], #8 \n" | ||
341 | "smlad %[res], r3, r5, %[res] \n" | ||
152 | #if ORDER > 32 | 342 | #if ORDER > 32 |
153 | "subs %[cnt], %[cnt], #1 \n" | 343 | "subs %[cnt], %[cnt], #1 \n" |
154 | "bne 1b \n" | 344 | "ldrned r4, [%[f2]], #8 \n" |
345 | "ldrned r0, [%[v1], #8] \n" | ||
346 | "ssub16 r2, r2, r6 \n" | ||
347 | "ssub16 r3, r3, r7 \n" | ||
348 | "strd r2, [%[v1]], #8 \n" | ||
349 | "bne 1b \n" | ||
350 | #else | ||
351 | "ssub16 r2, r2, r6 \n" | ||
352 | "ssub16 r3, r3, r7 \n" | ||
353 | "strd r2, [%[v1]], #8 \n" | ||
155 | #endif | 354 | #endif |
156 | 355 | ||
157 | "99: \n" | 356 | "99: \n" |
158 | : /* outputs */ | 357 | : /* outputs */ |
159 | #if ORDER > 32 | 358 | #if ORDER > 32 |
160 | [cnt]"+r"(cnt), | 359 | [cnt]"+r"(cnt), |
161 | #endif | 360 | #endif |
162 | [v1] "+r"(v1), | 361 | [v1] "+r"(v1), |
163 | [v2] "+r"(v2) | 362 | [f2] "+r"(f2), |
363 | [s2] "+r"(s2), | ||
364 | [res]"=r"(res) | ||
164 | : /* inputs */ | 365 | : /* inputs */ |
165 | : /* clobbers */ | 366 | : /* clobbers */ |
166 | "r0", "r1", "r2", "r3", "r4", | 367 | "r0", "r1", "r2", "r3", "r4", |
167 | "r5", "r6", "r7", "memory" | 368 | "r5", "r6", "r7", "memory" |
168 | ); | 369 | ); |
370 | return res; | ||
169 | } | 371 | } |
170 | 372 | ||
171 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 373 | /* This version fetches data as 32 bit words, and *requires* v1 to be |
@@ -178,12 +380,6 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
178 | int cnt = ORDER>>5; | 380 | int cnt = ORDER>>5; |
179 | #endif | 381 | #endif |
180 | 382 | ||
181 | #if ORDER > 16 | ||
182 | #define MLA_BLOCKS "3" | ||
183 | #else | ||
184 | #define MLA_BLOCKS "1" | ||
185 | #endif | ||
186 | |||
187 | asm volatile ( | 383 | asm volatile ( |
188 | #if ORDER > 32 | 384 | #if ORDER > 32 |
189 | "mov %[res], #0 \n" | 385 | "mov %[res], #0 \n" |
@@ -194,80 +390,85 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
194 | "10: \n" | 390 | "10: \n" |
195 | "bic %[v2], %[v2], #2 \n" | 391 | "bic %[v2], %[v2], #2 \n" |
196 | "ldmia %[v2]!, {r5-r7} \n" | 392 | "ldmia %[v2]!, {r5-r7} \n" |
197 | "ldmia %[v1]!, {r0-r1} \n" | 393 | "ldrd r0, [%[v1]], #8 \n" |
394 | |||
198 | "1: \n" | 395 | "1: \n" |
199 | "pkhbt r8, r6, r5 \n" | 396 | "pkhtb r3, r5, r6 \n" |
200 | "ldmia %[v2]!, {r4-r5} \n" | 397 | "ldrd r4, [%[v2]], #8 \n" |
201 | #if ORDER > 32 | 398 | #if ORDER > 32 |
202 | "smladx %[res], r0, r8, %[res] \n" | 399 | "smladx %[res], r0, r3, %[res] \n" |
203 | #else | 400 | #else |
204 | "smuadx %[res], r0, r8 \n" | 401 | "smuadx %[res], r0, r3 \n" |
205 | #endif | 402 | #endif |
206 | ".rept " MLA_BLOCKS "\n" | 403 | ".rept " BLOCK_REPEAT "\n" |
207 | "pkhbt r8, r7, r6 \n" | 404 | "pkhtb r0, r6, r7 \n" |
208 | "ldmia %[v1]!, {r2-r3} \n" | 405 | "ldrd r2, [%[v1]], #8 \n" |
209 | "smladx %[res], r1, r8, %[res] \n" | 406 | "smladx %[res], r1, r0, %[res] \n" |
210 | "pkhbt r8, r4, r7 \n" | 407 | "pkhtb r1, r7, r4 \n" |
211 | "ldmia %[v2]!, {r6-r7} \n" | 408 | "ldrd r6, [%[v2]], #8 \n" |
212 | "smladx %[res], r2, r8, %[res] \n" | 409 | "smladx %[res], r2, r1, %[res] \n" |
213 | "pkhbt r8, r5, r4 \n" | 410 | "pkhtb r2, r4, r5 \n" |
214 | "ldmia %[v1]!, {r0-r1} \n" | 411 | "ldrd r0, [%[v1]], #8 \n" |
215 | "smladx %[res], r3, r8, %[res] \n" | 412 | "smladx %[res], r3, r2, %[res] \n" |
216 | "pkhbt r8, r6, r5 \n" | 413 | "pkhtb r3, r5, r6 \n" |
217 | "ldmia %[v2]!, {r4-r5} \n" | 414 | "ldrd r4, [%[v2]], #8 \n" |
218 | "smladx %[res], r0, r8, %[res] \n" | 415 | "smladx %[res], r0, r3, %[res] \n" |
219 | ".endr \n" | 416 | ".endr \n" |
220 | 417 | ||
221 | "pkhbt r8, r7, r6 \n" | 418 | "pkhtb r0, r6, r7 \n" |
222 | "ldmia %[v1]!, {r2-r3} \n" | 419 | "ldrd r2, [%[v1]], #8 \n" |
223 | "smladx %[res], r1, r8, %[res] \n" | 420 | "smladx %[res], r1, r0, %[res] \n" |
224 | "pkhbt r8, r4, r7 \n" | 421 | "pkhtb r1, r7, r4 \n" |
225 | #if ORDER > 32 | 422 | #if ORDER > 32 |
226 | "subs %[cnt], %[cnt], #1 \n" | 423 | "subs %[cnt], %[cnt], #1 \n" |
227 | "ldmneia %[v2]!, {r6-r7} \n" | 424 | "ldrned r6, [%[v2]], #8 \n" |
228 | "smladx %[res], r2, r8, %[res] \n" | 425 | "smladx %[res], r2, r1, %[res] \n" |
229 | "pkhbt r8, r5, r4 \n" | 426 | "pkhtb r2, r4, r5 \n" |
230 | "ldmneia %[v1]!, {r0-r1} \n" | 427 | "ldrned r0, [%[v1]], #8 \n" |
231 | "smladx %[res], r3, r8, %[res] \n" | 428 | "smladx %[res], r3, r2, %[res] \n" |
232 | "bne 1b \n" | 429 | "bne 1b \n" |
233 | #else | 430 | #else |
234 | "pkhbt r5, r5, r4 \n" | 431 | "pkhtb r4, r4, r5 \n" |
235 | "smladx %[res], r2, r8, %[res] \n" | 432 | "smladx %[res], r2, r1, %[res] \n" |
236 | "smladx %[res], r3, r5, %[res] \n" | 433 | "smladx %[res], r3, r4, %[res] \n" |
237 | #endif | 434 | #endif |
238 | "b 99f \n" | ||
239 | 435 | ||
436 | "b 99f \n" | ||
437 | |||
240 | "20: \n" | 438 | "20: \n" |
241 | "ldmia %[v1]!, {r0-r1} \n" | 439 | "ldrd r0, [%[v1]], #8 \n" |
242 | "ldmia %[v2]!, {r5-r7} \n" | 440 | "ldmia %[v2]!, {r5-r7} \n" |
441 | |||
243 | "1: \n" | 442 | "1: \n" |
244 | "ldmia %[v1]!, {r2-r3} \n" | 443 | "ldrd r2, [%[v1]], #8 \n" |
245 | #if ORDER > 32 | 444 | #if ORDER > 32 |
246 | "smlad %[res], r0, r5, %[res] \n" | 445 | "smlad %[res], r0, r5, %[res] \n" |
247 | #else | 446 | #else |
248 | "smuad %[res], r0, r5 \n" | 447 | "smuad %[res], r0, r5 \n" |
249 | #endif | 448 | #endif |
250 | ".rept " MLA_BLOCKS "\n" | 449 | ".rept " BLOCK_REPEAT "\n" |
251 | "ldmia %[v2]!, {r4-r5} \n" | 450 | "ldrd r4, [%[v2]], #8 \n" |
252 | "smlad %[res], r1, r6, %[res] \n" | 451 | "smlad %[res], r1, r6, %[res] \n" |
253 | "ldmia %[v1]!, {r0-r1} \n" | 452 | "ldrd r0, [%[v1]], #8 \n" |
254 | "smlad %[res], r2, r7, %[res] \n" | 453 | "smlad %[res], r2, r7, %[res] \n" |
255 | "ldmia %[v2]!, {r6-r7} \n" | 454 | "ldrd r6, [%[v2]], #8 \n" |
256 | "smlad %[res], r3, r4, %[res] \n" | 455 | "smlad %[res], r3, r4, %[res] \n" |
257 | "ldmia %[v1]!, {r2-r3} \n" | 456 | "ldrd r2, [%[v1]], #8 \n" |
258 | "smlad %[res], r0, r5, %[res] \n" | 457 | "smlad %[res], r0, r5, %[res] \n" |
259 | ".endr \n" | 458 | ".endr \n" |
260 | 459 | ||
261 | "ldmia %[v2]!, {r4-r5} \n" | ||
262 | "smlad %[res], r1, r6, %[res] \n" | ||
263 | #if ORDER > 32 | 460 | #if ORDER > 32 |
461 | "ldrd r4, [%[v2]], #8 \n" | ||
462 | "smlad %[res], r1, r6, %[res] \n" | ||
264 | "subs %[cnt], %[cnt], #1 \n" | 463 | "subs %[cnt], %[cnt], #1 \n" |
265 | "ldmneia %[v1]!, {r0-r1} \n" | 464 | "ldrned r0, [%[v1]], #8 \n" |
266 | "smlad %[res], r2, r7, %[res] \n" | 465 | "smlad %[res], r2, r7, %[res] \n" |
267 | "ldmneia %[v2]!, {r6-r7} \n" | 466 | "ldrned r6, [%[v2]], #8 \n" |
268 | "smlad %[res], r3, r4, %[res] \n" | 467 | "smlad %[res], r3, r4, %[res] \n" |
269 | "bne 1b \n" | 468 | "bne 1b \n" |
270 | #else | 469 | #else |
470 | "ldr r4, [%[v2]], #4 \n" | ||
471 | "smlad %[res], r1, r6, %[res] \n" | ||
271 | "smlad %[res], r2, r7, %[res] \n" | 472 | "smlad %[res], r2, r7, %[res] \n" |
272 | "smlad %[res], r3, r4, %[res] \n" | 473 | "smlad %[res], r3, r4, %[res] \n" |
273 | #endif | 474 | #endif |
@@ -282,8 +483,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
282 | [res]"=r"(res) | 483 | [res]"=r"(res) |
283 | : /* inputs */ | 484 | : /* inputs */ |
284 | : /* clobbers */ | 485 | : /* clobbers */ |
285 | "r0", "r1", "r2", "r3", "r4", | 486 | "r0", "r1", "r2", "r3", |
286 | "r5", "r6", "r7", "r8" | 487 | "r4", "r5", "r6", "r7" |
287 | ); | 488 | ); |
288 | return res; | 489 | return res; |
289 | } | 490 | } |