diff options
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h')
-rw-r--r-- | lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h | 404 |
1 files changed, 404 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h new file mode 100644 index 0000000000..ae7427c137 --- /dev/null +++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h | |||
@@ -0,0 +1,404 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARMv5te vector math copyright (C) 2008 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | #define FUSED_VECTOR_MATH | ||
28 | |||
29 | #define REPEAT_3(x) x x x | ||
30 | #if ORDER > 16 | ||
31 | #define REPEAT_MLA(x) x x x x x x x | ||
32 | #else | ||
33 | #define REPEAT_MLA(x) x x x | ||
34 | #endif | ||
35 | |||
36 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) | ||
37 | * This version fetches data as 32 bit words, and *requires* v1 to be | ||
38 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
39 | * aligned or both unaligned. If either condition isn't met, it will either | ||
40 | * result in a data abort or incorrect results. */ | ||
41 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
42 | { | ||
43 | int res; | ||
44 | #if ORDER > 16 | ||
45 | int cnt = ORDER>>4; | ||
46 | #endif | ||
47 | |||
48 | #define ADDHALFREGS(sum, s1, s2) /* Adds register */ \ | ||
49 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ | ||
50 | "add " #sum ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ | ||
51 | "add " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ | ||
52 | "mov " #s1 ", " #s1 ", lsl #16 \n" \ | ||
53 | "orr " #sum ", " #s1 ", " #sum ", lsr #16 \n" | ||
54 | |||
55 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ | ||
56 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ | ||
57 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ | ||
58 | "mov " #sum ", " #sum ", lsl #16 \n" \ | ||
59 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" | ||
60 | |||
61 | asm volatile ( | ||
62 | #if ORDER > 16 | ||
63 | "mov %[res], #0 \n" | ||
64 | #endif | ||
65 | "tst %[f2], #2 \n" | ||
66 | "beq 20f \n" | ||
67 | |||
68 | "10: \n" | ||
69 | "ldrh r4, [%[s2]], #2 \n" | ||
70 | "mov r4, r4, lsl #16 \n" | ||
71 | "ldrh r3, [%[f2]], #2 \n" | ||
72 | #if ORDER > 16 | ||
73 | "mov r3, r3, lsl #16 \n" | ||
74 | "1: \n" | ||
75 | "ldmia %[v1], {r0,r1} \n" | ||
76 | "smlabt %[res], r0, r3, %[res] \n" | ||
77 | #else | ||
78 | "ldmia %[v1], {r0,r1} \n" | ||
79 | "smulbb %[res], r0, r3 \n" | ||
80 | #endif | ||
81 | "ldmia %[f2]!, {r2,r3} \n" | ||
82 | "smlatb %[res], r0, r2, %[res] \n" | ||
83 | "smlabt %[res], r1, r2, %[res] \n" | ||
84 | "smlatb %[res], r1, r3, %[res] \n" | ||
85 | "ldmia %[s2]!, {r2,r5} \n" | ||
86 | ADDHALFXREGS(r0, r4, r2) | ||
87 | ADDHALFXREGS(r1, r2, r5) | ||
88 | "stmia %[v1]!, {r0,r1} \n" | ||
89 | "ldmia %[v1], {r0,r1} \n" | ||
90 | "smlabt %[res], r0, r3, %[res] \n" | ||
91 | "ldmia %[f2]!, {r2,r3} \n" | ||
92 | "smlatb %[res], r0, r2, %[res] \n" | ||
93 | "smlabt %[res], r1, r2, %[res] \n" | ||
94 | "smlatb %[res], r1, r3, %[res] \n" | ||
95 | "ldmia %[s2]!, {r2,r4} \n" | ||
96 | ADDHALFXREGS(r0, r5, r2) | ||
97 | ADDHALFXREGS(r1, r2, r4) | ||
98 | "stmia %[v1]!, {r0,r1} \n" | ||
99 | |||
100 | "ldmia %[v1], {r0,r1} \n" | ||
101 | "smlabt %[res], r0, r3, %[res] \n" | ||
102 | "ldmia %[f2]!, {r2,r3} \n" | ||
103 | "smlatb %[res], r0, r2, %[res] \n" | ||
104 | "smlabt %[res], r1, r2, %[res] \n" | ||
105 | "smlatb %[res], r1, r3, %[res] \n" | ||
106 | "ldmia %[s2]!, {r2,r5} \n" | ||
107 | ADDHALFXREGS(r0, r4, r2) | ||
108 | ADDHALFXREGS(r1, r2, r5) | ||
109 | "stmia %[v1]!, {r0,r1} \n" | ||
110 | "ldmia %[v1], {r0,r1} \n" | ||
111 | "smlabt %[res], r0, r3, %[res] \n" | ||
112 | "ldmia %[f2]!, {r2,r3} \n" | ||
113 | "smlatb %[res], r0, r2, %[res] \n" | ||
114 | "smlabt %[res], r1, r2, %[res] \n" | ||
115 | "smlatb %[res], r1, r3, %[res] \n" | ||
116 | "ldmia %[s2]!, {r2,r4} \n" | ||
117 | ADDHALFXREGS(r0, r5, r2) | ||
118 | ADDHALFXREGS(r1, r2, r4) | ||
119 | "stmia %[v1]!, {r0,r1} \n" | ||
120 | #if ORDER > 16 | ||
121 | "subs %[cnt], %[cnt], #1 \n" | ||
122 | "bne 1b \n" | ||
123 | #endif | ||
124 | "b 99f \n" | ||
125 | |||
126 | "20: \n" | ||
127 | "1: \n" | ||
128 | "ldmia %[v1], {r1,r2} \n" | ||
129 | "ldmia %[f2]!, {r3,r4} \n" | ||
130 | #if ORDER > 16 | ||
131 | "smlabb %[res], r1, r3, %[res] \n" | ||
132 | #else | ||
133 | "smulbb %[res], r1, r3 \n" | ||
134 | #endif | ||
135 | "smlatt %[res], r1, r3, %[res] \n" | ||
136 | "smlabb %[res], r2, r4, %[res] \n" | ||
137 | "smlatt %[res], r2, r4, %[res] \n" | ||
138 | "ldmia %[s2]!, {r3,r4} \n" | ||
139 | ADDHALFREGS(r0, r1, r3) | ||
140 | ADDHALFREGS(r1, r2, r4) | ||
141 | "stmia %[v1]!, {r0,r1} \n" | ||
142 | |||
143 | REPEAT_3( | ||
144 | "ldmia %[v1], {r1,r2} \n" | ||
145 | "ldmia %[f2]!, {r3,r4} \n" | ||
146 | "smlabb %[res], r1, r3, %[res] \n" | ||
147 | "smlatt %[res], r1, r3, %[res] \n" | ||
148 | "smlabb %[res], r2, r4, %[res] \n" | ||
149 | "smlatt %[res], r2, r4, %[res] \n" | ||
150 | "ldmia %[s2]!, {r3,r4} \n" | ||
151 | ADDHALFREGS(r0, r1, r3) | ||
152 | ADDHALFREGS(r1, r2, r4) | ||
153 | "stmia %[v1]!, {r0,r1} \n" | ||
154 | ) | ||
155 | #if ORDER > 16 | ||
156 | "subs %[cnt], %[cnt], #1 \n" | ||
157 | "bne 1b \n" | ||
158 | #endif | ||
159 | |||
160 | "99: \n" | ||
161 | : /* outputs */ | ||
162 | #if ORDER > 16 | ||
163 | [cnt]"+r"(cnt), | ||
164 | #endif | ||
165 | [v1] "+r"(v1), | ||
166 | [f2] "+r"(f2), | ||
167 | [s2] "+r"(s2), | ||
168 | [res]"=r"(res) | ||
169 | : /* inputs */ | ||
170 | : /* clobbers */ | ||
171 | "r0", "r1", "r2", "r3", "r4", "r5", "cc", "memory" | ||
172 | ); | ||
173 | return res; | ||
174 | } | ||
175 | |||
176 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) | ||
177 | * This version fetches data as 32 bit words, and *requires* v1 to be | ||
178 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
179 | * aligned or both unaligned. If either condition isn't met, it will either | ||
180 | * result in a data abort or incorrect results. */ | ||
181 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
182 | { | ||
183 | int res; | ||
184 | #if ORDER > 16 | ||
185 | int cnt = ORDER>>4; | ||
186 | #endif | ||
187 | |||
188 | #define SUBHALFREGS(dif, s1, s2) /* Subtracts reg. */ \ | ||
189 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ | ||
190 | "sub " #dif ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ | ||
191 | "sub " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ | ||
192 | "mov " #s1 ", " #s1 ", lsl #16 \n" \ | ||
193 | "orr " #dif ", " #s1 ", " #dif ", lsr #16 \n" | ||
194 | |||
195 | #define SUBHALFXREGS(dif, s1, s2, msk) /* Subtracts reg. */ \ | ||
196 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ | ||
197 | "and " #s1 ", " #s1 ", " #msk " \n" /* Needs msk = */ \ | ||
198 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* 0x0000ffff, */ \ | ||
199 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" /* clobbers 's1'. */ | ||
200 | |||
201 | asm volatile ( | ||
202 | #if ORDER > 16 | ||
203 | "mov %[res], #0 \n" | ||
204 | #endif | ||
205 | "tst %[f2], #2 \n" | ||
206 | "beq 20f \n" | ||
207 | |||
208 | "10: \n" | ||
209 | "mov r6, #0xff \n" | ||
210 | "orr r6, r6, #0xff00 \n" | ||
211 | "ldrh r4, [%[s2]], #2 \n" | ||
212 | "mov r4, r4, lsl #16 \n" | ||
213 | "ldrh r3, [%[f2]], #2 \n" | ||
214 | #if ORDER > 16 | ||
215 | "mov r3, r3, lsl #16 \n" | ||
216 | "1: \n" | ||
217 | "ldmia %[v1], {r0,r1} \n" | ||
218 | "smlabt %[res], r0, r3, %[res] \n" | ||
219 | #else | ||
220 | "ldmia %[v1], {r0,r1} \n" | ||
221 | "smulbb %[res], r0, r3 \n" | ||
222 | #endif | ||
223 | "ldmia %[f2]!, {r2,r3} \n" | ||
224 | "smlatb %[res], r0, r2, %[res] \n" | ||
225 | "smlabt %[res], r1, r2, %[res] \n" | ||
226 | "smlatb %[res], r1, r3, %[res] \n" | ||
227 | "ldmia %[s2]!, {r2,r5} \n" | ||
228 | SUBHALFXREGS(r0, r4, r2, r6) | ||
229 | SUBHALFXREGS(r1, r2, r5, r6) | ||
230 | "stmia %[v1]!, {r0,r1} \n" | ||
231 | "ldmia %[v1], {r0,r1} \n" | ||
232 | "smlabt %[res], r0, r3, %[res] \n" | ||
233 | "ldmia %[f2]!, {r2,r3} \n" | ||
234 | "smlatb %[res], r0, r2, %[res] \n" | ||
235 | "smlabt %[res], r1, r2, %[res] \n" | ||
236 | "smlatb %[res], r1, r3, %[res] \n" | ||
237 | "ldmia %[s2]!, {r2,r4} \n" | ||
238 | SUBHALFXREGS(r0, r5, r2, r6) | ||
239 | SUBHALFXREGS(r1, r2, r4, r6) | ||
240 | "stmia %[v1]!, {r0,r1} \n" | ||
241 | |||
242 | "ldmia %[v1], {r0,r1} \n" | ||
243 | "smlabt %[res], r0, r3, %[res] \n" | ||
244 | "ldmia %[f2]!, {r2,r3} \n" | ||
245 | "smlatb %[res], r0, r2, %[res] \n" | ||
246 | "smlabt %[res], r1, r2, %[res] \n" | ||
247 | "smlatb %[res], r1, r3, %[res] \n" | ||
248 | "ldmia %[s2]!, {r2,r5} \n" | ||
249 | SUBHALFXREGS(r0, r4, r2, r6) | ||
250 | SUBHALFXREGS(r1, r2, r5, r6) | ||
251 | "stmia %[v1]!, {r0,r1} \n" | ||
252 | "ldmia %[v1], {r0,r1} \n" | ||
253 | "smlabt %[res], r0, r3, %[res] \n" | ||
254 | "ldmia %[f2]!, {r2,r3} \n" | ||
255 | "smlatb %[res], r0, r2, %[res] \n" | ||
256 | "smlabt %[res], r1, r2, %[res] \n" | ||
257 | "smlatb %[res], r1, r3, %[res] \n" | ||
258 | "ldmia %[s2]!, {r2,r4} \n" | ||
259 | SUBHALFXREGS(r0, r5, r2, r6) | ||
260 | SUBHALFXREGS(r1, r2, r4, r6) | ||
261 | "stmia %[v1]!, {r0,r1} \n" | ||
262 | #if ORDER > 16 | ||
263 | "subs %[cnt], %[cnt], #1 \n" | ||
264 | "bne 1b \n" | ||
265 | #endif | ||
266 | "b 99f \n" | ||
267 | |||
268 | "20: \n" | ||
269 | "1: \n" | ||
270 | "ldmia %[v1], {r1,r2} \n" | ||
271 | "ldmia %[f2]!, {r3,r4} \n" | ||
272 | #if ORDER > 16 | ||
273 | "smlabb %[res], r1, r3, %[res] \n" | ||
274 | #else | ||
275 | "smulbb %[res], r1, r3 \n" | ||
276 | #endif | ||
277 | "smlatt %[res], r1, r3, %[res] \n" | ||
278 | "smlabb %[res], r2, r4, %[res] \n" | ||
279 | "smlatt %[res], r2, r4, %[res] \n" | ||
280 | "ldmia %[s2]!, {r3,r4} \n" | ||
281 | SUBHALFREGS(r0, r1, r3) | ||
282 | SUBHALFREGS(r1, r2, r4) | ||
283 | "stmia %[v1]!, {r0,r1} \n" | ||
284 | |||
285 | REPEAT_3( | ||
286 | "ldmia %[v1], {r1,r2} \n" | ||
287 | "ldmia %[f2]!, {r3,r4} \n" | ||
288 | "smlabb %[res], r1, r3, %[res] \n" | ||
289 | "smlatt %[res], r1, r3, %[res] \n" | ||
290 | "smlabb %[res], r2, r4, %[res] \n" | ||
291 | "smlatt %[res], r2, r4, %[res] \n" | ||
292 | "ldmia %[s2]!, {r3,r4} \n" | ||
293 | SUBHALFREGS(r0, r1, r3) | ||
294 | SUBHALFREGS(r1, r2, r4) | ||
295 | "stmia %[v1]!, {r0,r1} \n" | ||
296 | ) | ||
297 | #if ORDER > 16 | ||
298 | "subs %[cnt], %[cnt], #1 \n" | ||
299 | "bne 1b \n" | ||
300 | #endif | ||
301 | |||
302 | "99: \n" | ||
303 | : /* outputs */ | ||
304 | #if ORDER > 16 | ||
305 | [cnt]"+r"(cnt), | ||
306 | #endif | ||
307 | [v1] "+r"(v1), | ||
308 | [f2] "+r"(f2), | ||
309 | [s2] "+r"(s2), | ||
310 | [res]"=r"(res) | ||
311 | : /* inputs */ | ||
312 | : /* clobbers */ | ||
313 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "cc", "memory" | ||
314 | ); | ||
315 | return res; | ||
316 | } | ||
317 | |||
318 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
319 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
320 | * incorrect results (if ARM aligncheck is disabled). */ | ||
321 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
322 | { | ||
323 | int res; | ||
324 | #if ORDER > 32 | ||
325 | int cnt = ORDER>>5; | ||
326 | #endif | ||
327 | |||
328 | asm volatile ( | ||
329 | #if ORDER > 32 | ||
330 | "mov %[res], #0 \n" | ||
331 | #endif | ||
332 | "tst %[v2], #2 \n" | ||
333 | "beq 20f \n" | ||
334 | |||
335 | "10: \n" | ||
336 | "ldrh r3, [%[v2]], #2 \n" | ||
337 | #if ORDER > 32 | ||
338 | "mov r3, r3, lsl #16 \n" | ||
339 | "1: \n" | ||
340 | "ldmia %[v1]!, {r0,r1} \n" | ||
341 | "smlabt %[res], r0, r3, %[res] \n" | ||
342 | #else | ||
343 | "ldmia %[v1]!, {r0,r1} \n" | ||
344 | "smulbb %[res], r0, r3 \n" | ||
345 | #endif | ||
346 | "ldmia %[v2]!, {r2,r3} \n" | ||
347 | "smlatb %[res], r0, r2, %[res] \n" | ||
348 | "smlabt %[res], r1, r2, %[res] \n" | ||
349 | "smlatb %[res], r1, r3, %[res] \n" | ||
350 | |||
351 | REPEAT_MLA( | ||
352 | "ldmia %[v1]!, {r0,r1} \n" | ||
353 | "smlabt %[res], r0, r3, %[res] \n" | ||
354 | "ldmia %[v2]!, {r2,r3} \n" | ||
355 | "smlatb %[res], r0, r2, %[res] \n" | ||
356 | "smlabt %[res], r1, r2, %[res] \n" | ||
357 | "smlatb %[res], r1, r3, %[res] \n" | ||
358 | ) | ||
359 | #if ORDER > 32 | ||
360 | "subs %[cnt], %[cnt], #1 \n" | ||
361 | "bne 1b \n" | ||
362 | #endif | ||
363 | "b 99f \n" | ||
364 | |||
365 | "20: \n" | ||
366 | "1: \n" | ||
367 | "ldmia %[v1]!, {r0,r1} \n" | ||
368 | "ldmia %[v2]!, {r2,r3} \n" | ||
369 | #if ORDER > 32 | ||
370 | "smlabb %[res], r0, r2, %[res] \n" | ||
371 | #else | ||
372 | "smulbb %[res], r0, r2 \n" | ||
373 | #endif | ||
374 | "smlatt %[res], r0, r2, %[res] \n" | ||
375 | "smlabb %[res], r1, r3, %[res] \n" | ||
376 | "smlatt %[res], r1, r3, %[res] \n" | ||
377 | |||
378 | REPEAT_MLA( | ||
379 | "ldmia %[v1]!, {r0,r1} \n" | ||
380 | "ldmia %[v2]!, {r2,r3} \n" | ||
381 | "smlabb %[res], r0, r2, %[res] \n" | ||
382 | "smlatt %[res], r0, r2, %[res] \n" | ||
383 | "smlabb %[res], r1, r3, %[res] \n" | ||
384 | "smlatt %[res], r1, r3, %[res] \n" | ||
385 | ) | ||
386 | #if ORDER > 32 | ||
387 | "subs %[cnt], %[cnt], #1 \n" | ||
388 | "bne 1b \n" | ||
389 | #endif | ||
390 | |||
391 | "99: \n" | ||
392 | : /* outputs */ | ||
393 | #if ORDER > 32 | ||
394 | [cnt]"+r"(cnt), | ||
395 | #endif | ||
396 | [v1] "+r"(v1), | ||
397 | [v2] "+r"(v2), | ||
398 | [res]"=r"(res) | ||
399 | : /* inputs */ | ||
400 | : /* clobbers */ | ||
401 | "r0", "r1", "r2", "r3", "cc", "memory" | ||
402 | ); | ||
403 | return res; | ||
404 | } | ||