diff options
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_arm7.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_arm7.h | 293 |
1 files changed, 0 insertions, 293 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h deleted file mode 100644 index 653bb1f53f..0000000000 --- a/apps/codecs/demac/libdemac/vector_math16_arm7.h +++ /dev/null | |||
@@ -1,293 +0,0 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARM7 vector math copyright (C) 2007 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
29 | * incorrect results (if ARM aligncheck is disabled). */ | ||
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | ||
31 | { | ||
32 | #if ORDER > 16 | ||
33 | int cnt = ORDER>>4; | ||
34 | #endif | ||
35 | |||
36 | #define ADDHALFREGS(sum, s1) /* Adds register */ \ | ||
37 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ | ||
38 | "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ | ||
39 | "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ | ||
40 | "mov " #sum ", " #sum ", lsl #16 \n" \ | ||
41 | "orr " #sum ", " #sum ", r8 , lsr #16 \n" | ||
42 | |||
43 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ | ||
44 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ | ||
45 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ | ||
46 | "mov " #sum ", " #sum ", lsl #16 \n" \ | ||
47 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" | ||
48 | |||
49 | asm volatile ( | ||
50 | "tst %[v2], #2 \n" | ||
51 | "beq 20f \n" | ||
52 | |||
53 | "10: \n" | ||
54 | "ldrh r4, [%[v2]], #2 \n" | ||
55 | "mov r4, r4, lsl #16 \n" | ||
56 | "1: \n" | ||
57 | "ldmia %[v1], {r0-r3} \n" | ||
58 | "ldmia %[v2]!, {r5-r8} \n" | ||
59 | ADDHALFXREGS(r0, r4, r5) | ||
60 | ADDHALFXREGS(r1, r5, r6) | ||
61 | ADDHALFXREGS(r2, r6, r7) | ||
62 | ADDHALFXREGS(r3, r7, r8) | ||
63 | "stmia %[v1]!, {r0-r3} \n" | ||
64 | "mov r4, r8 \n" | ||
65 | "ldmia %[v1], {r0-r3} \n" | ||
66 | "ldmia %[v2]!, {r5-r8} \n" | ||
67 | ADDHALFXREGS(r0, r4, r5) | ||
68 | ADDHALFXREGS(r1, r5, r6) | ||
69 | ADDHALFXREGS(r2, r6, r7) | ||
70 | ADDHALFXREGS(r3, r7, r8) | ||
71 | "stmia %[v1]!, {r0-r3} \n" | ||
72 | #if ORDER > 16 | ||
73 | "mov r4, r8 \n" | ||
74 | "subs %[cnt], %[cnt], #1 \n" | ||
75 | "bne 1b \n" | ||
76 | #endif | ||
77 | "b 99f \n" | ||
78 | |||
79 | "20: \n" | ||
80 | "1: \n" | ||
81 | "ldmia %[v1], {r0-r3} \n" | ||
82 | "ldmia %[v2]!, {r4-r7} \n" | ||
83 | ADDHALFREGS(r0, r4) | ||
84 | ADDHALFREGS(r1, r5) | ||
85 | ADDHALFREGS(r2, r6) | ||
86 | ADDHALFREGS(r3, r7) | ||
87 | "stmia %[v1]!, {r0-r3} \n" | ||
88 | "ldmia %[v1], {r0-r3} \n" | ||
89 | "ldmia %[v2]!, {r4-r7} \n" | ||
90 | ADDHALFREGS(r0, r4) | ||
91 | ADDHALFREGS(r1, r5) | ||
92 | ADDHALFREGS(r2, r6) | ||
93 | ADDHALFREGS(r3, r7) | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | ||
96 | "subs %[cnt], %[cnt], #1 \n" | ||
97 | "bne 1b \n" | ||
98 | #endif | ||
99 | |||
100 | "99: \n" | ||
101 | : /* outputs */ | ||
102 | #if ORDER > 16 | ||
103 | [cnt]"+r"(cnt), | ||
104 | #endif | ||
105 | [v1] "+r"(v1), | ||
106 | [v2] "+r"(v2) | ||
107 | : /* inputs */ | ||
108 | : /* clobbers */ | ||
109 | "r0", "r1", "r2", "r3", "r4", | ||
110 | "r5", "r6", "r7", "r8", "memory" | ||
111 | ); | ||
112 | } | ||
113 | |||
114 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
115 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
116 | * incorrect results (if ARM aligncheck is disabled). */ | ||
117 | static inline void vector_sub(int16_t* v1, int16_t* v2) | ||
118 | { | ||
119 | #if ORDER > 16 | ||
120 | int cnt = ORDER>>4; | ||
121 | #endif | ||
122 | |||
123 | #define SUBHALFREGS(dif, s1) /* Subtracts register */ \ | ||
124 | "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ | ||
125 | "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ | ||
126 | "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ | ||
127 | "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ | ||
128 | "orr " #dif ", r8 , " #dif ", lsl #16 \n" | ||
129 | |||
130 | #define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ | ||
131 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ | ||
132 | "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ | ||
133 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ | ||
134 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" | ||
135 | |||
136 | asm volatile ( | ||
137 | "mov r9, #0xff \n" | ||
138 | "orr r9, r9, #0xff00 \n" | ||
139 | "tst %[v2], #2 \n" | ||
140 | "beq 20f \n" | ||
141 | |||
142 | "10: \n" | ||
143 | "ldrh r4, [%[v2]], #2 \n" | ||
144 | "mov r4, r4, lsl #16 \n" | ||
145 | "1: \n" | ||
146 | "ldmia %[v1], {r0-r3} \n" | ||
147 | "ldmia %[v2]!, {r5-r8} \n" | ||
148 | SUBHALFXREGS(r0, r4, r5) | ||
149 | SUBHALFXREGS(r1, r5, r6) | ||
150 | SUBHALFXREGS(r2, r6, r7) | ||
151 | SUBHALFXREGS(r3, r7, r8) | ||
152 | "stmia %[v1]!, {r0-r3} \n" | ||
153 | "mov r4, r8 \n" | ||
154 | "ldmia %[v1], {r0-r3} \n" | ||
155 | "ldmia %[v2]!, {r5-r8} \n" | ||
156 | SUBHALFXREGS(r0, r4, r5) | ||
157 | SUBHALFXREGS(r1, r5, r6) | ||
158 | SUBHALFXREGS(r2, r6, r7) | ||
159 | SUBHALFXREGS(r3, r7, r8) | ||
160 | "stmia %[v1]!, {r0-r3} \n" | ||
161 | #if ORDER > 16 | ||
162 | "mov r4, r8 \n" | ||
163 | "subs %[cnt], %[cnt], #1 \n" | ||
164 | "bne 1b \n" | ||
165 | #endif | ||
166 | "b 99f \n" | ||
167 | |||
168 | "20: \n" | ||
169 | "1: \n" | ||
170 | "ldmia %[v1], {r0-r3} \n" | ||
171 | "ldmia %[v2]!, {r4-r7} \n" | ||
172 | SUBHALFREGS(r0, r4) | ||
173 | SUBHALFREGS(r1, r5) | ||
174 | SUBHALFREGS(r2, r6) | ||
175 | SUBHALFREGS(r3, r7) | ||
176 | "stmia %[v1]!, {r0-r3} \n" | ||
177 | "ldmia %[v1], {r0-r3} \n" | ||
178 | "ldmia %[v2]!, {r4-r7} \n" | ||
179 | SUBHALFREGS(r0, r4) | ||
180 | SUBHALFREGS(r1, r5) | ||
181 | SUBHALFREGS(r2, r6) | ||
182 | SUBHALFREGS(r3, r7) | ||
183 | "stmia %[v1]!, {r0-r3} \n" | ||
184 | #if ORDER > 16 | ||
185 | "subs %[cnt], %[cnt], #1 \n" | ||
186 | "bne 1b \n" | ||
187 | #endif | ||
188 | |||
189 | "99: \n" | ||
190 | : /* outputs */ | ||
191 | #if ORDER > 16 | ||
192 | [cnt]"+r"(cnt), | ||
193 | #endif | ||
194 | [v1] "+r"(v1), | ||
195 | [v2] "+r"(v2) | ||
196 | : /* inputs */ | ||
197 | : /* clobbers */ | ||
198 | "r0", "r1", "r2", "r3", "r4", "r5", | ||
199 | "r6", "r7", "r8", "r9", "memory" | ||
200 | ); | ||
201 | } | ||
202 | |||
203 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
204 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
205 | * incorrect results (if ARM aligncheck is disabled). It is optimised | ||
206 | * for ARM7TDMI. Using it for ARM9 or higher results in worse performance | ||
207 | * than the C version. */ | ||
208 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
209 | { | ||
210 | int res = 0; | ||
211 | #if ORDER > 16 | ||
212 | int cnt = ORDER>>4; | ||
213 | #endif | ||
214 | |||
215 | #define MLABLOCK2(f1, f2) \ | ||
216 | "mov r8, " #f1 ", lsl #16 \n" \ | ||
217 | "mov r8, r8 , asr #16 \n" \ | ||
218 | "mov r9, " #f2 ", lsl #16 \n" \ | ||
219 | "mov r9, r9 , asr #16 \n" \ | ||
220 | "mla %[res], r9, r8, %[res] \n" \ | ||
221 | "mov r8, " #f1 ", asr #16 \n" \ | ||
222 | "mov r9, " #f2 ", asr #16 \n" \ | ||
223 | "mla %[res], r9, r8, %[res] \n" | ||
224 | |||
225 | #define MLABLOCK2_U2(f1, f2) \ | ||
226 | "mov r8, " #f1 ", lsl #16 \n" \ | ||
227 | "mov r8, r8 , asr #16 \n" \ | ||
228 | "mla %[res], r9, r8, %[res] \n" \ | ||
229 | "mov r8, " #f1 ", asr #16 \n" \ | ||
230 | "mov r9, " #f2 ", lsl #16 \n" \ | ||
231 | "mov r9, r9 , asr #16 \n" \ | ||
232 | "mla %[res], r9, r8, %[res] \n" \ | ||
233 | "mov r9, " #f2 ", asr #16 \n" | ||
234 | |||
235 | asm volatile ( | ||
236 | "tst %[v2], #2 \n" | ||
237 | "beq 20f \n" | ||
238 | |||
239 | "10: \n" | ||
240 | "ldrsh r9, [%[v2]], #2 \n" | ||
241 | "1: \n" | ||
242 | "ldmia %[v1]!, {r0-r3} \n" | ||
243 | "ldmia %[v2]!, {r4-r7} \n" | ||
244 | MLABLOCK2_U2(r0, r4) | ||
245 | MLABLOCK2_U2(r1, r5) | ||
246 | MLABLOCK2_U2(r2, r6) | ||
247 | MLABLOCK2_U2(r3, r7) | ||
248 | "ldmia %[v1]!, {r0-r3} \n" | ||
249 | "ldmia %[v2]!, {r4-r7} \n" | ||
250 | MLABLOCK2_U2(r0, r4) | ||
251 | MLABLOCK2_U2(r1, r5) | ||
252 | MLABLOCK2_U2(r2, r6) | ||
253 | MLABLOCK2_U2(r3, r7) | ||
254 | #if ORDER > 16 | ||
255 | "subs %[cnt], %[cnt], #1 \n" | ||
256 | "bne 1b \n" | ||
257 | #endif | ||
258 | "b 99f \n" | ||
259 | |||
260 | "20: \n" | ||
261 | "1: \n" | ||
262 | "ldmia %[v1]!, {r0-r3} \n" | ||
263 | "ldmia %[v2]!, {r4-r7} \n" | ||
264 | MLABLOCK2(r0, r4) | ||
265 | MLABLOCK2(r1, r5) | ||
266 | MLABLOCK2(r2, r6) | ||
267 | MLABLOCK2(r3, r7) | ||
268 | "ldmia %[v1]!, {r0-r3} \n" | ||
269 | "ldmia %[v2]!, {r4-r7} \n" | ||
270 | MLABLOCK2(r0, r4) | ||
271 | MLABLOCK2(r1, r5) | ||
272 | MLABLOCK2(r2, r6) | ||
273 | MLABLOCK2(r3, r7) | ||
274 | #if ORDER > 16 | ||
275 | "subs %[cnt], %[cnt], #1 \n" | ||
276 | "bne 1b \n" | ||
277 | #endif | ||
278 | |||
279 | "99: \n" | ||
280 | : /* outputs */ | ||
281 | #if ORDER > 16 | ||
282 | [cnt]"+r"(cnt), | ||
283 | #endif | ||
284 | [v1] "+r"(v1), | ||
285 | [v2] "+r"(v2), | ||
286 | [res]"+r"(res) | ||
287 | : /* inputs */ | ||
288 | : /* clobbers */ | ||
289 | "r0", "r1", "r2", "r3", "r4", | ||
290 | "r5", "r6", "r7", "r8", "r9" | ||
291 | ); | ||
292 | return res; | ||
293 | } | ||