diff options
author | Sean Bartell <wingedtachikoma@gmail.com> | 2011-06-25 21:32:25 -0400 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2012-04-25 22:13:20 +0200 |
commit | f40bfc9267b13b54e6379dfe7539447662879d24 (patch) | |
tree | 9b20069d5e62809ff434061ad730096836f916f2 /lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h | |
parent | a0009907de7a0107d49040d8a180f140e2eff299 (diff) | |
download | rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.tar.gz rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.zip |
Add codecs to librbcodec.
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97
Reviewed-on: http://gerrit.rockbox.org/137
Reviewed-by: Nils Wallménius <nils@rockbox.org>
Tested-by: Nils Wallménius <nils@rockbox.org>
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h')
-rw-r--r-- | lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h new file mode 100644 index 0000000000..4d77d3be31 --- /dev/null +++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -0,0 +1,364 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | Coldfire vector math copyright (C) 2007 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | #define FUSED_VECTOR_MATH | ||
28 | |||
29 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | ||
30 | |||
31 | #define REPEAT_2(x) x x | ||
32 | #define REPEAT_3(x) x x x | ||
33 | #define REPEAT_7(x) x x x x x x x | ||
34 | |||
35 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) | ||
36 | * This version fetches data as 32 bit words, and *recommends* v1 to be | ||
37 | * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit | ||
38 | * aligned or both unaligned. Performance will suffer if either condition | ||
39 | * isn't met. It also needs EMAC in signed integer mode. */ | ||
40 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
41 | { | ||
42 | int res; | ||
43 | #if ORDER > 16 | ||
44 | int cnt = ORDER>>4; | ||
45 | #endif | ||
46 | |||
47 | #define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \ | ||
48 | "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \ | ||
49 | "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \ | ||
50 | "clr.w " #sum " \n" /* 's1' is clobbered! */ \ | ||
51 | "add.l " #s2 ", " #sum "\n" \ | ||
52 | "move.w " #s1 ", " #sum "\n" | ||
53 | |||
54 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ | ||
55 | "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \ | ||
56 | "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \ | ||
57 | "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \ | ||
58 | "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \ | ||
59 | "move.w " #s1 ", " #sum "\n" | ||
60 | |||
61 | asm volatile ( | ||
62 | "move.l %[f2], %%d0 \n" | ||
63 | "and.l #2, %%d0 \n" | ||
64 | "jeq 20f \n" | ||
65 | |||
66 | "10: \n" | ||
67 | "move.w (%[f2])+, %%d0 \n" | ||
68 | "move.w (%[s2])+, %%d1 \n" | ||
69 | "swap %%d1 \n" | ||
70 | "1: \n" | ||
71 | REPEAT_2( | ||
72 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
73 | "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" | ||
74 | "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" | ||
75 | ADDHALFXREGS(%%d6, %%d2, %%d1) | ||
76 | "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" | ||
77 | "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" | ||
78 | "move.l %%d1, (%[v1])+ \n" | ||
79 | ADDHALFXREGS(%%d7, %%d6, %%d2) | ||
80 | "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" | ||
81 | "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" | ||
82 | "move.l %%d2, (%[v1])+ \n" | ||
83 | ADDHALFXREGS(%%a0, %%d7, %%d6) | ||
84 | "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" | ||
85 | "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" | ||
86 | "move.l %%d6, (%[v1])+ \n" | ||
87 | ADDHALFXREGS(%%a1, %%d1, %%d7) | ||
88 | "move.l %%d7, (%[v1])+ \n" | ||
89 | ) | ||
90 | |||
91 | #if ORDER > 16 | ||
92 | "subq.l #1, %[res] \n" | ||
93 | "bne.w 1b \n" | ||
94 | #endif | ||
95 | "jra 99f \n" | ||
96 | |||
97 | "20: \n" | ||
98 | "move.l (%[f2])+, %%d0 \n" | ||
99 | "1: \n" | ||
100 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
101 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
102 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
103 | ADDHALFREGS(%%d6, %%d1, %%d2) | ||
104 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
105 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
106 | "move.l %%d2, (%[v1])+ \n" | ||
107 | ADDHALFREGS(%%d7, %%d1, %%d2) | ||
108 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
109 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
110 | "move.l %%d2, (%[v1])+ \n" | ||
111 | ADDHALFREGS(%%a0, %%d1, %%d2) | ||
112 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
113 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
114 | "move.l %%d2, (%[v1])+ \n" | ||
115 | ADDHALFREGS(%%a1, %%d1, %%d2) | ||
116 | "move.l %%d2, (%[v1])+ \n" | ||
117 | |||
118 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
119 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
120 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
121 | ADDHALFREGS(%%d6, %%d1, %%d2) | ||
122 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
123 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
124 | "move.l %%d2, (%[v1])+ \n" | ||
125 | ADDHALFREGS(%%d7, %%d1, %%d2) | ||
126 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
127 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
128 | "move.l %%d2, (%[v1])+ \n" | ||
129 | ADDHALFREGS(%%a0, %%d1, %%d2) | ||
130 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
131 | #if ORDER > 16 | ||
132 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
133 | #else | ||
134 | "mac.w %%d0l, %%a1l, %%acc0 \n" | ||
135 | #endif | ||
136 | "move.l %%d2, (%[v1])+ \n" | ||
137 | ADDHALFREGS(%%a1, %%d1, %%d2) | ||
138 | "move.l %%d2, (%[v1])+ \n" | ||
139 | #if ORDER > 16 | ||
140 | "subq.l #1, %[res] \n" | ||
141 | "bne.w 1b \n" | ||
142 | #endif | ||
143 | |||
144 | "99: \n" | ||
145 | "movclr.l %%acc0, %[res] \n" | ||
146 | : /* outputs */ | ||
147 | [v1]"+a"(v1), | ||
148 | [f2]"+a"(f2), | ||
149 | [s2]"+a"(s2), | ||
150 | [res]"=d"(res) | ||
151 | : /* inputs */ | ||
152 | #if ORDER > 16 | ||
153 | [cnt]"[res]"(cnt) | ||
154 | #endif | ||
155 | : /* clobbers */ | ||
156 | "d0", "d1", "d2", "d6", "d7", | ||
157 | "a0", "a1", "memory" | ||
158 | |||
159 | ); | ||
160 | return res; | ||
161 | } | ||
162 | |||
163 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) | ||
164 | * This version fetches data as 32 bit words, and *recommends* v1 to be | ||
165 | * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit | ||
166 | * aligned or both unaligned. Performance will suffer if either condition | ||
167 | * isn't met. It also needs EMAC in signed integer mode. */ | ||
168 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
169 | { | ||
170 | int res; | ||
171 | #if ORDER > 16 | ||
172 | int cnt = ORDER>>4; | ||
173 | #endif | ||
174 | |||
175 | #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \ | ||
176 | "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \ | ||
177 | "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \ | ||
178 | "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \ | ||
179 | "sub.l " #sub ", " #dif "\n" \ | ||
180 | "move.w " #min ", " #dif "\n" | ||
181 | |||
182 | #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \ | ||
183 | "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \ | ||
184 | "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \ | ||
185 | "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \ | ||
186 | "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \ | ||
187 | "sub.l " #s2 ", " #min "\n" \ | ||
188 | "move.w " #min ", " #s1d "\n" | ||
189 | |||
190 | asm volatile ( | ||
191 | "move.l %[f2], %%d0 \n" | ||
192 | "and.l #2, %%d0 \n" | ||
193 | "jeq 20f \n" | ||
194 | |||
195 | "10: \n" | ||
196 | "move.w (%[f2])+, %%d0 \n" | ||
197 | "move.w (%[s2])+, %%d1 \n" | ||
198 | "swap %%d1 \n" | ||
199 | "1: \n" | ||
200 | REPEAT_2( | ||
201 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
202 | "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" | ||
203 | "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" | ||
204 | SUBHALFXREGS(%%d6, %%d2, %%d1) | ||
205 | "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" | ||
206 | "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" | ||
207 | "move.l %%d1, (%[v1])+ \n" | ||
208 | SUBHALFXREGS(%%d7, %%d6, %%d2) | ||
209 | "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" | ||
210 | "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" | ||
211 | "move.l %%d2, (%[v1])+ \n" | ||
212 | SUBHALFXREGS(%%a0, %%d7, %%d6) | ||
213 | "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" | ||
214 | "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" | ||
215 | "move.l %%d6, (%[v1])+ \n" | ||
216 | SUBHALFXREGS(%%a1, %%d1, %%d7) | ||
217 | "move.l %%d7, (%[v1])+ \n" | ||
218 | ) | ||
219 | |||
220 | #if ORDER > 16 | ||
221 | "subq.l #1, %[res] \n" | ||
222 | "bne.w 1b \n" | ||
223 | #endif | ||
224 | |||
225 | "jra 99f \n" | ||
226 | |||
227 | "20: \n" | ||
228 | "move.l (%[f2])+, %%d0 \n" | ||
229 | "1: \n" | ||
230 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
231 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
232 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
233 | SUBHALFREGS(%%d6, %%d1, %%d2) | ||
234 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
235 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
236 | "move.l %%d2, (%[v1])+ \n" | ||
237 | SUBHALFREGS(%%d7, %%d1, %%d2) | ||
238 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
239 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
240 | "move.l %%d2, (%[v1])+ \n" | ||
241 | SUBHALFREGS(%%a0, %%d1, %%d2) | ||
242 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
243 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
244 | "move.l %%d2, (%[v1])+ \n" | ||
245 | SUBHALFREGS(%%a1, %%d1, %%d2) | ||
246 | "move.l %%d2, (%[v1])+ \n" | ||
247 | |||
248 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
249 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
250 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
251 | SUBHALFREGS(%%d6, %%d1, %%d2) | ||
252 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
253 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
254 | "move.l %%d2, (%[v1])+ \n" | ||
255 | SUBHALFREGS(%%d7, %%d1, %%d2) | ||
256 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
257 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
258 | "move.l %%d2, (%[v1])+ \n" | ||
259 | SUBHALFREGS(%%a0, %%d1, %%d2) | ||
260 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
261 | #if ORDER > 16 | ||
262 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
263 | #else | ||
264 | "mac.w %%d0l, %%a1l, %%acc0 \n" | ||
265 | #endif | ||
266 | "move.l %%d2, (%[v1])+ \n" | ||
267 | SUBHALFREGS(%%a1, %%d1, %%d2) | ||
268 | "move.l %%d2, (%[v1])+ \n" | ||
269 | #if ORDER > 16 | ||
270 | "subq.l #1, %[res] \n" | ||
271 | "bne.w 1b \n" | ||
272 | #endif | ||
273 | |||
274 | "99: \n" | ||
275 | "movclr.l %%acc0, %[res] \n" | ||
276 | : /* outputs */ | ||
277 | [v1]"+a"(v1), | ||
278 | [f2]"+a"(f2), | ||
279 | [s2]"+a"(s2), | ||
280 | [res]"=d"(res) | ||
281 | : /* inputs */ | ||
282 | #if ORDER > 16 | ||
283 | [cnt]"[res]"(cnt) | ||
284 | #endif | ||
285 | : /* clobbers */ | ||
286 | "d0", "d1", "d2", "d6", "d7", | ||
287 | "a0", "a1", "memory" | ||
288 | |||
289 | ); | ||
290 | return res; | ||
291 | } | ||
292 | |||
293 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
294 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC | ||
295 | * in signed integer mode. */ | ||
296 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
297 | { | ||
298 | int res; | ||
299 | #if ORDER > 16 | ||
300 | int cnt = ORDER>>4; | ||
301 | #endif | ||
302 | |||
303 | asm volatile ( | ||
304 | "move.l %[v2], %%d0 \n" | ||
305 | "and.l #2, %%d0 \n" | ||
306 | "jeq 20f \n" | ||
307 | |||
308 | "10: \n" | ||
309 | "move.l (%[v1])+, %%d0 \n" | ||
310 | "move.w (%[v2])+, %%d1 \n" | ||
311 | "1: \n" | ||
312 | REPEAT_7( | ||
313 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
314 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
315 | ) | ||
316 | |||
317 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
318 | #if ORDER > 16 | ||
319 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
320 | "subq.l #1, %[res] \n" | ||
321 | "bne.b 1b \n" | ||
322 | #else | ||
323 | "mac.w %%d0l, %%d1u, %%acc0 \n" | ||
324 | #endif | ||
325 | "jra 99f \n" | ||
326 | |||
327 | "20: \n" | ||
328 | "move.l (%[v1])+, %%d0 \n" | ||
329 | "move.l (%[v2])+, %%d1 \n" | ||
330 | "1: \n" | ||
331 | REPEAT_3( | ||
332 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | ||
333 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
334 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
335 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
336 | ) | ||
337 | |||
338 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | ||
339 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
340 | #if ORDER > 16 | ||
341 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
342 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
343 | "subq.l #1, %[res] \n" | ||
344 | "bne.b 1b \n" | ||
345 | #else | ||
346 | "mac.w %%d2u, %%d1u, %%acc0 \n" | ||
347 | "mac.w %%d2l, %%d1l, %%acc0 \n" | ||
348 | #endif | ||
349 | |||
350 | "99: \n" | ||
351 | "movclr.l %%acc0, %[res] \n" | ||
352 | : /* outputs */ | ||
353 | [v1]"+a"(v1), | ||
354 | [v2]"+a"(v2), | ||
355 | [res]"=d"(res) | ||
356 | : /* inputs */ | ||
357 | #if ORDER > 16 | ||
358 | [cnt]"[res]"(cnt) | ||
359 | #endif | ||
360 | : /* clobbers */ | ||
361 | "d0", "d1", "d2" | ||
362 | ); | ||
363 | return res; | ||
364 | } | ||