summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h')
-rw-r--r--lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h364
1 files changed, 364 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h
new file mode 100644
index 0000000000..4d77d3be31
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h
@@ -0,0 +1,364 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9Coldfire vector math copyright (C) 2007 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27#define FUSED_VECTOR_MATH
28
29#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
30
31#define REPEAT_2(x) x x
32#define REPEAT_3(x) x x x
33#define REPEAT_7(x) x x x x x x x
34
35/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
36 * This version fetches data as 32 bit words, and *recommends* v1 to be
37 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
38 * aligned or both unaligned. Performance will suffer if either condition
39 * isn't met. It also needs EMAC in signed integer mode. */
40static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
41{
42 int res;
43#if ORDER > 16
44 int cnt = ORDER>>4;
45#endif
46
47#define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \
48 "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \
49 "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \
50 "clr.w " #sum " \n" /* 's1' is clobbered! */ \
51 "add.l " #s2 ", " #sum "\n" \
52 "move.w " #s1 ", " #sum "\n"
53
54#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
55 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
56 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
57 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
58 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
59 "move.w " #s1 ", " #sum "\n"
60
61 asm volatile (
62 "move.l %[f2], %%d0 \n"
63 "and.l #2, %%d0 \n"
64 "jeq 20f \n"
65
66 "10: \n"
67 "move.w (%[f2])+, %%d0 \n"
68 "move.w (%[s2])+, %%d1 \n"
69 "swap %%d1 \n"
70 "1: \n"
71 REPEAT_2(
72 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
73 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
74 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
75 ADDHALFXREGS(%%d6, %%d2, %%d1)
76 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
77 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
78 "move.l %%d1, (%[v1])+ \n"
79 ADDHALFXREGS(%%d7, %%d6, %%d2)
80 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
81 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
82 "move.l %%d2, (%[v1])+ \n"
83 ADDHALFXREGS(%%a0, %%d7, %%d6)
84 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
85 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
86 "move.l %%d6, (%[v1])+ \n"
87 ADDHALFXREGS(%%a1, %%d1, %%d7)
88 "move.l %%d7, (%[v1])+ \n"
89 )
90
91#if ORDER > 16
92 "subq.l #1, %[res] \n"
93 "bne.w 1b \n"
94#endif
95 "jra 99f \n"
96
97 "20: \n"
98 "move.l (%[f2])+, %%d0 \n"
99 "1: \n"
100 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
101 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
102 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
103 ADDHALFREGS(%%d6, %%d1, %%d2)
104 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
105 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
106 "move.l %%d2, (%[v1])+ \n"
107 ADDHALFREGS(%%d7, %%d1, %%d2)
108 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
109 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
110 "move.l %%d2, (%[v1])+ \n"
111 ADDHALFREGS(%%a0, %%d1, %%d2)
112 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
113 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
114 "move.l %%d2, (%[v1])+ \n"
115 ADDHALFREGS(%%a1, %%d1, %%d2)
116 "move.l %%d2, (%[v1])+ \n"
117
118 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
119 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
120 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
121 ADDHALFREGS(%%d6, %%d1, %%d2)
122 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
123 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
124 "move.l %%d2, (%[v1])+ \n"
125 ADDHALFREGS(%%d7, %%d1, %%d2)
126 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
127 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
128 "move.l %%d2, (%[v1])+ \n"
129 ADDHALFREGS(%%a0, %%d1, %%d2)
130 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
131#if ORDER > 16
132 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
133#else
134 "mac.w %%d0l, %%a1l, %%acc0 \n"
135#endif
136 "move.l %%d2, (%[v1])+ \n"
137 ADDHALFREGS(%%a1, %%d1, %%d2)
138 "move.l %%d2, (%[v1])+ \n"
139#if ORDER > 16
140 "subq.l #1, %[res] \n"
141 "bne.w 1b \n"
142#endif
143
144 "99: \n"
145 "movclr.l %%acc0, %[res] \n"
146 : /* outputs */
147 [v1]"+a"(v1),
148 [f2]"+a"(f2),
149 [s2]"+a"(s2),
150 [res]"=d"(res)
151 : /* inputs */
152#if ORDER > 16
153 [cnt]"[res]"(cnt)
154#endif
155 : /* clobbers */
156 "d0", "d1", "d2", "d6", "d7",
157 "a0", "a1", "memory"
158
159 );
160 return res;
161}
162
163/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
164 * This version fetches data as 32 bit words, and *recommends* v1 to be
165 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
166 * aligned or both unaligned. Performance will suffer if either condition
167 * isn't met. It also needs EMAC in signed integer mode. */
168static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
169{
170 int res;
171#if ORDER > 16
172 int cnt = ORDER>>4;
173#endif
174
175#define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
176 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
177 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
178 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
179 "sub.l " #sub ", " #dif "\n" \
180 "move.w " #min ", " #dif "\n"
181
182#define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
183 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
184 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
185 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
186 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
187 "sub.l " #s2 ", " #min "\n" \
188 "move.w " #min ", " #s1d "\n"
189
190 asm volatile (
191 "move.l %[f2], %%d0 \n"
192 "and.l #2, %%d0 \n"
193 "jeq 20f \n"
194
195 "10: \n"
196 "move.w (%[f2])+, %%d0 \n"
197 "move.w (%[s2])+, %%d1 \n"
198 "swap %%d1 \n"
199 "1: \n"
200 REPEAT_2(
201 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
202 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
203 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
204 SUBHALFXREGS(%%d6, %%d2, %%d1)
205 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
206 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
207 "move.l %%d1, (%[v1])+ \n"
208 SUBHALFXREGS(%%d7, %%d6, %%d2)
209 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
210 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
211 "move.l %%d2, (%[v1])+ \n"
212 SUBHALFXREGS(%%a0, %%d7, %%d6)
213 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
214 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
215 "move.l %%d6, (%[v1])+ \n"
216 SUBHALFXREGS(%%a1, %%d1, %%d7)
217 "move.l %%d7, (%[v1])+ \n"
218 )
219
220#if ORDER > 16
221 "subq.l #1, %[res] \n"
222 "bne.w 1b \n"
223#endif
224
225 "jra 99f \n"
226
227 "20: \n"
228 "move.l (%[f2])+, %%d0 \n"
229 "1: \n"
230 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
231 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
232 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
233 SUBHALFREGS(%%d6, %%d1, %%d2)
234 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
235 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
236 "move.l %%d2, (%[v1])+ \n"
237 SUBHALFREGS(%%d7, %%d1, %%d2)
238 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
239 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
240 "move.l %%d2, (%[v1])+ \n"
241 SUBHALFREGS(%%a0, %%d1, %%d2)
242 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
243 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
244 "move.l %%d2, (%[v1])+ \n"
245 SUBHALFREGS(%%a1, %%d1, %%d2)
246 "move.l %%d2, (%[v1])+ \n"
247
248 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
249 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
250 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
251 SUBHALFREGS(%%d6, %%d1, %%d2)
252 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
253 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
254 "move.l %%d2, (%[v1])+ \n"
255 SUBHALFREGS(%%d7, %%d1, %%d2)
256 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
257 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
258 "move.l %%d2, (%[v1])+ \n"
259 SUBHALFREGS(%%a0, %%d1, %%d2)
260 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
261#if ORDER > 16
262 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
263#else
264 "mac.w %%d0l, %%a1l, %%acc0 \n"
265#endif
266 "move.l %%d2, (%[v1])+ \n"
267 SUBHALFREGS(%%a1, %%d1, %%d2)
268 "move.l %%d2, (%[v1])+ \n"
269#if ORDER > 16
270 "subq.l #1, %[res] \n"
271 "bne.w 1b \n"
272#endif
273
274 "99: \n"
275 "movclr.l %%acc0, %[res] \n"
276 : /* outputs */
277 [v1]"+a"(v1),
278 [f2]"+a"(f2),
279 [s2]"+a"(s2),
280 [res]"=d"(res)
281 : /* inputs */
282#if ORDER > 16
283 [cnt]"[res]"(cnt)
284#endif
285 : /* clobbers */
286 "d0", "d1", "d2", "d6", "d7",
287 "a0", "a1", "memory"
288
289 );
290 return res;
291}
292
293/* This version fetches data as 32 bit words, and *recommends* v1 to be
294 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
295 * in signed integer mode. */
296static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
297{
298 int res;
299#if ORDER > 16
300 int cnt = ORDER>>4;
301#endif
302
303 asm volatile (
304 "move.l %[v2], %%d0 \n"
305 "and.l #2, %%d0 \n"
306 "jeq 20f \n"
307
308 "10: \n"
309 "move.l (%[v1])+, %%d0 \n"
310 "move.w (%[v2])+, %%d1 \n"
311 "1: \n"
312 REPEAT_7(
313 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
314 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
315 )
316
317 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
318#if ORDER > 16
319 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
320 "subq.l #1, %[res] \n"
321 "bne.b 1b \n"
322#else
323 "mac.w %%d0l, %%d1u, %%acc0 \n"
324#endif
325 "jra 99f \n"
326
327 "20: \n"
328 "move.l (%[v1])+, %%d0 \n"
329 "move.l (%[v2])+, %%d1 \n"
330 "1: \n"
331 REPEAT_3(
332 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
333 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
334 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
335 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
336 )
337
338 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
339 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
340#if ORDER > 16
341 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
342 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
343 "subq.l #1, %[res] \n"
344 "bne.b 1b \n"
345#else
346 "mac.w %%d2u, %%d1u, %%acc0 \n"
347 "mac.w %%d2l, %%d1l, %%acc0 \n"
348#endif
349
350 "99: \n"
351 "movclr.l %%acc0, %[res] \n"
352 : /* outputs */
353 [v1]"+a"(v1),
354 [v2]"+a"(v2),
355 [res]"=d"(res)
356 : /* inputs */
357#if ORDER > 16
358 [cnt]"[res]"(cnt)
359#endif
360 : /* clobbers */
361 "d0", "d1", "d2"
362 );
363 return res;
364}