summaryrefslogtreecommitdiff
path: root/apps/codecs/demac/libdemac/vector_math16_cf.h
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_cf.h')
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h230
1 files changed, 190 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 937462c293..0c3aaca223 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *recommends* v1 to be
28 * 32 bit aligned, otherwise performance will suffer. */
27static inline void vector_add(int16_t* v1, int16_t* v2) 29static inline void vector_add(int16_t* v1, int16_t* v2)
28{ 30{
29#define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ 31#if ORDER > 16
30 "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ 32 int cnt = ORDER>>4;
31 "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ 33#endif
32 "clr.w %%d4 \n" \ 34
33 "add.l %%d4 , " #sum "\n" \ 35#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
36 "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
37 "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
38 "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
39 "add.l %%d4 , " #sum "\n" \
40 "move.w " #s1 ", " #sum "\n"
41
42#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
43 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
44 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
45 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
46 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
34 "move.w " #s1 ", " #sum "\n" 47 "move.w " #s1 ", " #sum "\n"
35 48
36 asm volatile ( 49 asm volatile (
37#if ORDER > 16 50 "move.l %[v2], %%d0 \n"
38 "moveq.l %[cnt], %%d5 \n" 51 "and.l #2, %%d0 \n"
52 "jeq 20f \n"
53
54 "10: \n"
55 "move.w (%[v2])+, %%d0 \n"
56 "swap %%d0 \n"
39 "1: \n" 57 "1: \n"
58 "movem.l (%[v1]), %%a0-%%a3 \n"
59 "movem.l (%[v2]), %%d1-%%d4 \n"
60 ADDHALFXREGS(%%a0, %%d1, %%d0)
61 "move.l %%d0, (%[v1])+ \n"
62 ADDHALFXREGS(%%a1, %%d2, %%d1)
63 "move.l %%d1, (%[v1])+ \n"
64 ADDHALFXREGS(%%a2, %%d3, %%d2)
65 "move.l %%d2, (%[v1])+ \n"
66 ADDHALFXREGS(%%a3, %%d4, %%d3)
67 "move.l %%d3, (%[v1])+ \n"
68 "lea.l (16, %[v2]), %[v2] \n"
69 "move.l %%d4, %%d0 \n"
70
71 "movem.l (%[v1]), %%a0-%%a3 \n"
72 "movem.l (%[v2]), %%d1-%%d4 \n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0)
74 "move.l %%d0, (%[v1])+ \n"
75 ADDHALFXREGS(%%a1, %%d2, %%d1)
76 "move.l %%d1, (%[v1])+ \n"
77 ADDHALFXREGS(%%a2, %%d3, %%d2)
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a3, %%d4, %%d3)
80 "move.l %%d3, (%[v1])+ \n"
81#if ORDER > 16
82 "lea.l (16, %[v2]), %[v2] \n"
83 "move.l %%d4, %%d0 \n"
84
85 "subq.l #1, %[cnt] \n"
86 "jne 1b \n"
40#endif 87#endif
88 "jra 99f \n"
89
90 "20: \n"
91 "1: \n"
41 "movem.l (%[v2]), %%a0-%%a3 \n" 92 "movem.l (%[v2]), %%a0-%%a3 \n"
42 "movem.l (%[v1]), %%d0-%%d3 \n" 93 "movem.l (%[v1]), %%d0-%%d3 \n"
43 ADDHALFREGS(%%a0, %%d0) 94 ADDHALFREGS(%%a0, %%d0)
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
48 "move.l %%d2, (%[v1])+ \n" 99 "move.l %%d2, (%[v1])+ \n"
49 ADDHALFREGS(%%a3, %%d3) 100 ADDHALFREGS(%%a3, %%d3)
50 "move.l %%d3, (%[v1])+ \n" 101 "move.l %%d3, (%[v1])+ \n"
51
52 "lea.l (16, %[v2]), %[v2] \n" 102 "lea.l (16, %[v2]), %[v2] \n"
53 103
54 "movem.l (%[v2]), %%a0-%%a3 \n" 104 "movem.l (%[v2]), %%a0-%%a3 \n"
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
64#if ORDER > 16 114#if ORDER > 16
65 "lea.l (16, %[v2]), %[v2] \n" 115 "lea.l (16, %[v2]), %[v2] \n"
66 116
67 "subq.l #1, %%d5 \n" 117 "subq.l #1, %[cnt] \n"
68 "bne.w 1b \n" 118 "jne 1b \n"
69#endif 119#endif
120 "99: \n"
70 : /* outputs */ 121 : /* outputs */
71 [v1]"+a"(v1), 122#if ORDER > 16
72 [v2]"+a"(v2) 123 [cnt]"+d"(cnt),
124#endif
125 [v1] "+a"(v1),
126 [v2] "+a"(v2)
73 : /* inputs */ 127 : /* inputs */
74 [cnt]"n"(ORDER>>4)
75 : /* clobbers */ 128 : /* clobbers */
76 "d0", "d1", "d2", "d3", "d4", "d5", 129 "d0", "d1", "d2", "d3", "d4",
77 "a0", "a1", "a2", "a3", "memory" 130 "a0", "a1", "a2", "a3", "memory"
78 ); 131 );
79} 132}
80 133
134/* This version fetches data as 32 bit words, and *recommends* v1 to be
135 * 32 bit aligned, otherwise performance will suffer. */
81static inline void vector_sub(int16_t* v1, int16_t* v2) 136static inline void vector_sub(int16_t* v1, int16_t* v2)
82{ 137{
83#define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ 138#if ORDER > 16
84 "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ 139 int cnt = ORDER>>4;
85 "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ 140#endif
86 "clr.w " #sub "\n" \ 141
87 "sub.l " #sub ", " #dif "\n" \ 142#define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
143 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
144 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
145 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
146 "sub.l " #sub ", " #dif "\n" \
88 "move.w " #min ", " #dif "\n" 147 "move.w " #min ", " #dif "\n"
148
149#define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
150 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
151 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
152 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
153 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
154 "sub.l " #s2 ", " #min "\n" \
155 "move.w " #min ", " #s1d "\n"
89 156
90 asm volatile ( 157 asm volatile (
91#if ORDER > 16 158 "move.l %[v2], %%d0 \n"
92 "moveq.l %[cnt], %%d5 \n" 159 "and.l #2, %%d0 \n"
160 "jeq 20f \n"
161
162 "10: \n"
163 "move.w (%[v2])+, %%d0 \n"
164 "swap %%d0 \n"
93 "1: \n" 165 "1: \n"
166 "movem.l (%[v2]), %%d1-%%d4 \n"
167 "movem.l (%[v1]), %%a0-%%a3 \n"
168 SUBHALFXREGS(%%a0, %%d1, %%d0)
169 "move.l %%d0, (%[v1])+ \n"
170 SUBHALFXREGS(%%a1, %%d2, %%d1)
171 "move.l %%d1, (%[v1])+ \n"
172 SUBHALFXREGS(%%a2, %%d3, %%d2)
173 "move.l %%d2, (%[v1])+ \n"
174 SUBHALFXREGS(%%a3, %%d4, %%d3)
175 "move.l %%d3, (%[v1])+ \n"
176 "lea.l (16, %[v2]), %[v2] \n"
177 "move.l %%d4, %%d0 \n"
178
179 "movem.l (%[v2]), %%d1-%%d4 \n"
180 "movem.l (%[v1]), %%a0-%%a3 \n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0)
182 "move.l %%d0, (%[v1])+ \n"
183 SUBHALFXREGS(%%a1, %%d2, %%d1)
184 "move.l %%d1, (%[v1])+ \n"
185 SUBHALFXREGS(%%a2, %%d3, %%d2)
186 "move.l %%d2, (%[v1])+ \n"
187 SUBHALFXREGS(%%a3, %%d4, %%d3)
188 "move.l %%d3, (%[v1])+ \n"
189#if ORDER > 16
190 "lea.l (16, %[v2]), %[v2] \n"
191 "move.l %%d4, %%d0 \n"
192
193 "subq.l #1, %[cnt] \n"
194 "bne.w 1b \n"
94#endif 195#endif
196 "jra 99f \n"
197
198 "20: \n"
199 "1: \n"
95 "movem.l (%[v2]), %%d1-%%d4 \n" 200 "movem.l (%[v2]), %%d1-%%d4 \n"
96 "movem.l (%[v1]), %%a0-%%a3 \n" 201 "movem.l (%[v1]), %%a0-%%a3 \n"
97 SUBHALFREGS(%%a0, %%d1, %%d0) 202 SUBHALFREGS(%%a0, %%d1, %%d0)
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
118#if ORDER > 16 223#if ORDER > 16
119 "lea.l (16, %[v2]), %[v2] \n" 224 "lea.l (16, %[v2]), %[v2] \n"
120 225
121 "subq.l #1, %%d5 \n" 226 "subq.l #1, %[cnt] \n"
122 "bne.w 1b \n" 227 "bne.w 1b \n"
123#endif 228#endif
229
230 "99: \n"
124 : /* outputs */ 231 : /* outputs */
125 [v1]"+a"(v1), 232#if ORDER > 16
126 [v2]"+a"(v2) 233 [cnt]"+d"(cnt),
234#endif
235 [v1] "+a"(v1),
236 [v2] "+a"(v2)
127 : /* inputs */ 237 : /* inputs */
128 [cnt]"n"(ORDER>>4)
129 : /* clobbers */ 238 : /* clobbers */
130 "d0", "d1", "d2", "d3", "d4", "d5", 239 "d0", "d1", "d2", "d3", "d4",
131 "a0", "a1", "a2", "a3", "memory" 240 "a0", "a1", "a2", "a3", "memory"
132 ); 241 );
133} 242}
134 243
135#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ 244#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
136 245
137/* Needs EMAC in signed integer mode! */ 246/* This version fetches data as 32 bit words, and *recommends* v1 to be
247 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
248 * in signed integer mode - call above macro before use. */
138static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 249static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
139{ 250{
140 int res = 0; 251 int res = 0;
252#if ORDER > 32
253 int cnt = ORDER>>5;
254#endif
141 255
142#define MACBLOCK4 \ 256#define MACBLOCK4 \
143 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ 257 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \
144 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ 258 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
145 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ 259 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
146 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 260 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
261
262#define MACBLOCK4_U2 \
263 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
264 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
265 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
266 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
147 267
148 asm volatile ( 268 asm volatile (
269 "move.l %[v2], %%d0 \n"
270 "and.l #2, %%d0 \n"
271 "jeq 20f \n"
272
273 "10: \n"
274 "move.l (%[v1])+, %%d0 \n"
275 "move.w (%[v2])+, %%d1 \n"
276 "1: \n"
277#if ORDER > 16
278 MACBLOCK4_U2
279 MACBLOCK4_U2
280 MACBLOCK4_U2
281 MACBLOCK4_U2
282#endif
283 MACBLOCK4_U2
284 MACBLOCK4_U2
285 MACBLOCK4_U2
286 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
287 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
288 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
149#if ORDER > 32 289#if ORDER > 32
150 "moveq.l %[cnt], %[res] \n" 290 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
291 "subq.l #1, %[res] \n"
292 "bne.w 1b \n"
293#else
294 "mac.w %%d0l, %%d1u, %%acc0 \n"
151#endif 295#endif
296 "jra 99f \n"
297
298 "20: \n"
152 "move.l (%[v1])+, %%d0 \n" 299 "move.l (%[v1])+, %%d0 \n"
153 "move.l (%[v2])+, %%d1 \n" 300 "move.l (%[v2])+, %%d1 \n"
154 "1: \n" 301 "1: \n"
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
162 MACBLOCK4 309 MACBLOCK4
163 MACBLOCK4 310 MACBLOCK4
164 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 311 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
165 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" 312 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
166#if ORDER > 32 313#if ORDER > 32
167 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" 314 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
168 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 315 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
169
170 "subq.l #1, %[res] \n" 316 "subq.l #1, %[res] \n"
171 "bne.w 1b \n" 317 "bne.w 1b \n"
172#else 318#else
173 "mac.w %%d2u, %%d3u, %%acc0 \n" 319 "mac.w %%d2u, %%d1u, %%acc0 \n"
174 "mac.w %%d2l, %%d3l, %%acc0 \n" 320 "mac.w %%d2l, %%d1l, %%acc0 \n"
175#endif 321#endif
322
323 "99: \n"
176 "movclr.l %%acc0, %[res] \n" 324 "movclr.l %%acc0, %[res] \n"
177 : /* outputs */ 325 : /* outputs */
178 [v1]"+a"(v1), 326 [v1]"+a"(v1),
179 [v2]"+a"(v2), 327 [v2]"+a"(v2),
180 [res]"=&d"(res) 328 [res]"=d"(res)
181 : /* inputs */ 329 : /* inputs */
182 [cnt]"n"(ORDER>>5) 330#if ORDER > 32
331 [cnt]"[res]"(cnt)
332#endif
183 : /* clobbers */ 333 : /* clobbers */
184 "d0", "d1", "d2", "d3" 334 "d0", "d1", "d2"
185 ); 335 );
186 return res; 336 return res;
187} 337}