diff options
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_cf.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 230 |
1 files changed, 190 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 937462c293..0c3aaca223 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
28 | * 32 bit aligned, otherwise performance will suffer. */ | ||
27 | static inline void vector_add(int16_t* v1, int16_t* v2) | 29 | static inline void vector_add(int16_t* v1, int16_t* v2) |
28 | { | 30 | { |
29 | #define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ | 31 | #if ORDER > 16 |
30 | "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ | 32 | int cnt = ORDER>>4; |
31 | "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ | 33 | #endif |
32 | "clr.w %%d4 \n" \ | 34 | |
33 | "add.l %%d4 , " #sum "\n" \ | 35 | #define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ |
36 | "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ | ||
37 | "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ | ||
38 | "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ | ||
39 | "add.l %%d4 , " #sum "\n" \ | ||
40 | "move.w " #s1 ", " #sum "\n" | ||
41 | |||
42 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ | ||
43 | "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \ | ||
44 | "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \ | ||
45 | "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \ | ||
46 | "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \ | ||
34 | "move.w " #s1 ", " #sum "\n" | 47 | "move.w " #s1 ", " #sum "\n" |
35 | 48 | ||
36 | asm volatile ( | 49 | asm volatile ( |
37 | #if ORDER > 16 | 50 | "move.l %[v2], %%d0 \n" |
38 | "moveq.l %[cnt], %%d5 \n" | 51 | "and.l #2, %%d0 \n" |
52 | "jeq 20f \n" | ||
53 | |||
54 | "10: \n" | ||
55 | "move.w (%[v2])+, %%d0 \n" | ||
56 | "swap %%d0 \n" | ||
39 | "1: \n" | 57 | "1: \n" |
58 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
59 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
60 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
61 | "move.l %%d0, (%[v1])+ \n" | ||
62 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
63 | "move.l %%d1, (%[v1])+ \n" | ||
64 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
65 | "move.l %%d2, (%[v1])+ \n" | ||
66 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
67 | "move.l %%d3, (%[v1])+ \n" | ||
68 | "lea.l (16, %[v2]), %[v2] \n" | ||
69 | "move.l %%d4, %%d0 \n" | ||
70 | |||
71 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
72 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
73 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
74 | "move.l %%d0, (%[v1])+ \n" | ||
75 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
76 | "move.l %%d1, (%[v1])+ \n" | ||
77 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
78 | "move.l %%d2, (%[v1])+ \n" | ||
79 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
80 | "move.l %%d3, (%[v1])+ \n" | ||
81 | #if ORDER > 16 | ||
82 | "lea.l (16, %[v2]), %[v2] \n" | ||
83 | "move.l %%d4, %%d0 \n" | ||
84 | |||
85 | "subq.l #1, %[cnt] \n" | ||
86 | "jne 1b \n" | ||
40 | #endif | 87 | #endif |
88 | "jra 99f \n" | ||
89 | |||
90 | "20: \n" | ||
91 | "1: \n" | ||
41 | "movem.l (%[v2]), %%a0-%%a3 \n" | 92 | "movem.l (%[v2]), %%a0-%%a3 \n" |
42 | "movem.l (%[v1]), %%d0-%%d3 \n" | 93 | "movem.l (%[v1]), %%d0-%%d3 \n" |
43 | ADDHALFREGS(%%a0, %%d0) | 94 | ADDHALFREGS(%%a0, %%d0) |
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
48 | "move.l %%d2, (%[v1])+ \n" | 99 | "move.l %%d2, (%[v1])+ \n" |
49 | ADDHALFREGS(%%a3, %%d3) | 100 | ADDHALFREGS(%%a3, %%d3) |
50 | "move.l %%d3, (%[v1])+ \n" | 101 | "move.l %%d3, (%[v1])+ \n" |
51 | |||
52 | "lea.l (16, %[v2]), %[v2] \n" | 102 | "lea.l (16, %[v2]), %[v2] \n" |
53 | 103 | ||
54 | "movem.l (%[v2]), %%a0-%%a3 \n" | 104 | "movem.l (%[v2]), %%a0-%%a3 \n" |
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
64 | #if ORDER > 16 | 114 | #if ORDER > 16 |
65 | "lea.l (16, %[v2]), %[v2] \n" | 115 | "lea.l (16, %[v2]), %[v2] \n" |
66 | 116 | ||
67 | "subq.l #1, %%d5 \n" | 117 | "subq.l #1, %[cnt] \n" |
68 | "bne.w 1b \n" | 118 | "jne 1b \n" |
69 | #endif | 119 | #endif |
120 | "99: \n" | ||
70 | : /* outputs */ | 121 | : /* outputs */ |
71 | [v1]"+a"(v1), | 122 | #if ORDER > 16 |
72 | [v2]"+a"(v2) | 123 | [cnt]"+d"(cnt), |
124 | #endif | ||
125 | [v1] "+a"(v1), | ||
126 | [v2] "+a"(v2) | ||
73 | : /* inputs */ | 127 | : /* inputs */ |
74 | [cnt]"n"(ORDER>>4) | ||
75 | : /* clobbers */ | 128 | : /* clobbers */ |
76 | "d0", "d1", "d2", "d3", "d4", "d5", | 129 | "d0", "d1", "d2", "d3", "d4", |
77 | "a0", "a1", "a2", "a3", "memory" | 130 | "a0", "a1", "a2", "a3", "memory" |
78 | ); | 131 | ); |
79 | } | 132 | } |
80 | 133 | ||
134 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
135 | * 32 bit aligned, otherwise performance will suffer. */ | ||
81 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 136 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
82 | { | 137 | { |
83 | #define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ | 138 | #if ORDER > 16 |
84 | "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ | 139 | int cnt = ORDER>>4; |
85 | "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ | 140 | #endif |
86 | "clr.w " #sub "\n" \ | 141 | |
87 | "sub.l " #sub ", " #dif "\n" \ | 142 | #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \ |
143 | "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \ | ||
144 | "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \ | ||
145 | "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \ | ||
146 | "sub.l " #sub ", " #dif "\n" \ | ||
88 | "move.w " #min ", " #dif "\n" | 147 | "move.w " #min ", " #dif "\n" |
148 | |||
149 | #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \ | ||
150 | "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \ | ||
151 | "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \ | ||
152 | "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \ | ||
153 | "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \ | ||
154 | "sub.l " #s2 ", " #min "\n" \ | ||
155 | "move.w " #min ", " #s1d "\n" | ||
89 | 156 | ||
90 | asm volatile ( | 157 | asm volatile ( |
91 | #if ORDER > 16 | 158 | "move.l %[v2], %%d0 \n" |
92 | "moveq.l %[cnt], %%d5 \n" | 159 | "and.l #2, %%d0 \n" |
160 | "jeq 20f \n" | ||
161 | |||
162 | "10: \n" | ||
163 | "move.w (%[v2])+, %%d0 \n" | ||
164 | "swap %%d0 \n" | ||
93 | "1: \n" | 165 | "1: \n" |
166 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
167 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
168 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
169 | "move.l %%d0, (%[v1])+ \n" | ||
170 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
171 | "move.l %%d1, (%[v1])+ \n" | ||
172 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
173 | "move.l %%d2, (%[v1])+ \n" | ||
174 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
175 | "move.l %%d3, (%[v1])+ \n" | ||
176 | "lea.l (16, %[v2]), %[v2] \n" | ||
177 | "move.l %%d4, %%d0 \n" | ||
178 | |||
179 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
180 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
181 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
182 | "move.l %%d0, (%[v1])+ \n" | ||
183 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
184 | "move.l %%d1, (%[v1])+ \n" | ||
185 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
186 | "move.l %%d2, (%[v1])+ \n" | ||
187 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
188 | "move.l %%d3, (%[v1])+ \n" | ||
189 | #if ORDER > 16 | ||
190 | "lea.l (16, %[v2]), %[v2] \n" | ||
191 | "move.l %%d4, %%d0 \n" | ||
192 | |||
193 | "subq.l #1, %[cnt] \n" | ||
194 | "bne.w 1b \n" | ||
94 | #endif | 195 | #endif |
196 | "jra 99f \n" | ||
197 | |||
198 | "20: \n" | ||
199 | "1: \n" | ||
95 | "movem.l (%[v2]), %%d1-%%d4 \n" | 200 | "movem.l (%[v2]), %%d1-%%d4 \n" |
96 | "movem.l (%[v1]), %%a0-%%a3 \n" | 201 | "movem.l (%[v1]), %%a0-%%a3 \n" |
97 | SUBHALFREGS(%%a0, %%d1, %%d0) | 202 | SUBHALFREGS(%%a0, %%d1, %%d0) |
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
118 | #if ORDER > 16 | 223 | #if ORDER > 16 |
119 | "lea.l (16, %[v2]), %[v2] \n" | 224 | "lea.l (16, %[v2]), %[v2] \n" |
120 | 225 | ||
121 | "subq.l #1, %%d5 \n" | 226 | "subq.l #1, %[cnt] \n" |
122 | "bne.w 1b \n" | 227 | "bne.w 1b \n" |
123 | #endif | 228 | #endif |
229 | |||
230 | "99: \n" | ||
124 | : /* outputs */ | 231 | : /* outputs */ |
125 | [v1]"+a"(v1), | 232 | #if ORDER > 16 |
126 | [v2]"+a"(v2) | 233 | [cnt]"+d"(cnt), |
234 | #endif | ||
235 | [v1] "+a"(v1), | ||
236 | [v2] "+a"(v2) | ||
127 | : /* inputs */ | 237 | : /* inputs */ |
128 | [cnt]"n"(ORDER>>4) | ||
129 | : /* clobbers */ | 238 | : /* clobbers */ |
130 | "d0", "d1", "d2", "d3", "d4", "d5", | 239 | "d0", "d1", "d2", "d3", "d4", |
131 | "a0", "a1", "a2", "a3", "memory" | 240 | "a0", "a1", "a2", "a3", "memory" |
132 | ); | 241 | ); |
133 | } | 242 | } |
134 | 243 | ||
135 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | 244 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ |
136 | 245 | ||
137 | /* Needs EMAC in signed integer mode! */ | 246 | /* This version fetches data as 32 bit words, and *recommends* v1 to be |
247 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC | ||
248 | * in signed integer mode - call above macro before use. */ | ||
138 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 249 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
139 | { | 250 | { |
140 | int res = 0; | 251 | int res = 0; |
252 | #if ORDER > 32 | ||
253 | int cnt = ORDER>>5; | ||
254 | #endif | ||
141 | 255 | ||
142 | #define MACBLOCK4 \ | 256 | #define MACBLOCK4 \ |
143 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ | 257 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ |
144 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ | 258 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ |
145 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ | 259 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ |
146 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 260 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
261 | |||
262 | #define MACBLOCK4_U2 \ | ||
263 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
264 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ | ||
265 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
266 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
147 | 267 | ||
148 | asm volatile ( | 268 | asm volatile ( |
269 | "move.l %[v2], %%d0 \n" | ||
270 | "and.l #2, %%d0 \n" | ||
271 | "jeq 20f \n" | ||
272 | |||
273 | "10: \n" | ||
274 | "move.l (%[v1])+, %%d0 \n" | ||
275 | "move.w (%[v2])+, %%d1 \n" | ||
276 | "1: \n" | ||
277 | #if ORDER > 16 | ||
278 | MACBLOCK4_U2 | ||
279 | MACBLOCK4_U2 | ||
280 | MACBLOCK4_U2 | ||
281 | MACBLOCK4_U2 | ||
282 | #endif | ||
283 | MACBLOCK4_U2 | ||
284 | MACBLOCK4_U2 | ||
285 | MACBLOCK4_U2 | ||
286 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
287 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
288 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
149 | #if ORDER > 32 | 289 | #if ORDER > 32 |
150 | "moveq.l %[cnt], %[res] \n" | 290 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
291 | "subq.l #1, %[res] \n" | ||
292 | "bne.w 1b \n" | ||
293 | #else | ||
294 | "mac.w %%d0l, %%d1u, %%acc0 \n" | ||
151 | #endif | 295 | #endif |
296 | "jra 99f \n" | ||
297 | |||
298 | "20: \n" | ||
152 | "move.l (%[v1])+, %%d0 \n" | 299 | "move.l (%[v1])+, %%d0 \n" |
153 | "move.l (%[v2])+, %%d1 \n" | 300 | "move.l (%[v2])+, %%d1 \n" |
154 | "1: \n" | 301 | "1: \n" |
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
162 | MACBLOCK4 | 309 | MACBLOCK4 |
163 | MACBLOCK4 | 310 | MACBLOCK4 |
164 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 311 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
165 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" | 312 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
166 | #if ORDER > 32 | 313 | #if ORDER > 32 |
167 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" | 314 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
168 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 315 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
169 | |||
170 | "subq.l #1, %[res] \n" | 316 | "subq.l #1, %[res] \n" |
171 | "bne.w 1b \n" | 317 | "bne.w 1b \n" |
172 | #else | 318 | #else |
173 | "mac.w %%d2u, %%d3u, %%acc0 \n" | 319 | "mac.w %%d2u, %%d1u, %%acc0 \n" |
174 | "mac.w %%d2l, %%d3l, %%acc0 \n" | 320 | "mac.w %%d2l, %%d1l, %%acc0 \n" |
175 | #endif | 321 | #endif |
322 | |||
323 | "99: \n" | ||
176 | "movclr.l %%acc0, %[res] \n" | 324 | "movclr.l %%acc0, %[res] \n" |
177 | : /* outputs */ | 325 | : /* outputs */ |
178 | [v1]"+a"(v1), | 326 | [v1]"+a"(v1), |
179 | [v2]"+a"(v2), | 327 | [v2]"+a"(v2), |
180 | [res]"=&d"(res) | 328 | [res]"=d"(res) |
181 | : /* inputs */ | 329 | : /* inputs */ |
182 | [cnt]"n"(ORDER>>5) | 330 | #if ORDER > 32 |
331 | [cnt]"[res]"(cnt) | ||
332 | #endif | ||
183 | : /* clobbers */ | 333 | : /* clobbers */ |
184 | "d0", "d1", "d2", "d3" | 334 | "d0", "d1", "d2" |
185 | ); | 335 | ); |
186 | return res; | 336 | return res; |
187 | } | 337 | } |