diff options
Diffstat (limited to 'apps/codecs/demac')
-rw-r--r-- | apps/codecs/demac/libdemac/filter.c | 2 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_arm7.h | 293 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 230 |
3 files changed, 485 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index e85e42fb00..92d86edd7d 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c | |||
@@ -31,6 +31,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
31 | 31 | ||
32 | #ifdef CPU_COLDFIRE | 32 | #ifdef CPU_COLDFIRE |
33 | #include "vector_math16_cf.h" | 33 | #include "vector_math16_cf.h" |
34 | #elif defined CPU_ARM7TDMI | ||
35 | #include "vector_math16_arm7.h" | ||
34 | #else | 36 | #else |
35 | #include "vector_math16.h" | 37 | #include "vector_math16.h" |
36 | #endif | 38 | #endif |
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h new file mode 100644 index 0000000000..1565ca9602 --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_arm7.h | |||
@@ -0,0 +1,293 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARM7 vector math copyright (C) 2007 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
29 | * incorrect results (if ARM aligncheck is disabled). */ | ||
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | ||
31 | { | ||
32 | #if ORDER > 16 | ||
33 | int cnt = ORDER>>4; | ||
34 | #endif | ||
35 | |||
36 | #define ADDHALFREGS(sum, s1) /* Adds register */ \ | ||
37 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ | ||
38 | "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ | ||
39 | "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ | ||
40 | "mov " #sum ", " #sum ", lsl #16 \n" \ | ||
41 | "orr " #sum ", " #sum ", r8 , lsr #16 \n" | ||
42 | |||
43 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ | ||
44 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ | ||
45 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ | ||
46 | "mov " #sum ", " #sum ", lsl #16 \n" \ | ||
47 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" | ||
48 | |||
49 | asm volatile ( | ||
50 | "tst %[v2], #2 \n" | ||
51 | "beq 20f \n" | ||
52 | |||
53 | "10: \n" | ||
54 | "ldrh r4, [%[v2]], #2 \n" | ||
55 | "mov r4, r4, lsl #16 \n" | ||
56 | "1: \n" | ||
57 | "ldmia %[v1], {r0-r3} \n" | ||
58 | "ldmia %[v2]!, {r5-r8} \n" | ||
59 | ADDHALFXREGS(r0, r4, r5) | ||
60 | ADDHALFXREGS(r1, r5, r6) | ||
61 | ADDHALFXREGS(r2, r6, r7) | ||
62 | ADDHALFXREGS(r3, r7, r8) | ||
63 | "stmia %[v1]!, {r0-r3} \n" | ||
64 | "mov r4, r8 \n" | ||
65 | "ldmia %[v1], {r0-r3} \n" | ||
66 | "ldmia %[v2]!, {r5-r8} \n" | ||
67 | ADDHALFXREGS(r0, r4, r5) | ||
68 | ADDHALFXREGS(r1, r5, r6) | ||
69 | ADDHALFXREGS(r2, r6, r7) | ||
70 | ADDHALFXREGS(r3, r7, r8) | ||
71 | "stmia %[v1]!, {r0-r3} \n" | ||
72 | #if ORDER > 16 | ||
73 | "mov r4, r8 \n" | ||
74 | "subs %[cnt], %[cnt], #1 \n" | ||
75 | "bne 1b \n" | ||
76 | #endif | ||
77 | "b 99f \n" | ||
78 | |||
79 | "20: \n" | ||
80 | "1: \n" | ||
81 | "ldmia %[v1], {r0-r3} \n" | ||
82 | "ldmia %[v2]!, {r4-r7} \n" | ||
83 | ADDHALFREGS(r0, r4) | ||
84 | ADDHALFREGS(r1, r5) | ||
85 | ADDHALFREGS(r2, r6) | ||
86 | ADDHALFREGS(r3, r7) | ||
87 | "stmia %[v1]!, {r0-r3} \n" | ||
88 | "ldmia %[v1], {r0-r3} \n" | ||
89 | "ldmia %[v2]!, {r4-r7} \n" | ||
90 | ADDHALFREGS(r0, r4) | ||
91 | ADDHALFREGS(r1, r5) | ||
92 | ADDHALFREGS(r2, r6) | ||
93 | ADDHALFREGS(r3, r7) | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | ||
96 | "subs %[cnt], %[cnt], #1 \n" | ||
97 | "bne 1b \n" | ||
98 | #endif | ||
99 | |||
100 | "99: \n" | ||
101 | : /* outputs */ | ||
102 | #if ORDER > 16 | ||
103 | [cnt]"+r"(cnt), | ||
104 | #endif | ||
105 | [v1] "+r"(v1), | ||
106 | [v2] "+r"(v2) | ||
107 | : /* inputs */ | ||
108 | : /* clobbers */ | ||
109 | "r0", "r1", "r2", "r3", "r4", | ||
110 | "r5", "r6", "r7", "r8", "memory" | ||
111 | ); | ||
112 | } | ||
113 | |||
114 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
115 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
116 | * incorrect results (if ARM aligncheck is disabled). */ | ||
117 | static inline void vector_sub(int16_t* v1, int16_t* v2) | ||
118 | { | ||
119 | #if ORDER > 16 | ||
120 | int cnt = ORDER>>4; | ||
121 | #endif | ||
122 | |||
123 | #define SUBHALFREGS(dif, s1) /* Subtracts register */ \ | ||
124 | "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ | ||
125 | "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ | ||
126 | "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ | ||
127 | "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ | ||
128 | "orr " #dif ", r8 , " #dif ", lsl #16 \n" | ||
129 | |||
130 | #define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ | ||
131 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ | ||
132 | "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ | ||
133 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ | ||
134 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" | ||
135 | |||
136 | asm volatile ( | ||
137 | "mov r9, #0xff \n" | ||
138 | "orr r9, r9, #0xff00 \n" | ||
139 | "tst %[v2], #2 \n" | ||
140 | "beq 20f \n" | ||
141 | |||
142 | "10: \n" | ||
143 | "ldrh r4, [%[v2]], #2 \n" | ||
144 | "mov r4, r4, lsl #16 \n" | ||
145 | "1: \n" | ||
146 | "ldmia %[v1], {r0-r3} \n" | ||
147 | "ldmia %[v2]!, {r5-r8} \n" | ||
148 | SUBHALFXREGS(r0, r4, r5) | ||
149 | SUBHALFXREGS(r1, r5, r6) | ||
150 | SUBHALFXREGS(r2, r6, r7) | ||
151 | SUBHALFXREGS(r3, r7, r8) | ||
152 | "stmia %[v1]!, {r0-r3} \n" | ||
153 | "mov r4, r8 \n" | ||
154 | "ldmia %[v1], {r0-r3} \n" | ||
155 | "ldmia %[v2]!, {r5-r8} \n" | ||
156 | SUBHALFXREGS(r0, r4, r5) | ||
157 | SUBHALFXREGS(r1, r5, r6) | ||
158 | SUBHALFXREGS(r2, r6, r7) | ||
159 | SUBHALFXREGS(r3, r7, r8) | ||
160 | "stmia %[v1]!, {r0-r3} \n" | ||
161 | #if ORDER > 16 | ||
162 | "mov r4, r8 \n" | ||
163 | "subs %[cnt], %[cnt], #1 \n" | ||
164 | "bne 1b \n" | ||
165 | #endif | ||
166 | "b 99f \n" | ||
167 | |||
168 | "20: \n" | ||
169 | "1: \n" | ||
170 | "ldmia %[v1], {r0-r3} \n" | ||
171 | "ldmia %[v2]!, {r4-r7} \n" | ||
172 | SUBHALFREGS(r0, r4) | ||
173 | SUBHALFREGS(r1, r5) | ||
174 | SUBHALFREGS(r2, r6) | ||
175 | SUBHALFREGS(r3, r7) | ||
176 | "stmia %[v1]!, {r0-r3} \n" | ||
177 | "ldmia %[v1], {r0-r3} \n" | ||
178 | "ldmia %[v2]!, {r4-r7} \n" | ||
179 | SUBHALFREGS(r0, r4) | ||
180 | SUBHALFREGS(r1, r5) | ||
181 | SUBHALFREGS(r2, r6) | ||
182 | SUBHALFREGS(r3, r7) | ||
183 | "stmia %[v1]!, {r0-r3} \n" | ||
184 | #if ORDER > 16 | ||
185 | "subs %[cnt], %[cnt], #1 \n" | ||
186 | "bne 1b \n" | ||
187 | #endif | ||
188 | |||
189 | "99: \n" | ||
190 | : /* outputs */ | ||
191 | #if ORDER > 16 | ||
192 | [cnt]"+r"(cnt), | ||
193 | #endif | ||
194 | [v1] "+r"(v1), | ||
195 | [v2] "+r"(v2) | ||
196 | : /* inputs */ | ||
197 | : /* clobbers */ | ||
198 | "r0", "r1", "r2", "r3", "r4", "r5", | ||
199 | "r6", "r7", "r8", "r9", "memory" | ||
200 | ); | ||
201 | } | ||
202 | |||
203 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
204 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
205 | * incorrect results (if ARM aligncheck is disabled). It is optimised | ||
206 | * for ARM7TDMI. Using it for ARM9 or higher results in worse performance | ||
207 | * than the C version. */ | ||
208 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
209 | { | ||
210 | int res = 0; | ||
211 | #if ORDER > 16 | ||
212 | int cnt = ORDER>>4; | ||
213 | #endif | ||
214 | |||
215 | #define MLABLOCK2(f1, f2) \ | ||
216 | "mov r8, " #f1 ", lsl #16 \n" \ | ||
217 | "mov r8, r8 , asr #16 \n" \ | ||
218 | "mov r9, " #f2 ", lsl #16 \n" \ | ||
219 | "mov r9, r9 , asr #16 \n" \ | ||
220 | "mla %[res], r8, r9, %[res] \n" \ | ||
221 | "mov r8, " #f1 ", asr #16 \n" \ | ||
222 | "mov r9, " #f2 ", asr #16 \n" \ | ||
223 | "mla %[res], r8, r9, %[res] \n" | ||
224 | |||
225 | #define MLABLOCK2_U2(f1, f2) \ | ||
226 | "mov r8, " #f1 ", lsl #16 \n" \ | ||
227 | "mov r8, r8 , asr #16 \n" \ | ||
228 | "mla %[res], r8, r9, %[res] \n" \ | ||
229 | "mov r8, " #f1 ", asr #16 \n" \ | ||
230 | "mov r9, " #f2 ", lsl #16 \n" \ | ||
231 | "mov r9, r9 , asr #16 \n" \ | ||
232 | "mla %[res], r8, r9, %[res] \n" \ | ||
233 | "mov r9, " #f2 ", asr #16 \n" | ||
234 | |||
235 | asm volatile ( | ||
236 | "tst %[v2], #2 \n" | ||
237 | "beq 20f \n" | ||
238 | |||
239 | "10: \n" | ||
240 | "ldrsh r9, [%[v2]], #2 \n" | ||
241 | "1: \n" | ||
242 | "ldmia %[v1]!, {r0-r3} \n" | ||
243 | "ldmia %[v2]!, {r4-r7} \n" | ||
244 | MLABLOCK2_U2(r0, r4) | ||
245 | MLABLOCK2_U2(r1, r5) | ||
246 | MLABLOCK2_U2(r2, r6) | ||
247 | MLABLOCK2_U2(r3, r7) | ||
248 | "ldmia %[v1]!, {r0-r3} \n" | ||
249 | "ldmia %[v2]!, {r4-r7} \n" | ||
250 | MLABLOCK2_U2(r0, r4) | ||
251 | MLABLOCK2_U2(r1, r5) | ||
252 | MLABLOCK2_U2(r2, r6) | ||
253 | MLABLOCK2_U2(r3, r7) | ||
254 | #if ORDER > 16 | ||
255 | "subs %[cnt], %[cnt], #1 \n" | ||
256 | "bne 1b \n" | ||
257 | #endif | ||
258 | "b 99f \n" | ||
259 | |||
260 | "20: \n" | ||
261 | "1: \n" | ||
262 | "ldmia %[v1]!, {r0-r3} \n" | ||
263 | "ldmia %[v2]!, {r4-r7} \n" | ||
264 | MLABLOCK2(r0, r4) | ||
265 | MLABLOCK2(r1, r5) | ||
266 | MLABLOCK2(r2, r6) | ||
267 | MLABLOCK2(r3, r7) | ||
268 | "ldmia %[v1]!, {r0-r3} \n" | ||
269 | "ldmia %[v2]!, {r4-r7} \n" | ||
270 | MLABLOCK2(r0, r4) | ||
271 | MLABLOCK2(r1, r5) | ||
272 | MLABLOCK2(r2, r6) | ||
273 | MLABLOCK2(r3, r7) | ||
274 | #if ORDER > 16 | ||
275 | "subs %[cnt], %[cnt], #1 \n" | ||
276 | "bne 1b \n" | ||
277 | #endif | ||
278 | |||
279 | "99: \n" | ||
280 | : /* outputs */ | ||
281 | #if ORDER > 16 | ||
282 | [cnt]"+r"(cnt), | ||
283 | #endif | ||
284 | [v1] "+r"(v1), | ||
285 | [v2] "+r"(v2), | ||
286 | [res]"+r"(res) | ||
287 | : /* inputs */ | ||
288 | : /* clobbers */ | ||
289 | "r0", "r1", "r2", "r3", "r4", | ||
290 | "r5", "r6", "r7", "r8", "r9" | ||
291 | ); | ||
292 | return res; | ||
293 | } | ||
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 937462c293..0c3aaca223 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
28 | * 32 bit aligned, otherwise performance will suffer. */ | ||
27 | static inline void vector_add(int16_t* v1, int16_t* v2) | 29 | static inline void vector_add(int16_t* v1, int16_t* v2) |
28 | { | 30 | { |
29 | #define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ | 31 | #if ORDER > 16 |
30 | "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ | 32 | int cnt = ORDER>>4; |
31 | "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ | 33 | #endif |
32 | "clr.w %%d4 \n" \ | 34 | |
33 | "add.l %%d4 , " #sum "\n" \ | 35 | #define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ |
36 | "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ | ||
37 | "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ | ||
38 | "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ | ||
39 | "add.l %%d4 , " #sum "\n" \ | ||
40 | "move.w " #s1 ", " #sum "\n" | ||
41 | |||
42 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ | ||
43 | "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \ | ||
44 | "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \ | ||
45 | "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \ | ||
46 | "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \ | ||
34 | "move.w " #s1 ", " #sum "\n" | 47 | "move.w " #s1 ", " #sum "\n" |
35 | 48 | ||
36 | asm volatile ( | 49 | asm volatile ( |
37 | #if ORDER > 16 | 50 | "move.l %[v2], %%d0 \n" |
38 | "moveq.l %[cnt], %%d5 \n" | 51 | "and.l #2, %%d0 \n" |
52 | "jeq 20f \n" | ||
53 | |||
54 | "10: \n" | ||
55 | "move.w (%[v2])+, %%d0 \n" | ||
56 | "swap %%d0 \n" | ||
39 | "1: \n" | 57 | "1: \n" |
58 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
59 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
60 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
61 | "move.l %%d0, (%[v1])+ \n" | ||
62 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
63 | "move.l %%d1, (%[v1])+ \n" | ||
64 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
65 | "move.l %%d2, (%[v1])+ \n" | ||
66 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
67 | "move.l %%d3, (%[v1])+ \n" | ||
68 | "lea.l (16, %[v2]), %[v2] \n" | ||
69 | "move.l %%d4, %%d0 \n" | ||
70 | |||
71 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
72 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
73 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
74 | "move.l %%d0, (%[v1])+ \n" | ||
75 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
76 | "move.l %%d1, (%[v1])+ \n" | ||
77 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
78 | "move.l %%d2, (%[v1])+ \n" | ||
79 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
80 | "move.l %%d3, (%[v1])+ \n" | ||
81 | #if ORDER > 16 | ||
82 | "lea.l (16, %[v2]), %[v2] \n" | ||
83 | "move.l %%d4, %%d0 \n" | ||
84 | |||
85 | "subq.l #1, %[cnt] \n" | ||
86 | "jne 1b \n" | ||
40 | #endif | 87 | #endif |
88 | "jra 99f \n" | ||
89 | |||
90 | "20: \n" | ||
91 | "1: \n" | ||
41 | "movem.l (%[v2]), %%a0-%%a3 \n" | 92 | "movem.l (%[v2]), %%a0-%%a3 \n" |
42 | "movem.l (%[v1]), %%d0-%%d3 \n" | 93 | "movem.l (%[v1]), %%d0-%%d3 \n" |
43 | ADDHALFREGS(%%a0, %%d0) | 94 | ADDHALFREGS(%%a0, %%d0) |
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
48 | "move.l %%d2, (%[v1])+ \n" | 99 | "move.l %%d2, (%[v1])+ \n" |
49 | ADDHALFREGS(%%a3, %%d3) | 100 | ADDHALFREGS(%%a3, %%d3) |
50 | "move.l %%d3, (%[v1])+ \n" | 101 | "move.l %%d3, (%[v1])+ \n" |
51 | |||
52 | "lea.l (16, %[v2]), %[v2] \n" | 102 | "lea.l (16, %[v2]), %[v2] \n" |
53 | 103 | ||
54 | "movem.l (%[v2]), %%a0-%%a3 \n" | 104 | "movem.l (%[v2]), %%a0-%%a3 \n" |
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
64 | #if ORDER > 16 | 114 | #if ORDER > 16 |
65 | "lea.l (16, %[v2]), %[v2] \n" | 115 | "lea.l (16, %[v2]), %[v2] \n" |
66 | 116 | ||
67 | "subq.l #1, %%d5 \n" | 117 | "subq.l #1, %[cnt] \n" |
68 | "bne.w 1b \n" | 118 | "jne 1b \n" |
69 | #endif | 119 | #endif |
120 | "99: \n" | ||
70 | : /* outputs */ | 121 | : /* outputs */ |
71 | [v1]"+a"(v1), | 122 | #if ORDER > 16 |
72 | [v2]"+a"(v2) | 123 | [cnt]"+d"(cnt), |
124 | #endif | ||
125 | [v1] "+a"(v1), | ||
126 | [v2] "+a"(v2) | ||
73 | : /* inputs */ | 127 | : /* inputs */ |
74 | [cnt]"n"(ORDER>>4) | ||
75 | : /* clobbers */ | 128 | : /* clobbers */ |
76 | "d0", "d1", "d2", "d3", "d4", "d5", | 129 | "d0", "d1", "d2", "d3", "d4", |
77 | "a0", "a1", "a2", "a3", "memory" | 130 | "a0", "a1", "a2", "a3", "memory" |
78 | ); | 131 | ); |
79 | } | 132 | } |
80 | 133 | ||
134 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
135 | * 32 bit aligned, otherwise performance will suffer. */ | ||
81 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 136 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
82 | { | 137 | { |
83 | #define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ | 138 | #if ORDER > 16 |
84 | "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ | 139 | int cnt = ORDER>>4; |
85 | "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ | 140 | #endif |
86 | "clr.w " #sub "\n" \ | 141 | |
87 | "sub.l " #sub ", " #dif "\n" \ | 142 | #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \ |
143 | "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \ | ||
144 | "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \ | ||
145 | "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \ | ||
146 | "sub.l " #sub ", " #dif "\n" \ | ||
88 | "move.w " #min ", " #dif "\n" | 147 | "move.w " #min ", " #dif "\n" |
148 | |||
149 | #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \ | ||
150 | "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \ | ||
151 | "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \ | ||
152 | "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \ | ||
153 | "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \ | ||
154 | "sub.l " #s2 ", " #min "\n" \ | ||
155 | "move.w " #min ", " #s1d "\n" | ||
89 | 156 | ||
90 | asm volatile ( | 157 | asm volatile ( |
91 | #if ORDER > 16 | 158 | "move.l %[v2], %%d0 \n" |
92 | "moveq.l %[cnt], %%d5 \n" | 159 | "and.l #2, %%d0 \n" |
160 | "jeq 20f \n" | ||
161 | |||
162 | "10: \n" | ||
163 | "move.w (%[v2])+, %%d0 \n" | ||
164 | "swap %%d0 \n" | ||
93 | "1: \n" | 165 | "1: \n" |
166 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
167 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
168 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
169 | "move.l %%d0, (%[v1])+ \n" | ||
170 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
171 | "move.l %%d1, (%[v1])+ \n" | ||
172 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
173 | "move.l %%d2, (%[v1])+ \n" | ||
174 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
175 | "move.l %%d3, (%[v1])+ \n" | ||
176 | "lea.l (16, %[v2]), %[v2] \n" | ||
177 | "move.l %%d4, %%d0 \n" | ||
178 | |||
179 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
180 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
181 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
182 | "move.l %%d0, (%[v1])+ \n" | ||
183 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
184 | "move.l %%d1, (%[v1])+ \n" | ||
185 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
186 | "move.l %%d2, (%[v1])+ \n" | ||
187 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
188 | "move.l %%d3, (%[v1])+ \n" | ||
189 | #if ORDER > 16 | ||
190 | "lea.l (16, %[v2]), %[v2] \n" | ||
191 | "move.l %%d4, %%d0 \n" | ||
192 | |||
193 | "subq.l #1, %[cnt] \n" | ||
194 | "bne.w 1b \n" | ||
94 | #endif | 195 | #endif |
196 | "jra 99f \n" | ||
197 | |||
198 | "20: \n" | ||
199 | "1: \n" | ||
95 | "movem.l (%[v2]), %%d1-%%d4 \n" | 200 | "movem.l (%[v2]), %%d1-%%d4 \n" |
96 | "movem.l (%[v1]), %%a0-%%a3 \n" | 201 | "movem.l (%[v1]), %%a0-%%a3 \n" |
97 | SUBHALFREGS(%%a0, %%d1, %%d0) | 202 | SUBHALFREGS(%%a0, %%d1, %%d0) |
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
118 | #if ORDER > 16 | 223 | #if ORDER > 16 |
119 | "lea.l (16, %[v2]), %[v2] \n" | 224 | "lea.l (16, %[v2]), %[v2] \n" |
120 | 225 | ||
121 | "subq.l #1, %%d5 \n" | 226 | "subq.l #1, %[cnt] \n" |
122 | "bne.w 1b \n" | 227 | "bne.w 1b \n" |
123 | #endif | 228 | #endif |
229 | |||
230 | "99: \n" | ||
124 | : /* outputs */ | 231 | : /* outputs */ |
125 | [v1]"+a"(v1), | 232 | #if ORDER > 16 |
126 | [v2]"+a"(v2) | 233 | [cnt]"+d"(cnt), |
234 | #endif | ||
235 | [v1] "+a"(v1), | ||
236 | [v2] "+a"(v2) | ||
127 | : /* inputs */ | 237 | : /* inputs */ |
128 | [cnt]"n"(ORDER>>4) | ||
129 | : /* clobbers */ | 238 | : /* clobbers */ |
130 | "d0", "d1", "d2", "d3", "d4", "d5", | 239 | "d0", "d1", "d2", "d3", "d4", |
131 | "a0", "a1", "a2", "a3", "memory" | 240 | "a0", "a1", "a2", "a3", "memory" |
132 | ); | 241 | ); |
133 | } | 242 | } |
134 | 243 | ||
135 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | 244 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ |
136 | 245 | ||
137 | /* Needs EMAC in signed integer mode! */ | 246 | /* This version fetches data as 32 bit words, and *recommends* v1 to be |
247 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC | ||
248 | * in signed integer mode - call above macro before use. */ | ||
138 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 249 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
139 | { | 250 | { |
140 | int res = 0; | 251 | int res = 0; |
252 | #if ORDER > 32 | ||
253 | int cnt = ORDER>>5; | ||
254 | #endif | ||
141 | 255 | ||
142 | #define MACBLOCK4 \ | 256 | #define MACBLOCK4 \ |
143 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ | 257 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ |
144 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ | 258 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ |
145 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ | 259 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ |
146 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 260 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
261 | |||
262 | #define MACBLOCK4_U2 \ | ||
263 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
264 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ | ||
265 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
266 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
147 | 267 | ||
148 | asm volatile ( | 268 | asm volatile ( |
269 | "move.l %[v2], %%d0 \n" | ||
270 | "and.l #2, %%d0 \n" | ||
271 | "jeq 20f \n" | ||
272 | |||
273 | "10: \n" | ||
274 | "move.l (%[v1])+, %%d0 \n" | ||
275 | "move.w (%[v2])+, %%d1 \n" | ||
276 | "1: \n" | ||
277 | #if ORDER > 16 | ||
278 | MACBLOCK4_U2 | ||
279 | MACBLOCK4_U2 | ||
280 | MACBLOCK4_U2 | ||
281 | MACBLOCK4_U2 | ||
282 | #endif | ||
283 | MACBLOCK4_U2 | ||
284 | MACBLOCK4_U2 | ||
285 | MACBLOCK4_U2 | ||
286 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
287 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
288 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
149 | #if ORDER > 32 | 289 | #if ORDER > 32 |
150 | "moveq.l %[cnt], %[res] \n" | 290 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
291 | "subq.l #1, %[res] \n" | ||
292 | "bne.w 1b \n" | ||
293 | #else | ||
294 | "mac.w %%d0l, %%d1u, %%acc0 \n" | ||
151 | #endif | 295 | #endif |
296 | "jra 99f \n" | ||
297 | |||
298 | "20: \n" | ||
152 | "move.l (%[v1])+, %%d0 \n" | 299 | "move.l (%[v1])+, %%d0 \n" |
153 | "move.l (%[v2])+, %%d1 \n" | 300 | "move.l (%[v2])+, %%d1 \n" |
154 | "1: \n" | 301 | "1: \n" |
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
162 | MACBLOCK4 | 309 | MACBLOCK4 |
163 | MACBLOCK4 | 310 | MACBLOCK4 |
164 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 311 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
165 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" | 312 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
166 | #if ORDER > 32 | 313 | #if ORDER > 32 |
167 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" | 314 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
168 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 315 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
169 | |||
170 | "subq.l #1, %[res] \n" | 316 | "subq.l #1, %[res] \n" |
171 | "bne.w 1b \n" | 317 | "bne.w 1b \n" |
172 | #else | 318 | #else |
173 | "mac.w %%d2u, %%d3u, %%acc0 \n" | 319 | "mac.w %%d2u, %%d1u, %%acc0 \n" |
174 | "mac.w %%d2l, %%d3l, %%acc0 \n" | 320 | "mac.w %%d2l, %%d1l, %%acc0 \n" |
175 | #endif | 321 | #endif |
322 | |||
323 | "99: \n" | ||
176 | "movclr.l %%acc0, %[res] \n" | 324 | "movclr.l %%acc0, %[res] \n" |
177 | : /* outputs */ | 325 | : /* outputs */ |
178 | [v1]"+a"(v1), | 326 | [v1]"+a"(v1), |
179 | [v2]"+a"(v2), | 327 | [v2]"+a"(v2), |
180 | [res]"=&d"(res) | 328 | [res]"=d"(res) |
181 | : /* inputs */ | 329 | : /* inputs */ |
182 | [cnt]"n"(ORDER>>5) | 330 | #if ORDER > 32 |
331 | [cnt]"[res]"(cnt) | ||
332 | #endif | ||
183 | : /* clobbers */ | 333 | : /* clobbers */ |
184 | "d0", "d1", "d2", "d3" | 334 | "d0", "d1", "d2" |
185 | ); | 335 | ); |
186 | return res; | 336 | return res; |
187 | } | 337 | } |