summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--apps/codecs/demac/libdemac/filter.c2
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_arm7.h293
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h230
3 files changed, 485 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index e85e42fb00..92d86edd7d 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -31,6 +31,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
31 31
32#ifdef CPU_COLDFIRE 32#ifdef CPU_COLDFIRE
33#include "vector_math16_cf.h" 33#include "vector_math16_cf.h"
34#elif defined CPU_ARM7TDMI
35#include "vector_math16_arm7.h"
34#else 36#else
35#include "vector_math16.h" 37#include "vector_math16.h"
36#endif 38#endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
new file mode 100644
index 0000000000..1565ca9602
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_arm7.h
@@ -0,0 +1,293 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARM7 vector math copyright (C) 2007 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2)
31{
32#if ORDER > 16
33 int cnt = ORDER>>4;
34#endif
35
36#define ADDHALFREGS(sum, s1) /* Adds register */ \
37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
40 "mov " #sum ", " #sum ", lsl #16 \n" \
41 "orr " #sum ", " #sum ", r8 , lsr #16 \n"
42
43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
46 "mov " #sum ", " #sum ", lsl #16 \n" \
47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
48
49 asm volatile (
50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16
73 "mov r4, r8 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif
77 "b 99f \n"
78
79 "20: \n"
80 "1: \n"
81 "ldmia %[v1], {r0-r3} \n"
82 "ldmia %[v2]!, {r4-r7} \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n"
98#endif
99
100 "99: \n"
101 : /* outputs */
102#if ORDER > 16
103 [cnt]"+r"(cnt),
104#endif
105 [v1] "+r"(v1),
106 [v2] "+r"(v2)
107 : /* inputs */
108 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4",
110 "r5", "r6", "r7", "r8", "memory"
111 );
112}
113
114/* This version fetches data as 32 bit words, and *requires* v1 to be
115 * 32 bit aligned, otherwise it will result either in a data abort, or
116 * incorrect results (if ARM aligncheck is disabled). */
117static inline void vector_sub(int16_t* v1, int16_t* v2)
118{
119#if ORDER > 16
120 int cnt = ORDER>>4;
121#endif
122
123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
128 "orr " #dif ", r8 , " #dif ", lsl #16 \n"
129
130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
135
136 asm volatile (
137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16
162 "mov r4, r8 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif
166 "b 99f \n"
167
168 "20: \n"
169 "1: \n"
170 "ldmia %[v1], {r0-r3} \n"
171 "ldmia %[v2]!, {r4-r7} \n"
172 SUBHALFREGS(r0, r4)
173 SUBHALFREGS(r1, r5)
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
188
189 "99: \n"
190 : /* outputs */
191#if ORDER > 16
192 [cnt]"+r"(cnt),
193#endif
194 [v1] "+r"(v1),
195 [v2] "+r"(v2)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5",
199 "r6", "r7", "r8", "r9", "memory"
200 );
201}
202
203/* This version fetches data as 32 bit words, and *requires* v1 to be
204 * 32 bit aligned, otherwise it will result either in a data abort, or
205 * incorrect results (if ARM aligncheck is disabled). It is optimised
206 * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
207 * than the C version. */
208static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
209{
210 int res = 0;
211#if ORDER > 16
212 int cnt = ORDER>>4;
213#endif
214
215#define MLABLOCK2(f1, f2) \
216 "mov r8, " #f1 ", lsl #16 \n" \
217 "mov r8, r8 , asr #16 \n" \
218 "mov r9, " #f2 ", lsl #16 \n" \
219 "mov r9, r9 , asr #16 \n" \
220 "mla %[res], r8, r9, %[res] \n" \
221 "mov r8, " #f1 ", asr #16 \n" \
222 "mov r9, " #f2 ", asr #16 \n" \
223 "mla %[res], r8, r9, %[res] \n"
224
225#define MLABLOCK2_U2(f1, f2) \
226 "mov r8, " #f1 ", lsl #16 \n" \
227 "mov r8, r8 , asr #16 \n" \
228 "mla %[res], r8, r9, %[res] \n" \
229 "mov r8, " #f1 ", asr #16 \n" \
230 "mov r9, " #f2 ", lsl #16 \n" \
231 "mov r9, r9 , asr #16 \n" \
232 "mla %[res], r8, r9, %[res] \n" \
233 "mov r9, " #f2 ", asr #16 \n"
234
235 asm volatile (
236 "tst %[v2], #2 \n"
237 "beq 20f \n"
238
239 "10: \n"
240 "ldrsh r9, [%[v2]], #2 \n"
241 "1: \n"
242 "ldmia %[v1]!, {r0-r3} \n"
243 "ldmia %[v2]!, {r4-r7} \n"
244 MLABLOCK2_U2(r0, r4)
245 MLABLOCK2_U2(r1, r5)
246 MLABLOCK2_U2(r2, r6)
247 MLABLOCK2_U2(r3, r7)
248 "ldmia %[v1]!, {r0-r3} \n"
249 "ldmia %[v2]!, {r4-r7} \n"
250 MLABLOCK2_U2(r0, r4)
251 MLABLOCK2_U2(r1, r5)
252 MLABLOCK2_U2(r2, r6)
253 MLABLOCK2_U2(r3, r7)
254#if ORDER > 16
255 "subs %[cnt], %[cnt], #1 \n"
256 "bne 1b \n"
257#endif
258 "b 99f \n"
259
260 "20: \n"
261 "1: \n"
262 "ldmia %[v1]!, {r0-r3} \n"
263 "ldmia %[v2]!, {r4-r7} \n"
264 MLABLOCK2(r0, r4)
265 MLABLOCK2(r1, r5)
266 MLABLOCK2(r2, r6)
267 MLABLOCK2(r3, r7)
268 "ldmia %[v1]!, {r0-r3} \n"
269 "ldmia %[v2]!, {r4-r7} \n"
270 MLABLOCK2(r0, r4)
271 MLABLOCK2(r1, r5)
272 MLABLOCK2(r2, r6)
273 MLABLOCK2(r3, r7)
274#if ORDER > 16
275 "subs %[cnt], %[cnt], #1 \n"
276 "bne 1b \n"
277#endif
278
279 "99: \n"
280 : /* outputs */
281#if ORDER > 16
282 [cnt]"+r"(cnt),
283#endif
284 [v1] "+r"(v1),
285 [v2] "+r"(v2),
286 [res]"+r"(res)
287 : /* inputs */
288 : /* clobbers */
289 "r0", "r1", "r2", "r3", "r4",
290 "r5", "r6", "r7", "r8", "r9"
291 );
292 return res;
293}
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 937462c293..0c3aaca223 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *recommends* v1 to be
28 * 32 bit aligned, otherwise performance will suffer. */
27static inline void vector_add(int16_t* v1, int16_t* v2) 29static inline void vector_add(int16_t* v1, int16_t* v2)
28{ 30{
29#define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ 31#if ORDER > 16
30 "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ 32 int cnt = ORDER>>4;
31 "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ 33#endif
32 "clr.w %%d4 \n" \ 34
33 "add.l %%d4 , " #sum "\n" \ 35#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
36 "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
37 "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
38 "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
39 "add.l %%d4 , " #sum "\n" \
40 "move.w " #s1 ", " #sum "\n"
41
42#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
43 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
44 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
45 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
46 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
34 "move.w " #s1 ", " #sum "\n" 47 "move.w " #s1 ", " #sum "\n"
35 48
36 asm volatile ( 49 asm volatile (
37#if ORDER > 16 50 "move.l %[v2], %%d0 \n"
38 "moveq.l %[cnt], %%d5 \n" 51 "and.l #2, %%d0 \n"
52 "jeq 20f \n"
53
54 "10: \n"
55 "move.w (%[v2])+, %%d0 \n"
56 "swap %%d0 \n"
39 "1: \n" 57 "1: \n"
58 "movem.l (%[v1]), %%a0-%%a3 \n"
59 "movem.l (%[v2]), %%d1-%%d4 \n"
60 ADDHALFXREGS(%%a0, %%d1, %%d0)
61 "move.l %%d0, (%[v1])+ \n"
62 ADDHALFXREGS(%%a1, %%d2, %%d1)
63 "move.l %%d1, (%[v1])+ \n"
64 ADDHALFXREGS(%%a2, %%d3, %%d2)
65 "move.l %%d2, (%[v1])+ \n"
66 ADDHALFXREGS(%%a3, %%d4, %%d3)
67 "move.l %%d3, (%[v1])+ \n"
68 "lea.l (16, %[v2]), %[v2] \n"
69 "move.l %%d4, %%d0 \n"
70
71 "movem.l (%[v1]), %%a0-%%a3 \n"
72 "movem.l (%[v2]), %%d1-%%d4 \n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0)
74 "move.l %%d0, (%[v1])+ \n"
75 ADDHALFXREGS(%%a1, %%d2, %%d1)
76 "move.l %%d1, (%[v1])+ \n"
77 ADDHALFXREGS(%%a2, %%d3, %%d2)
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a3, %%d4, %%d3)
80 "move.l %%d3, (%[v1])+ \n"
81#if ORDER > 16
82 "lea.l (16, %[v2]), %[v2] \n"
83 "move.l %%d4, %%d0 \n"
84
85 "subq.l #1, %[cnt] \n"
86 "jne 1b \n"
40#endif 87#endif
88 "jra 99f \n"
89
90 "20: \n"
91 "1: \n"
41 "movem.l (%[v2]), %%a0-%%a3 \n" 92 "movem.l (%[v2]), %%a0-%%a3 \n"
42 "movem.l (%[v1]), %%d0-%%d3 \n" 93 "movem.l (%[v1]), %%d0-%%d3 \n"
43 ADDHALFREGS(%%a0, %%d0) 94 ADDHALFREGS(%%a0, %%d0)
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
48 "move.l %%d2, (%[v1])+ \n" 99 "move.l %%d2, (%[v1])+ \n"
49 ADDHALFREGS(%%a3, %%d3) 100 ADDHALFREGS(%%a3, %%d3)
50 "move.l %%d3, (%[v1])+ \n" 101 "move.l %%d3, (%[v1])+ \n"
51
52 "lea.l (16, %[v2]), %[v2] \n" 102 "lea.l (16, %[v2]), %[v2] \n"
53 103
54 "movem.l (%[v2]), %%a0-%%a3 \n" 104 "movem.l (%[v2]), %%a0-%%a3 \n"
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
64#if ORDER > 16 114#if ORDER > 16
65 "lea.l (16, %[v2]), %[v2] \n" 115 "lea.l (16, %[v2]), %[v2] \n"
66 116
67 "subq.l #1, %%d5 \n" 117 "subq.l #1, %[cnt] \n"
68 "bne.w 1b \n" 118 "jne 1b \n"
69#endif 119#endif
120 "99: \n"
70 : /* outputs */ 121 : /* outputs */
71 [v1]"+a"(v1), 122#if ORDER > 16
72 [v2]"+a"(v2) 123 [cnt]"+d"(cnt),
124#endif
125 [v1] "+a"(v1),
126 [v2] "+a"(v2)
73 : /* inputs */ 127 : /* inputs */
74 [cnt]"n"(ORDER>>4)
75 : /* clobbers */ 128 : /* clobbers */
76 "d0", "d1", "d2", "d3", "d4", "d5", 129 "d0", "d1", "d2", "d3", "d4",
77 "a0", "a1", "a2", "a3", "memory" 130 "a0", "a1", "a2", "a3", "memory"
78 ); 131 );
79} 132}
80 133
134/* This version fetches data as 32 bit words, and *recommends* v1 to be
135 * 32 bit aligned, otherwise performance will suffer. */
81static inline void vector_sub(int16_t* v1, int16_t* v2) 136static inline void vector_sub(int16_t* v1, int16_t* v2)
82{ 137{
83#define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ 138#if ORDER > 16
84 "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ 139 int cnt = ORDER>>4;
85 "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ 140#endif
86 "clr.w " #sub "\n" \ 141
87 "sub.l " #sub ", " #dif "\n" \ 142#define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
143 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
144 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
145 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
146 "sub.l " #sub ", " #dif "\n" \
88 "move.w " #min ", " #dif "\n" 147 "move.w " #min ", " #dif "\n"
148
149#define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
150 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
151 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
152 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
153 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
154 "sub.l " #s2 ", " #min "\n" \
155 "move.w " #min ", " #s1d "\n"
89 156
90 asm volatile ( 157 asm volatile (
91#if ORDER > 16 158 "move.l %[v2], %%d0 \n"
92 "moveq.l %[cnt], %%d5 \n" 159 "and.l #2, %%d0 \n"
160 "jeq 20f \n"
161
162 "10: \n"
163 "move.w (%[v2])+, %%d0 \n"
164 "swap %%d0 \n"
93 "1: \n" 165 "1: \n"
166 "movem.l (%[v2]), %%d1-%%d4 \n"
167 "movem.l (%[v1]), %%a0-%%a3 \n"
168 SUBHALFXREGS(%%a0, %%d1, %%d0)
169 "move.l %%d0, (%[v1])+ \n"
170 SUBHALFXREGS(%%a1, %%d2, %%d1)
171 "move.l %%d1, (%[v1])+ \n"
172 SUBHALFXREGS(%%a2, %%d3, %%d2)
173 "move.l %%d2, (%[v1])+ \n"
174 SUBHALFXREGS(%%a3, %%d4, %%d3)
175 "move.l %%d3, (%[v1])+ \n"
176 "lea.l (16, %[v2]), %[v2] \n"
177 "move.l %%d4, %%d0 \n"
178
179 "movem.l (%[v2]), %%d1-%%d4 \n"
180 "movem.l (%[v1]), %%a0-%%a3 \n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0)
182 "move.l %%d0, (%[v1])+ \n"
183 SUBHALFXREGS(%%a1, %%d2, %%d1)
184 "move.l %%d1, (%[v1])+ \n"
185 SUBHALFXREGS(%%a2, %%d3, %%d2)
186 "move.l %%d2, (%[v1])+ \n"
187 SUBHALFXREGS(%%a3, %%d4, %%d3)
188 "move.l %%d3, (%[v1])+ \n"
189#if ORDER > 16
190 "lea.l (16, %[v2]), %[v2] \n"
191 "move.l %%d4, %%d0 \n"
192
193 "subq.l #1, %[cnt] \n"
194 "bne.w 1b \n"
94#endif 195#endif
196 "jra 99f \n"
197
198 "20: \n"
199 "1: \n"
95 "movem.l (%[v2]), %%d1-%%d4 \n" 200 "movem.l (%[v2]), %%d1-%%d4 \n"
96 "movem.l (%[v1]), %%a0-%%a3 \n" 201 "movem.l (%[v1]), %%a0-%%a3 \n"
97 SUBHALFREGS(%%a0, %%d1, %%d0) 202 SUBHALFREGS(%%a0, %%d1, %%d0)
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
118#if ORDER > 16 223#if ORDER > 16
119 "lea.l (16, %[v2]), %[v2] \n" 224 "lea.l (16, %[v2]), %[v2] \n"
120 225
121 "subq.l #1, %%d5 \n" 226 "subq.l #1, %[cnt] \n"
122 "bne.w 1b \n" 227 "bne.w 1b \n"
123#endif 228#endif
229
230 "99: \n"
124 : /* outputs */ 231 : /* outputs */
125 [v1]"+a"(v1), 232#if ORDER > 16
126 [v2]"+a"(v2) 233 [cnt]"+d"(cnt),
234#endif
235 [v1] "+a"(v1),
236 [v2] "+a"(v2)
127 : /* inputs */ 237 : /* inputs */
128 [cnt]"n"(ORDER>>4)
129 : /* clobbers */ 238 : /* clobbers */
130 "d0", "d1", "d2", "d3", "d4", "d5", 239 "d0", "d1", "d2", "d3", "d4",
131 "a0", "a1", "a2", "a3", "memory" 240 "a0", "a1", "a2", "a3", "memory"
132 ); 241 );
133} 242}
134 243
135#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ 244#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
136 245
137/* Needs EMAC in signed integer mode! */ 246/* This version fetches data as 32 bit words, and *recommends* v1 to be
247 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
248 * in signed integer mode - call above macro before use. */
138static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 249static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
139{ 250{
140 int res = 0; 251 int res = 0;
252#if ORDER > 32
253 int cnt = ORDER>>5;
254#endif
141 255
142#define MACBLOCK4 \ 256#define MACBLOCK4 \
143 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ 257 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \
144 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ 258 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
145 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ 259 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
146 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 260 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
261
262#define MACBLOCK4_U2 \
263 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
264 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
265 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
266 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
147 267
148 asm volatile ( 268 asm volatile (
269 "move.l %[v2], %%d0 \n"
270 "and.l #2, %%d0 \n"
271 "jeq 20f \n"
272
273 "10: \n"
274 "move.l (%[v1])+, %%d0 \n"
275 "move.w (%[v2])+, %%d1 \n"
276 "1: \n"
277#if ORDER > 16
278 MACBLOCK4_U2
279 MACBLOCK4_U2
280 MACBLOCK4_U2
281 MACBLOCK4_U2
282#endif
283 MACBLOCK4_U2
284 MACBLOCK4_U2
285 MACBLOCK4_U2
286 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
287 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
288 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
149#if ORDER > 32 289#if ORDER > 32
150 "moveq.l %[cnt], %[res] \n" 290 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
291 "subq.l #1, %[res] \n"
292 "bne.w 1b \n"
293#else
294 "mac.w %%d0l, %%d1u, %%acc0 \n"
151#endif 295#endif
296 "jra 99f \n"
297
298 "20: \n"
152 "move.l (%[v1])+, %%d0 \n" 299 "move.l (%[v1])+, %%d0 \n"
153 "move.l (%[v2])+, %%d1 \n" 300 "move.l (%[v2])+, %%d1 \n"
154 "1: \n" 301 "1: \n"
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
162 MACBLOCK4 309 MACBLOCK4
163 MACBLOCK4 310 MACBLOCK4
164 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 311 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
165 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" 312 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
166#if ORDER > 32 313#if ORDER > 32
167 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" 314 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
168 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 315 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
169
170 "subq.l #1, %[res] \n" 316 "subq.l #1, %[res] \n"
171 "bne.w 1b \n" 317 "bne.w 1b \n"
172#else 318#else
173 "mac.w %%d2u, %%d3u, %%acc0 \n" 319 "mac.w %%d2u, %%d1u, %%acc0 \n"
174 "mac.w %%d2l, %%d3l, %%acc0 \n" 320 "mac.w %%d2l, %%d1l, %%acc0 \n"
175#endif 321#endif
322
323 "99: \n"
176 "movclr.l %%acc0, %[res] \n" 324 "movclr.l %%acc0, %[res] \n"
177 : /* outputs */ 325 : /* outputs */
178 [v1]"+a"(v1), 326 [v1]"+a"(v1),
179 [v2]"+a"(v2), 327 [v2]"+a"(v2),
180 [res]"=&d"(res) 328 [res]"=d"(res)
181 : /* inputs */ 329 : /* inputs */
182 [cnt]"n"(ORDER>>5) 330#if ORDER > 32
331 [cnt]"[res]"(cnt)
332#endif
183 : /* clobbers */ 333 : /* clobbers */
184 "d0", "d1", "d2", "d3" 334 "d0", "d1", "d2"
185 ); 335 );
186 return res; 336 return res;
187} 337}