summaryrefslogtreecommitdiff
path: root/apps/codecs/demac
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2007-10-25 18:58:16 +0000
committerJens Arnold <amiconn@rockbox.org>2007-10-25 18:58:16 +0000
commit35f23267bfc97d070284a03e4adaa2c6b7bb6852 (patch)
treec42fe719f16e68512b0575bfa581105cfa8170bc /apps/codecs/demac
parent3ea3caf34165ddc8114ecf3cd39ed0016192b1d7 (diff)
downloadrockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.tar.gz
rockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.zip
Further optimised the filter vector math assembly for coldfire, and added assembly filter vector math for ARM. Both make use of the fact that the first argument of the vector functions is longword aligned. * The ARM version is tailored for ARM7TDMI, and would slow down arm9 or higher. Introduced a new CPU_ macro for ARM7TDMI. Speedup for coldfire: -c3000 104%->109%, -c4000 43%->46%, -c5000 1.7%->2.0%. Speedup for PP502x: -c2000 66%->75%, -c3000 37%->48%, -c4000 11%->18%, -c5000 2.5%->3.7%
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15302 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac')
-rw-r--r--apps/codecs/demac/libdemac/filter.c2
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_arm7.h293
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h230
3 files changed, 485 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index e85e42fb00..92d86edd7d 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -31,6 +31,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
31 31
32#ifdef CPU_COLDFIRE 32#ifdef CPU_COLDFIRE
33#include "vector_math16_cf.h" 33#include "vector_math16_cf.h"
34#elif defined CPU_ARM7TDMI
35#include "vector_math16_arm7.h"
34#else 36#else
35#include "vector_math16.h" 37#include "vector_math16.h"
36#endif 38#endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
new file mode 100644
index 0000000000..1565ca9602
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_arm7.h
@@ -0,0 +1,293 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARM7 vector math copyright (C) 2007 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2)
31{
32#if ORDER > 16
33 int cnt = ORDER>>4;
34#endif
35
36#define ADDHALFREGS(sum, s1) /* Adds register */ \
37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
40 "mov " #sum ", " #sum ", lsl #16 \n" \
41 "orr " #sum ", " #sum ", r8 , lsr #16 \n"
42
43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
46 "mov " #sum ", " #sum ", lsl #16 \n" \
47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
48
49 asm volatile (
50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16
73 "mov r4, r8 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif
77 "b 99f \n"
78
79 "20: \n"
80 "1: \n"
81 "ldmia %[v1], {r0-r3} \n"
82 "ldmia %[v2]!, {r4-r7} \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n"
98#endif
99
100 "99: \n"
101 : /* outputs */
102#if ORDER > 16
103 [cnt]"+r"(cnt),
104#endif
105 [v1] "+r"(v1),
106 [v2] "+r"(v2)
107 : /* inputs */
108 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4",
110 "r5", "r6", "r7", "r8", "memory"
111 );
112}
113
114/* This version fetches data as 32 bit words, and *requires* v1 to be
115 * 32 bit aligned, otherwise it will result either in a data abort, or
116 * incorrect results (if ARM aligncheck is disabled). */
117static inline void vector_sub(int16_t* v1, int16_t* v2)
118{
119#if ORDER > 16
120 int cnt = ORDER>>4;
121#endif
122
123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
128 "orr " #dif ", r8 , " #dif ", lsl #16 \n"
129
130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
135
136 asm volatile (
137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16
162 "mov r4, r8 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif
166 "b 99f \n"
167
168 "20: \n"
169 "1: \n"
170 "ldmia %[v1], {r0-r3} \n"
171 "ldmia %[v2]!, {r4-r7} \n"
172 SUBHALFREGS(r0, r4)
173 SUBHALFREGS(r1, r5)
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
188
189 "99: \n"
190 : /* outputs */
191#if ORDER > 16
192 [cnt]"+r"(cnt),
193#endif
194 [v1] "+r"(v1),
195 [v2] "+r"(v2)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5",
199 "r6", "r7", "r8", "r9", "memory"
200 );
201}
202
203/* This version fetches data as 32 bit words, and *requires* v1 to be
204 * 32 bit aligned, otherwise it will result either in a data abort, or
205 * incorrect results (if ARM aligncheck is disabled). It is optimised
206 * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
207 * than the C version. */
208static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
209{
210 int res = 0;
211#if ORDER > 16
212 int cnt = ORDER>>4;
213#endif
214
215#define MLABLOCK2(f1, f2) \
216 "mov r8, " #f1 ", lsl #16 \n" \
217 "mov r8, r8 , asr #16 \n" \
218 "mov r9, " #f2 ", lsl #16 \n" \
219 "mov r9, r9 , asr #16 \n" \
220 "mla %[res], r8, r9, %[res] \n" \
221 "mov r8, " #f1 ", asr #16 \n" \
222 "mov r9, " #f2 ", asr #16 \n" \
223 "mla %[res], r8, r9, %[res] \n"
224
225#define MLABLOCK2_U2(f1, f2) \
226 "mov r8, " #f1 ", lsl #16 \n" \
227 "mov r8, r8 , asr #16 \n" \
228 "mla %[res], r8, r9, %[res] \n" \
229 "mov r8, " #f1 ", asr #16 \n" \
230 "mov r9, " #f2 ", lsl #16 \n" \
231 "mov r9, r9 , asr #16 \n" \
232 "mla %[res], r8, r9, %[res] \n" \
233 "mov r9, " #f2 ", asr #16 \n"
234
235 asm volatile (
236 "tst %[v2], #2 \n"
237 "beq 20f \n"
238
239 "10: \n"
240 "ldrsh r9, [%[v2]], #2 \n"
241 "1: \n"
242 "ldmia %[v1]!, {r0-r3} \n"
243 "ldmia %[v2]!, {r4-r7} \n"
244 MLABLOCK2_U2(r0, r4)
245 MLABLOCK2_U2(r1, r5)
246 MLABLOCK2_U2(r2, r6)
247 MLABLOCK2_U2(r3, r7)
248 "ldmia %[v1]!, {r0-r3} \n"
249 "ldmia %[v2]!, {r4-r7} \n"
250 MLABLOCK2_U2(r0, r4)
251 MLABLOCK2_U2(r1, r5)
252 MLABLOCK2_U2(r2, r6)
253 MLABLOCK2_U2(r3, r7)
254#if ORDER > 16
255 "subs %[cnt], %[cnt], #1 \n"
256 "bne 1b \n"
257#endif
258 "b 99f \n"
259
260 "20: \n"
261 "1: \n"
262 "ldmia %[v1]!, {r0-r3} \n"
263 "ldmia %[v2]!, {r4-r7} \n"
264 MLABLOCK2(r0, r4)
265 MLABLOCK2(r1, r5)
266 MLABLOCK2(r2, r6)
267 MLABLOCK2(r3, r7)
268 "ldmia %[v1]!, {r0-r3} \n"
269 "ldmia %[v2]!, {r4-r7} \n"
270 MLABLOCK2(r0, r4)
271 MLABLOCK2(r1, r5)
272 MLABLOCK2(r2, r6)
273 MLABLOCK2(r3, r7)
274#if ORDER > 16
275 "subs %[cnt], %[cnt], #1 \n"
276 "bne 1b \n"
277#endif
278
279 "99: \n"
280 : /* outputs */
281#if ORDER > 16
282 [cnt]"+r"(cnt),
283#endif
284 [v1] "+r"(v1),
285 [v2] "+r"(v2),
286 [res]"+r"(res)
287 : /* inputs */
288 : /* clobbers */
289 "r0", "r1", "r2", "r3", "r4",
290 "r5", "r6", "r7", "r8", "r9"
291 );
292 return res;
293}
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 937462c293..0c3aaca223 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *recommends* v1 to be
28 * 32 bit aligned, otherwise performance will suffer. */
27static inline void vector_add(int16_t* v1, int16_t* v2) 29static inline void vector_add(int16_t* v1, int16_t* v2)
28{ 30{
29#define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ 31#if ORDER > 16
30 "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ 32 int cnt = ORDER>>4;
31 "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ 33#endif
32 "clr.w %%d4 \n" \ 34
33 "add.l %%d4 , " #sum "\n" \ 35#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
36 "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
37 "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
38 "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
39 "add.l %%d4 , " #sum "\n" \
40 "move.w " #s1 ", " #sum "\n"
41
42#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
43 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
44 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
45 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
46 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
34 "move.w " #s1 ", " #sum "\n" 47 "move.w " #s1 ", " #sum "\n"
35 48
36 asm volatile ( 49 asm volatile (
37#if ORDER > 16 50 "move.l %[v2], %%d0 \n"
38 "moveq.l %[cnt], %%d5 \n" 51 "and.l #2, %%d0 \n"
52 "jeq 20f \n"
53
54 "10: \n"
55 "move.w (%[v2])+, %%d0 \n"
56 "swap %%d0 \n"
39 "1: \n" 57 "1: \n"
58 "movem.l (%[v1]), %%a0-%%a3 \n"
59 "movem.l (%[v2]), %%d1-%%d4 \n"
60 ADDHALFXREGS(%%a0, %%d1, %%d0)
61 "move.l %%d0, (%[v1])+ \n"
62 ADDHALFXREGS(%%a1, %%d2, %%d1)
63 "move.l %%d1, (%[v1])+ \n"
64 ADDHALFXREGS(%%a2, %%d3, %%d2)
65 "move.l %%d2, (%[v1])+ \n"
66 ADDHALFXREGS(%%a3, %%d4, %%d3)
67 "move.l %%d3, (%[v1])+ \n"
68 "lea.l (16, %[v2]), %[v2] \n"
69 "move.l %%d4, %%d0 \n"
70
71 "movem.l (%[v1]), %%a0-%%a3 \n"
72 "movem.l (%[v2]), %%d1-%%d4 \n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0)
74 "move.l %%d0, (%[v1])+ \n"
75 ADDHALFXREGS(%%a1, %%d2, %%d1)
76 "move.l %%d1, (%[v1])+ \n"
77 ADDHALFXREGS(%%a2, %%d3, %%d2)
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a3, %%d4, %%d3)
80 "move.l %%d3, (%[v1])+ \n"
81#if ORDER > 16
82 "lea.l (16, %[v2]), %[v2] \n"
83 "move.l %%d4, %%d0 \n"
84
85 "subq.l #1, %[cnt] \n"
86 "jne 1b \n"
40#endif 87#endif
88 "jra 99f \n"
89
90 "20: \n"
91 "1: \n"
41 "movem.l (%[v2]), %%a0-%%a3 \n" 92 "movem.l (%[v2]), %%a0-%%a3 \n"
42 "movem.l (%[v1]), %%d0-%%d3 \n" 93 "movem.l (%[v1]), %%d0-%%d3 \n"
43 ADDHALFREGS(%%a0, %%d0) 94 ADDHALFREGS(%%a0, %%d0)
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
48 "move.l %%d2, (%[v1])+ \n" 99 "move.l %%d2, (%[v1])+ \n"
49 ADDHALFREGS(%%a3, %%d3) 100 ADDHALFREGS(%%a3, %%d3)
50 "move.l %%d3, (%[v1])+ \n" 101 "move.l %%d3, (%[v1])+ \n"
51
52 "lea.l (16, %[v2]), %[v2] \n" 102 "lea.l (16, %[v2]), %[v2] \n"
53 103
54 "movem.l (%[v2]), %%a0-%%a3 \n" 104 "movem.l (%[v2]), %%a0-%%a3 \n"
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
64#if ORDER > 16 114#if ORDER > 16
65 "lea.l (16, %[v2]), %[v2] \n" 115 "lea.l (16, %[v2]), %[v2] \n"
66 116
67 "subq.l #1, %%d5 \n" 117 "subq.l #1, %[cnt] \n"
68 "bne.w 1b \n" 118 "jne 1b \n"
69#endif 119#endif
120 "99: \n"
70 : /* outputs */ 121 : /* outputs */
71 [v1]"+a"(v1), 122#if ORDER > 16
72 [v2]"+a"(v2) 123 [cnt]"+d"(cnt),
124#endif
125 [v1] "+a"(v1),
126 [v2] "+a"(v2)
73 : /* inputs */ 127 : /* inputs */
74 [cnt]"n"(ORDER>>4)
75 : /* clobbers */ 128 : /* clobbers */
76 "d0", "d1", "d2", "d3", "d4", "d5", 129 "d0", "d1", "d2", "d3", "d4",
77 "a0", "a1", "a2", "a3", "memory" 130 "a0", "a1", "a2", "a3", "memory"
78 ); 131 );
79} 132}
80 133
134/* This version fetches data as 32 bit words, and *recommends* v1 to be
135 * 32 bit aligned, otherwise performance will suffer. */
81static inline void vector_sub(int16_t* v1, int16_t* v2) 136static inline void vector_sub(int16_t* v1, int16_t* v2)
82{ 137{
83#define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ 138#if ORDER > 16
84 "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ 139 int cnt = ORDER>>4;
85 "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ 140#endif
86 "clr.w " #sub "\n" \ 141
87 "sub.l " #sub ", " #dif "\n" \ 142#define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
143 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
144 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
145 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
146 "sub.l " #sub ", " #dif "\n" \
88 "move.w " #min ", " #dif "\n" 147 "move.w " #min ", " #dif "\n"
148
149#define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
150 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
151 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
152 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
153 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
154 "sub.l " #s2 ", " #min "\n" \
155 "move.w " #min ", " #s1d "\n"
89 156
90 asm volatile ( 157 asm volatile (
91#if ORDER > 16 158 "move.l %[v2], %%d0 \n"
92 "moveq.l %[cnt], %%d5 \n" 159 "and.l #2, %%d0 \n"
160 "jeq 20f \n"
161
162 "10: \n"
163 "move.w (%[v2])+, %%d0 \n"
164 "swap %%d0 \n"
93 "1: \n" 165 "1: \n"
166 "movem.l (%[v2]), %%d1-%%d4 \n"
167 "movem.l (%[v1]), %%a0-%%a3 \n"
168 SUBHALFXREGS(%%a0, %%d1, %%d0)
169 "move.l %%d0, (%[v1])+ \n"
170 SUBHALFXREGS(%%a1, %%d2, %%d1)
171 "move.l %%d1, (%[v1])+ \n"
172 SUBHALFXREGS(%%a2, %%d3, %%d2)
173 "move.l %%d2, (%[v1])+ \n"
174 SUBHALFXREGS(%%a3, %%d4, %%d3)
175 "move.l %%d3, (%[v1])+ \n"
176 "lea.l (16, %[v2]), %[v2] \n"
177 "move.l %%d4, %%d0 \n"
178
179 "movem.l (%[v2]), %%d1-%%d4 \n"
180 "movem.l (%[v1]), %%a0-%%a3 \n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0)
182 "move.l %%d0, (%[v1])+ \n"
183 SUBHALFXREGS(%%a1, %%d2, %%d1)
184 "move.l %%d1, (%[v1])+ \n"
185 SUBHALFXREGS(%%a2, %%d3, %%d2)
186 "move.l %%d2, (%[v1])+ \n"
187 SUBHALFXREGS(%%a3, %%d4, %%d3)
188 "move.l %%d3, (%[v1])+ \n"
189#if ORDER > 16
190 "lea.l (16, %[v2]), %[v2] \n"
191 "move.l %%d4, %%d0 \n"
192
193 "subq.l #1, %[cnt] \n"
194 "bne.w 1b \n"
94#endif 195#endif
196 "jra 99f \n"
197
198 "20: \n"
199 "1: \n"
95 "movem.l (%[v2]), %%d1-%%d4 \n" 200 "movem.l (%[v2]), %%d1-%%d4 \n"
96 "movem.l (%[v1]), %%a0-%%a3 \n" 201 "movem.l (%[v1]), %%a0-%%a3 \n"
97 SUBHALFREGS(%%a0, %%d1, %%d0) 202 SUBHALFREGS(%%a0, %%d1, %%d0)
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
118#if ORDER > 16 223#if ORDER > 16
119 "lea.l (16, %[v2]), %[v2] \n" 224 "lea.l (16, %[v2]), %[v2] \n"
120 225
121 "subq.l #1, %%d5 \n" 226 "subq.l #1, %[cnt] \n"
122 "bne.w 1b \n" 227 "bne.w 1b \n"
123#endif 228#endif
229
230 "99: \n"
124 : /* outputs */ 231 : /* outputs */
125 [v1]"+a"(v1), 232#if ORDER > 16
126 [v2]"+a"(v2) 233 [cnt]"+d"(cnt),
234#endif
235 [v1] "+a"(v1),
236 [v2] "+a"(v2)
127 : /* inputs */ 237 : /* inputs */
128 [cnt]"n"(ORDER>>4)
129 : /* clobbers */ 238 : /* clobbers */
130 "d0", "d1", "d2", "d3", "d4", "d5", 239 "d0", "d1", "d2", "d3", "d4",
131 "a0", "a1", "a2", "a3", "memory" 240 "a0", "a1", "a2", "a3", "memory"
132 ); 241 );
133} 242}
134 243
135#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ 244#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
136 245
137/* Needs EMAC in signed integer mode! */ 246/* This version fetches data as 32 bit words, and *recommends* v1 to be
247 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
248 * in signed integer mode - call above macro before use. */
138static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 249static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
139{ 250{
140 int res = 0; 251 int res = 0;
252#if ORDER > 32
253 int cnt = ORDER>>5;
254#endif
141 255
142#define MACBLOCK4 \ 256#define MACBLOCK4 \
143 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ 257 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \
144 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ 258 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
145 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ 259 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
146 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 260 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
261
262#define MACBLOCK4_U2 \
263 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
264 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
265 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
266 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
147 267
148 asm volatile ( 268 asm volatile (
269 "move.l %[v2], %%d0 \n"
270 "and.l #2, %%d0 \n"
271 "jeq 20f \n"
272
273 "10: \n"
274 "move.l (%[v1])+, %%d0 \n"
275 "move.w (%[v2])+, %%d1 \n"
276 "1: \n"
277#if ORDER > 16
278 MACBLOCK4_U2
279 MACBLOCK4_U2
280 MACBLOCK4_U2
281 MACBLOCK4_U2
282#endif
283 MACBLOCK4_U2
284 MACBLOCK4_U2
285 MACBLOCK4_U2
286 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
287 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
288 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
149#if ORDER > 32 289#if ORDER > 32
150 "moveq.l %[cnt], %[res] \n" 290 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
291 "subq.l #1, %[res] \n"
292 "bne.w 1b \n"
293#else
294 "mac.w %%d0l, %%d1u, %%acc0 \n"
151#endif 295#endif
296 "jra 99f \n"
297
298 "20: \n"
152 "move.l (%[v1])+, %%d0 \n" 299 "move.l (%[v1])+, %%d0 \n"
153 "move.l (%[v2])+, %%d1 \n" 300 "move.l (%[v2])+, %%d1 \n"
154 "1: \n" 301 "1: \n"
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
162 MACBLOCK4 309 MACBLOCK4
163 MACBLOCK4 310 MACBLOCK4
164 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 311 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
165 "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" 312 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
166#if ORDER > 32 313#if ORDER > 32
167 "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" 314 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
168 "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" 315 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
169
170 "subq.l #1, %[res] \n" 316 "subq.l #1, %[res] \n"
171 "bne.w 1b \n" 317 "bne.w 1b \n"
172#else 318#else
173 "mac.w %%d2u, %%d3u, %%acc0 \n" 319 "mac.w %%d2u, %%d1u, %%acc0 \n"
174 "mac.w %%d2l, %%d3l, %%acc0 \n" 320 "mac.w %%d2l, %%d1l, %%acc0 \n"
175#endif 321#endif
322
323 "99: \n"
176 "movclr.l %%acc0, %[res] \n" 324 "movclr.l %%acc0, %[res] \n"
177 : /* outputs */ 325 : /* outputs */
178 [v1]"+a"(v1), 326 [v1]"+a"(v1),
179 [v2]"+a"(v2), 327 [v2]"+a"(v2),
180 [res]"=&d"(res) 328 [res]"=d"(res)
181 : /* inputs */ 329 : /* inputs */
182 [cnt]"n"(ORDER>>5) 330#if ORDER > 32
331 [cnt]"[res]"(cnt)
332#endif
183 : /* clobbers */ 333 : /* clobbers */
184 "d0", "d1", "d2", "d3" 334 "d0", "d1", "d2"
185 ); 335 );
186 return res; 336 return res;
187} 337}