diff options
author | Jens Arnold <amiconn@rockbox.org> | 2007-10-25 18:58:16 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2007-10-25 18:58:16 +0000 |
commit | 35f23267bfc97d070284a03e4adaa2c6b7bb6852 (patch) | |
tree | c42fe719f16e68512b0575bfa581105cfa8170bc /apps/codecs/demac/libdemac/vector_math16_cf.h | |
parent | 3ea3caf34165ddc8114ecf3cd39ed0016192b1d7 (diff) | |
download | rockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.tar.gz rockbox-35f23267bfc97d070284a03e4adaa2c6b7bb6852.zip |
Further optimised the filter vector math assembly for coldfire, and added assembly filter vector math for ARM. Both make use of the fact that the first argument of the vector functions is longword aligned. * The ARM version is tailored for ARM7TDMI, and would slow down arm9 or higher. Introduced a new CPU_ macro for ARM7TDMI. Speedup for coldfire: -c3000 104%->109%, -c4000 43%->46%, -c5000 1.7%->2.0%. Speedup for PP502x: -c2000 66%->75%, -c3000 37%->48%, -c4000 11%->18%, -c5000 2.5%->3.7%
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15302 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_cf.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 230 |
1 files changed, 190 insertions, 40 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 937462c293..0c3aaca223 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -24,20 +24,71 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
28 | * 32 bit aligned, otherwise performance will suffer. */ | ||
27 | static inline void vector_add(int16_t* v1, int16_t* v2) | 29 | static inline void vector_add(int16_t* v1, int16_t* v2) |
28 | { | 30 | { |
29 | #define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ | 31 | #if ORDER > 16 |
30 | "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ | 32 | int cnt = ORDER>>4; |
31 | "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ | 33 | #endif |
32 | "clr.w %%d4 \n" \ | 34 | |
33 | "add.l %%d4 , " #sum "\n" \ | 35 | #define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ |
36 | "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ | ||
37 | "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ | ||
38 | "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ | ||
39 | "add.l %%d4 , " #sum "\n" \ | ||
40 | "move.w " #s1 ", " #sum "\n" | ||
41 | |||
42 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ | ||
43 | "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \ | ||
44 | "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \ | ||
45 | "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \ | ||
46 | "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \ | ||
34 | "move.w " #s1 ", " #sum "\n" | 47 | "move.w " #s1 ", " #sum "\n" |
35 | 48 | ||
36 | asm volatile ( | 49 | asm volatile ( |
37 | #if ORDER > 16 | 50 | "move.l %[v2], %%d0 \n" |
38 | "moveq.l %[cnt], %%d5 \n" | 51 | "and.l #2, %%d0 \n" |
52 | "jeq 20f \n" | ||
53 | |||
54 | "10: \n" | ||
55 | "move.w (%[v2])+, %%d0 \n" | ||
56 | "swap %%d0 \n" | ||
39 | "1: \n" | 57 | "1: \n" |
58 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
59 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
60 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
61 | "move.l %%d0, (%[v1])+ \n" | ||
62 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
63 | "move.l %%d1, (%[v1])+ \n" | ||
64 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
65 | "move.l %%d2, (%[v1])+ \n" | ||
66 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
67 | "move.l %%d3, (%[v1])+ \n" | ||
68 | "lea.l (16, %[v2]), %[v2] \n" | ||
69 | "move.l %%d4, %%d0 \n" | ||
70 | |||
71 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
72 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
73 | ADDHALFXREGS(%%a0, %%d1, %%d0) | ||
74 | "move.l %%d0, (%[v1])+ \n" | ||
75 | ADDHALFXREGS(%%a1, %%d2, %%d1) | ||
76 | "move.l %%d1, (%[v1])+ \n" | ||
77 | ADDHALFXREGS(%%a2, %%d3, %%d2) | ||
78 | "move.l %%d2, (%[v1])+ \n" | ||
79 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
80 | "move.l %%d3, (%[v1])+ \n" | ||
81 | #if ORDER > 16 | ||
82 | "lea.l (16, %[v2]), %[v2] \n" | ||
83 | "move.l %%d4, %%d0 \n" | ||
84 | |||
85 | "subq.l #1, %[cnt] \n" | ||
86 | "jne 1b \n" | ||
40 | #endif | 87 | #endif |
88 | "jra 99f \n" | ||
89 | |||
90 | "20: \n" | ||
91 | "1: \n" | ||
41 | "movem.l (%[v2]), %%a0-%%a3 \n" | 92 | "movem.l (%[v2]), %%a0-%%a3 \n" |
42 | "movem.l (%[v1]), %%d0-%%d3 \n" | 93 | "movem.l (%[v1]), %%d0-%%d3 \n" |
43 | ADDHALFREGS(%%a0, %%d0) | 94 | ADDHALFREGS(%%a0, %%d0) |
@@ -48,7 +99,6 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
48 | "move.l %%d2, (%[v1])+ \n" | 99 | "move.l %%d2, (%[v1])+ \n" |
49 | ADDHALFREGS(%%a3, %%d3) | 100 | ADDHALFREGS(%%a3, %%d3) |
50 | "move.l %%d3, (%[v1])+ \n" | 101 | "move.l %%d3, (%[v1])+ \n" |
51 | |||
52 | "lea.l (16, %[v2]), %[v2] \n" | 102 | "lea.l (16, %[v2]), %[v2] \n" |
53 | 103 | ||
54 | "movem.l (%[v2]), %%a0-%%a3 \n" | 104 | "movem.l (%[v2]), %%a0-%%a3 \n" |
@@ -64,34 +114,89 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
64 | #if ORDER > 16 | 114 | #if ORDER > 16 |
65 | "lea.l (16, %[v2]), %[v2] \n" | 115 | "lea.l (16, %[v2]), %[v2] \n" |
66 | 116 | ||
67 | "subq.l #1, %%d5 \n" | 117 | "subq.l #1, %[cnt] \n" |
68 | "bne.w 1b \n" | 118 | "jne 1b \n" |
69 | #endif | 119 | #endif |
120 | "99: \n" | ||
70 | : /* outputs */ | 121 | : /* outputs */ |
71 | [v1]"+a"(v1), | 122 | #if ORDER > 16 |
72 | [v2]"+a"(v2) | 123 | [cnt]"+d"(cnt), |
124 | #endif | ||
125 | [v1] "+a"(v1), | ||
126 | [v2] "+a"(v2) | ||
73 | : /* inputs */ | 127 | : /* inputs */ |
74 | [cnt]"n"(ORDER>>4) | ||
75 | : /* clobbers */ | 128 | : /* clobbers */ |
76 | "d0", "d1", "d2", "d3", "d4", "d5", | 129 | "d0", "d1", "d2", "d3", "d4", |
77 | "a0", "a1", "a2", "a3", "memory" | 130 | "a0", "a1", "a2", "a3", "memory" |
78 | ); | 131 | ); |
79 | } | 132 | } |
80 | 133 | ||
134 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | ||
135 | * 32 bit aligned, otherwise performance will suffer. */ | ||
81 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 136 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
82 | { | 137 | { |
83 | #define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ | 138 | #if ORDER > 16 |
84 | "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ | 139 | int cnt = ORDER>>4; |
85 | "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ | 140 | #endif |
86 | "clr.w " #sub "\n" \ | 141 | |
87 | "sub.l " #sub ", " #dif "\n" \ | 142 | #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \ |
143 | "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \ | ||
144 | "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \ | ||
145 | "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \ | ||
146 | "sub.l " #sub ", " #dif "\n" \ | ||
88 | "move.w " #min ", " #dif "\n" | 147 | "move.w " #min ", " #dif "\n" |
148 | |||
149 | #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \ | ||
150 | "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \ | ||
151 | "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \ | ||
152 | "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \ | ||
153 | "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \ | ||
154 | "sub.l " #s2 ", " #min "\n" \ | ||
155 | "move.w " #min ", " #s1d "\n" | ||
89 | 156 | ||
90 | asm volatile ( | 157 | asm volatile ( |
91 | #if ORDER > 16 | 158 | "move.l %[v2], %%d0 \n" |
92 | "moveq.l %[cnt], %%d5 \n" | 159 | "and.l #2, %%d0 \n" |
160 | "jeq 20f \n" | ||
161 | |||
162 | "10: \n" | ||
163 | "move.w (%[v2])+, %%d0 \n" | ||
164 | "swap %%d0 \n" | ||
93 | "1: \n" | 165 | "1: \n" |
166 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
167 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
168 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
169 | "move.l %%d0, (%[v1])+ \n" | ||
170 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
171 | "move.l %%d1, (%[v1])+ \n" | ||
172 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
173 | "move.l %%d2, (%[v1])+ \n" | ||
174 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
175 | "move.l %%d3, (%[v1])+ \n" | ||
176 | "lea.l (16, %[v2]), %[v2] \n" | ||
177 | "move.l %%d4, %%d0 \n" | ||
178 | |||
179 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
180 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
181 | SUBHALFXREGS(%%a0, %%d1, %%d0) | ||
182 | "move.l %%d0, (%[v1])+ \n" | ||
183 | SUBHALFXREGS(%%a1, %%d2, %%d1) | ||
184 | "move.l %%d1, (%[v1])+ \n" | ||
185 | SUBHALFXREGS(%%a2, %%d3, %%d2) | ||
186 | "move.l %%d2, (%[v1])+ \n" | ||
187 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
188 | "move.l %%d3, (%[v1])+ \n" | ||
189 | #if ORDER > 16 | ||
190 | "lea.l (16, %[v2]), %[v2] \n" | ||
191 | "move.l %%d4, %%d0 \n" | ||
192 | |||
193 | "subq.l #1, %[cnt] \n" | ||
194 | "bne.w 1b \n" | ||
94 | #endif | 195 | #endif |
196 | "jra 99f \n" | ||
197 | |||
198 | "20: \n" | ||
199 | "1: \n" | ||
95 | "movem.l (%[v2]), %%d1-%%d4 \n" | 200 | "movem.l (%[v2]), %%d1-%%d4 \n" |
96 | "movem.l (%[v1]), %%a0-%%a3 \n" | 201 | "movem.l (%[v1]), %%a0-%%a3 \n" |
97 | SUBHALFREGS(%%a0, %%d1, %%d0) | 202 | SUBHALFREGS(%%a0, %%d1, %%d0) |
@@ -118,37 +223,79 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
118 | #if ORDER > 16 | 223 | #if ORDER > 16 |
119 | "lea.l (16, %[v2]), %[v2] \n" | 224 | "lea.l (16, %[v2]), %[v2] \n" |
120 | 225 | ||
121 | "subq.l #1, %%d5 \n" | 226 | "subq.l #1, %[cnt] \n" |
122 | "bne.w 1b \n" | 227 | "bne.w 1b \n" |
123 | #endif | 228 | #endif |
229 | |||
230 | "99: \n" | ||
124 | : /* outputs */ | 231 | : /* outputs */ |
125 | [v1]"+a"(v1), | 232 | #if ORDER > 16 |
126 | [v2]"+a"(v2) | 233 | [cnt]"+d"(cnt), |
234 | #endif | ||
235 | [v1] "+a"(v1), | ||
236 | [v2] "+a"(v2) | ||
127 | : /* inputs */ | 237 | : /* inputs */ |
128 | [cnt]"n"(ORDER>>4) | ||
129 | : /* clobbers */ | 238 | : /* clobbers */ |
130 | "d0", "d1", "d2", "d3", "d4", "d5", | 239 | "d0", "d1", "d2", "d3", "d4", |
131 | "a0", "a1", "a2", "a3", "memory" | 240 | "a0", "a1", "a2", "a3", "memory" |
132 | ); | 241 | ); |
133 | } | 242 | } |
134 | 243 | ||
135 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | 244 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ |
136 | 245 | ||
137 | /* Needs EMAC in signed integer mode! */ | 246 | /* This version fetches data as 32 bit words, and *recommends* v1 to be |
247 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC | ||
248 | * in signed integer mode - call above macro before use. */ | ||
138 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 249 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
139 | { | 250 | { |
140 | int res = 0; | 251 | int res = 0; |
252 | #if ORDER > 32 | ||
253 | int cnt = ORDER>>5; | ||
254 | #endif | ||
141 | 255 | ||
142 | #define MACBLOCK4 \ | 256 | #define MACBLOCK4 \ |
143 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ | 257 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ |
144 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ | 258 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ |
145 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ | 259 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ |
146 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 260 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
261 | |||
262 | #define MACBLOCK4_U2 \ | ||
263 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
264 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ | ||
265 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
266 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
147 | 267 | ||
148 | asm volatile ( | 268 | asm volatile ( |
269 | "move.l %[v2], %%d0 \n" | ||
270 | "and.l #2, %%d0 \n" | ||
271 | "jeq 20f \n" | ||
272 | |||
273 | "10: \n" | ||
274 | "move.l (%[v1])+, %%d0 \n" | ||
275 | "move.w (%[v2])+, %%d1 \n" | ||
276 | "1: \n" | ||
277 | #if ORDER > 16 | ||
278 | MACBLOCK4_U2 | ||
279 | MACBLOCK4_U2 | ||
280 | MACBLOCK4_U2 | ||
281 | MACBLOCK4_U2 | ||
282 | #endif | ||
283 | MACBLOCK4_U2 | ||
284 | MACBLOCK4_U2 | ||
285 | MACBLOCK4_U2 | ||
286 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
287 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
288 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
149 | #if ORDER > 32 | 289 | #if ORDER > 32 |
150 | "moveq.l %[cnt], %[res] \n" | 290 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
291 | "subq.l #1, %[res] \n" | ||
292 | "bne.w 1b \n" | ||
293 | #else | ||
294 | "mac.w %%d0l, %%d1u, %%acc0 \n" | ||
151 | #endif | 295 | #endif |
296 | "jra 99f \n" | ||
297 | |||
298 | "20: \n" | ||
152 | "move.l (%[v1])+, %%d0 \n" | 299 | "move.l (%[v1])+, %%d0 \n" |
153 | "move.l (%[v2])+, %%d1 \n" | 300 | "move.l (%[v2])+, %%d1 \n" |
154 | "1: \n" | 301 | "1: \n" |
@@ -162,26 +309,29 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
162 | MACBLOCK4 | 309 | MACBLOCK4 |
163 | MACBLOCK4 | 310 | MACBLOCK4 |
164 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 311 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
165 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" | 312 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
166 | #if ORDER > 32 | 313 | #if ORDER > 32 |
167 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" | 314 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
168 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | 315 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
169 | |||
170 | "subq.l #1, %[res] \n" | 316 | "subq.l #1, %[res] \n" |
171 | "bne.w 1b \n" | 317 | "bne.w 1b \n" |
172 | #else | 318 | #else |
173 | "mac.w %%d2u, %%d3u, %%acc0 \n" | 319 | "mac.w %%d2u, %%d1u, %%acc0 \n" |
174 | "mac.w %%d2l, %%d3l, %%acc0 \n" | 320 | "mac.w %%d2l, %%d1l, %%acc0 \n" |
175 | #endif | 321 | #endif |
322 | |||
323 | "99: \n" | ||
176 | "movclr.l %%acc0, %[res] \n" | 324 | "movclr.l %%acc0, %[res] \n" |
177 | : /* outputs */ | 325 | : /* outputs */ |
178 | [v1]"+a"(v1), | 326 | [v1]"+a"(v1), |
179 | [v2]"+a"(v2), | 327 | [v2]"+a"(v2), |
180 | [res]"=&d"(res) | 328 | [res]"=d"(res) |
181 | : /* inputs */ | 329 | : /* inputs */ |
182 | [cnt]"n"(ORDER>>5) | 330 | #if ORDER > 32 |
331 | [cnt]"[res]"(cnt) | ||
332 | #endif | ||
183 | : /* clobbers */ | 333 | : /* clobbers */ |
184 | "d0", "d1", "d2", "d3" | 334 | "d0", "d1", "d2" |
185 | ); | 335 | ); |
186 | return res; | 336 | return res; |
187 | } | 337 | } |