summaryrefslogtreecommitdiff
path: root/apps
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2010-02-10 23:23:17 +0000
committerJens Arnold <amiconn@rockbox.org>2010-02-10 23:23:17 +0000
commit0a291fff12c27ba6b46521ecaf126bdb4726c24e (patch)
tree4c7d6df8d4d43b07a8cf17b6eedaf950fbd38d1b /apps
parent3d7983e5c7a496bb7c3a8578051fd9da61e243d1 (diff)
downloadrockbox-0a291fff12c27ba6b46521ecaf126bdb4726c24e.tar.gz
rockbox-0a291fff12c27ba6b46521ecaf126bdb4726c24e.zip
APE: Fused vector math for the filters on ARMv5te. Speedup on Cowon D2 is ~4% for -c2000..-c4000 (less for -c5000). Thanks to Frank Gevaerts for testing.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24590 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv5te.h443
1 files changed, 267 insertions, 176 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
index 4f2c203f5e..2940585a42 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
@@ -24,180 +24,288 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *requires* v1 to be 27#define FUSED_VECTOR_MATH
28 * 32 bit aligned, otherwise it will result either in a data abort, or 28
29 * incorrect results (if ARM aligncheck is disabled). */ 29/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
30static inline void vector_add(int16_t* v1, int16_t* v2) 30 * This version fetches data as 32 bit words, and *requires* v1 to be
31 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
32 * aligned or both unaligned. If either condition isn't met, it will either
33 * result in a data abort or incorrect results. */
34static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
31{ 35{
36 int res;
32#if ORDER > 16 37#if ORDER > 16
33 int cnt = ORDER>>4; 38 int cnt = ORDER>>4;
34#endif 39#endif
35 40
36#define ADDHALFREGS(sum, s1) /* Adds register */ \ 41#define ADDHALFREGS(sum, s1, s2) /* Adds register */ \
37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ 42 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \
38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ 43 "add " #sum ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \
39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ 44 "add " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \
40 "mov " #sum ", " #sum ", lsl #16 \n" \ 45 "mov " #s1 ", " #s1 ", lsl #16 \n" \
41 "orr " #sum ", " #sum ", r8 , lsr #16 \n" 46 "orr " #sum ", " #s1 ", " #sum ", lsr #16 \n"
42 47
43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ 48#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ 49 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ 50 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
46 "mov " #sum ", " #sum ", lsl #16 \n" \ 51 "mov " #sum ", " #sum ", lsl #16 \n" \
47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" 52 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
48 53
49 asm volatile ( 54 asm volatile (
50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16 55#if ORDER > 16
73 "mov r4, r8 \n" 56 "mov %[res], #0 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif 57#endif
77 "b 99f \n" 58 "tst %[f2], #2 \n"
59 "beq 20f \n"
78 60
79 "20: \n" 61 "10: \n"
80 "1: \n" 62 "ldrh r4, [%[s2]], #2 \n"
81 "ldmia %[v1], {r0-r3} \n" 63 "mov r4, r4, lsl #16 \n"
82 "ldmia %[v2]!, {r4-r7} \n" 64 "ldrh r3, [%[f2]], #2 \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16 65#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n" 66 "mov r3, r3, lsl #16 \n"
97 "bne 1b \n" 67 "1: \n"
68 "ldmia %[v1], {r0,r1} \n"
69 "smlabt %[res], r0, r3, %[res] \n"
70#else
71 "ldmia %[v1], {r0,r1} \n"
72 "smulbb %[res], r0, r3 \n"
73#endif
74 "ldmia %[f2]!, {r2,r3} \n"
75 "smlatb %[res], r0, r2, %[res] \n"
76 "smlabt %[res], r1, r2, %[res] \n"
77 "smlatb %[res], r1, r3, %[res] \n"
78 "ldmia %[s2]!, {r2,r5} \n"
79 ADDHALFXREGS(r0, r4, r2)
80 ADDHALFXREGS(r1, r2, r5)
81 "stmia %[v1]!, {r0,r1} \n"
82 "ldmia %[v1], {r0,r1} \n"
83 "smlabt %[res], r0, r3, %[res] \n"
84 "ldmia %[f2]!, {r2,r3} \n"
85 "smlatb %[res], r0, r2, %[res] \n"
86 "smlabt %[res], r1, r2, %[res] \n"
87 "smlatb %[res], r1, r3, %[res] \n"
88 "ldmia %[s2]!, {r2,r4} \n"
89 ADDHALFXREGS(r0, r5, r2)
90 ADDHALFXREGS(r1, r2, r4)
91 "stmia %[v1]!, {r0,r1} \n"
92
93 "ldmia %[v1], {r0,r1} \n"
94 "smlabt %[res], r0, r3, %[res] \n"
95 "ldmia %[f2]!, {r2,r3} \n"
96 "smlatb %[res], r0, r2, %[res] \n"
97 "smlabt %[res], r1, r2, %[res] \n"
98 "smlatb %[res], r1, r3, %[res] \n"
99 "ldmia %[s2]!, {r2,r5} \n"
100 ADDHALFXREGS(r0, r4, r2)
101 ADDHALFXREGS(r1, r2, r5)
102 "stmia %[v1]!, {r0,r1} \n"
103 "ldmia %[v1], {r0,r1} \n"
104 "smlabt %[res], r0, r3, %[res] \n"
105 "ldmia %[f2]!, {r2,r3} \n"
106 "smlatb %[res], r0, r2, %[res] \n"
107 "smlabt %[res], r1, r2, %[res] \n"
108 "smlatb %[res], r1, r3, %[res] \n"
109 "ldmia %[s2]!, {r2,r4} \n"
110 ADDHALFXREGS(r0, r5, r2)
111 ADDHALFXREGS(r1, r2, r4)
112 "stmia %[v1]!, {r0,r1} \n"
113#if ORDER > 16
114 "subs %[cnt], %[cnt], #1 \n"
115 "bne 1b \n"
116#endif
117 "b 99f \n"
118
119 "20: \n"
120 "1: \n"
121 "ldmia %[v1], {r1,r2} \n"
122 "ldmia %[f2]!, {r3,r4} \n"
123#if ORDER > 16
124 "smlabb %[res], r1, r3, %[res] \n"
125#else
126 "smulbb %[res], r1, r3 \n"
127#endif
128 "smlatt %[res], r1, r3, %[res] \n"
129 "smlabb %[res], r2, r4, %[res] \n"
130 "smlatt %[res], r2, r4, %[res] \n"
131 "ldmia %[s2]!, {r3,r4} \n"
132 ADDHALFREGS(r0, r1, r3)
133 ADDHALFREGS(r1, r2, r4)
134 "stmia %[v1]!, {r0,r1} \n"
135
136 ".rept 3 \n"
137 "ldmia %[v1], {r1,r2} \n"
138 "ldmia %[f2]!, {r3,r4} \n"
139 "smlabb %[res], r1, r3, %[res] \n"
140 "smlatt %[res], r1, r3, %[res] \n"
141 "smlabb %[res], r2, r4, %[res] \n"
142 "smlatt %[res], r2, r4, %[res] \n"
143 "ldmia %[s2]!, {r3,r4} \n"
144 ADDHALFREGS(r0, r1, r3)
145 ADDHALFREGS(r1, r2, r4)
146 "stmia %[v1]!, {r0,r1} \n"
147 ".endr \n"
148#if ORDER > 16
149 "subs %[cnt], %[cnt], #1 \n"
150 "bne 1b \n"
98#endif 151#endif
99 152
100 "99: \n" 153 "99: \n"
101 : /* outputs */ 154 : /* outputs */
102#if ORDER > 16 155#if ORDER > 16
103 [cnt]"+r"(cnt), 156 [cnt]"+r"(cnt),
104#endif 157#endif
105 [v1] "+r"(v1), 158 [v1] "+r"(v1),
106 [v2] "+r"(v2) 159 [f2] "+r"(f2),
160 [s2] "+r"(s2),
161 [res]"=r"(res)
107 : /* inputs */ 162 : /* inputs */
108 : /* clobbers */ 163 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4", 164 "r0", "r1", "r2", "r3", "r4", "r5", "memory"
110 "r5", "r6", "r7", "r8", "memory"
111 ); 165 );
166 return res;
112} 167}
113 168
114/* This version fetches data as 32 bit words, and *requires* v1 to be 169/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
115 * 32 bit aligned, otherwise it will result either in a data abort, or 170 * This version fetches data as 32 bit words, and *requires* v1 to be
116 * incorrect results (if ARM aligncheck is disabled). */ 171 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
117static inline void vector_sub(int16_t* v1, int16_t* v2) 172 * aligned or both unaligned. If either condition isn't met, it will either
173 * result in a data abort or incorrect results. */
174static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
118{ 175{
176 int res;
119#if ORDER > 16 177#if ORDER > 16
120 int cnt = ORDER>>4; 178 int cnt = ORDER>>4;
121#endif 179#endif
122 180
123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \ 181#define SUBHALFREGS(dif, s1, s2) /* Subtracts reg. */ \
124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ 182 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \
125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ 183 "sub " #dif ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \
126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ 184 "sub " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \
127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ 185 "mov " #s1 ", " #s1 ", lsl #16 \n" \
128 "orr " #dif ", r8 , " #dif ", lsl #16 \n" 186 "orr " #dif ", " #s1 ", " #dif ", lsr #16 \n"
129 187
130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ 188#define SUBHALFXREGS(dif, s1, s2, msk) /* Subtracts reg. */ \
131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ 189 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ 190 "and " #s1 ", " #s1 ", " #msk " \n" /* Needs msk = */ \
133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ 191 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* 0x0000ffff, */ \
134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" 192 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" /* clobbers 's1'. */
135 193
136 asm volatile ( 194 asm volatile (
137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16 195#if ORDER > 16
162 "mov r4, r8 \n" 196 "mov %[res], #0 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif 197#endif
166 "b 99f \n" 198 "tst %[f2], #2 \n"
199 "beq 20f \n"
167 200
168 "20: \n" 201 "10: \n"
169 "1: \n" 202 "mov r6, #0xff \n"
170 "ldmia %[v1], {r0-r3} \n" 203 "orr r6, r6, #0xff00 \n"
171 "ldmia %[v2]!, {r4-r7} \n" 204 "ldrh r4, [%[s2]], #2 \n"
172 SUBHALFREGS(r0, r4) 205 "mov r4, r4, lsl #16 \n"
173 SUBHALFREGS(r1, r5) 206 "ldrh r3, [%[f2]], #2 \n"
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16 207#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n" 208 "mov r3, r3, lsl #16 \n"
186 "bne 1b \n" 209 "1: \n"
210 "ldmia %[v1], {r0,r1} \n"
211 "smlabt %[res], r0, r3, %[res] \n"
212#else
213 "ldmia %[v1], {r0,r1} \n"
214 "smulbb %[res], r0, r3 \n"
215#endif
216 "ldmia %[f2]!, {r2,r3} \n"
217 "smlatb %[res], r0, r2, %[res] \n"
218 "smlabt %[res], r1, r2, %[res] \n"
219 "smlatb %[res], r1, r3, %[res] \n"
220 "ldmia %[s2]!, {r2,r5} \n"
221 SUBHALFXREGS(r0, r4, r2, r6)
222 SUBHALFXREGS(r1, r2, r5, r6)
223 "stmia %[v1]!, {r0,r1} \n"
224 "ldmia %[v1], {r0,r1} \n"
225 "smlabt %[res], r0, r3, %[res] \n"
226 "ldmia %[f2]!, {r2,r3} \n"
227 "smlatb %[res], r0, r2, %[res] \n"
228 "smlabt %[res], r1, r2, %[res] \n"
229 "smlatb %[res], r1, r3, %[res] \n"
230 "ldmia %[s2]!, {r2,r4} \n"
231 SUBHALFXREGS(r0, r5, r2, r6)
232 SUBHALFXREGS(r1, r2, r4, r6)
233 "stmia %[v1]!, {r0,r1} \n"
234
235 "ldmia %[v1], {r0,r1} \n"
236 "smlabt %[res], r0, r3, %[res] \n"
237 "ldmia %[f2]!, {r2,r3} \n"
238 "smlatb %[res], r0, r2, %[res] \n"
239 "smlabt %[res], r1, r2, %[res] \n"
240 "smlatb %[res], r1, r3, %[res] \n"
241 "ldmia %[s2]!, {r2,r5} \n"
242 SUBHALFXREGS(r0, r4, r2, r6)
243 SUBHALFXREGS(r1, r2, r5, r6)
244 "stmia %[v1]!, {r0,r1} \n"
245 "ldmia %[v1], {r0,r1} \n"
246 "smlabt %[res], r0, r3, %[res] \n"
247 "ldmia %[f2]!, {r2,r3} \n"
248 "smlatb %[res], r0, r2, %[res] \n"
249 "smlabt %[res], r1, r2, %[res] \n"
250 "smlatb %[res], r1, r3, %[res] \n"
251 "ldmia %[s2]!, {r2,r4} \n"
252 SUBHALFXREGS(r0, r5, r2, r6)
253 SUBHALFXREGS(r1, r2, r4, r6)
254 "stmia %[v1]!, {r0,r1} \n"
255#if ORDER > 16
256 "subs %[cnt], %[cnt], #1 \n"
257 "bne 1b \n"
187#endif 258#endif
259 "b 99f \n"
188 260
189 "99: \n" 261 "20: \n"
262 "1: \n"
263 "ldmia %[v1], {r1,r2} \n"
264 "ldmia %[f2]!, {r3,r4} \n"
265#if ORDER > 16
266 "smlabb %[res], r1, r3, %[res] \n"
267#else
268 "smulbb %[res], r1, r3 \n"
269#endif
270 "smlatt %[res], r1, r3, %[res] \n"
271 "smlabb %[res], r2, r4, %[res] \n"
272 "smlatt %[res], r2, r4, %[res] \n"
273 "ldmia %[s2]!, {r3,r4} \n"
274 SUBHALFREGS(r0, r1, r3)
275 SUBHALFREGS(r1, r2, r4)
276 "stmia %[v1]!, {r0,r1} \n"
277
278 ".rept 3 \n"
279 "ldmia %[v1], {r1,r2} \n"
280 "ldmia %[f2]!, {r3,r4} \n"
281 "smlabb %[res], r1, r3, %[res] \n"
282 "smlatt %[res], r1, r3, %[res] \n"
283 "smlabb %[res], r2, r4, %[res] \n"
284 "smlatt %[res], r2, r4, %[res] \n"
285 "ldmia %[s2]!, {r3,r4} \n"
286 SUBHALFREGS(r0, r1, r3)
287 SUBHALFREGS(r1, r2, r4)
288 "stmia %[v1]!, {r0,r1} \n"
289 ".endr \n"
290#if ORDER > 16
291 "subs %[cnt], %[cnt], #1 \n"
292 "bne 1b \n"
293#endif
294
295 "99: \n"
190 : /* outputs */ 296 : /* outputs */
191#if ORDER > 16 297#if ORDER > 16
192 [cnt]"+r"(cnt), 298 [cnt]"+r"(cnt),
193#endif 299#endif
194 [v1] "+r"(v1), 300 [v1] "+r"(v1),
195 [v2] "+r"(v2) 301 [f2] "+r"(f2),
302 [s2] "+r"(s2),
303 [res]"=r"(res)
196 : /* inputs */ 304 : /* inputs */
197 : /* clobbers */ 305 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5", 306 "r0", "r1", "r2", "r3", "r4", "r5", "r6", "memory"
199 "r6", "r7", "r8", "r9", "memory"
200 ); 307 );
308 return res;
201} 309}
202 310
203/* This version fetches data as 32 bit words, and *requires* v1 to be 311/* This version fetches data as 32 bit words, and *requires* v1 to be
@@ -211,9 +319,9 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
211#endif 319#endif
212 320
213#if ORDER > 16 321#if ORDER > 16
214#define MLA_BLOCKS "3" 322#define MLA_BLOCKS "7"
215#else 323#else
216#define MLA_BLOCKS "1" 324#define MLA_BLOCKS "3"
217#endif 325#endif
218 326
219 asm volatile ( 327 asm volatile (
@@ -224,36 +332,28 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
224 "beq 20f \n" 332 "beq 20f \n"
225 333
226 "10: \n" 334 "10: \n"
227 "ldrh r7, [%[v2]], #2 \n" 335 "ldrh r3, [%[v2]], #2 \n"
228#if ORDER > 32 336#if ORDER > 32
229 "mov r7, r7, lsl #16 \n" 337 "mov r3, r3, lsl #16 \n"
230 "1: \n" 338 "1: \n"
231 "ldmia %[v1]!, {r0-r3} \n" 339 "ldmia %[v1]!, {r0,r1} \n"
232 "smlabt %[res], r0, r7, %[res] \n" 340 "smlabt %[res], r0, r3, %[res] \n"
233#else 341#else
234 "ldmia %[v1]!, {r0-r3} \n" 342 "ldmia %[v1]!, {r0,r1} \n"
235 "smulbb %[res], r0, r7 \n" 343 "smulbb %[res], r0, r3 \n"
236#endif 344#endif
237 "ldmia %[v2]!, {r4-r7} \n" 345 "ldmia %[v2]!, {r2,r3} \n"
238 "smlatb %[res], r0, r4, %[res] \n" 346 "smlatb %[res], r0, r2, %[res] \n"
239 "smlabt %[res], r1, r4, %[res] \n" 347 "smlabt %[res], r1, r2, %[res] \n"
240 "smlatb %[res], r1, r5, %[res] \n" 348 "smlatb %[res], r1, r3, %[res] \n"
241 "smlabt %[res], r2, r5, %[res] \n" 349
242 "smlatb %[res], r2, r6, %[res] \n"
243 "smlabt %[res], r3, r6, %[res] \n"
244 "smlatb %[res], r3, r7, %[res] \n"
245
246 ".rept " MLA_BLOCKS "\n" 350 ".rept " MLA_BLOCKS "\n"
247 "ldmia %[v1]!, {r0-r3} \n" 351 "ldmia %[v1]!, {r0,r1} \n"
248 "smlabt %[res], r0, r7, %[res] \n" 352 "smlabt %[res], r0, r3, %[res] \n"
249 "ldmia %[v2]!, {r4-r7} \n" 353 "ldmia %[v2]!, {r2,r3} \n"
250 "smlatb %[res], r0, r4, %[res] \n" 354 "smlatb %[res], r0, r2, %[res] \n"
251 "smlabt %[res], r1, r4, %[res] \n" 355 "smlabt %[res], r1, r2, %[res] \n"
252 "smlatb %[res], r1, r5, %[res] \n" 356 "smlatb %[res], r1, r3, %[res] \n"
253 "smlabt %[res], r2, r5, %[res] \n"
254 "smlatb %[res], r2, r6, %[res] \n"
255 "smlabt %[res], r3, r6, %[res] \n"
256 "smlatb %[res], r3, r7, %[res] \n"
257 ".endr \n" 357 ".endr \n"
258#if ORDER > 32 358#if ORDER > 32
259 "subs %[cnt], %[cnt], #1 \n" 359 "subs %[cnt], %[cnt], #1 \n"
@@ -263,32 +363,24 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
263 363
264 "20: \n" 364 "20: \n"
265 "1: \n" 365 "1: \n"
266 "ldmia %[v1]!, {r0-r3} \n" 366 "ldmia %[v1]!, {r0,r1} \n"
267 "ldmia %[v2]!, {r4-r7} \n" 367 "ldmia %[v2]!, {r2,r3} \n"
268#if ORDER > 32 368#if ORDER > 32
269 "smlabb %[res], r0, r4, %[res] \n" 369 "smlabb %[res], r0, r2, %[res] \n"
270#else 370#else
271 "smulbb %[res], r0, r4 \n" 371 "smulbb %[res], r0, r2 \n"
272#endif 372#endif
273 "smlatt %[res], r0, r4, %[res] \n" 373 "smlatt %[res], r0, r2, %[res] \n"
274 "smlabb %[res], r1, r5, %[res] \n" 374 "smlabb %[res], r1, r3, %[res] \n"
275 "smlatt %[res], r1, r5, %[res] \n" 375 "smlatt %[res], r1, r3, %[res] \n"
276 "smlabb %[res], r2, r6, %[res] \n"
277 "smlatt %[res], r2, r6, %[res] \n"
278 "smlabb %[res], r3, r7, %[res] \n"
279 "smlatt %[res], r3, r7, %[res] \n"
280 376
281 ".rept " MLA_BLOCKS "\n" 377 ".rept " MLA_BLOCKS "\n"
282 "ldmia %[v1]!, {r0-r3} \n" 378 "ldmia %[v1]!, {r0,r1} \n"
283 "ldmia %[v2]!, {r4-r7} \n" 379 "ldmia %[v2]!, {r2,r3} \n"
284 "smlabb %[res], r0, r4, %[res] \n" 380 "smlabb %[res], r0, r2, %[res] \n"
285 "smlatt %[res], r0, r4, %[res] \n" 381 "smlatt %[res], r0, r2, %[res] \n"
286 "smlabb %[res], r1, r5, %[res] \n" 382 "smlabb %[res], r1, r3, %[res] \n"
287 "smlatt %[res], r1, r5, %[res] \n" 383 "smlatt %[res], r1, r3, %[res] \n"
288 "smlabb %[res], r2, r6, %[res] \n"
289 "smlatt %[res], r2, r6, %[res] \n"
290 "smlabb %[res], r3, r7, %[res] \n"
291 "smlatt %[res], r3, r7, %[res] \n"
292 ".endr \n" 384 ".endr \n"
293#if ORDER > 32 385#if ORDER > 32
294 "subs %[cnt], %[cnt], #1 \n" 386 "subs %[cnt], %[cnt], #1 \n"
@@ -305,8 +397,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
305 [res]"=r"(res) 397 [res]"=r"(res)
306 : /* inputs */ 398 : /* inputs */
307 : /* clobbers */ 399 : /* clobbers */
308 "r0", "r1", "r2", "r3", 400 "r0", "r1", "r2", "r3"
309 "r4", "r5", "r6", "r7"
310 ); 401 );
311 return res; 402 return res;
312} 403}