diff options
author | Jens Arnold <amiconn@rockbox.org> | 2010-02-10 23:23:17 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2010-02-10 23:23:17 +0000 |
commit | 0a291fff12c27ba6b46521ecaf126bdb4726c24e (patch) | |
tree | 4c7d6df8d4d43b07a8cf17b6eedaf950fbd38d1b | |
parent | 3d7983e5c7a496bb7c3a8578051fd9da61e243d1 (diff) | |
download | rockbox-0a291fff12c27ba6b46521ecaf126bdb4726c24e.tar.gz rockbox-0a291fff12c27ba6b46521ecaf126bdb4726c24e.zip |
APE: Fused vector math for the filters on ARMv5te. Speedup on Cowon D2 is ~4% for -c2000..-c4000 (less for -c5000). Thanks to Frank Gevaerts for testing.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24590 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv5te.h | 443 |
1 files changed, 267 insertions, 176 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h index 4f2c203f5e..2940585a42 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h | |||
@@ -24,180 +24,288 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 27 | #define FUSED_VECTOR_MATH |
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | 28 | |
29 | * incorrect results (if ARM aligncheck is disabled). */ | 29 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) |
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | 30 | * This version fetches data as 32 bit words, and *requires* v1 to be |
31 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit | ||
32 | * aligned or both unaligned. If either condition isn't met, it will either | ||
33 | * result in a data abort or incorrect results. */ | ||
34 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
31 | { | 35 | { |
36 | int res; | ||
32 | #if ORDER > 16 | 37 | #if ORDER > 16 |
33 | int cnt = ORDER>>4; | 38 | int cnt = ORDER>>4; |
34 | #endif | 39 | #endif |
35 | 40 | ||
36 | #define ADDHALFREGS(sum, s1) /* Adds register */ \ | 41 | #define ADDHALFREGS(sum, s1, s2) /* Adds register */ \ |
37 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ | 42 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ |
38 | "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ | 43 | "add " #sum ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ |
39 | "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ | 44 | "add " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ |
40 | "mov " #sum ", " #sum ", lsl #16 \n" \ | 45 | "mov " #s1 ", " #s1 ", lsl #16 \n" \ |
41 | "orr " #sum ", " #sum ", r8 , lsr #16 \n" | 46 | "orr " #sum ", " #s1 ", " #sum ", lsr #16 \n" |
42 | 47 | ||
43 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ | 48 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ |
44 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ | 49 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ |
45 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ | 50 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ |
46 | "mov " #sum ", " #sum ", lsl #16 \n" \ | 51 | "mov " #sum ", " #sum ", lsl #16 \n" \ |
47 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" | 52 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" |
48 | 53 | ||
49 | asm volatile ( | 54 | asm volatile ( |
50 | "tst %[v2], #2 \n" | ||
51 | "beq 20f \n" | ||
52 | |||
53 | "10: \n" | ||
54 | "ldrh r4, [%[v2]], #2 \n" | ||
55 | "mov r4, r4, lsl #16 \n" | ||
56 | "1: \n" | ||
57 | "ldmia %[v1], {r0-r3} \n" | ||
58 | "ldmia %[v2]!, {r5-r8} \n" | ||
59 | ADDHALFXREGS(r0, r4, r5) | ||
60 | ADDHALFXREGS(r1, r5, r6) | ||
61 | ADDHALFXREGS(r2, r6, r7) | ||
62 | ADDHALFXREGS(r3, r7, r8) | ||
63 | "stmia %[v1]!, {r0-r3} \n" | ||
64 | "mov r4, r8 \n" | ||
65 | "ldmia %[v1], {r0-r3} \n" | ||
66 | "ldmia %[v2]!, {r5-r8} \n" | ||
67 | ADDHALFXREGS(r0, r4, r5) | ||
68 | ADDHALFXREGS(r1, r5, r6) | ||
69 | ADDHALFXREGS(r2, r6, r7) | ||
70 | ADDHALFXREGS(r3, r7, r8) | ||
71 | "stmia %[v1]!, {r0-r3} \n" | ||
72 | #if ORDER > 16 | 55 | #if ORDER > 16 |
73 | "mov r4, r8 \n" | 56 | "mov %[res], #0 \n" |
74 | "subs %[cnt], %[cnt], #1 \n" | ||
75 | "bne 1b \n" | ||
76 | #endif | 57 | #endif |
77 | "b 99f \n" | 58 | "tst %[f2], #2 \n" |
59 | "beq 20f \n" | ||
78 | 60 | ||
79 | "20: \n" | 61 | "10: \n" |
80 | "1: \n" | 62 | "ldrh r4, [%[s2]], #2 \n" |
81 | "ldmia %[v1], {r0-r3} \n" | 63 | "mov r4, r4, lsl #16 \n" |
82 | "ldmia %[v2]!, {r4-r7} \n" | 64 | "ldrh r3, [%[f2]], #2 \n" |
83 | ADDHALFREGS(r0, r4) | ||
84 | ADDHALFREGS(r1, r5) | ||
85 | ADDHALFREGS(r2, r6) | ||
86 | ADDHALFREGS(r3, r7) | ||
87 | "stmia %[v1]!, {r0-r3} \n" | ||
88 | "ldmia %[v1], {r0-r3} \n" | ||
89 | "ldmia %[v2]!, {r4-r7} \n" | ||
90 | ADDHALFREGS(r0, r4) | ||
91 | ADDHALFREGS(r1, r5) | ||
92 | ADDHALFREGS(r2, r6) | ||
93 | ADDHALFREGS(r3, r7) | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | 65 | #if ORDER > 16 |
96 | "subs %[cnt], %[cnt], #1 \n" | 66 | "mov r3, r3, lsl #16 \n" |
97 | "bne 1b \n" | 67 | "1: \n" |
68 | "ldmia %[v1], {r0,r1} \n" | ||
69 | "smlabt %[res], r0, r3, %[res] \n" | ||
70 | #else | ||
71 | "ldmia %[v1], {r0,r1} \n" | ||
72 | "smulbb %[res], r0, r3 \n" | ||
73 | #endif | ||
74 | "ldmia %[f2]!, {r2,r3} \n" | ||
75 | "smlatb %[res], r0, r2, %[res] \n" | ||
76 | "smlabt %[res], r1, r2, %[res] \n" | ||
77 | "smlatb %[res], r1, r3, %[res] \n" | ||
78 | "ldmia %[s2]!, {r2,r5} \n" | ||
79 | ADDHALFXREGS(r0, r4, r2) | ||
80 | ADDHALFXREGS(r1, r2, r5) | ||
81 | "stmia %[v1]!, {r0,r1} \n" | ||
82 | "ldmia %[v1], {r0,r1} \n" | ||
83 | "smlabt %[res], r0, r3, %[res] \n" | ||
84 | "ldmia %[f2]!, {r2,r3} \n" | ||
85 | "smlatb %[res], r0, r2, %[res] \n" | ||
86 | "smlabt %[res], r1, r2, %[res] \n" | ||
87 | "smlatb %[res], r1, r3, %[res] \n" | ||
88 | "ldmia %[s2]!, {r2,r4} \n" | ||
89 | ADDHALFXREGS(r0, r5, r2) | ||
90 | ADDHALFXREGS(r1, r2, r4) | ||
91 | "stmia %[v1]!, {r0,r1} \n" | ||
92 | |||
93 | "ldmia %[v1], {r0,r1} \n" | ||
94 | "smlabt %[res], r0, r3, %[res] \n" | ||
95 | "ldmia %[f2]!, {r2,r3} \n" | ||
96 | "smlatb %[res], r0, r2, %[res] \n" | ||
97 | "smlabt %[res], r1, r2, %[res] \n" | ||
98 | "smlatb %[res], r1, r3, %[res] \n" | ||
99 | "ldmia %[s2]!, {r2,r5} \n" | ||
100 | ADDHALFXREGS(r0, r4, r2) | ||
101 | ADDHALFXREGS(r1, r2, r5) | ||
102 | "stmia %[v1]!, {r0,r1} \n" | ||
103 | "ldmia %[v1], {r0,r1} \n" | ||
104 | "smlabt %[res], r0, r3, %[res] \n" | ||
105 | "ldmia %[f2]!, {r2,r3} \n" | ||
106 | "smlatb %[res], r0, r2, %[res] \n" | ||
107 | "smlabt %[res], r1, r2, %[res] \n" | ||
108 | "smlatb %[res], r1, r3, %[res] \n" | ||
109 | "ldmia %[s2]!, {r2,r4} \n" | ||
110 | ADDHALFXREGS(r0, r5, r2) | ||
111 | ADDHALFXREGS(r1, r2, r4) | ||
112 | "stmia %[v1]!, {r0,r1} \n" | ||
113 | #if ORDER > 16 | ||
114 | "subs %[cnt], %[cnt], #1 \n" | ||
115 | "bne 1b \n" | ||
116 | #endif | ||
117 | "b 99f \n" | ||
118 | |||
119 | "20: \n" | ||
120 | "1: \n" | ||
121 | "ldmia %[v1], {r1,r2} \n" | ||
122 | "ldmia %[f2]!, {r3,r4} \n" | ||
123 | #if ORDER > 16 | ||
124 | "smlabb %[res], r1, r3, %[res] \n" | ||
125 | #else | ||
126 | "smulbb %[res], r1, r3 \n" | ||
127 | #endif | ||
128 | "smlatt %[res], r1, r3, %[res] \n" | ||
129 | "smlabb %[res], r2, r4, %[res] \n" | ||
130 | "smlatt %[res], r2, r4, %[res] \n" | ||
131 | "ldmia %[s2]!, {r3,r4} \n" | ||
132 | ADDHALFREGS(r0, r1, r3) | ||
133 | ADDHALFREGS(r1, r2, r4) | ||
134 | "stmia %[v1]!, {r0,r1} \n" | ||
135 | |||
136 | ".rept 3 \n" | ||
137 | "ldmia %[v1], {r1,r2} \n" | ||
138 | "ldmia %[f2]!, {r3,r4} \n" | ||
139 | "smlabb %[res], r1, r3, %[res] \n" | ||
140 | "smlatt %[res], r1, r3, %[res] \n" | ||
141 | "smlabb %[res], r2, r4, %[res] \n" | ||
142 | "smlatt %[res], r2, r4, %[res] \n" | ||
143 | "ldmia %[s2]!, {r3,r4} \n" | ||
144 | ADDHALFREGS(r0, r1, r3) | ||
145 | ADDHALFREGS(r1, r2, r4) | ||
146 | "stmia %[v1]!, {r0,r1} \n" | ||
147 | ".endr \n" | ||
148 | #if ORDER > 16 | ||
149 | "subs %[cnt], %[cnt], #1 \n" | ||
150 | "bne 1b \n" | ||
98 | #endif | 151 | #endif |
99 | 152 | ||
100 | "99: \n" | 153 | "99: \n" |
101 | : /* outputs */ | 154 | : /* outputs */ |
102 | #if ORDER > 16 | 155 | #if ORDER > 16 |
103 | [cnt]"+r"(cnt), | 156 | [cnt]"+r"(cnt), |
104 | #endif | 157 | #endif |
105 | [v1] "+r"(v1), | 158 | [v1] "+r"(v1), |
106 | [v2] "+r"(v2) | 159 | [f2] "+r"(f2), |
160 | [s2] "+r"(s2), | ||
161 | [res]"=r"(res) | ||
107 | : /* inputs */ | 162 | : /* inputs */ |
108 | : /* clobbers */ | 163 | : /* clobbers */ |
109 | "r0", "r1", "r2", "r3", "r4", | 164 | "r0", "r1", "r2", "r3", "r4", "r5", "memory" |
110 | "r5", "r6", "r7", "r8", "memory" | ||
111 | ); | 165 | ); |
166 | return res; | ||
112 | } | 167 | } |
113 | 168 | ||
114 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 169 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) |
115 | * 32 bit aligned, otherwise it will result either in a data abort, or | 170 | * This version fetches data as 32 bit words, and *requires* v1 to be |
116 | * incorrect results (if ARM aligncheck is disabled). */ | 171 | * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit |
117 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 172 | * aligned or both unaligned. If either condition isn't met, it will either |
173 | * result in a data abort or incorrect results. */ | ||
174 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
118 | { | 175 | { |
176 | int res; | ||
119 | #if ORDER > 16 | 177 | #if ORDER > 16 |
120 | int cnt = ORDER>>4; | 178 | int cnt = ORDER>>4; |
121 | #endif | 179 | #endif |
122 | 180 | ||
123 | #define SUBHALFREGS(dif, s1) /* Subtracts register */ \ | 181 | #define SUBHALFREGS(dif, s1, s2) /* Subtracts reg. */ \ |
124 | "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ | 182 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ |
125 | "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ | 183 | "sub " #dif ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ |
126 | "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ | 184 | "sub " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ |
127 | "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ | 185 | "mov " #s1 ", " #s1 ", lsl #16 \n" \ |
128 | "orr " #dif ", r8 , " #dif ", lsl #16 \n" | 186 | "orr " #dif ", " #s1 ", " #dif ", lsr #16 \n" |
129 | 187 | ||
130 | #define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ | 188 | #define SUBHALFXREGS(dif, s1, s2, msk) /* Subtracts reg. */ \ |
131 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ | 189 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ |
132 | "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ | 190 | "and " #s1 ", " #s1 ", " #msk " \n" /* Needs msk = */ \ |
133 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ | 191 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* 0x0000ffff, */ \ |
134 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" | 192 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" /* clobbers 's1'. */ |
135 | 193 | ||
136 | asm volatile ( | 194 | asm volatile ( |
137 | "mov r9, #0xff \n" | ||
138 | "orr r9, r9, #0xff00 \n" | ||
139 | "tst %[v2], #2 \n" | ||
140 | "beq 20f \n" | ||
141 | |||
142 | "10: \n" | ||
143 | "ldrh r4, [%[v2]], #2 \n" | ||
144 | "mov r4, r4, lsl #16 \n" | ||
145 | "1: \n" | ||
146 | "ldmia %[v1], {r0-r3} \n" | ||
147 | "ldmia %[v2]!, {r5-r8} \n" | ||
148 | SUBHALFXREGS(r0, r4, r5) | ||
149 | SUBHALFXREGS(r1, r5, r6) | ||
150 | SUBHALFXREGS(r2, r6, r7) | ||
151 | SUBHALFXREGS(r3, r7, r8) | ||
152 | "stmia %[v1]!, {r0-r3} \n" | ||
153 | "mov r4, r8 \n" | ||
154 | "ldmia %[v1], {r0-r3} \n" | ||
155 | "ldmia %[v2]!, {r5-r8} \n" | ||
156 | SUBHALFXREGS(r0, r4, r5) | ||
157 | SUBHALFXREGS(r1, r5, r6) | ||
158 | SUBHALFXREGS(r2, r6, r7) | ||
159 | SUBHALFXREGS(r3, r7, r8) | ||
160 | "stmia %[v1]!, {r0-r3} \n" | ||
161 | #if ORDER > 16 | 195 | #if ORDER > 16 |
162 | "mov r4, r8 \n" | 196 | "mov %[res], #0 \n" |
163 | "subs %[cnt], %[cnt], #1 \n" | ||
164 | "bne 1b \n" | ||
165 | #endif | 197 | #endif |
166 | "b 99f \n" | 198 | "tst %[f2], #2 \n" |
199 | "beq 20f \n" | ||
167 | 200 | ||
168 | "20: \n" | 201 | "10: \n" |
169 | "1: \n" | 202 | "mov r6, #0xff \n" |
170 | "ldmia %[v1], {r0-r3} \n" | 203 | "orr r6, r6, #0xff00 \n" |
171 | "ldmia %[v2]!, {r4-r7} \n" | 204 | "ldrh r4, [%[s2]], #2 \n" |
172 | SUBHALFREGS(r0, r4) | 205 | "mov r4, r4, lsl #16 \n" |
173 | SUBHALFREGS(r1, r5) | 206 | "ldrh r3, [%[f2]], #2 \n" |
174 | SUBHALFREGS(r2, r6) | ||
175 | SUBHALFREGS(r3, r7) | ||
176 | "stmia %[v1]!, {r0-r3} \n" | ||
177 | "ldmia %[v1], {r0-r3} \n" | ||
178 | "ldmia %[v2]!, {r4-r7} \n" | ||
179 | SUBHALFREGS(r0, r4) | ||
180 | SUBHALFREGS(r1, r5) | ||
181 | SUBHALFREGS(r2, r6) | ||
182 | SUBHALFREGS(r3, r7) | ||
183 | "stmia %[v1]!, {r0-r3} \n" | ||
184 | #if ORDER > 16 | 207 | #if ORDER > 16 |
185 | "subs %[cnt], %[cnt], #1 \n" | 208 | "mov r3, r3, lsl #16 \n" |
186 | "bne 1b \n" | 209 | "1: \n" |
210 | "ldmia %[v1], {r0,r1} \n" | ||
211 | "smlabt %[res], r0, r3, %[res] \n" | ||
212 | #else | ||
213 | "ldmia %[v1], {r0,r1} \n" | ||
214 | "smulbb %[res], r0, r3 \n" | ||
215 | #endif | ||
216 | "ldmia %[f2]!, {r2,r3} \n" | ||
217 | "smlatb %[res], r0, r2, %[res] \n" | ||
218 | "smlabt %[res], r1, r2, %[res] \n" | ||
219 | "smlatb %[res], r1, r3, %[res] \n" | ||
220 | "ldmia %[s2]!, {r2,r5} \n" | ||
221 | SUBHALFXREGS(r0, r4, r2, r6) | ||
222 | SUBHALFXREGS(r1, r2, r5, r6) | ||
223 | "stmia %[v1]!, {r0,r1} \n" | ||
224 | "ldmia %[v1], {r0,r1} \n" | ||
225 | "smlabt %[res], r0, r3, %[res] \n" | ||
226 | "ldmia %[f2]!, {r2,r3} \n" | ||
227 | "smlatb %[res], r0, r2, %[res] \n" | ||
228 | "smlabt %[res], r1, r2, %[res] \n" | ||
229 | "smlatb %[res], r1, r3, %[res] \n" | ||
230 | "ldmia %[s2]!, {r2,r4} \n" | ||
231 | SUBHALFXREGS(r0, r5, r2, r6) | ||
232 | SUBHALFXREGS(r1, r2, r4, r6) | ||
233 | "stmia %[v1]!, {r0,r1} \n" | ||
234 | |||
235 | "ldmia %[v1], {r0,r1} \n" | ||
236 | "smlabt %[res], r0, r3, %[res] \n" | ||
237 | "ldmia %[f2]!, {r2,r3} \n" | ||
238 | "smlatb %[res], r0, r2, %[res] \n" | ||
239 | "smlabt %[res], r1, r2, %[res] \n" | ||
240 | "smlatb %[res], r1, r3, %[res] \n" | ||
241 | "ldmia %[s2]!, {r2,r5} \n" | ||
242 | SUBHALFXREGS(r0, r4, r2, r6) | ||
243 | SUBHALFXREGS(r1, r2, r5, r6) | ||
244 | "stmia %[v1]!, {r0,r1} \n" | ||
245 | "ldmia %[v1], {r0,r1} \n" | ||
246 | "smlabt %[res], r0, r3, %[res] \n" | ||
247 | "ldmia %[f2]!, {r2,r3} \n" | ||
248 | "smlatb %[res], r0, r2, %[res] \n" | ||
249 | "smlabt %[res], r1, r2, %[res] \n" | ||
250 | "smlatb %[res], r1, r3, %[res] \n" | ||
251 | "ldmia %[s2]!, {r2,r4} \n" | ||
252 | SUBHALFXREGS(r0, r5, r2, r6) | ||
253 | SUBHALFXREGS(r1, r2, r4, r6) | ||
254 | "stmia %[v1]!, {r0,r1} \n" | ||
255 | #if ORDER > 16 | ||
256 | "subs %[cnt], %[cnt], #1 \n" | ||
257 | "bne 1b \n" | ||
187 | #endif | 258 | #endif |
259 | "b 99f \n" | ||
188 | 260 | ||
189 | "99: \n" | 261 | "20: \n" |
262 | "1: \n" | ||
263 | "ldmia %[v1], {r1,r2} \n" | ||
264 | "ldmia %[f2]!, {r3,r4} \n" | ||
265 | #if ORDER > 16 | ||
266 | "smlabb %[res], r1, r3, %[res] \n" | ||
267 | #else | ||
268 | "smulbb %[res], r1, r3 \n" | ||
269 | #endif | ||
270 | "smlatt %[res], r1, r3, %[res] \n" | ||
271 | "smlabb %[res], r2, r4, %[res] \n" | ||
272 | "smlatt %[res], r2, r4, %[res] \n" | ||
273 | "ldmia %[s2]!, {r3,r4} \n" | ||
274 | SUBHALFREGS(r0, r1, r3) | ||
275 | SUBHALFREGS(r1, r2, r4) | ||
276 | "stmia %[v1]!, {r0,r1} \n" | ||
277 | |||
278 | ".rept 3 \n" | ||
279 | "ldmia %[v1], {r1,r2} \n" | ||
280 | "ldmia %[f2]!, {r3,r4} \n" | ||
281 | "smlabb %[res], r1, r3, %[res] \n" | ||
282 | "smlatt %[res], r1, r3, %[res] \n" | ||
283 | "smlabb %[res], r2, r4, %[res] \n" | ||
284 | "smlatt %[res], r2, r4, %[res] \n" | ||
285 | "ldmia %[s2]!, {r3,r4} \n" | ||
286 | SUBHALFREGS(r0, r1, r3) | ||
287 | SUBHALFREGS(r1, r2, r4) | ||
288 | "stmia %[v1]!, {r0,r1} \n" | ||
289 | ".endr \n" | ||
290 | #if ORDER > 16 | ||
291 | "subs %[cnt], %[cnt], #1 \n" | ||
292 | "bne 1b \n" | ||
293 | #endif | ||
294 | |||
295 | "99: \n" | ||
190 | : /* outputs */ | 296 | : /* outputs */ |
191 | #if ORDER > 16 | 297 | #if ORDER > 16 |
192 | [cnt]"+r"(cnt), | 298 | [cnt]"+r"(cnt), |
193 | #endif | 299 | #endif |
194 | [v1] "+r"(v1), | 300 | [v1] "+r"(v1), |
195 | [v2] "+r"(v2) | 301 | [f2] "+r"(f2), |
302 | [s2] "+r"(s2), | ||
303 | [res]"=r"(res) | ||
196 | : /* inputs */ | 304 | : /* inputs */ |
197 | : /* clobbers */ | 305 | : /* clobbers */ |
198 | "r0", "r1", "r2", "r3", "r4", "r5", | 306 | "r0", "r1", "r2", "r3", "r4", "r5", "r6", "memory" |
199 | "r6", "r7", "r8", "r9", "memory" | ||
200 | ); | 307 | ); |
308 | return res; | ||
201 | } | 309 | } |
202 | 310 | ||
203 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 311 | /* This version fetches data as 32 bit words, and *requires* v1 to be |
@@ -211,9 +319,9 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
211 | #endif | 319 | #endif |
212 | 320 | ||
213 | #if ORDER > 16 | 321 | #if ORDER > 16 |
214 | #define MLA_BLOCKS "3" | 322 | #define MLA_BLOCKS "7" |
215 | #else | 323 | #else |
216 | #define MLA_BLOCKS "1" | 324 | #define MLA_BLOCKS "3" |
217 | #endif | 325 | #endif |
218 | 326 | ||
219 | asm volatile ( | 327 | asm volatile ( |
@@ -224,36 +332,28 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
224 | "beq 20f \n" | 332 | "beq 20f \n" |
225 | 333 | ||
226 | "10: \n" | 334 | "10: \n" |
227 | "ldrh r7, [%[v2]], #2 \n" | 335 | "ldrh r3, [%[v2]], #2 \n" |
228 | #if ORDER > 32 | 336 | #if ORDER > 32 |
229 | "mov r7, r7, lsl #16 \n" | 337 | "mov r3, r3, lsl #16 \n" |
230 | "1: \n" | 338 | "1: \n" |
231 | "ldmia %[v1]!, {r0-r3} \n" | 339 | "ldmia %[v1]!, {r0,r1} \n" |
232 | "smlabt %[res], r0, r7, %[res] \n" | 340 | "smlabt %[res], r0, r3, %[res] \n" |
233 | #else | 341 | #else |
234 | "ldmia %[v1]!, {r0-r3} \n" | 342 | "ldmia %[v1]!, {r0,r1} \n" |
235 | "smulbb %[res], r0, r7 \n" | 343 | "smulbb %[res], r0, r3 \n" |
236 | #endif | 344 | #endif |
237 | "ldmia %[v2]!, {r4-r7} \n" | 345 | "ldmia %[v2]!, {r2,r3} \n" |
238 | "smlatb %[res], r0, r4, %[res] \n" | 346 | "smlatb %[res], r0, r2, %[res] \n" |
239 | "smlabt %[res], r1, r4, %[res] \n" | 347 | "smlabt %[res], r1, r2, %[res] \n" |
240 | "smlatb %[res], r1, r5, %[res] \n" | 348 | "smlatb %[res], r1, r3, %[res] \n" |
241 | "smlabt %[res], r2, r5, %[res] \n" | 349 | |
242 | "smlatb %[res], r2, r6, %[res] \n" | ||
243 | "smlabt %[res], r3, r6, %[res] \n" | ||
244 | "smlatb %[res], r3, r7, %[res] \n" | ||
245 | |||
246 | ".rept " MLA_BLOCKS "\n" | 350 | ".rept " MLA_BLOCKS "\n" |
247 | "ldmia %[v1]!, {r0-r3} \n" | 351 | "ldmia %[v1]!, {r0,r1} \n" |
248 | "smlabt %[res], r0, r7, %[res] \n" | 352 | "smlabt %[res], r0, r3, %[res] \n" |
249 | "ldmia %[v2]!, {r4-r7} \n" | 353 | "ldmia %[v2]!, {r2,r3} \n" |
250 | "smlatb %[res], r0, r4, %[res] \n" | 354 | "smlatb %[res], r0, r2, %[res] \n" |
251 | "smlabt %[res], r1, r4, %[res] \n" | 355 | "smlabt %[res], r1, r2, %[res] \n" |
252 | "smlatb %[res], r1, r5, %[res] \n" | 356 | "smlatb %[res], r1, r3, %[res] \n" |
253 | "smlabt %[res], r2, r5, %[res] \n" | ||
254 | "smlatb %[res], r2, r6, %[res] \n" | ||
255 | "smlabt %[res], r3, r6, %[res] \n" | ||
256 | "smlatb %[res], r3, r7, %[res] \n" | ||
257 | ".endr \n" | 357 | ".endr \n" |
258 | #if ORDER > 32 | 358 | #if ORDER > 32 |
259 | "subs %[cnt], %[cnt], #1 \n" | 359 | "subs %[cnt], %[cnt], #1 \n" |
@@ -263,32 +363,24 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
263 | 363 | ||
264 | "20: \n" | 364 | "20: \n" |
265 | "1: \n" | 365 | "1: \n" |
266 | "ldmia %[v1]!, {r0-r3} \n" | 366 | "ldmia %[v1]!, {r0,r1} \n" |
267 | "ldmia %[v2]!, {r4-r7} \n" | 367 | "ldmia %[v2]!, {r2,r3} \n" |
268 | #if ORDER > 32 | 368 | #if ORDER > 32 |
269 | "smlabb %[res], r0, r4, %[res] \n" | 369 | "smlabb %[res], r0, r2, %[res] \n" |
270 | #else | 370 | #else |
271 | "smulbb %[res], r0, r4 \n" | 371 | "smulbb %[res], r0, r2 \n" |
272 | #endif | 372 | #endif |
273 | "smlatt %[res], r0, r4, %[res] \n" | 373 | "smlatt %[res], r0, r2, %[res] \n" |
274 | "smlabb %[res], r1, r5, %[res] \n" | 374 | "smlabb %[res], r1, r3, %[res] \n" |
275 | "smlatt %[res], r1, r5, %[res] \n" | 375 | "smlatt %[res], r1, r3, %[res] \n" |
276 | "smlabb %[res], r2, r6, %[res] \n" | ||
277 | "smlatt %[res], r2, r6, %[res] \n" | ||
278 | "smlabb %[res], r3, r7, %[res] \n" | ||
279 | "smlatt %[res], r3, r7, %[res] \n" | ||
280 | 376 | ||
281 | ".rept " MLA_BLOCKS "\n" | 377 | ".rept " MLA_BLOCKS "\n" |
282 | "ldmia %[v1]!, {r0-r3} \n" | 378 | "ldmia %[v1]!, {r0,r1} \n" |
283 | "ldmia %[v2]!, {r4-r7} \n" | 379 | "ldmia %[v2]!, {r2,r3} \n" |
284 | "smlabb %[res], r0, r4, %[res] \n" | 380 | "smlabb %[res], r0, r2, %[res] \n" |
285 | "smlatt %[res], r0, r4, %[res] \n" | 381 | "smlatt %[res], r0, r2, %[res] \n" |
286 | "smlabb %[res], r1, r5, %[res] \n" | 382 | "smlabb %[res], r1, r3, %[res] \n" |
287 | "smlatt %[res], r1, r5, %[res] \n" | 383 | "smlatt %[res], r1, r3, %[res] \n" |
288 | "smlabb %[res], r2, r6, %[res] \n" | ||
289 | "smlatt %[res], r2, r6, %[res] \n" | ||
290 | "smlabb %[res], r3, r7, %[res] \n" | ||
291 | "smlatt %[res], r3, r7, %[res] \n" | ||
292 | ".endr \n" | 384 | ".endr \n" |
293 | #if ORDER > 32 | 385 | #if ORDER > 32 |
294 | "subs %[cnt], %[cnt], #1 \n" | 386 | "subs %[cnt], %[cnt], #1 \n" |
@@ -305,8 +397,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
305 | [res]"=r"(res) | 397 | [res]"=r"(res) |
306 | : /* inputs */ | 398 | : /* inputs */ |
307 | : /* clobbers */ | 399 | : /* clobbers */ |
308 | "r0", "r1", "r2", "r3", | 400 | "r0", "r1", "r2", "r3" |
309 | "r4", "r5", "r6", "r7" | ||
310 | ); | 401 | ); |
311 | return res; | 402 | return res; |
312 | } | 403 | } |