diff options
author | Jens Arnold <amiconn@rockbox.org> | 2008-10-03 08:54:34 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2008-10-03 08:54:34 +0000 |
commit | 6fcf2765dd01703e940dce5d891a16e28180a508 (patch) | |
tree | abbf5ac2661ee140de7a4ddd4e64c58f6b6a4911 /apps/codecs/demac/libdemac/vector_math16_armv6.h | |
parent | c42f22cb8773c78e99bf5eb79ed4a784ead3cbc8 (diff) | |
download | rockbox-6fcf2765dd01703e940dce5d891a16e28180a508.tar.gz rockbox-6fcf2765dd01703e940dce5d891a16e28180a508.zip |
Add armv6 specific asm code for the APE filters, speeding up -c2000..-c5000 a bit on Gigabeat S.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18692 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_armv6.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv6.h | 287 |
1 files changed, 287 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h new file mode 100644 index 0000000000..7ecf372462 --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -0,0 +1,287 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARMv6 vector math copyright (C) 2008 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
29 | * incorrect results (if ARM aligncheck is disabled). */ | ||
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | ||
31 | { | ||
32 | #if ORDER > 16 | ||
33 | int cnt = ORDER>>4; | ||
34 | #endif | ||
35 | |||
36 | asm volatile ( | ||
37 | "tst %[v2], #2 \n" | ||
38 | "beq 20f \n" | ||
39 | |||
40 | "10: \n" | ||
41 | "ldrh r4, [%[v2]], #2 \n" | ||
42 | "1: \n" | ||
43 | "ldmia %[v2]!, {r5-r8} \n" | ||
44 | "ldmia %[v1], {r0-r3} \n" | ||
45 | "mov r5, r5, ror #16 \n" | ||
46 | "pkhbt r4, r4, r5 \n" | ||
47 | "sadd16 r0, r0, r4 \n" | ||
48 | "mov r6, r6, ror #16 \n" | ||
49 | "pkhbt r5, r5, r6 \n" | ||
50 | "sadd16 r1, r1, r5 \n" | ||
51 | "mov r7, r7, ror #16 \n" | ||
52 | "pkhbt r6, r6, r7 \n" | ||
53 | "sadd16 r2, r2, r6 \n" | ||
54 | "mov r8, r8, ror #16 \n" | ||
55 | "pkhbt r7, r7, r8 \n" | ||
56 | "sadd16 r3, r3, r7 \n" | ||
57 | "stmia %[v1]!, {r0-r3} \n" | ||
58 | "mov r4, r8 \n" | ||
59 | "ldmia %[v2]!, {r5-r8} \n" | ||
60 | "ldmia %[v1], {r0-r3} \n" | ||
61 | "mov r5, r5, ror #16 \n" | ||
62 | "pkhbt r4, r4, r5 \n" | ||
63 | "sadd16 r0, r0, r4 \n" | ||
64 | "mov r6, r6, ror #16 \n" | ||
65 | "pkhbt r5, r5, r6 \n" | ||
66 | "sadd16 r1, r1, r5 \n" | ||
67 | "mov r7, r7, ror #16 \n" | ||
68 | "pkhbt r6, r6, r7 \n" | ||
69 | "sadd16 r2, r2, r6 \n" | ||
70 | "mov r8, r8, ror #16 \n" | ||
71 | "pkhbt r7, r7, r8 \n" | ||
72 | "sadd16 r3, r3, r7 \n" | ||
73 | "stmia %[v1]!, {r0-r3} \n" | ||
74 | #if ORDER > 16 | ||
75 | "mov r4, r8 \n" | ||
76 | "subs %[cnt], %[cnt], #1 \n" | ||
77 | "bne 1b \n" | ||
78 | #endif | ||
79 | "b 99f \n" | ||
80 | |||
81 | "20: \n" | ||
82 | "1: \n" | ||
83 | "ldmia %[v2]!, {r4-r7} \n" | ||
84 | "ldmia %[v1], {r0-r3} \n" | ||
85 | "sadd16 r0, r0, r4 \n" | ||
86 | "sadd16 r1, r1, r5 \n" | ||
87 | "sadd16 r2, r2, r6 \n" | ||
88 | "sadd16 r3, r3, r7 \n" | ||
89 | "stmia %[v1]!, {r0-r3} \n" | ||
90 | "ldmia %[v2]!, {r4-r7} \n" | ||
91 | "ldmia %[v1], {r0-r3} \n" | ||
92 | "sadd16 r0, r0, r4 \n" | ||
93 | "sadd16 r1, r1, r5 \n" | ||
94 | "sadd16 r2, r2, r6 \n" | ||
95 | "sadd16 r3, r3, r7 \n" | ||
96 | "stmia %[v1]!, {r0-r3} \n" | ||
97 | #if ORDER > 16 | ||
98 | "subs %[cnt], %[cnt], #1 \n" | ||
99 | "bne 1b \n" | ||
100 | #endif | ||
101 | |||
102 | "99: \n" | ||
103 | : /* outputs */ | ||
104 | #if ORDER > 16 | ||
105 | [cnt]"+r"(cnt), | ||
106 | #endif | ||
107 | [v1] "+r"(v1), | ||
108 | [v2] "+r"(v2) | ||
109 | : /* inputs */ | ||
110 | : /* clobbers */ | ||
111 | "r0", "r1", "r2", "r3", "r4", | ||
112 | "r5", "r6", "r7", "r8", "memory" | ||
113 | ); | ||
114 | } | ||
115 | |||
116 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
117 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
118 | * incorrect results (if ARM aligncheck is disabled). */ | ||
119 | static inline void vector_sub(int16_t* v1, int16_t* v2) | ||
120 | { | ||
121 | #if ORDER > 16 | ||
122 | int cnt = ORDER>>4; | ||
123 | #endif | ||
124 | |||
125 | asm volatile ( | ||
126 | "tst %[v2], #2 \n" | ||
127 | "beq 20f \n" | ||
128 | |||
129 | "10: \n" | ||
130 | "ldrh r4, [%[v2]], #2 \n" | ||
131 | "mov r4, r4, lsl #16 \n" | ||
132 | "1: \n" | ||
133 | "ldmia %[v2]!, {r5-r8} \n" | ||
134 | "ldmia %[v1], {r0-r3} \n" | ||
135 | "mov r5, r5, ror #16 \n" | ||
136 | "pkhbt r4, r4, r5 \n" | ||
137 | "ssub16 r0, r0, r4 \n" | ||
138 | "mov r6, r6, ror #16 \n" | ||
139 | "pkhbt r5, r5, r6 \n" | ||
140 | "ssub16 r1, r1, r5 \n" | ||
141 | "mov r7, r7, ror #16 \n" | ||
142 | "pkhbt r6, r6, r7 \n" | ||
143 | "ssub16 r2, r2, r6 \n" | ||
144 | "mov r8, r8, ror #16 \n" | ||
145 | "pkhbt r7, r7, r8 \n" | ||
146 | "ssub16 r3, r3, r7 \n" | ||
147 | "stmia %[v1]!, {r0-r3} \n" | ||
148 | "mov r4, r8 \n" | ||
149 | "ldmia %[v2]!, {r5-r8} \n" | ||
150 | "ldmia %[v1], {r0-r3} \n" | ||
151 | "mov r5, r5, ror #16 \n" | ||
152 | "pkhbt r4, r4, r5 \n" | ||
153 | "ssub16 r0, r0, r4 \n" | ||
154 | "mov r6, r6, ror #16 \n" | ||
155 | "pkhbt r5, r5, r6 \n" | ||
156 | "ssub16 r1, r1, r5 \n" | ||
157 | "mov r7, r7, ror #16 \n" | ||
158 | "pkhbt r6, r6, r7 \n" | ||
159 | "ssub16 r2, r2, r6 \n" | ||
160 | "mov r8, r8, ror #16 \n" | ||
161 | "pkhbt r7, r7, r8 \n" | ||
162 | "ssub16 r3, r3, r7 \n" | ||
163 | "stmia %[v1]!, {r0-r3} \n" | ||
164 | #if ORDER > 16 | ||
165 | "mov r4, r8 \n" | ||
166 | "subs %[cnt], %[cnt], #1 \n" | ||
167 | "bne 1b \n" | ||
168 | #endif | ||
169 | "b 99f \n" | ||
170 | |||
171 | "20: \n" | ||
172 | "1: \n" | ||
173 | "ldmia %[v2]!, {r4-r7} \n" | ||
174 | "ldmia %[v1], {r0-r3} \n" | ||
175 | "ssub16 r0, r0, r4 \n" | ||
176 | "ssub16 r1, r1, r5 \n" | ||
177 | "ssub16 r2, r2, r6 \n" | ||
178 | "ssub16 r3, r3, r7 \n" | ||
179 | "stmia %[v1]!, {r0-r3} \n" | ||
180 | "ldmia %[v2]!, {r4-r7} \n" | ||
181 | "ldmia %[v1], {r0-r3} \n" | ||
182 | "ssub16 r0, r0, r4 \n" | ||
183 | "ssub16 r1, r1, r5 \n" | ||
184 | "ssub16 r2, r2, r6 \n" | ||
185 | "ssub16 r3, r3, r7 \n" | ||
186 | "stmia %[v1]!, {r0-r3} \n" | ||
187 | #if ORDER > 16 | ||
188 | "subs %[cnt], %[cnt], #1 \n" | ||
189 | "bne 1b \n" | ||
190 | #endif | ||
191 | |||
192 | "99: \n" | ||
193 | : /* outputs */ | ||
194 | #if ORDER > 16 | ||
195 | [cnt]"+r"(cnt), | ||
196 | #endif | ||
197 | [v1] "+r"(v1), | ||
198 | [v2] "+r"(v2) | ||
199 | : /* inputs */ | ||
200 | : /* clobbers */ | ||
201 | "r0", "r1", "r2", "r3", "r4", | ||
202 | "r5", "r6", "r7", "r8", "memory" | ||
203 | ); | ||
204 | } | ||
205 | |||
206 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
207 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
208 | * incorrect results (if ARM aligncheck is disabled). It is optimised | ||
209 | * for ARM7TDMI. Using it for ARM9 or higher results in worse performance | ||
210 | * than the C version. */ | ||
211 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
212 | { | ||
213 | int res = 0; | ||
214 | #if ORDER > 16 | ||
215 | int cnt = ORDER>>4; | ||
216 | #endif | ||
217 | |||
218 | asm volatile ( | ||
219 | "tst %[v2], #2 \n" | ||
220 | "beq 20f \n" | ||
221 | |||
222 | "10: \n" | ||
223 | "ldrh r4, [%[v2]], #2 \n" | ||
224 | "mov r4, r4, lsl #16 \n" | ||
225 | "1: \n" | ||
226 | "ldmia %[v1]!, {r0-r3} \n" | ||
227 | "ldmia %[v2]!, {r5-r8} \n" | ||
228 | "smlabt %[res], r0, r4, %[res] \n" | ||
229 | "smlatb %[res], r0, r5, %[res] \n" | ||
230 | "smlabt %[res], r1, r5, %[res] \n" | ||
231 | "smlatb %[res], r1, r6, %[res] \n" | ||
232 | "smlabt %[res], r2, r6, %[res] \n" | ||
233 | "smlatb %[res], r2, r7, %[res] \n" | ||
234 | "smlabt %[res], r3, r7, %[res] \n" | ||
235 | "smlatb %[res], r3, r8, %[res] \n" | ||
236 | "mov r4, r8 \n" | ||
237 | "ldmia %[v1]!, {r0-r3} \n" | ||
238 | "ldmia %[v2]!, {r5-r8} \n" | ||
239 | "smlabt %[res], r0, r4, %[res] \n" | ||
240 | "smlatb %[res], r0, r5, %[res] \n" | ||
241 | "smlabt %[res], r1, r5, %[res] \n" | ||
242 | "smlatb %[res], r1, r6, %[res] \n" | ||
243 | "smlabt %[res], r2, r6, %[res] \n" | ||
244 | "smlatb %[res], r2, r7, %[res] \n" | ||
245 | "smlabt %[res], r3, r7, %[res] \n" | ||
246 | "smlatb %[res], r3, r8, %[res] \n" | ||
247 | #if ORDER > 16 | ||
248 | "mov r4, r8 \n" | ||
249 | "subs %[cnt], %[cnt], #1 \n" | ||
250 | "bne 1b \n" | ||
251 | #endif | ||
252 | "b 99f \n" | ||
253 | |||
254 | "20: \n" | ||
255 | "1: \n" | ||
256 | "ldmia %[v1]!, {r0-r3} \n" | ||
257 | "ldmia %[v2]!, {r4-r7} \n" | ||
258 | "smlad %[res], r0, r4, %[res] \n" | ||
259 | "smlad %[res], r1, r5, %[res] \n" | ||
260 | "smlad %[res], r2, r6, %[res] \n" | ||
261 | "smlad %[res], r3, r7, %[res] \n" | ||
262 | "ldmia %[v1]!, {r0-r3} \n" | ||
263 | "ldmia %[v2]!, {r4-r7} \n" | ||
264 | "smlad %[res], r0, r4, %[res] \n" | ||
265 | "smlad %[res], r1, r5, %[res] \n" | ||
266 | "smlad %[res], r2, r6, %[res] \n" | ||
267 | "smlad %[res], r3, r7, %[res] \n" | ||
268 | #if ORDER > 16 | ||
269 | "subs %[cnt], %[cnt], #1 \n" | ||
270 | "bne 1b \n" | ||
271 | #endif | ||
272 | |||
273 | "99: \n" | ||
274 | : /* outputs */ | ||
275 | #if ORDER > 16 | ||
276 | [cnt]"+r"(cnt), | ||
277 | #endif | ||
278 | [v1] "+r"(v1), | ||
279 | [v2] "+r"(v2), | ||
280 | [res]"+r"(res) | ||
281 | : /* inputs */ | ||
282 | : /* clobbers */ | ||
283 | "r0", "r1", "r2", "r3", "r4", | ||
284 | "r5", "r6", "r7", "r8" | ||
285 | ); | ||
286 | return res; | ||
287 | } | ||