summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-10-03 08:54:34 +0000
committerJens Arnold <amiconn@rockbox.org>2008-10-03 08:54:34 +0000
commit6fcf2765dd01703e940dce5d891a16e28180a508 (patch)
treeabbf5ac2661ee140de7a4ddd4e64c58f6b6a4911
parentc42f22cb8773c78e99bf5eb79ed4a784ead3cbc8 (diff)
downloadrockbox-6fcf2765dd01703e940dce5d891a16e28180a508.tar.gz
rockbox-6fcf2765dd01703e940dce5d891a16e28180a508.zip
Add armv6 specific asm code for the APE filters, speeding up -c2000..-c5000 a bit on Gigabeat S.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18692 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/filter.c2
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv6.h287
2 files changed, 289 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 8bc34b1e01..0ff7148bf2 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -30,6 +30,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
30 30
31#ifdef CPU_COLDFIRE 31#ifdef CPU_COLDFIRE
32#include "vector_math16_cf.h" 32#include "vector_math16_cf.h"
33#elif ARM_ARCH >= 6
34#include "vector_math16_armv6.h"
33#elif defined CPU_ARM7TDMI 35#elif defined CPU_ARM7TDMI
34#include "vector_math16_arm7.h" 36#include "vector_math16_arm7.h"
35#else 37#else
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h
new file mode 100644
index 0000000000..7ecf372462
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h
@@ -0,0 +1,287 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARMv6 vector math copyright (C) 2008 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2)
31{
32#if ORDER > 16
33 int cnt = ORDER>>4;
34#endif
35
36 asm volatile (
37 "tst %[v2], #2 \n"
38 "beq 20f \n"
39
40 "10: \n"
41 "ldrh r4, [%[v2]], #2 \n"
42 "1: \n"
43 "ldmia %[v2]!, {r5-r8} \n"
44 "ldmia %[v1], {r0-r3} \n"
45 "mov r5, r5, ror #16 \n"
46 "pkhbt r4, r4, r5 \n"
47 "sadd16 r0, r0, r4 \n"
48 "mov r6, r6, ror #16 \n"
49 "pkhbt r5, r5, r6 \n"
50 "sadd16 r1, r1, r5 \n"
51 "mov r7, r7, ror #16 \n"
52 "pkhbt r6, r6, r7 \n"
53 "sadd16 r2, r2, r6 \n"
54 "mov r8, r8, ror #16 \n"
55 "pkhbt r7, r7, r8 \n"
56 "sadd16 r3, r3, r7 \n"
57 "stmia %[v1]!, {r0-r3} \n"
58 "mov r4, r8 \n"
59 "ldmia %[v2]!, {r5-r8} \n"
60 "ldmia %[v1], {r0-r3} \n"
61 "mov r5, r5, ror #16 \n"
62 "pkhbt r4, r4, r5 \n"
63 "sadd16 r0, r0, r4 \n"
64 "mov r6, r6, ror #16 \n"
65 "pkhbt r5, r5, r6 \n"
66 "sadd16 r1, r1, r5 \n"
67 "mov r7, r7, ror #16 \n"
68 "pkhbt r6, r6, r7 \n"
69 "sadd16 r2, r2, r6 \n"
70 "mov r8, r8, ror #16 \n"
71 "pkhbt r7, r7, r8 \n"
72 "sadd16 r3, r3, r7 \n"
73 "stmia %[v1]!, {r0-r3} \n"
74#if ORDER > 16
75 "mov r4, r8 \n"
76 "subs %[cnt], %[cnt], #1 \n"
77 "bne 1b \n"
78#endif
79 "b 99f \n"
80
81 "20: \n"
82 "1: \n"
83 "ldmia %[v2]!, {r4-r7} \n"
84 "ldmia %[v1], {r0-r3} \n"
85 "sadd16 r0, r0, r4 \n"
86 "sadd16 r1, r1, r5 \n"
87 "sadd16 r2, r2, r6 \n"
88 "sadd16 r3, r3, r7 \n"
89 "stmia %[v1]!, {r0-r3} \n"
90 "ldmia %[v2]!, {r4-r7} \n"
91 "ldmia %[v1], {r0-r3} \n"
92 "sadd16 r0, r0, r4 \n"
93 "sadd16 r1, r1, r5 \n"
94 "sadd16 r2, r2, r6 \n"
95 "sadd16 r3, r3, r7 \n"
96 "stmia %[v1]!, {r0-r3} \n"
97#if ORDER > 16
98 "subs %[cnt], %[cnt], #1 \n"
99 "bne 1b \n"
100#endif
101
102 "99: \n"
103 : /* outputs */
104#if ORDER > 16
105 [cnt]"+r"(cnt),
106#endif
107 [v1] "+r"(v1),
108 [v2] "+r"(v2)
109 : /* inputs */
110 : /* clobbers */
111 "r0", "r1", "r2", "r3", "r4",
112 "r5", "r6", "r7", "r8", "memory"
113 );
114}
115
116/* This version fetches data as 32 bit words, and *requires* v1 to be
117 * 32 bit aligned, otherwise it will result either in a data abort, or
118 * incorrect results (if ARM aligncheck is disabled). */
119static inline void vector_sub(int16_t* v1, int16_t* v2)
120{
121#if ORDER > 16
122 int cnt = ORDER>>4;
123#endif
124
125 asm volatile (
126 "tst %[v2], #2 \n"
127 "beq 20f \n"
128
129 "10: \n"
130 "ldrh r4, [%[v2]], #2 \n"
131 "mov r4, r4, lsl #16 \n"
132 "1: \n"
133 "ldmia %[v2]!, {r5-r8} \n"
134 "ldmia %[v1], {r0-r3} \n"
135 "mov r5, r5, ror #16 \n"
136 "pkhbt r4, r4, r5 \n"
137 "ssub16 r0, r0, r4 \n"
138 "mov r6, r6, ror #16 \n"
139 "pkhbt r5, r5, r6 \n"
140 "ssub16 r1, r1, r5 \n"
141 "mov r7, r7, ror #16 \n"
142 "pkhbt r6, r6, r7 \n"
143 "ssub16 r2, r2, r6 \n"
144 "mov r8, r8, ror #16 \n"
145 "pkhbt r7, r7, r8 \n"
146 "ssub16 r3, r3, r7 \n"
147 "stmia %[v1]!, {r0-r3} \n"
148 "mov r4, r8 \n"
149 "ldmia %[v2]!, {r5-r8} \n"
150 "ldmia %[v1], {r0-r3} \n"
151 "mov r5, r5, ror #16 \n"
152 "pkhbt r4, r4, r5 \n"
153 "ssub16 r0, r0, r4 \n"
154 "mov r6, r6, ror #16 \n"
155 "pkhbt r5, r5, r6 \n"
156 "ssub16 r1, r1, r5 \n"
157 "mov r7, r7, ror #16 \n"
158 "pkhbt r6, r6, r7 \n"
159 "ssub16 r2, r2, r6 \n"
160 "mov r8, r8, ror #16 \n"
161 "pkhbt r7, r7, r8 \n"
162 "ssub16 r3, r3, r7 \n"
163 "stmia %[v1]!, {r0-r3} \n"
164#if ORDER > 16
165 "mov r4, r8 \n"
166 "subs %[cnt], %[cnt], #1 \n"
167 "bne 1b \n"
168#endif
169 "b 99f \n"
170
171 "20: \n"
172 "1: \n"
173 "ldmia %[v2]!, {r4-r7} \n"
174 "ldmia %[v1], {r0-r3} \n"
175 "ssub16 r0, r0, r4 \n"
176 "ssub16 r1, r1, r5 \n"
177 "ssub16 r2, r2, r6 \n"
178 "ssub16 r3, r3, r7 \n"
179 "stmia %[v1]!, {r0-r3} \n"
180 "ldmia %[v2]!, {r4-r7} \n"
181 "ldmia %[v1], {r0-r3} \n"
182 "ssub16 r0, r0, r4 \n"
183 "ssub16 r1, r1, r5 \n"
184 "ssub16 r2, r2, r6 \n"
185 "ssub16 r3, r3, r7 \n"
186 "stmia %[v1]!, {r0-r3} \n"
187#if ORDER > 16
188 "subs %[cnt], %[cnt], #1 \n"
189 "bne 1b \n"
190#endif
191
192 "99: \n"
193 : /* outputs */
194#if ORDER > 16
195 [cnt]"+r"(cnt),
196#endif
197 [v1] "+r"(v1),
198 [v2] "+r"(v2)
199 : /* inputs */
200 : /* clobbers */
201 "r0", "r1", "r2", "r3", "r4",
202 "r5", "r6", "r7", "r8", "memory"
203 );
204}
205
206/* This version fetches data as 32 bit words, and *requires* v1 to be
207 * 32 bit aligned, otherwise it will result either in a data abort, or
208 * incorrect results (if ARM aligncheck is disabled). It is optimised
209 * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
210 * than the C version. */
211static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
212{
213 int res = 0;
214#if ORDER > 16
215 int cnt = ORDER>>4;
216#endif
217
218 asm volatile (
219 "tst %[v2], #2 \n"
220 "beq 20f \n"
221
222 "10: \n"
223 "ldrh r4, [%[v2]], #2 \n"
224 "mov r4, r4, lsl #16 \n"
225 "1: \n"
226 "ldmia %[v1]!, {r0-r3} \n"
227 "ldmia %[v2]!, {r5-r8} \n"
228 "smlabt %[res], r0, r4, %[res] \n"
229 "smlatb %[res], r0, r5, %[res] \n"
230 "smlabt %[res], r1, r5, %[res] \n"
231 "smlatb %[res], r1, r6, %[res] \n"
232 "smlabt %[res], r2, r6, %[res] \n"
233 "smlatb %[res], r2, r7, %[res] \n"
234 "smlabt %[res], r3, r7, %[res] \n"
235 "smlatb %[res], r3, r8, %[res] \n"
236 "mov r4, r8 \n"
237 "ldmia %[v1]!, {r0-r3} \n"
238 "ldmia %[v2]!, {r5-r8} \n"
239 "smlabt %[res], r0, r4, %[res] \n"
240 "smlatb %[res], r0, r5, %[res] \n"
241 "smlabt %[res], r1, r5, %[res] \n"
242 "smlatb %[res], r1, r6, %[res] \n"
243 "smlabt %[res], r2, r6, %[res] \n"
244 "smlatb %[res], r2, r7, %[res] \n"
245 "smlabt %[res], r3, r7, %[res] \n"
246 "smlatb %[res], r3, r8, %[res] \n"
247#if ORDER > 16
248 "mov r4, r8 \n"
249 "subs %[cnt], %[cnt], #1 \n"
250 "bne 1b \n"
251#endif
252 "b 99f \n"
253
254 "20: \n"
255 "1: \n"
256 "ldmia %[v1]!, {r0-r3} \n"
257 "ldmia %[v2]!, {r4-r7} \n"
258 "smlad %[res], r0, r4, %[res] \n"
259 "smlad %[res], r1, r5, %[res] \n"
260 "smlad %[res], r2, r6, %[res] \n"
261 "smlad %[res], r3, r7, %[res] \n"
262 "ldmia %[v1]!, {r0-r3} \n"
263 "ldmia %[v2]!, {r4-r7} \n"
264 "smlad %[res], r0, r4, %[res] \n"
265 "smlad %[res], r1, r5, %[res] \n"
266 "smlad %[res], r2, r6, %[res] \n"
267 "smlad %[res], r3, r7, %[res] \n"
268#if ORDER > 16
269 "subs %[cnt], %[cnt], #1 \n"
270 "bne 1b \n"
271#endif
272
273 "99: \n"
274 : /* outputs */
275#if ORDER > 16
276 [cnt]"+r"(cnt),
277#endif
278 [v1] "+r"(v1),
279 [v2] "+r"(v2),
280 [res]"+r"(res)
281 : /* inputs */
282 : /* clobbers */
283 "r0", "r1", "r2", "r3", "r4",
284 "r5", "r6", "r7", "r8"
285 );
286 return res;
287}