summaryrefslogtreecommitdiff
path: root/apps/codecs/demac/libdemac/vector_math16_arm7.h
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-19 00:34:48 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-19 00:34:48 +0000
commit77934cbc961a69e7d18588276f0e64a692854125 (patch)
treeffad34c6e3ae65466bdce8fc0f998404bbadee57 /apps/codecs/demac/libdemac/vector_math16_arm7.h
parent73b3f5417fb53579600b2645cfc227f614793f4f (diff)
downloadrockbox-77934cbc961a69e7d18588276f0e64a692854125.tar.gz
rockbox-77934cbc961a69e7d18588276f0e64a692854125.zip
Compile-time choice between 16 bit and 32 bit integers for the filters. 32 bit filters are faster on ARMv4 (with assembler code), so use them there. Nice speedup on PP and Gigabeat F/X.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19140 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_arm7.h')
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_arm7.h293
1 files changed, 0 insertions, 293 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
deleted file mode 100644
index 653bb1f53f..0000000000
--- a/apps/codecs/demac/libdemac/vector_math16_arm7.h
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARM7 vector math copyright (C) 2007 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2)
31{
32#if ORDER > 16
33 int cnt = ORDER>>4;
34#endif
35
36#define ADDHALFREGS(sum, s1) /* Adds register */ \
37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
40 "mov " #sum ", " #sum ", lsl #16 \n" \
41 "orr " #sum ", " #sum ", r8 , lsr #16 \n"
42
43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
46 "mov " #sum ", " #sum ", lsl #16 \n" \
47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
48
49 asm volatile (
50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16
73 "mov r4, r8 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif
77 "b 99f \n"
78
79 "20: \n"
80 "1: \n"
81 "ldmia %[v1], {r0-r3} \n"
82 "ldmia %[v2]!, {r4-r7} \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n"
98#endif
99
100 "99: \n"
101 : /* outputs */
102#if ORDER > 16
103 [cnt]"+r"(cnt),
104#endif
105 [v1] "+r"(v1),
106 [v2] "+r"(v2)
107 : /* inputs */
108 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4",
110 "r5", "r6", "r7", "r8", "memory"
111 );
112}
113
114/* This version fetches data as 32 bit words, and *requires* v1 to be
115 * 32 bit aligned, otherwise it will result either in a data abort, or
116 * incorrect results (if ARM aligncheck is disabled). */
117static inline void vector_sub(int16_t* v1, int16_t* v2)
118{
119#if ORDER > 16
120 int cnt = ORDER>>4;
121#endif
122
123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
128 "orr " #dif ", r8 , " #dif ", lsl #16 \n"
129
130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
135
136 asm volatile (
137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16
162 "mov r4, r8 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif
166 "b 99f \n"
167
168 "20: \n"
169 "1: \n"
170 "ldmia %[v1], {r0-r3} \n"
171 "ldmia %[v2]!, {r4-r7} \n"
172 SUBHALFREGS(r0, r4)
173 SUBHALFREGS(r1, r5)
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
188
189 "99: \n"
190 : /* outputs */
191#if ORDER > 16
192 [cnt]"+r"(cnt),
193#endif
194 [v1] "+r"(v1),
195 [v2] "+r"(v2)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5",
199 "r6", "r7", "r8", "r9", "memory"
200 );
201}
202
203/* This version fetches data as 32 bit words, and *requires* v1 to be
204 * 32 bit aligned, otherwise it will result either in a data abort, or
205 * incorrect results (if ARM aligncheck is disabled). It is optimised
206 * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
207 * than the C version. */
208static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
209{
210 int res = 0;
211#if ORDER > 16
212 int cnt = ORDER>>4;
213#endif
214
215#define MLABLOCK2(f1, f2) \
216 "mov r8, " #f1 ", lsl #16 \n" \
217 "mov r8, r8 , asr #16 \n" \
218 "mov r9, " #f2 ", lsl #16 \n" \
219 "mov r9, r9 , asr #16 \n" \
220 "mla %[res], r9, r8, %[res] \n" \
221 "mov r8, " #f1 ", asr #16 \n" \
222 "mov r9, " #f2 ", asr #16 \n" \
223 "mla %[res], r9, r8, %[res] \n"
224
225#define MLABLOCK2_U2(f1, f2) \
226 "mov r8, " #f1 ", lsl #16 \n" \
227 "mov r8, r8 , asr #16 \n" \
228 "mla %[res], r9, r8, %[res] \n" \
229 "mov r8, " #f1 ", asr #16 \n" \
230 "mov r9, " #f2 ", lsl #16 \n" \
231 "mov r9, r9 , asr #16 \n" \
232 "mla %[res], r9, r8, %[res] \n" \
233 "mov r9, " #f2 ", asr #16 \n"
234
235 asm volatile (
236 "tst %[v2], #2 \n"
237 "beq 20f \n"
238
239 "10: \n"
240 "ldrsh r9, [%[v2]], #2 \n"
241 "1: \n"
242 "ldmia %[v1]!, {r0-r3} \n"
243 "ldmia %[v2]!, {r4-r7} \n"
244 MLABLOCK2_U2(r0, r4)
245 MLABLOCK2_U2(r1, r5)
246 MLABLOCK2_U2(r2, r6)
247 MLABLOCK2_U2(r3, r7)
248 "ldmia %[v1]!, {r0-r3} \n"
249 "ldmia %[v2]!, {r4-r7} \n"
250 MLABLOCK2_U2(r0, r4)
251 MLABLOCK2_U2(r1, r5)
252 MLABLOCK2_U2(r2, r6)
253 MLABLOCK2_U2(r3, r7)
254#if ORDER > 16
255 "subs %[cnt], %[cnt], #1 \n"
256 "bne 1b \n"
257#endif
258 "b 99f \n"
259
260 "20: \n"
261 "1: \n"
262 "ldmia %[v1]!, {r0-r3} \n"
263 "ldmia %[v2]!, {r4-r7} \n"
264 MLABLOCK2(r0, r4)
265 MLABLOCK2(r1, r5)
266 MLABLOCK2(r2, r6)
267 MLABLOCK2(r3, r7)
268 "ldmia %[v1]!, {r0-r3} \n"
269 "ldmia %[v2]!, {r4-r7} \n"
270 MLABLOCK2(r0, r4)
271 MLABLOCK2(r1, r5)
272 MLABLOCK2(r2, r6)
273 MLABLOCK2(r3, r7)
274#if ORDER > 16
275 "subs %[cnt], %[cnt], #1 \n"
276 "bne 1b \n"
277#endif
278
279 "99: \n"
280 : /* outputs */
281#if ORDER > 16
282 [cnt]"+r"(cnt),
283#endif
284 [v1] "+r"(v1),
285 [v2] "+r"(v2),
286 [res]"+r"(res)
287 : /* inputs */
288 : /* clobbers */
289 "r0", "r1", "r2", "r3", "r4",
290 "r5", "r6", "r7", "r8", "r9"
291 );
292 return res;
293}