summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2010-08-30 06:31:47 +0000
committerJens Arnold <amiconn@rockbox.org>2010-08-30 06:31:47 +0000
commit811877e5b3ae95b70e285b786bb7cc9d73d333e0 (patch)
treec4e7865faaaad715566f7b1ebb559eeba25d7221
parentdd5e3eb5424a66a5399f99386b59a8ee86d6cde0 (diff)
downloadrockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.tar.gz
rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.zip
libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/filter.c2
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv7.h214
2 files changed, 216 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 8055098301..903885cf00 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
41 41
42#ifdef CPU_COLDFIRE 42#ifdef CPU_COLDFIRE
43#include "vector_math16_cf.h" 43#include "vector_math16_cf.h"
44#elif defined(CPU_ARM) && (ARM_ARCH >= 7)
45#include "vector_math16_armv7.h"
44#elif defined(CPU_ARM) && (ARM_ARCH >= 6) 46#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
45#include "vector_math16_armv6.h" 47#include "vector_math16_armv6.h"
46#elif defined(CPU_ARM) && (ARM_ARCH >= 5) 48#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv7.h b/apps/codecs/demac/libdemac/vector_math16_armv7.h
new file mode 100644
index 0000000000..84afda3e5d
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARMv7 neon vector math copyright (C) 2010 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27#define FUSED_VECTOR_MATH
28
29#if ORDER > 32
30#define REPEAT_BLOCK(x) x x x
31#elif ORDER > 16
32#define REPEAT_BLOCK(x) x
33#else
34#define REPEAT_BLOCK(x)
35#endif
36
37/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
38static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
39{
40 int res;
41#if ORDER > 64
42 int cnt = ORDER>>6;
43#endif
44
45 asm volatile (
46#if ORDER > 64
47 "vmov.i16 q0, #0 \n"
48 "1: \n"
49 "subs %[cnt], %[cnt], #1 \n"
50#endif
51 "vld1.16 {d6-d9}, [%[f2]]! \n"
52 "vld1.16 {d2-d5}, [%[v1]] \n"
53 "vld1.16 {d10-d13}, [%[s2]]! \n"
54#if ORDER > 64
55 "vmlal.s16 q0, d2, d6 \n"
56#else
57 "vmull.s16 q0, d2, d6 \n"
58#endif
59 "vmlal.s16 q0, d3, d7 \n"
60 "vmlal.s16 q0, d4, d8 \n"
61 "vmlal.s16 q0, d5, d9 \n"
62 "vadd.i16 q1, q1, q5 \n"
63 "vadd.i16 q2, q2, q6 \n"
64 "vst1.16 {d2-d5}, [%[v1]]! \n"
65
66 REPEAT_BLOCK(
67 "vld1.16 {d6-d9}, [%[f2]]! \n"
68 "vld1.16 {d2-d5}, [%[v1]] \n"
69 "vld1.16 {d10-d13}, [%[s2]]! \n"
70 "vmlal.s16 q0, d2, d6 \n"
71 "vmlal.s16 q0, d3, d7 \n"
72 "vmlal.s16 q0, d4, d8 \n"
73 "vmlal.s16 q0, d5, d9 \n"
74 "vadd.i16 q1, q1, q5 \n"
75 "vadd.i16 q2, q2, q6 \n"
76 "vst1.16 {d2-d5}, [%[v1]]! \n"
77 )
78#if ORDER > 64
79 "bne 1b \n"
80#endif
81 "vpadd.i32 d0, d0, d1 \n"
82 "vpaddl.s32 d0, d0 \n"
83 "vmov.32 %[res], d0[0] \n"
84 : /* outputs */
85#if ORDER > 64
86 [cnt]"+r"(cnt),
87#endif
88 [v1] "+r"(v1),
89 [f2] "+r"(f2),
90 [s2] "+r"(s2),
91 [res]"=r"(res)
92 : /* inputs */
93 : /* clobbers */
94 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
95 "d8", "d9", "d10", "d11", "d12", "d13", "memory"
96 );
97 return res;
98}
99
100/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
101static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
102{
103 int res;
104#if ORDER > 64
105 int cnt = ORDER>>6;
106#endif
107
108 asm volatile (
109#if ORDER > 64
110 "vmov.i16 q0, #0 \n"
111 "1: \n"
112 "subs %[cnt], %[cnt], #1 \n"
113#endif
114 "vld1.16 {d6-d9}, [%[f2]]! \n"
115 "vld1.16 {d2-d5}, [%[v1]] \n"
116 "vld1.16 {d10-d13}, [%[s2]]! \n"
117#if ORDER > 64
118 "vmlal.s16 q0, d2, d6 \n"
119#else
120 "vmull.s16 q0, d2, d6 \n"
121#endif
122 "vmlal.s16 q0, d3, d7 \n"
123 "vmlal.s16 q0, d4, d8 \n"
124 "vmlal.s16 q0, d5, d9 \n"
125 "vsub.i16 q1, q1, q5 \n"
126 "vsub.i16 q2, q2, q6 \n"
127 "vst1.16 {d2-d5}, [%[v1]]! \n"
128
129 REPEAT_BLOCK(
130 "vld1.16 {d6-d9}, [%[f2]]! \n"
131 "vld1.16 {d2-d5}, [%[v1]] \n"
132 "vld1.16 {d10-d13}, [%[s2]]! \n"
133 "vmlal.s16 q0, d2, d6 \n"
134 "vmlal.s16 q0, d3, d7 \n"
135 "vmlal.s16 q0, d4, d8 \n"
136 "vmlal.s16 q0, d5, d9 \n"
137 "vsub.i16 q1, q1, q5 \n"
138 "vsub.i16 q2, q2, q6 \n"
139 "vst1.16 {d2-d5}, [%[v1]]! \n"
140 )
141#if ORDER > 64
142 "bne 1b \n"
143#endif
144 "vpadd.i32 d0, d0, d1 \n"
145 "vpaddl.s32 d0, d0 \n"
146 "vmov.32 %[res], d0[0] \n"
147 : /* outputs */
148#if ORDER > 64
149 [cnt]"+r"(cnt),
150#endif
151 [v1] "+r"(v1),
152 [f2] "+r"(f2),
153 [s2] "+r"(s2),
154 [res]"=r"(res)
155 : /* inputs */
156 : /* clobbers */
157 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
158 "d8", "d9", "d10", "d11", "d12", "d13", "memory"
159 );
160 return res;
161}
162
163static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
164{
165 int res;
166#if ORDER > 64
167 int cnt = ORDER>>6;
168#endif
169
170 asm volatile (
171#if ORDER > 64
172 "vmov.i16 q0, #0 \n"
173 "1: \n"
174 "subs %[cnt], %[cnt], #1 \n"
175#endif
176 "vld1.16 {d2-d5}, [%[v1]]! \n"
177 "vld1.16 {d6-d9}, [%[v2]]! \n"
178#if ORDER > 64
179 "vmlal.s16 q0, d2, d6 \n"
180#else
181 "vmull.s16 q0, d2, d6 \n"
182#endif
183 "vmlal.s16 q0, d3, d7 \n"
184 "vmlal.s16 q0, d4, d8 \n"
185 "vmlal.s16 q0, d5, d9 \n"
186
187 REPEAT_BLOCK(
188 "vld1.16 {d2-d5}, [%[v1]]! \n"
189 "vld1.16 {d6-d9}, [%[v2]]! \n"
190 "vmlal.s16 q0, d2, d6 \n"
191 "vmlal.s16 q0, d3, d7 \n"
192 "vmlal.s16 q0, d4, d8 \n"
193 "vmlal.s16 q0, d5, d9 \n"
194 )
195#if ORDER > 64
196 "bne 1b \n"
197#endif
198 "vpadd.i32 d0, d0, d1 \n"
199 "vpaddl.s32 d0, d0 \n"
200 "vmov.32 %[res], d0[0] \n"
201 : /* outputs */
202#if ORDER > 64
203 [cnt]"+r"(cnt),
204#endif
205 [v1] "+r"(v1),
206 [v2] "+r"(v2),
207 [res]"=r"(res)
208 : /* inputs */
209 : /* clobbers */
210 "d0", "d1", "d2", "d3", "d4",
211 "d5", "d6", "d7", "d8", "d9"
212 );
213 return res;
214}