diff options
author | Jens Arnold <amiconn@rockbox.org> | 2010-08-30 06:31:47 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2010-08-30 06:31:47 +0000 |
commit | 811877e5b3ae95b70e285b786bb7cc9d73d333e0 (patch) | |
tree | c4e7865faaaad715566f7b1ebb559eeba25d7221 /apps/codecs/demac/libdemac | |
parent | dd5e3eb5424a66a5399f99386b59a8ee86d6cde0 (diff) | |
download | rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.tar.gz rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.zip |
libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac')
-rw-r--r-- | apps/codecs/demac/libdemac/filter.c | 2 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv7.h | 214 |
2 files changed, 216 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 8055098301..903885cf00 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c | |||
@@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
41 | 41 | ||
42 | #ifdef CPU_COLDFIRE | 42 | #ifdef CPU_COLDFIRE |
43 | #include "vector_math16_cf.h" | 43 | #include "vector_math16_cf.h" |
44 | #elif defined(CPU_ARM) && (ARM_ARCH >= 7) | ||
45 | #include "vector_math16_armv7.h" | ||
44 | #elif defined(CPU_ARM) && (ARM_ARCH >= 6) | 46 | #elif defined(CPU_ARM) && (ARM_ARCH >= 6) |
45 | #include "vector_math16_armv6.h" | 47 | #include "vector_math16_armv6.h" |
46 | #elif defined(CPU_ARM) && (ARM_ARCH >= 5) | 48 | #elif defined(CPU_ARM) && (ARM_ARCH >= 5) |
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv7.h b/apps/codecs/demac/libdemac/vector_math16_armv7.h new file mode 100644 index 0000000000..84afda3e5d --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h | |||
@@ -0,0 +1,214 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | ARMv7 neon vector math copyright (C) 2010 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | #define FUSED_VECTOR_MATH | ||
28 | |||
29 | #if ORDER > 32 | ||
30 | #define REPEAT_BLOCK(x) x x x | ||
31 | #elif ORDER > 16 | ||
32 | #define REPEAT_BLOCK(x) x | ||
33 | #else | ||
34 | #define REPEAT_BLOCK(x) | ||
35 | #endif | ||
36 | |||
37 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ | ||
38 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
39 | { | ||
40 | int res; | ||
41 | #if ORDER > 64 | ||
42 | int cnt = ORDER>>6; | ||
43 | #endif | ||
44 | |||
45 | asm volatile ( | ||
46 | #if ORDER > 64 | ||
47 | "vmov.i16 q0, #0 \n" | ||
48 | "1: \n" | ||
49 | "subs %[cnt], %[cnt], #1 \n" | ||
50 | #endif | ||
51 | "vld1.16 {d6-d9}, [%[f2]]! \n" | ||
52 | "vld1.16 {d2-d5}, [%[v1]] \n" | ||
53 | "vld1.16 {d10-d13}, [%[s2]]! \n" | ||
54 | #if ORDER > 64 | ||
55 | "vmlal.s16 q0, d2, d6 \n" | ||
56 | #else | ||
57 | "vmull.s16 q0, d2, d6 \n" | ||
58 | #endif | ||
59 | "vmlal.s16 q0, d3, d7 \n" | ||
60 | "vmlal.s16 q0, d4, d8 \n" | ||
61 | "vmlal.s16 q0, d5, d9 \n" | ||
62 | "vadd.i16 q1, q1, q5 \n" | ||
63 | "vadd.i16 q2, q2, q6 \n" | ||
64 | "vst1.16 {d2-d5}, [%[v1]]! \n" | ||
65 | |||
66 | REPEAT_BLOCK( | ||
67 | "vld1.16 {d6-d9}, [%[f2]]! \n" | ||
68 | "vld1.16 {d2-d5}, [%[v1]] \n" | ||
69 | "vld1.16 {d10-d13}, [%[s2]]! \n" | ||
70 | "vmlal.s16 q0, d2, d6 \n" | ||
71 | "vmlal.s16 q0, d3, d7 \n" | ||
72 | "vmlal.s16 q0, d4, d8 \n" | ||
73 | "vmlal.s16 q0, d5, d9 \n" | ||
74 | "vadd.i16 q1, q1, q5 \n" | ||
75 | "vadd.i16 q2, q2, q6 \n" | ||
76 | "vst1.16 {d2-d5}, [%[v1]]! \n" | ||
77 | ) | ||
78 | #if ORDER > 64 | ||
79 | "bne 1b \n" | ||
80 | #endif | ||
81 | "vpadd.i32 d0, d0, d1 \n" | ||
82 | "vpaddl.s32 d0, d0 \n" | ||
83 | "vmov.32 %[res], d0[0] \n" | ||
84 | : /* outputs */ | ||
85 | #if ORDER > 64 | ||
86 | [cnt]"+r"(cnt), | ||
87 | #endif | ||
88 | [v1] "+r"(v1), | ||
89 | [f2] "+r"(f2), | ||
90 | [s2] "+r"(s2), | ||
91 | [res]"=r"(res) | ||
92 | : /* inputs */ | ||
93 | : /* clobbers */ | ||
94 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | ||
95 | "d8", "d9", "d10", "d11", "d12", "d13", "memory" | ||
96 | ); | ||
97 | return res; | ||
98 | } | ||
99 | |||
100 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ | ||
101 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
102 | { | ||
103 | int res; | ||
104 | #if ORDER > 64 | ||
105 | int cnt = ORDER>>6; | ||
106 | #endif | ||
107 | |||
108 | asm volatile ( | ||
109 | #if ORDER > 64 | ||
110 | "vmov.i16 q0, #0 \n" | ||
111 | "1: \n" | ||
112 | "subs %[cnt], %[cnt], #1 \n" | ||
113 | #endif | ||
114 | "vld1.16 {d6-d9}, [%[f2]]! \n" | ||
115 | "vld1.16 {d2-d5}, [%[v1]] \n" | ||
116 | "vld1.16 {d10-d13}, [%[s2]]! \n" | ||
117 | #if ORDER > 64 | ||
118 | "vmlal.s16 q0, d2, d6 \n" | ||
119 | #else | ||
120 | "vmull.s16 q0, d2, d6 \n" | ||
121 | #endif | ||
122 | "vmlal.s16 q0, d3, d7 \n" | ||
123 | "vmlal.s16 q0, d4, d8 \n" | ||
124 | "vmlal.s16 q0, d5, d9 \n" | ||
125 | "vsub.i16 q1, q1, q5 \n" | ||
126 | "vsub.i16 q2, q2, q6 \n" | ||
127 | "vst1.16 {d2-d5}, [%[v1]]! \n" | ||
128 | |||
129 | REPEAT_BLOCK( | ||
130 | "vld1.16 {d6-d9}, [%[f2]]! \n" | ||
131 | "vld1.16 {d2-d5}, [%[v1]] \n" | ||
132 | "vld1.16 {d10-d13}, [%[s2]]! \n" | ||
133 | "vmlal.s16 q0, d2, d6 \n" | ||
134 | "vmlal.s16 q0, d3, d7 \n" | ||
135 | "vmlal.s16 q0, d4, d8 \n" | ||
136 | "vmlal.s16 q0, d5, d9 \n" | ||
137 | "vsub.i16 q1, q1, q5 \n" | ||
138 | "vsub.i16 q2, q2, q6 \n" | ||
139 | "vst1.16 {d2-d5}, [%[v1]]! \n" | ||
140 | ) | ||
141 | #if ORDER > 64 | ||
142 | "bne 1b \n" | ||
143 | #endif | ||
144 | "vpadd.i32 d0, d0, d1 \n" | ||
145 | "vpaddl.s32 d0, d0 \n" | ||
146 | "vmov.32 %[res], d0[0] \n" | ||
147 | : /* outputs */ | ||
148 | #if ORDER > 64 | ||
149 | [cnt]"+r"(cnt), | ||
150 | #endif | ||
151 | [v1] "+r"(v1), | ||
152 | [f2] "+r"(f2), | ||
153 | [s2] "+r"(s2), | ||
154 | [res]"=r"(res) | ||
155 | : /* inputs */ | ||
156 | : /* clobbers */ | ||
157 | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | ||
158 | "d8", "d9", "d10", "d11", "d12", "d13", "memory" | ||
159 | ); | ||
160 | return res; | ||
161 | } | ||
162 | |||
163 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
164 | { | ||
165 | int res; | ||
166 | #if ORDER > 64 | ||
167 | int cnt = ORDER>>6; | ||
168 | #endif | ||
169 | |||
170 | asm volatile ( | ||
171 | #if ORDER > 64 | ||
172 | "vmov.i16 q0, #0 \n" | ||
173 | "1: \n" | ||
174 | "subs %[cnt], %[cnt], #1 \n" | ||
175 | #endif | ||
176 | "vld1.16 {d2-d5}, [%[v1]]! \n" | ||
177 | "vld1.16 {d6-d9}, [%[v2]]! \n" | ||
178 | #if ORDER > 64 | ||
179 | "vmlal.s16 q0, d2, d6 \n" | ||
180 | #else | ||
181 | "vmull.s16 q0, d2, d6 \n" | ||
182 | #endif | ||
183 | "vmlal.s16 q0, d3, d7 \n" | ||
184 | "vmlal.s16 q0, d4, d8 \n" | ||
185 | "vmlal.s16 q0, d5, d9 \n" | ||
186 | |||
187 | REPEAT_BLOCK( | ||
188 | "vld1.16 {d2-d5}, [%[v1]]! \n" | ||
189 | "vld1.16 {d6-d9}, [%[v2]]! \n" | ||
190 | "vmlal.s16 q0, d2, d6 \n" | ||
191 | "vmlal.s16 q0, d3, d7 \n" | ||
192 | "vmlal.s16 q0, d4, d8 \n" | ||
193 | "vmlal.s16 q0, d5, d9 \n" | ||
194 | ) | ||
195 | #if ORDER > 64 | ||
196 | "bne 1b \n" | ||
197 | #endif | ||
198 | "vpadd.i32 d0, d0, d1 \n" | ||
199 | "vpaddl.s32 d0, d0 \n" | ||
200 | "vmov.32 %[res], d0[0] \n" | ||
201 | : /* outputs */ | ||
202 | #if ORDER > 64 | ||
203 | [cnt]"+r"(cnt), | ||
204 | #endif | ||
205 | [v1] "+r"(v1), | ||
206 | [v2] "+r"(v2), | ||
207 | [res]"=r"(res) | ||
208 | : /* inputs */ | ||
209 | : /* clobbers */ | ||
210 | "d0", "d1", "d2", "d3", "d4", | ||
211 | "d5", "d6", "d7", "d8", "d9" | ||
212 | ); | ||
213 | return res; | ||
214 | } | ||