summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h')
-rw-r--r--lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h214
1 files changed, 214 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
new file mode 100644
index 0000000000..84afda3e5d
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARMv7 neon vector math copyright (C) 2010 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27#define FUSED_VECTOR_MATH
28
29#if ORDER > 32
30#define REPEAT_BLOCK(x) x x x
31#elif ORDER > 16
32#define REPEAT_BLOCK(x) x
33#else
34#define REPEAT_BLOCK(x)
35#endif
36
37/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
38static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
39{
40 int res;
41#if ORDER > 64
42 int cnt = ORDER>>6;
43#endif
44
45 asm volatile (
46#if ORDER > 64
47 "vmov.i16 q0, #0 \n"
48 "1: \n"
49 "subs %[cnt], %[cnt], #1 \n"
50#endif
51 "vld1.16 {d6-d9}, [%[f2]]! \n"
52 "vld1.16 {d2-d5}, [%[v1]] \n"
53 "vld1.16 {d10-d13}, [%[s2]]! \n"
54#if ORDER > 64
55 "vmlal.s16 q0, d2, d6 \n"
56#else
57 "vmull.s16 q0, d2, d6 \n"
58#endif
59 "vmlal.s16 q0, d3, d7 \n"
60 "vmlal.s16 q0, d4, d8 \n"
61 "vmlal.s16 q0, d5, d9 \n"
62 "vadd.i16 q1, q1, q5 \n"
63 "vadd.i16 q2, q2, q6 \n"
64 "vst1.16 {d2-d5}, [%[v1]]! \n"
65
66 REPEAT_BLOCK(
67 "vld1.16 {d6-d9}, [%[f2]]! \n"
68 "vld1.16 {d2-d5}, [%[v1]] \n"
69 "vld1.16 {d10-d13}, [%[s2]]! \n"
70 "vmlal.s16 q0, d2, d6 \n"
71 "vmlal.s16 q0, d3, d7 \n"
72 "vmlal.s16 q0, d4, d8 \n"
73 "vmlal.s16 q0, d5, d9 \n"
74 "vadd.i16 q1, q1, q5 \n"
75 "vadd.i16 q2, q2, q6 \n"
76 "vst1.16 {d2-d5}, [%[v1]]! \n"
77 )
78#if ORDER > 64
79 "bne 1b \n"
80#endif
81 "vpadd.i32 d0, d0, d1 \n"
82 "vpaddl.s32 d0, d0 \n"
83 "vmov.32 %[res], d0[0] \n"
84 : /* outputs */
85#if ORDER > 64
86 [cnt]"+r"(cnt),
87#endif
88 [v1] "+r"(v1),
89 [f2] "+r"(f2),
90 [s2] "+r"(s2),
91 [res]"=r"(res)
92 : /* inputs */
93 : /* clobbers */
94 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
95 "d8", "d9", "d10", "d11", "d12", "d13", "memory"
96 );
97 return res;
98}
99
100/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
101static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
102{
103 int res;
104#if ORDER > 64
105 int cnt = ORDER>>6;
106#endif
107
108 asm volatile (
109#if ORDER > 64
110 "vmov.i16 q0, #0 \n"
111 "1: \n"
112 "subs %[cnt], %[cnt], #1 \n"
113#endif
114 "vld1.16 {d6-d9}, [%[f2]]! \n"
115 "vld1.16 {d2-d5}, [%[v1]] \n"
116 "vld1.16 {d10-d13}, [%[s2]]! \n"
117#if ORDER > 64
118 "vmlal.s16 q0, d2, d6 \n"
119#else
120 "vmull.s16 q0, d2, d6 \n"
121#endif
122 "vmlal.s16 q0, d3, d7 \n"
123 "vmlal.s16 q0, d4, d8 \n"
124 "vmlal.s16 q0, d5, d9 \n"
125 "vsub.i16 q1, q1, q5 \n"
126 "vsub.i16 q2, q2, q6 \n"
127 "vst1.16 {d2-d5}, [%[v1]]! \n"
128
129 REPEAT_BLOCK(
130 "vld1.16 {d6-d9}, [%[f2]]! \n"
131 "vld1.16 {d2-d5}, [%[v1]] \n"
132 "vld1.16 {d10-d13}, [%[s2]]! \n"
133 "vmlal.s16 q0, d2, d6 \n"
134 "vmlal.s16 q0, d3, d7 \n"
135 "vmlal.s16 q0, d4, d8 \n"
136 "vmlal.s16 q0, d5, d9 \n"
137 "vsub.i16 q1, q1, q5 \n"
138 "vsub.i16 q2, q2, q6 \n"
139 "vst1.16 {d2-d5}, [%[v1]]! \n"
140 )
141#if ORDER > 64
142 "bne 1b \n"
143#endif
144 "vpadd.i32 d0, d0, d1 \n"
145 "vpaddl.s32 d0, d0 \n"
146 "vmov.32 %[res], d0[0] \n"
147 : /* outputs */
148#if ORDER > 64
149 [cnt]"+r"(cnt),
150#endif
151 [v1] "+r"(v1),
152 [f2] "+r"(f2),
153 [s2] "+r"(s2),
154 [res]"=r"(res)
155 : /* inputs */
156 : /* clobbers */
157 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
158 "d8", "d9", "d10", "d11", "d12", "d13", "memory"
159 );
160 return res;
161}
162
163static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
164{
165 int res;
166#if ORDER > 64
167 int cnt = ORDER>>6;
168#endif
169
170 asm volatile (
171#if ORDER > 64
172 "vmov.i16 q0, #0 \n"
173 "1: \n"
174 "subs %[cnt], %[cnt], #1 \n"
175#endif
176 "vld1.16 {d2-d5}, [%[v1]]! \n"
177 "vld1.16 {d6-d9}, [%[v2]]! \n"
178#if ORDER > 64
179 "vmlal.s16 q0, d2, d6 \n"
180#else
181 "vmull.s16 q0, d2, d6 \n"
182#endif
183 "vmlal.s16 q0, d3, d7 \n"
184 "vmlal.s16 q0, d4, d8 \n"
185 "vmlal.s16 q0, d5, d9 \n"
186
187 REPEAT_BLOCK(
188 "vld1.16 {d2-d5}, [%[v1]]! \n"
189 "vld1.16 {d6-d9}, [%[v2]]! \n"
190 "vmlal.s16 q0, d2, d6 \n"
191 "vmlal.s16 q0, d3, d7 \n"
192 "vmlal.s16 q0, d4, d8 \n"
193 "vmlal.s16 q0, d5, d9 \n"
194 )
195#if ORDER > 64
196 "bne 1b \n"
197#endif
198 "vpadd.i32 d0, d0, d1 \n"
199 "vpaddl.s32 d0, d0 \n"
200 "vmov.32 %[res], d0[0] \n"
201 : /* outputs */
202#if ORDER > 64
203 [cnt]"+r"(cnt),
204#endif
205 [v1] "+r"(v1),
206 [v2] "+r"(v2),
207 [res]"=r"(res)
208 : /* inputs */
209 : /* clobbers */
210 "d0", "d1", "d2", "d3", "d4",
211 "d5", "d6", "d7", "d8", "d9"
212 );
213 return res;
214}