diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c new file mode 100644 index 0000000000..1ac38c433a --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c | |||
@@ -0,0 +1,290 @@ | |||
1 | /*********************************************************************** | ||
2 | Copyright (c) 2017 Google Inc. | ||
3 | Redistribution and use in source and binary forms, with or without | ||
4 | modification, are permitted provided that the following conditions | ||
5 | are met: | ||
6 | - Redistributions of source code must retain the above copyright notice, | ||
7 | this list of conditions and the following disclaimer. | ||
8 | - Redistributions in binary form must reproduce the above copyright | ||
9 | notice, this list of conditions and the following disclaimer in the | ||
10 | documentation and/or other materials provided with the distribution. | ||
11 | - Neither the name of Internet Society, IETF or IETF Trust, nor the | ||
12 | names of specific contributors, may be used to endorse or promote | ||
13 | products derived from this software without specific prior written | ||
14 | permission. | ||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
18 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
19 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
20 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
21 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
22 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
23 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
24 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
25 | POSSIBILITY OF SUCH DAMAGE. | ||
26 | ***********************************************************************/ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <arm_neon.h> | ||
33 | #include "pitch.h" | ||
34 | |||
35 | #ifdef FIXED_POINT | ||
36 | |||
37 | opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) | ||
38 | { | ||
39 | int i; | ||
40 | opus_val32 xy; | ||
41 | int16x8_t x_s16x8, y_s16x8; | ||
42 | int32x4_t xy_s32x4 = vdupq_n_s32(0); | ||
43 | int64x2_t xy_s64x2; | ||
44 | int64x1_t xy_s64x1; | ||
45 | |||
46 | for (i = 0; i < N - 7; i += 8) { | ||
47 | x_s16x8 = vld1q_s16(&x[i]); | ||
48 | y_s16x8 = vld1q_s16(&y[i]); | ||
49 | xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8)); | ||
50 | xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8)); | ||
51 | } | ||
52 | |||
53 | if (N - i >= 4) { | ||
54 | const int16x4_t x_s16x4 = vld1_s16(&x[i]); | ||
55 | const int16x4_t y_s16x4 = vld1_s16(&y[i]); | ||
56 | xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4); | ||
57 | i += 4; | ||
58 | } | ||
59 | |||
60 | xy_s64x2 = vpaddlq_s32(xy_s32x4); | ||
61 | xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2)); | ||
62 | xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0); | ||
63 | |||
64 | for (; i < N; i++) { | ||
65 | xy = MAC16_16(xy, x[i], y[i]); | ||
66 | } | ||
67 | |||
68 | #ifdef OPUS_CHECK_ASM | ||
69 | celt_assert(celt_inner_prod_c(x, y, N) == xy); | ||
70 | #endif | ||
71 | |||
72 | return xy; | ||
73 | } | ||
74 | |||
75 | void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | ||
76 | int N, opus_val32 *xy1, opus_val32 *xy2) | ||
77 | { | ||
78 | int i; | ||
79 | opus_val32 xy01, xy02; | ||
80 | int16x8_t x_s16x8, y01_s16x8, y02_s16x8; | ||
81 | int32x4_t xy01_s32x4 = vdupq_n_s32(0); | ||
82 | int32x4_t xy02_s32x4 = vdupq_n_s32(0); | ||
83 | int64x2_t xy01_s64x2, xy02_s64x2; | ||
84 | int64x1_t xy01_s64x1, xy02_s64x1; | ||
85 | |||
86 | for (i = 0; i < N - 7; i += 8) { | ||
87 | x_s16x8 = vld1q_s16(&x[i]); | ||
88 | y01_s16x8 = vld1q_s16(&y01[i]); | ||
89 | y02_s16x8 = vld1q_s16(&y02[i]); | ||
90 | xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8)); | ||
91 | xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8)); | ||
92 | xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8)); | ||
93 | xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8)); | ||
94 | } | ||
95 | |||
96 | if (N - i >= 4) { | ||
97 | const int16x4_t x_s16x4 = vld1_s16(&x[i]); | ||
98 | const int16x4_t y01_s16x4 = vld1_s16(&y01[i]); | ||
99 | const int16x4_t y02_s16x4 = vld1_s16(&y02[i]); | ||
100 | xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4); | ||
101 | xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4); | ||
102 | i += 4; | ||
103 | } | ||
104 | |||
105 | xy01_s64x2 = vpaddlq_s32(xy01_s32x4); | ||
106 | xy02_s64x2 = vpaddlq_s32(xy02_s32x4); | ||
107 | xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2)); | ||
108 | xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2)); | ||
109 | xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0); | ||
110 | xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0); | ||
111 | |||
112 | for (; i < N; i++) { | ||
113 | xy01 = MAC16_16(xy01, x[i], y01[i]); | ||
114 | xy02 = MAC16_16(xy02, x[i], y02[i]); | ||
115 | } | ||
116 | *xy1 = xy01; | ||
117 | *xy2 = xy02; | ||
118 | |||
119 | #ifdef OPUS_CHECK_ASM | ||
120 | { | ||
121 | opus_val32 xy1_c, xy2_c; | ||
122 | dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c); | ||
123 | celt_assert(xy1_c == *xy1); | ||
124 | celt_assert(xy2_c == *xy2); | ||
125 | } | ||
126 | #endif | ||
127 | } | ||
128 | |||
129 | #else /* !FIXED_POINT */ | ||
130 | |||
131 | /* ========================================================================== */ | ||
132 | |||
133 | #ifdef OPUS_CHECK_ASM | ||
134 | |||
135 | /* This part of code simulates floating-point NEON operations. */ | ||
136 | |||
137 | /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ | ||
138 | /* operations of celt_inner_prod_neon(), and both functions should have bit */ | ||
139 | /* exact output. */ | ||
140 | static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N) | ||
141 | { | ||
142 | int i; | ||
143 | opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; | ||
144 | for (i = 0; i < N - 3; i += 4) { | ||
145 | xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); | ||
146 | xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); | ||
147 | xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); | ||
148 | xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); | ||
149 | } | ||
150 | xy0 += xy2; | ||
151 | xy1 += xy3; | ||
152 | xy = xy0 + xy1; | ||
153 | for (; i < N; i++) { | ||
154 | xy = MAC16_16(xy, x[i], y[i]); | ||
155 | } | ||
156 | return xy; | ||
157 | } | ||
158 | |||
159 | /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */ | ||
160 | /* operations of dual_inner_prod_neon(), and both functions should have bit */ | ||
161 | /* exact output. */ | ||
162 | static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | ||
163 | int N, opus_val32 *xy1, opus_val32 *xy2) | ||
164 | { | ||
165 | int i; | ||
166 | opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; | ||
167 | for (i = 0; i < N - 3; i += 4) { | ||
168 | xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); | ||
169 | xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); | ||
170 | xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); | ||
171 | xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); | ||
172 | xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); | ||
173 | xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); | ||
174 | xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); | ||
175 | xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); | ||
176 | } | ||
177 | xy01_0 += xy01_2; | ||
178 | xy02_0 += xy02_2; | ||
179 | xy01_1 += xy01_3; | ||
180 | xy02_1 += xy02_3; | ||
181 | xy01 = xy01_0 + xy01_1; | ||
182 | xy02 = xy02_0 + xy02_1; | ||
183 | for (; i < N; i++) { | ||
184 | xy01 = MAC16_16(xy01, x[i], y01[i]); | ||
185 | xy02 = MAC16_16(xy02, x[i], y02[i]); | ||
186 | } | ||
187 | *xy1 = xy01; | ||
188 | *xy2 = xy02; | ||
189 | } | ||
190 | |||
191 | #endif /* OPUS_CHECK_ASM */ | ||
192 | |||
193 | /* ========================================================================== */ | ||
194 | |||
195 | opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) | ||
196 | { | ||
197 | int i; | ||
198 | opus_val32 xy; | ||
199 | float32x4_t xy_f32x4 = vdupq_n_f32(0); | ||
200 | float32x2_t xy_f32x2; | ||
201 | |||
202 | for (i = 0; i < N - 7; i += 8) { | ||
203 | float32x4_t x_f32x4, y_f32x4; | ||
204 | x_f32x4 = vld1q_f32(&x[i]); | ||
205 | y_f32x4 = vld1q_f32(&y[i]); | ||
206 | xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | ||
207 | x_f32x4 = vld1q_f32(&x[i + 4]); | ||
208 | y_f32x4 = vld1q_f32(&y[i + 4]); | ||
209 | xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | ||
210 | } | ||
211 | |||
212 | if (N - i >= 4) { | ||
213 | const float32x4_t x_f32x4 = vld1q_f32(&x[i]); | ||
214 | const float32x4_t y_f32x4 = vld1q_f32(&y[i]); | ||
215 | xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | ||
216 | i += 4; | ||
217 | } | ||
218 | |||
219 | xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4)); | ||
220 | xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2); | ||
221 | xy = vget_lane_f32(xy_f32x2, 0); | ||
222 | |||
223 | for (; i < N; i++) { | ||
224 | xy = MAC16_16(xy, x[i], y[i]); | ||
225 | } | ||
226 | |||
227 | #ifdef OPUS_CHECK_ASM | ||
228 | celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL); | ||
229 | #endif | ||
230 | |||
231 | return xy; | ||
232 | } | ||
233 | |||
234 | void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | ||
235 | int N, opus_val32 *xy1, opus_val32 *xy2) | ||
236 | { | ||
237 | int i; | ||
238 | opus_val32 xy01, xy02; | ||
239 | float32x4_t xy01_f32x4 = vdupq_n_f32(0); | ||
240 | float32x4_t xy02_f32x4 = vdupq_n_f32(0); | ||
241 | float32x2_t xy01_f32x2, xy02_f32x2; | ||
242 | |||
243 | for (i = 0; i < N - 7; i += 8) { | ||
244 | float32x4_t x_f32x4, y01_f32x4, y02_f32x4; | ||
245 | x_f32x4 = vld1q_f32(&x[i]); | ||
246 | y01_f32x4 = vld1q_f32(&y01[i]); | ||
247 | y02_f32x4 = vld1q_f32(&y02[i]); | ||
248 | xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | ||
249 | xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | ||
250 | x_f32x4 = vld1q_f32(&x[i + 4]); | ||
251 | y01_f32x4 = vld1q_f32(&y01[i + 4]); | ||
252 | y02_f32x4 = vld1q_f32(&y02[i + 4]); | ||
253 | xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | ||
254 | xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | ||
255 | } | ||
256 | |||
257 | if (N - i >= 4) { | ||
258 | const float32x4_t x_f32x4 = vld1q_f32(&x[i]); | ||
259 | const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]); | ||
260 | const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]); | ||
261 | xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | ||
262 | xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | ||
263 | i += 4; | ||
264 | } | ||
265 | |||
266 | xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4)); | ||
267 | xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4)); | ||
268 | xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2); | ||
269 | xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2); | ||
270 | xy01 = vget_lane_f32(xy01_f32x2, 0); | ||
271 | xy02 = vget_lane_f32(xy02_f32x2, 0); | ||
272 | |||
273 | for (; i < N; i++) { | ||
274 | xy01 = MAC16_16(xy01, x[i], y01[i]); | ||
275 | xy02 = MAC16_16(xy02, x[i], y02[i]); | ||
276 | } | ||
277 | *xy1 = xy01; | ||
278 | *xy2 = xy02; | ||
279 | |||
280 | #ifdef OPUS_CHECK_ASM | ||
281 | { | ||
282 | opus_val32 xy1_c, xy2_c; | ||
283 | dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c); | ||
284 | celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL); | ||
285 | celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL); | ||
286 | } | ||
287 | #endif | ||
288 | } | ||
289 | |||
290 | #endif /* FIXED_POINT */ | ||