summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/silk/arm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/arm')
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_arm.h57
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c280
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_arm.h100
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_neon_intr.c1124
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.c112
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.h114
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/arm_silk_map.c123
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_arm.h68
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c156
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/macros_arm64.h39
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/macros_armv4.h13
-rw-r--r--lib/rbcodec/codecs/libopus/silk/arm/macros_armv5e.h9
12 files changed, 2191 insertions, 4 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_arm.h b/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_arm.h
new file mode 100644
index 0000000000..9895b555c8
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_arm.h
@@ -0,0 +1,57 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifndef SILK_LPC_INV_PRED_GAIN_ARM_H
29# define SILK_LPC_INV_PRED_GAIN_ARM_H
30
31# include "celt/arm/armcpu.h"
32
33# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
34opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
35 const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
36 const opus_int order /* I Prediction order */
37);
38
39# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
40# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
41# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))
42# endif
43# endif
44
45# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
46/*Is run-time CPU detection enabled on this platform?*/
47# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
48extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order);
49# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
50# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))
51# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
52# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
53# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_neon(A_Q12, order))
54# endif
55# endif
56
57#endif /* end SILK_LPC_INV_PRED_GAIN_ARM_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c b/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c
new file mode 100644
index 0000000000..ab426bcd66
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -0,0 +1,280 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <arm_neon.h>
33#include "SigProc_FIX.h"
34#include "define.h"
35
36#define QA 24
37#define A_LIMIT SILK_FIX_CONST( 0.99975, QA )
38
39#define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
40
41/* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed,
42 * since NEON has no 64-bit min, max or comparison instructions.
43 * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not.
44 * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000.
45 * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62.
46 * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1,
47 * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed.
48 * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000.
49 * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions.
50 * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF.
51 */
52
53/* Compute inverse of LPC prediction gain, and */
54/* test if LPC coefficients are stable (all poles within unit circle) */
55static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
56 opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
57 const opus_int order /* I Prediction order */
58)
59{
60 opus_int k, n, mult2Q;
61 opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2;
62 opus_int32 max, min;
63 int32x4_t max_s32x4, min_s32x4;
64 int32x2_t max_s32x2, min_s32x2;
65
66 max_s32x4 = vdupq_n_s32( silk_int32_MIN );
67 min_s32x4 = vdupq_n_s32( silk_int32_MAX );
68 invGain_Q30 = SILK_FIX_CONST( 1, 30 );
69 for( k = order - 1; k > 0; k-- ) {
70 int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
71 int64x2_t mult2Q_s64x2;
72
73 /* Check for stability */
74 if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {
75 return 0;
76 }
77
78 /* Set RC equal to negated AR coef */
79 rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA );
80
81 /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
82 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
83 silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */
84 silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
85
86 /* Update inverse gain */
87 /* invGain_Q30 range: [ 0 : 2^30 ] */
88 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
89 silk_assert( invGain_Q30 >= 0 );
90 silk_assert( invGain_Q30 <= ( 1 << 30 ) );
91 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
92 return 0;
93 }
94
95 /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
96 mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
97 rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
98
99 /* Update AR coefficient */
100 rc_Q31_s32x2 = vdup_n_s32( rc_Q31 );
101 mult2Q_s64x2 = vdupq_n_s64( -mult2Q );
102 rc_mult2_s32x2 = vdup_n_s32( rc_mult2 );
103
104 for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) {
105 /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */
106 int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4;
107 int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
108 tmp1_s32x4 = vld1q_s32( A_QA + n );
109 tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 );
110 tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 );
111 tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) );
112 t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 );
113 t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 );
114 t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 );
115 t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 );
116 t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 );
117 t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 );
118 t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 );
119 t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 );
120 t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 );
121 t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 );
122 t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 );
123 t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 );
124 t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) );
125 t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) );
126 s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) );
127 s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) );
128 max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 );
129 min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 );
130 max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 );
131 min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 );
132 t1_s32x4 = vrev64q_s32( t1_s32x4 );
133 t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) );
134 vst1q_s32( A_QA + n, t0_s32x4 );
135 vst1q_s32( A_QA + k - n - 4, t1_s32x4 );
136 }
137 for( ; n < (k + 1) >> 1; n++ ) {
138 opus_int64 tmp64;
139 tmp1 = A_QA[ n ];
140 tmp2 = A_QA[ k - n - 1 ];
141 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1,
142 MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q);
143 if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {
144 return 0;
145 }
146 A_QA[ n ] = ( opus_int32 )tmp64;
147 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2,
148 MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q);
149 if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {
150 return 0;
151 }
152 A_QA[ k - n - 1 ] = ( opus_int32 )tmp64;
153 }
154 }
155
156 /* Check for stability */
157 if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {
158 return 0;
159 }
160
161 max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) );
162 min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) );
163 max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) );
164 min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) );
165 max = vget_lane_s32( max_s32x2, 0 );
166 min = vget_lane_s32( min_s32x2, 0 );
167 if( ( max > 0 ) || ( min < -1 ) ) {
168 return 0;
169 }
170
171 /* Set RC equal to negated AR coef */
172 rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA );
173
174 /* Range: [ 1 : 2^30 ] */
175 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
176
177 /* Update inverse gain */
178 /* Range: [ 0 : 2^30 ] */
179 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
180 silk_assert( invGain_Q30 >= 0 );
181 silk_assert( invGain_Q30 <= ( 1 << 30 ) );
182 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
183 return 0;
184 }
185
186 return invGain_Q30;
187}
188
189/* For input in Q12 domain */
190opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
191 const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
192 const opus_int order /* I Prediction order */
193)
194{
195#ifdef OPUS_CHECK_ASM
196 const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order );
197#endif
198
199 opus_int32 invGain_Q30;
200 if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) {
201 invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order );
202 }
203 else {
204 opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ];
205 opus_int32 DC_resp;
206 int16x8_t t0_s16x8, t1_s16x8, t2_s16x8;
207 int32x4_t t0_s32x4;
208 const opus_int leftover = order & 7;
209
210 /* Increase Q domain of the AR coefficients */
211 t0_s16x8 = vld1q_s16( A_Q12 + 0 );
212 t1_s16x8 = vld1q_s16( A_Q12 + 8 );
213 t2_s16x8 = vld1q_s16( A_Q12 + 16 );
214 t0_s32x4 = vpaddlq_s16( t0_s16x8 );
215
216 switch( order - leftover )
217 {
218 case 24:
219 t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );
220 /* FALLTHROUGH */
221
222 case 16:
223 t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );
224 vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
225 vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
226 /* FALLTHROUGH */
227
228 case 8:
229 {
230 const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) );
231 const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 );
232 DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 );
233 vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );
234 vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );
235 }
236 break;
237
238 default:
239 DC_resp = 0;
240 break;
241 }
242 A_Q12 += order - leftover;
243
244 switch( leftover )
245 {
246 case 6:
247 DC_resp += (opus_int32)A_Q12[ 5 ];
248 DC_resp += (opus_int32)A_Q12[ 4 ];
249 /* FALLTHROUGH */
250
251 case 4:
252 DC_resp += (opus_int32)A_Q12[ 3 ];
253 DC_resp += (opus_int32)A_Q12[ 2 ];
254 /* FALLTHROUGH */
255
256 case 2:
257 DC_resp += (opus_int32)A_Q12[ 1 ];
258 DC_resp += (opus_int32)A_Q12[ 0 ];
259 /* FALLTHROUGH */
260
261 default:
262 break;
263 }
264
265 /* If the DC is unstable, we don't even need to do the full calculations */
266 if( DC_resp >= 4096 ) {
267 invGain_Q30 = 0;
268 } else {
269 vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );
270 vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );
271 invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
272 }
273 }
274
275#ifdef OPUS_CHECK_ASM
276 silk_assert( invGain_Q30_c == invGain_Q30 );
277#endif
278
279 return invGain_Q30;
280}
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_arm.h b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_arm.h
new file mode 100644
index 0000000000..9e76e16927
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_arm.h
@@ -0,0 +1,100 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifndef SILK_NSQ_DEL_DEC_ARM_H
29#define SILK_NSQ_DEL_DEC_ARM_H
30
31#include "celt/arm/armcpu.h"
32
33#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
34void silk_NSQ_del_dec_neon(
35 const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
36 SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
37 const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
38 const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
39 const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
40 const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
41 const opus_int Tilt_Q14[MAX_NB_SUBFR],
42 const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],
43 const opus_int32 Gains_Q16[MAX_NB_SUBFR],
44 const opus_int pitchL[MAX_NB_SUBFR], const opus_int Lambda_Q10,
45 const opus_int LTP_scale_Q14);
46
47#if !defined(OPUS_HAVE_RTCD)
48#define OVERRIDE_silk_NSQ_del_dec (1)
49#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, \
50 LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, \
51 LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, \
52 LTP_scale_Q14, arch) \
53 ((void)(arch), \
54 PRESUME_NEON(silk_NSQ_del_dec)( \
55 psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, \
56 AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, \
57 Lambda_Q10, LTP_scale_Q14))
58#endif
59#endif
60
61#if !defined(OVERRIDE_silk_NSQ_del_dec)
62/*Is run-time CPU detection enabled on this platform?*/
63#if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
64 !defined(OPUS_ARM_PRESUME_NEON_INTR))
65extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
66 const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
67 SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
68 const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
69 const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
70 const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
71 const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
72 const opus_int Tilt_Q14[MAX_NB_SUBFR],
73 const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],
74 const opus_int32 Gains_Q16[MAX_NB_SUBFR],
75 const opus_int pitchL[MAX_NB_SUBFR], const opus_int Lambda_Q10,
76 const opus_int LTP_scale_Q14);
77#define OVERRIDE_silk_NSQ_del_dec (1)
78#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, \
79 LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, \
80 LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, \
81 LTP_scale_Q14, arch) \
82 ((*SILK_NSQ_DEL_DEC_IMPL[(arch)&OPUS_ARCHMASK])( \
83 psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, \
84 AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, \
85 Lambda_Q10, LTP_scale_Q14))
86#elif defined(OPUS_ARM_PRESUME_NEON_INTR)
87#define OVERRIDE_silk_NSQ_del_dec (1)
88#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, \
89 LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, \
90 LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, \
91 LTP_scale_Q14, arch) \
92 ((void)(arch), \
93 silk_NSQ_del_dec_neon(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, \
94 LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, \
95 LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, \
96 LTP_scale_Q14))
97#endif
98#endif
99
100#endif /* end SILK_NSQ_DEL_DEC_ARM_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_neon_intr.c b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_neon_intr.c
new file mode 100644
index 0000000000..212410f362
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_del_dec_neon_intr.c
@@ -0,0 +1,1124 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <arm_neon.h>
33#ifdef OPUS_CHECK_ASM
34# include <string.h>
35#endif
36#include "main.h"
37#include "stack_alloc.h"
38
39/* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
40/* If there are more states, C function is called, and this optimization must be expanded. */
41#define NEON_MAX_DEL_DEC_STATES 4
42
43typedef struct {
44 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
45 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
46 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
47 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
48 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
49 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
50 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
51 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
52 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
53 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ];
54 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ];
55 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
56} NSQ_del_decs_struct;
57
58typedef struct {
59 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ];
60 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
61 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ];
62 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
63 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
64 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
65 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ];
66} NSQ_samples_struct;
67
68static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
69 const silk_encoder_state *psEncC, /* I Encoder State */
70 silk_nsq_state *NSQ, /* I/O NSQ state */
71 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
72 const opus_int16 x16[], /* I Input */
73 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
74 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
75 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
76 opus_int subfr, /* I Subframe number */
77 const opus_int LTP_scale_Q14, /* I LTP state scaling */
78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
80 const opus_int signal_type, /* I Signal type */
81 const opus_int decisionDelay /* I Decision delay */
82);
83
84/******************************************/
85/* Noise shape quantizer for one subframe */
86/******************************************/
87static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
88 silk_nsq_state *NSQ, /* I/O NSQ state */
89 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
90 opus_int signalType, /* I Signal type */
91 const opus_int32 x_Q10[], /* I */
92 opus_int8 pulses[], /* O */
93 opus_int16 xq[], /* O */
94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
96 const opus_int16 a_Q12[], /* I Short term prediction coefs */
97 const opus_int16 b_Q14[], /* I Long term prediction coefs */
98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
99 opus_int lag, /* I Pitch lag */
100 opus_int32 HarmShapeFIRPacked_Q14, /* I */
101 opus_int Tilt_Q14, /* I Spectral tilt */
102 opus_int32 LF_shp_Q14, /* I */
103 opus_int32 Gain_Q16, /* I */
104 opus_int Lambda_Q10, /* I */
105 opus_int offset_Q10, /* I */
106 opus_int length, /* I Input length */
107 opus_int subfr, /* I Subframe number */
108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
109 opus_int predictLPCOrder, /* I Prediction filter order */
110 opus_int warping_Q16, /* I */
111 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
112 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
113 opus_int decisionDelay /* I */
114);
115
116static OPUS_INLINE void copy_winner_state_kernel(
117 const NSQ_del_decs_struct *psDelDec,
118 const opus_int offset,
119 const opus_int last_smple_idx,
120 const opus_int Winner_ind,
121 const int32x2_t gain_lo_s32x2,
122 const int32x2_t gain_hi_s32x2,
123 const int32x4_t shift_s32x4,
124 int32x4_t t0_s32x4,
125 int32x4_t t1_s32x4,
126 opus_int8 *const pulses,
127 opus_int16 *pxq,
128 silk_nsq_state *NSQ
129)
130{
131 int16x8_t t_s16x8;
132 int32x4_t o0_s32x4, o1_s32x4;
133
134 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
138 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
142 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
143 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
144
145 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
149 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
153 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
154 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
155 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
156 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
157 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
158 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
159 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
160 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
161
162 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
166 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
170 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
172}
173
174static OPUS_INLINE void copy_winner_state(
175 const NSQ_del_decs_struct *psDelDec,
176 const opus_int decisionDelay,
177 const opus_int smpl_buf_idx,
178 const opus_int Winner_ind,
179 const opus_int32 gain,
180 const opus_int32 shift,
181 opus_int8 *const pulses,
182 opus_int16 *pxq,
183 silk_nsq_state *NSQ
184)
185{
186 opus_int i, last_smple_idx;
187 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
188 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
189 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
190 int32x4_t t0_s32x4, t1_s32x4;
191
192 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
193 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
194 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
196
197 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
198 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
199 }
200 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
201 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
202 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
203 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
204 }
205
206 last_smple_idx += DECISION_DELAY;
207 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
208 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
209 }
210 for( ; i < decisionDelay; i++, last_smple_idx-- ) {
211 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
212 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
213 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
214 }
215}
216
217void silk_NSQ_del_dec_neon(
218 const silk_encoder_state *psEncC, /* I Encoder State */
219 silk_nsq_state *NSQ, /* I/O NSQ state */
220 SideInfoIndices *psIndices, /* I/O Quantization Indices */
221 const opus_int16 x16[], /* I Input */
222 opus_int8 pulses[], /* O Quantized pulse signal */
223 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
224 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
225 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
226 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
227 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
228 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
229 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
230 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
231 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
232 const opus_int LTP_scale_Q14 /* I LTP state scaling */
233)
234{
235#ifdef OPUS_CHECK_ASM
236 silk_nsq_state NSQ_c;
237 SideInfoIndices psIndices_c;
238 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
239 const opus_int8 *const pulses_a = pulses;
240
241 ( void )pulses_a;
242 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
243 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
244 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
245 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
246 pitchL, Lambda_Q10, LTP_scale_Q14 );
247#endif
248
249 /* The optimization parallelizes the different delay decision states. */
250 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
251 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
252 /* If there are more states, C function is called, and this optimization must be expanded. */
253 /* When the number of delay decision states is less than 3, there are penalties using this */
254 /* optimization, and C function is called. */
255 /* When the number of delay decision states is 2, it's better to specialize another */
256 /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority) */
257 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
258 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
259 } else {
260 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
261 opus_int smpl_buf_idx, decisionDelay;
262 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
263 opus_int16 *pxq;
264 VARDECL( opus_int32, sLTP_Q15 );
265 VARDECL( opus_int16, sLTP );
266 opus_int32 HarmShapeFIRPacked_Q14;
267 opus_int offset_Q10;
268 opus_int32 RDmin_Q10, Gain_Q10;
269 VARDECL( opus_int32, x_sc_Q10 );
270 VARDECL( opus_int32, delayedGain_Q10 );
271 VARDECL( NSQ_del_decs_struct, psDelDec );
272 int32x4_t t_s32x4;
273 SAVE_STACK;
274
275 /* Set unvoiced lag to the previous one, overwrite later for voiced */
276 lag = NSQ->lagPrev;
277
278 silk_assert( NSQ->prev_gain_Q16 != 0 );
279
280 /* Initialize delayed decision states */
281 ALLOC( psDelDec, 1, NSQ_del_decs_struct );
282 /* Only RandState and RD_Q10 need to be initialized to 0. */
283 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
284 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
285
286 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
287 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
288 }
289 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
290 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
291 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
292 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
293 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
294 }
295 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
296 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
297 }
298
299 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
300 smpl_buf_idx = 0; /* index of oldest samples */
301
302 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
303
304 /* For voiced frames limit the decision delay to lower than the pitch lag */
305 if( psIndices->signalType == TYPE_VOICED ) {
306 opus_int pitch_min = pitchL[ 0 ];
307 for( k = 1; k < psEncC->nb_subfr; k++ ) {
308 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
309 }
310 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
311 } else {
312 if( lag > 0 ) {
313 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
314 }
315 }
316
317 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
318 LSF_interpolation_flag = 0;
319 } else {
320 LSF_interpolation_flag = 1;
321 }
322
323 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
324 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
325 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
326 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
327 /* Set up pointers to start of sub frame */
328 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
329 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
330 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
331 subfr = 0;
332 for( k = 0; k < psEncC->nb_subfr; k++ ) {
333 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
334 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
335 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
336
337 /* Noise shape parameters */
338 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
339 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
340 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
341
342 NSQ->rewhite_flag = 0;
343 if( psIndices->signalType == TYPE_VOICED ) {
344 /* Voiced */
345 lag = pitchL[ k ];
346
347 /* Re-whitening */
348 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
349 if( k == 2 ) {
350 /* RESET DELAYED DECISIONS */
351 /* Find winner */
352 int32x4_t RD_Q10_s32x4;
353 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
354 Winner_ind = 0;
355 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
356 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
357 RDmin_Q10 = psDelDec->RD_Q10[ i ];
358 Winner_ind = i;
359 }
360 }
361 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
362 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
363 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
364 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
365
366 /* Copy final part of signals from winner state to output and long-term filter states */
367 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
368
369 subfr = 0;
370 }
371
372 /* Rewhiten with new A coefs */
373 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
374 silk_assert( start_idx > 0 );
375
376 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
377 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
378
379 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
380 NSQ->rewhite_flag = 1;
381 }
382 }
383
384 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
385 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
386
387 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
388 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
389 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
390 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
391
392 x16 += psEncC->subfr_length;
393 pulses += psEncC->subfr_length;
394 pxq += psEncC->subfr_length;
395 }
396
397 /* Find winner */
398 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
399 Winner_ind = 0;
400 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
401 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
402 RDmin_Q10 = psDelDec->RD_Q10[ k ];
403 Winner_ind = k;
404 }
405 }
406
407 /* Copy final part of signals from winner state to output and long-term filter states */
408 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
409 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
410 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
411
412 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
413 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
414 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
415 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
418 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
419 }
420
421 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
422 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
423 }
424
425 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
426 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
427 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
430 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
431 }
432
433 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
434 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
435 }
436
437 /* Update states */
438 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
439 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ];
440 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
441
442 /* Save quantized speech signal */
443 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
444 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
445 RESTORE_STACK;
446 }
447
448#ifdef OPUS_CHECK_ASM
449 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
450 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
451 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
452#endif
453}
454
455/******************************************/
456/* Noise shape quantizer for one subframe */
457/******************************************/
458/* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
459/* Therefore here we append "_local" to the NEON function name to avoid confusion. */
460static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
461{
462 int16x8_t t_s16x8;
463 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
464 silk_assert( order == 10 || order == 16 );
465
466 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */
467 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */
468 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */
469 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */
470
471 if( order == 16 ) {
472 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */
473 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */
474 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */
475 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */
476 } else {
477 int16x4_t t_s16x4;
478
479 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */
480 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */
481 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */
482 t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
483 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */
484 }
485 vst1q_s32( out + 0, t0_s32x4 );
486 vst1q_s32( out + 4, t1_s32x4 );
487 vst1q_s32( out + 8, t2_s32x4 );
488 vst1q_s32( out + 12, t3_s32x4 );
489}
490
491static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
492 const int32x4_t out_s32x4,
493 const int32x4_t in_s32x4,
494 const int32x2_t coef_s32x2
495)
496{
497 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
498}
499
500static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
501 const int32x4_t out_s32x4,
502 const int32x4_t in_s32x4,
503 const int32x2_t coef_s32x2
504)
505{
506 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
507}
508
509/* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
510/* Therefore here we append "_local" to the function name to avoid confusion. */
511static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
512{
513 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
514 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
515 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
516 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
517 int32x4_t LPC_pred_Q14_s32x4;
518
519 silk_assert( order == 10 || order == 16 );
520 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
521 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
522 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
523 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
538
539 return LPC_pred_Q14_s32x4;
540}
541
542static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
543 silk_nsq_state *NSQ, /* I/O NSQ state */
544 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
545 opus_int signalType, /* I Signal type */
546 const opus_int32 x_Q10[], /* I */
547 opus_int8 pulses[], /* O */
548 opus_int16 xq[], /* O */
549 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
550 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
551 const opus_int16 a_Q12[], /* I Short term prediction coefs */
552 const opus_int16 b_Q14[], /* I Long term prediction coefs */
553 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
554 opus_int lag, /* I Pitch lag */
555 opus_int32 HarmShapeFIRPacked_Q14, /* I */
556 opus_int Tilt_Q14, /* I Spectral tilt */
557 opus_int32 LF_shp_Q14, /* I */
558 opus_int32 Gain_Q16, /* I */
559 opus_int Lambda_Q10, /* I */
560 opus_int offset_Q10, /* I */
561 opus_int length, /* I Input length */
562 opus_int subfr, /* I Subframe number */
563 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
564 opus_int predictLPCOrder, /* I Prediction filter order */
565 opus_int warping_Q16, /* I */
566 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
567 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
568 opus_int decisionDelay /* I */
569)
570{
571 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
572 opus_int32 Winner_rand_state;
573 opus_int32 LTP_pred_Q14, n_LTP_Q14;
574 opus_int32 RDmin_Q10, RDmax_Q10;
575 opus_int32 Gain_Q10;
576 opus_int32 *pred_lag_ptr, *shp_lag_ptr;
577 opus_int32 a_Q12_arch[MAX_LPC_ORDER];
578 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
579 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
580 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
581 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
582 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
583
584 VARDECL( NSQ_samples_struct, psSampleState );
585 SAVE_STACK;
586
587 silk_assert( nStatesDelayedDecision > 0 );
588 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
589 ALLOC( psSampleState, 2, NSQ_samples_struct );
590
591 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
592 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
593 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
594
595 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
596 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i );
597 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) );
598 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
599 }
600
601 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
602 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
603 }
604
605 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
606
607 for( i = 0; i < length; i++ ) {
608 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
609 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
610 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
611 int32x2_t AR_shp_Q28_s32x2;
612 int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
613
614 /* Perform common calculations used in all states */
615
616 /* Long-term prediction */
617 if( signalType == TYPE_VOICED ) {
618 /* Unrolled loop */
619 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
620 LTP_pred_Q14 = 2;
621 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );
622 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
623 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
626 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
627 pred_lag_ptr++;
628 } else {
629 LTP_pred_Q14 = 0;
630 }
631
632 /* Long-term shaping */
633 if( lag > 0 ) {
634 /* Symmetric, packed FIR coefficients */
635 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
636 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
637 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
638 shp_lag_ptr++;
639 } else {
640 n_LTP_Q14 = 0;
641 }
642
643 /* Generate dither */
644 Seed_s32x4 = vld1q_s32( psDelDec->Seed );
645 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
646 vst1q_s32( psDelDec->Seed, Seed_s32x4 );
647
648 /* Short-term prediction */
649 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
650 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
651
652 /* Noise shape feedback */
653 /* Output of lowpass section */
654 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
655 /* Output of allpass section */
656 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
657 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
658 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
659 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
660 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
661
662 /* Loop over allpass sections */
663 for( j = 2; j < shapingLPCOrder; j += 2 ) {
664 /* Output of allpass section */
665 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
666 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
667 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
668 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
669 /* Output of allpass section */
670 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
671 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
672 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
673 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
674 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
675 }
676 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
678 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */
679 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */
680 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */
681 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */
682 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
683 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */
684
685 /* Input minus prediction plus noise feedback */
686 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
687 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */
688 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
689 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */
690 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */
691 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */
692
693 /* Flip sign depending on dither */
694 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
695 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
696 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
697 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
698 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
699 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
700
701 /* Find two quantization level candidates and measure their rate-distortion */
702 {
703 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
704 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
705 int16x4_t q2_Q10_s16x4;
706 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
707 uint32x4_t t_u32x4;
708
709 if( Lambda_Q10 > 2048 ) {
710 /* For aggressive RDO, the bias becomes more than one pulse. */
711 const int rdo_offset = Lambda_Q10/2 - 512;
712 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
713 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
714 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
715 silk_assert( Lambda_Q10 <= 32767 );
716
717 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
718 q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
719 q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
720 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
721 }
722 {
723 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
724 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
725 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
726 int16x4_t tmp1_s16x4, tmp2_s16x4;
727
728 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
729 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
730 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
731 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
732 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
733 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
734 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
735 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
736 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
737 tmp1_s16x4 = q1_Q10_s16x4;
738 tmp2_s16x4 = q2_Q10_s16x4;
739 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
740 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
741 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
742 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
743 }
744
745 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
746 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
747 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
748
749 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
750 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
751 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
752
753 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
754 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
755 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
756 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
757 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
758 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
759 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
760 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
761 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
762 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
763 }
764
765 {
766 /* Update states for best quantization */
767 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
768
769 /* Quantized excitation */
770 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
771 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
772 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
773
774 /* Add predictions */
775 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
776 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
777
778 /* Update states */
779 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
780 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
781 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
782 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
783 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
784 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
785 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
786
787 /* Quantized excitation */
788 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
789 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
790 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
791
792 /* Add predictions */
793 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
794 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
795
796 /* Update states */
797 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
798 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
799 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
800 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
801 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
802 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
803 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
804 }
805
806 *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
807 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
808 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
809 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
810
811 /* Find winner */
812 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
813 Winner_ind = 0;
814 for( k = 1; k < nStatesDelayedDecision; k++ ) {
815 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
816 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
817 Winner_ind = k;
818 }
819 }
820
821 /* Increase RD values of expired states */
822 {
823 uint32x4_t t_u32x4;
824 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
825 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
826 t_u32x4 = vmvnq_u32( t_u32x4 );
827 t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
828 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
829 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
830 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
831 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
832 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
833 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
834
835 /* Find worst in first set and best in second set */
836 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
837 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
838 RDmax_ind = 0;
839 RDmin_ind = 0;
840 for( k = 1; k < nStatesDelayedDecision; k++ ) {
841 /* find worst in first set */
842 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
843 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
844 RDmax_ind = k;
845 }
846 /* find best in second set */
847 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
848 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
849 RDmin_ind = k;
850 }
851 }
852 }
853
854 /* Replace a state if best from second set outperforms worst in first set */
855 if( RDmin_Q10 < RDmax_Q10 ) {
856 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
857 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
858 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
859 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */
860 /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
861 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */
862 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
863 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
864 }
865 for( j = 0; j < numOthers; j++ ) {
866 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
867 }
868
869 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
870 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
871 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
872 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
873 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
874 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
875 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
876 }
877
878 /* Write samples from winner to output and long-term filter states */
879 if( subfr > 0 || i >= decisionDelay ) {
880 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
881 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
882 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
883 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
884 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ];
885 }
886 NSQ->sLTP_shp_buf_idx++;
887 NSQ->sLTP_buf_idx++;
888
889 /* Update states */
890 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
891 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
892 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
893 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
894 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
895 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
896 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
897 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
898 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
899 tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
900 vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
901 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
902 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
903 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
904 }
905 /* Update LPC states */
906 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
907
908 RESTORE_STACK;
909}
910
911static OPUS_INLINE void silk_SMULWB_8_neon(
912 const opus_int16 *a,
913 const int32x2_t b,
914 opus_int32 *o
915)
916{
917 const int16x8_t a_s16x8 = vld1q_s16( a );
918 int32x4_t o0_s32x4, o1_s32x4;
919
920 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
921 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
922 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
923 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
924 vst1q_s32( o, o0_s32x4 );
925 vst1q_s32( o + 4, o1_s32x4 );
926}
927
928/* Only works when ( b >= -65536 ) && ( b < 65536 ). */
929static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
930 opus_int32 *a,
931 const int32x2_t b_s32x2)
932{
933 int32x4_t o_s32x4;
934
935 o_s32x4 = vld1q_s32( a );
936 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
937 vst1q_s32( a, o_s32x4 );
938}
939
940/* Only works when ( b >= -65536 ) && ( b < 65536 ). */
941static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
942 opus_int32 *a,
943 const int32x2_t b_s32x2
944)
945{
946 int32x4_t o0_s32x4, o1_s32x4;
947
948 o0_s32x4 = vld1q_s32( a );
949 o1_s32x4 = vld1q_s32( a + 4 );
950 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
951 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
952 vst1q_s32( a, o0_s32x4 );
953 vst1q_s32( a + 4, o1_s32x4 );
954}
955
956static OPUS_INLINE void silk_SMULWW_4_neon(
957 opus_int32 *a,
958 const int32x2_t b_s32x2)
959{
960 int32x4_t a_s32x4, o_s32x4;
961
962 a_s32x4 = vld1q_s32( a );
963 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
964 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
965 vst1q_s32( a, o_s32x4 );
966}
967
968static OPUS_INLINE void silk_SMULWW_8_neon(
969 opus_int32 *a,
970 const int32x2_t b_s32x2
971)
972{
973 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
974
975 a0_s32x4 = vld1q_s32( a );
976 a1_s32x4 = vld1q_s32( a + 4 );
977 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
978 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
979 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
980 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
981 vst1q_s32( a, o0_s32x4 );
982 vst1q_s32( a + 4, o1_s32x4 );
983}
984
985static OPUS_INLINE void silk_SMULWW_loop_neon(
986 const opus_int16 *a,
987 const opus_int32 b,
988 opus_int32 *o,
989 const opus_int loop_num
990)
991{
992 opus_int i;
993 int32x2_t b_s32x2;
994
995 b_s32x2 = vdup_n_s32( b );
996 for( i = 0; i < loop_num - 7; i += 8 ) {
997 silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
998 }
999 for( ; i < loop_num; i++ ) {
1000 o[ i ] = silk_SMULWW( a[ i ], b );
1001 }
1002}
1003
1004static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
1005 const silk_encoder_state *psEncC, /* I Encoder State */
1006 silk_nsq_state *NSQ, /* I/O NSQ state */
1007 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
1008 const opus_int16 x16[], /* I Input */
1009 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
1010 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
1011 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
1012 opus_int subfr, /* I Subframe number */
1013 const opus_int LTP_scale_Q14, /* I LTP state scaling */
1014 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
1015 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
1016 const opus_int signal_type, /* I Signal type */
1017 const opus_int decisionDelay /* I Decision delay */
1018)
1019{
1020 opus_int i, lag;
1021 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
1022
1023 lag = pitchL[ subfr ];
1024 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
1025 silk_assert( inv_gain_Q31 != 0 );
1026
1027 /* Scale input */
1028 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
1029 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
1030
1031 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
1032 if( NSQ->rewhite_flag ) {
1033 if( subfr == 0 ) {
1034 /* Do LTP downscaling */
1035 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
1036 }
1037 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
1038 }
1039
1040 /* Adjust for changing gain */
1041 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
1042 int32x2_t gain_adj_Q16_s32x2;
1043 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
1044
1045 /* Scale long-term shaping state */
1046 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
1047 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
1048 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1049 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1050 }
1051 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1052 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1053 }
1054
1055 /* Scale long-term prediction state */
1056 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1057 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1058 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1059 }
1060 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1061 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1062 }
1063 }
1064
1065 /* Scale scalar states */
1066 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1067 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1068
1069 /* Scale short-term prediction and shaping states */
1070 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1071 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1072 }
1073
1074 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1075 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1076 }
1077
1078 for( i = 0; i < DECISION_DELAY; i++ ) {
1079 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1080 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1081 }
1082 } else {
1083 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
1084 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
1085 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1086 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1087 }
1088 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1089 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1090 }
1091
1092 /* Scale long-term prediction state */
1093 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1094 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1095 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1096 }
1097 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1098 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1099 }
1100 }
1101
1102 /* Scale scalar states */
1103 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1104 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1105
1106 /* Scale short-term prediction and shaping states */
1107 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1108 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1109 }
1110
1111 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1112 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1113 }
1114
1115 for( i = 0; i < DECISION_DELAY; i++ ) {
1116 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1117 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1118 }
1119 }
1120
1121 /* Save inverse gain */
1122 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
1123 }
1124}
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.c b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.c
new file mode 100644
index 0000000000..9642529973
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.c
@@ -0,0 +1,112 @@
1/***********************************************************************
2Copyright (C) 2014 Vidyo
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27#ifdef HAVE_CONFIG_H
28#include "config.h"
29#endif
30
31#include <arm_neon.h>
32#include "main.h"
33#include "stack_alloc.h"
34#include "NSQ.h"
35#include "celt/cpu_support.h"
36#include "celt/arm/armcpu.h"
37
38opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *buf32, const opus_int32 *coef32, opus_int order)
39{
40 int32x4_t coef0 = vld1q_s32(coef32);
41 int32x4_t coef1 = vld1q_s32(coef32 + 4);
42 int32x4_t coef2 = vld1q_s32(coef32 + 8);
43 int32x4_t coef3 = vld1q_s32(coef32 + 12);
44
45 int32x4_t a0 = vld1q_s32(buf32 - 15);
46 int32x4_t a1 = vld1q_s32(buf32 - 11);
47 int32x4_t a2 = vld1q_s32(buf32 - 7);
48 int32x4_t a3 = vld1q_s32(buf32 - 3);
49
50 int32x4_t b0 = vqdmulhq_s32(coef0, a0);
51 int32x4_t b1 = vqdmulhq_s32(coef1, a1);
52 int32x4_t b2 = vqdmulhq_s32(coef2, a2);
53 int32x4_t b3 = vqdmulhq_s32(coef3, a3);
54
55 int32x4_t c0 = vaddq_s32(b0, b1);
56 int32x4_t c1 = vaddq_s32(b2, b3);
57
58 int32x4_t d = vaddq_s32(c0, c1);
59
60 int64x2_t e = vpaddlq_s32(d);
61
62 int64x1_t f = vadd_s64(vget_low_s64(e), vget_high_s64(e));
63
64 opus_int32 out = vget_lane_s32(vreinterpret_s32_s64(f), 0);
65
66 out += silk_RSHIFT( order, 1 );
67
68 return out;
69}
70
71
72opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef, opus_int order)
73{
74 opus_int32 out;
75 if (order == 8)
76 {
77 int32x4_t a00 = vdupq_n_s32(data0[0]);
78 int32x4_t a01 = vld1q_s32(data1); /* data1[0] ... [3] */
79
80 int32x4_t a0 = vextq_s32 (a00, a01, 3); /* data0[0] data1[0] ...[2] */
81 int32x4_t a1 = vld1q_s32(data1 + 3); /* data1[3] ... [6] */
82
83 /*TODO: Convert these once in advance instead of once per sample, like
84 silk_noise_shape_quantizer_short_prediction_neon() does.*/
85 int16x8_t coef16 = vld1q_s16(coef);
86 int32x4_t coef0 = vmovl_s16(vget_low_s16(coef16));
87 int32x4_t coef1 = vmovl_s16(vget_high_s16(coef16));
88
89 /*This is not bit-exact with the C version, since we do not drop the
90 lower 16 bits of each multiply, but wait until the end to truncate
91 precision. This is an encoder-specific calculation (and unlike
92 silk_noise_shape_quantizer_short_prediction_neon(), is not meant to
93 simulate what the decoder will do). We still could use vqdmulhq_s32()
94 like silk_noise_shape_quantizer_short_prediction_neon() and save
95 half the multiplies, but the speed difference is not large, since we
96 then need two extra adds.*/
97 int64x2_t b0 = vmull_s32(vget_low_s32(a0), vget_low_s32(coef0));
98 int64x2_t b1 = vmlal_s32(b0, vget_high_s32(a0), vget_high_s32(coef0));
99 int64x2_t b2 = vmlal_s32(b1, vget_low_s32(a1), vget_low_s32(coef1));
100 int64x2_t b3 = vmlal_s32(b2, vget_high_s32(a1), vget_high_s32(coef1));
101
102 int64x1_t c = vadd_s64(vget_low_s64(b3), vget_high_s64(b3));
103 int64x1_t cS = vrshr_n_s64(c, 15);
104 int32x2_t d = vreinterpret_s32_s64(cS);
105
106 out = vget_lane_s32(d, 0);
107 vst1q_s32(data1, a0);
108 vst1q_s32(data1 + 4, a1);
109 return out;
110 }
111 return silk_NSQ_noise_shape_feedback_loop_c(data0, data1, coef, order);
112}
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.h b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.h
new file mode 100644
index 0000000000..b31d9442d6
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/NSQ_neon.h
@@ -0,0 +1,114 @@
1/***********************************************************************
2Copyright (C) 2014 Vidyo
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27#ifndef SILK_NSQ_NEON_H
28#define SILK_NSQ_NEON_H
29
30#include "cpu_support.h"
31#include "SigProc_FIX.h"
32
33#undef silk_short_prediction_create_arch_coef
34/* For vectorized calc, reverse a_Q12 coefs, convert to 32-bit, and shift for vqdmulhq_s32. */
35static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32 *out, const opus_int16 *in, opus_int order)
36{
37 out[15] = silk_LSHIFT32(in[0], 15);
38 out[14] = silk_LSHIFT32(in[1], 15);
39 out[13] = silk_LSHIFT32(in[2], 15);
40 out[12] = silk_LSHIFT32(in[3], 15);
41 out[11] = silk_LSHIFT32(in[4], 15);
42 out[10] = silk_LSHIFT32(in[5], 15);
43 out[9] = silk_LSHIFT32(in[6], 15);
44 out[8] = silk_LSHIFT32(in[7], 15);
45 out[7] = silk_LSHIFT32(in[8], 15);
46 out[6] = silk_LSHIFT32(in[9], 15);
47
48 if (order == 16)
49 {
50 out[5] = silk_LSHIFT32(in[10], 15);
51 out[4] = silk_LSHIFT32(in[11], 15);
52 out[3] = silk_LSHIFT32(in[12], 15);
53 out[2] = silk_LSHIFT32(in[13], 15);
54 out[1] = silk_LSHIFT32(in[14], 15);
55 out[0] = silk_LSHIFT32(in[15], 15);
56 }
57 else
58 {
59 out[5] = 0;
60 out[4] = 0;
61 out[3] = 0;
62 out[2] = 0;
63 out[1] = 0;
64 out[0] = 0;
65 }
66}
67
68#if defined(OPUS_ARM_PRESUME_NEON_INTR)
69
70#define silk_short_prediction_create_arch_coef(out, in, order) \
71 (silk_short_prediction_create_arch_coef_neon(out, in, order))
72
73#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
74
75#define silk_short_prediction_create_arch_coef(out, in, order) \
76 do { if (arch == OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
77
78#endif
79
80opus_int32 silk_noise_shape_quantizer_short_prediction_neon(const opus_int32 *buf32, const opus_int32 *coef32, opus_int order);
81
82opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef, opus_int order);
83
84#if defined(OPUS_ARM_PRESUME_NEON_INTR)
85#undef silk_noise_shape_quantizer_short_prediction
86#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch) \
87 ((void)arch,silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order))
88
89#undef silk_NSQ_noise_shape_feedback_loop
90#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) ((void)arch,silk_NSQ_noise_shape_feedback_loop_neon(data0, data1, coef, order))
91
92#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
93
94/* silk_noise_shape_quantizer_short_prediction implementations take different parameters based on arch
95 (coef vs. coefRev) so can't use the usual IMPL table implementation */
96#undef silk_noise_shape_quantizer_short_prediction
97#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch) \
98 (arch == OPUS_ARCH_ARM_NEON ? \
99 silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) : \
100 silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
101
102extern opus_int32
103 (*const SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_IMPL[OPUS_ARCHMASK+1])(
104 const opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef,
105 opus_int order);
106
107#undef silk_NSQ_noise_shape_feedback_loop
108#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) \
109 (SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_IMPL[(arch)&OPUS_ARCHMASK](data0, data1, \
110 coef, order))
111
112#endif
113
114#endif /* SILK_NSQ_NEON_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/arm_silk_map.c b/lib/rbcodec/codecs/libopus/silk/arm/arm_silk_map.c
new file mode 100644
index 0000000000..0b9bfec2ca
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/arm_silk_map.c
@@ -0,0 +1,123 @@
1/***********************************************************************
2Copyright (C) 2014 Vidyo
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27#ifdef HAVE_CONFIG_H
28# include "config.h"
29#endif
30
31#include "main_FIX.h"
32#include "NSQ.h"
33#include "SigProc_FIX.h"
34
35#if defined(OPUS_HAVE_RTCD)
36
37# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
38 !defined(OPUS_ARM_PRESUME_NEON_INTR))
39
40void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])(
41 const opus_int16 *in, /* I input signal */
42 const opus_int32 *B_Q28, /* I MA coefficients [3] */
43 const opus_int32 *A_Q28, /* I AR coefficients [2] */
44 opus_int32 *S, /* I/O State vector [4] */
45 opus_int16 *out, /* O output signal */
46 const opus_int32 len /* I signal length (must be even) */
47) = {
48 silk_biquad_alt_stride2_c, /* ARMv4 */
49 silk_biquad_alt_stride2_c, /* EDSP */
50 silk_biquad_alt_stride2_c, /* Media */
51 silk_biquad_alt_stride2_neon, /* Neon */
52};
53
54opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */
55 const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
56 const opus_int order /* I Prediction order */
57) = {
58 silk_LPC_inverse_pred_gain_c, /* ARMv4 */
59 silk_LPC_inverse_pred_gain_c, /* EDSP */
60 silk_LPC_inverse_pred_gain_c, /* Media */
61 silk_LPC_inverse_pred_gain_neon, /* Neon */
62};
63
64void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
65 const silk_encoder_state *psEncC, /* I Encoder State */
66 silk_nsq_state *NSQ, /* I/O NSQ state */
67 SideInfoIndices *psIndices, /* I/O Quantization Indices */
68 const opus_int16 x16[], /* I Input */
69 opus_int8 pulses[], /* O Quantized pulse signal */
70 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
71 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
72 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
73 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
74 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
75 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
78 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
79 const opus_int LTP_scale_Q14 /* I LTP state scaling */
80) = {
81 silk_NSQ_del_dec_c, /* ARMv4 */
82 silk_NSQ_del_dec_c, /* EDSP */
83 silk_NSQ_del_dec_c, /* Media */
84 silk_NSQ_del_dec_neon, /* Neon */
85};
86
87/*There is no table for silk_noise_shape_quantizer_short_prediction because the
88 NEON version takes different parameters than the C version.
89 Instead RTCD is done via if statements at the call sites.
90 See NSQ_neon.h for details.*/
91
92opus_int32
93 (*const SILK_NSQ_NOISE_SHAPE_FEEDBACK_LOOP_IMPL[OPUS_ARCHMASK+1])(
94 const opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef,
95 opus_int order) = {
96 silk_NSQ_noise_shape_feedback_loop_c, /* ARMv4 */
97 silk_NSQ_noise_shape_feedback_loop_c, /* EDSP */
98 silk_NSQ_noise_shape_feedback_loop_c, /* Media */
99 silk_NSQ_noise_shape_feedback_loop_neon, /* NEON */
100};
101
102# endif
103
104# if defined(FIXED_POINT) && \
105 defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
106
107void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
108 opus_int32 *corr, /* O Result [order + 1] */
109 opus_int *scale, /* O Scaling of the correlation vector */
110 const opus_int16 *input, /* I Input data to correlate */
111 const opus_int warping_Q16, /* I Warping coefficient */
112 const opus_int length, /* I Length of input */
113 const opus_int order /* I Correlation order (even) */
114) = {
115 silk_warped_autocorrelation_FIX_c, /* ARMv4 */
116 silk_warped_autocorrelation_FIX_c, /* EDSP */
117 silk_warped_autocorrelation_FIX_c, /* Media */
118 silk_warped_autocorrelation_FIX_neon, /* Neon */
119};
120
121# endif
122
123#endif /* OPUS_HAVE_RTCD */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_arm.h b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_arm.h
new file mode 100644
index 0000000000..66ea9f43dd
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_arm.h
@@ -0,0 +1,68 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifndef SILK_BIQUAD_ALT_ARM_H
29# define SILK_BIQUAD_ALT_ARM_H
30
31# include "celt/arm/armcpu.h"
32
33# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
34void silk_biquad_alt_stride2_neon(
35 const opus_int16 *in, /* I input signal */
36 const opus_int32 *B_Q28, /* I MA coefficients [3] */
37 const opus_int32 *A_Q28, /* I AR coefficients [2] */
38 opus_int32 *S, /* I/O State vector [4] */
39 opus_int16 *out, /* O output signal */
40 const opus_int32 len /* I signal length (must be even) */
41);
42
43# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
44# define OVERRIDE_silk_biquad_alt_stride2 (1)
45# define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((void)(arch), PRESUME_NEON(silk_biquad_alt_stride2)(in, B_Q28, A_Q28, S, out, len))
46# endif
47# endif
48
49# if !defined(OVERRIDE_silk_biquad_alt_stride2)
50/*Is run-time CPU detection enabled on this platform?*/
51# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
52extern void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK+1])(
53 const opus_int16 *in, /* I input signal */
54 const opus_int32 *B_Q28, /* I MA coefficients [3] */
55 const opus_int32 *A_Q28, /* I AR coefficients [2] */
56 opus_int32 *S, /* I/O State vector [4] */
57 opus_int16 *out, /* O output signal */
58 const opus_int32 len /* I signal length (must be even) */
59 );
60# define OVERRIDE_silk_biquad_alt_stride2 (1)
61# define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((*SILK_BIQUAD_ALT_STRIDE2_IMPL[(arch)&OPUS_ARCHMASK])(in, B_Q28, A_Q28, S, out, len))
62# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
63# define OVERRIDE_silk_biquad_alt_stride2 (1)
64# define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((void)(arch), silk_biquad_alt_stride2_neon(in, B_Q28, A_Q28, S, out, len))
65# endif
66# endif
67
68#endif /* end SILK_BIQUAD_ALT_ARM_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c
new file mode 100644
index 0000000000..9715733185
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c
@@ -0,0 +1,156 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <arm_neon.h>
33#ifdef OPUS_CHECK_ASM
34# include <string.h>
35# include "stack_alloc.h"
36#endif
37#include "SigProc_FIX.h"
38
39static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 )
40{
41 int32x4_t t_s32x4, out32_Q14_s32x4;
42
43 *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */
44 *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */
45 *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */
46 out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */
47 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */
48 *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */
49 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */
50 *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */
51 t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */
52 *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */
53}
54
55void silk_biquad_alt_stride2_neon(
56 const opus_int16 *in, /* I input signal */
57 const opus_int32 *B_Q28, /* I MA coefficients [3] */
58 const opus_int32 *A_Q28, /* I AR coefficients [2] */
59 opus_int32 *S, /* I/O State vector [4] */
60 opus_int16 *out, /* O output signal */
61 const opus_int32 len /* I signal length (must be even) */
62)
63{
64 /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
65 opus_int k = 0;
66 const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
67 const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
68 int16x4_t in_s16x4 = vdup_n_s16( 0 );
69 int16x4_t out_s16x4;
70 int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
71 int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
72 int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;
73
74#ifdef OPUS_CHECK_ASM
75 opus_int32 S_c[ 4 ];
76 VARDECL( opus_int16, out_c );
77 SAVE_STACK;
78 ALLOC( out_c, 2 * len, opus_int16 );
79
80 silk_memcpy( &S_c, S, sizeof( S_c ) );
81 silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
82#endif
83
84 /* Negate A_Q28 values and split in two parts */
85 A_Q28_s32x2 = vld1_s32( A_Q28 );
86 A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
87 A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */
88 A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */
89 A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */
90 A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
91 A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */
92
93 B_Q28_s32x2 = vld1_s32( B_Q28 );
94 t_s32x2 = vld1_s32( B_Q28 + 1 );
95 t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 );
96 t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 );
97 t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 );
98 A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */
99 A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */
100 B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */
101 S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
102 S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
103 S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );
104
105 for( ; k < len - 1; k += 2 ) {
106 int32x4_t in_s32x4[ 2 ], t_s32x4;
107 int32x2_t out32_Q14_s32x2[ 2 ];
108
109 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
110 in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
111 in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */
112 t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
113 in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */
114 in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */
115 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
116 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );
117
118 /* Scale back to Q0 and saturate */
119 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */
120 out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */
121 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */
122 vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
123 }
124
125 /* Process leftover. */
126 if( k < len ) {
127 int32x4_t in_s32x4;
128 int32x2_t out32_Q14_s32x2;
129
130 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
131 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */
132 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */
133 in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */
134 t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
135 in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */
136 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );
137
138 /* Scale back to Q0 and saturate */
139 out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */
140 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */
141 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */
142 vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
143 vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
144 }
145
146 vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */
147 vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */
148 vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */
149 vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */
150
151#ifdef OPUS_CHECK_ASM
152 silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
153 silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
154 RESTORE_STACK;
155#endif
156}
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/macros_arm64.h b/lib/rbcodec/codecs/libopus/silk/arm/macros_arm64.h
new file mode 100644
index 0000000000..ed030413c5
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/arm/macros_arm64.h
@@ -0,0 +1,39 @@
1/***********************************************************************
2Copyright (C) 2015 Vidyo
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifndef SILK_MACROS_ARM64_H
29#define SILK_MACROS_ARM64_H
30
31#include <arm_neon.h>
32
33#undef silk_ADD_SAT32
34#define silk_ADD_SAT32(a, b) (vqadds_s32((a), (b)))
35
36#undef silk_SUB_SAT32
37#define silk_SUB_SAT32(a, b) (vqsubs_s32((a), (b)))
38
39#endif /* SILK_MACROS_ARM64_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/macros_armv4.h b/lib/rbcodec/codecs/libopus/silk/arm/macros_armv4.h
index 3f30e97288..877eb18dd5 100644
--- a/lib/rbcodec/codecs/libopus/silk/arm/macros_armv4.h
+++ b/lib/rbcodec/codecs/libopus/silk/arm/macros_armv4.h
@@ -28,6 +28,11 @@ POSSIBILITY OF SUCH DAMAGE.
28#ifndef SILK_MACROS_ARMv4_H 28#ifndef SILK_MACROS_ARMv4_H
29#define SILK_MACROS_ARMv4_H 29#define SILK_MACROS_ARMv4_H
30 30
31/* This macro only avoids the undefined behaviour from a left shift of
32 a negative value. It should only be used in macros that can't include
33 SigProc_FIX.h. In other cases, use silk_LSHIFT32(). */
34#define SAFE_SHL(a,b) ((opus_int32)((opus_uint32)(a) << (b)))
35
31/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ 36/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
32#undef silk_SMULWB 37#undef silk_SMULWB
33static OPUS_INLINE opus_int32 silk_SMULWB_armv4(opus_int32 a, opus_int16 b) 38static OPUS_INLINE opus_int32 silk_SMULWB_armv4(opus_int32 a, opus_int16 b)
@@ -38,7 +43,7 @@ static OPUS_INLINE opus_int32 silk_SMULWB_armv4(opus_int32 a, opus_int16 b)
38 "#silk_SMULWB\n\t" 43 "#silk_SMULWB\n\t"
39 "smull %0, %1, %2, %3\n\t" 44 "smull %0, %1, %2, %3\n\t"
40 : "=&r"(rd_lo), "=&r"(rd_hi) 45 : "=&r"(rd_lo), "=&r"(rd_hi)
41 : "%r"(a), "r"(b<<16) 46 : "%r"(a), "r"(SAFE_SHL(b,16))
42 ); 47 );
43 return rd_hi; 48 return rd_hi;
44} 49}
@@ -80,7 +85,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_armv4(opus_int32 a, opus_int32 b)
80 : "=&r"(rd_lo), "=&r"(rd_hi) 85 : "=&r"(rd_lo), "=&r"(rd_hi)
81 : "%r"(a), "r"(b) 86 : "%r"(a), "r"(b)
82 ); 87 );
83 return (rd_hi<<16)+(rd_lo>>16); 88 return SAFE_SHL(rd_hi,16)+(rd_lo>>16);
84} 89}
85#define silk_SMULWW(a, b) (silk_SMULWW_armv4(a, b)) 90#define silk_SMULWW(a, b) (silk_SMULWW_armv4(a, b))
86 91
@@ -96,8 +101,10 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_armv4(opus_int32 a, opus_int32 b,
96 : "=&r"(rd_lo), "=&r"(rd_hi) 101 : "=&r"(rd_lo), "=&r"(rd_hi)
97 : "%r"(b), "r"(c) 102 : "%r"(b), "r"(c)
98 ); 103 );
99 return a+(rd_hi<<16)+(rd_lo>>16); 104 return a+SAFE_SHL(rd_hi,16)+(rd_lo>>16);
100} 105}
101#define silk_SMLAWW(a, b, c) (silk_SMLAWW_armv4(a, b, c)) 106#define silk_SMLAWW(a, b, c) (silk_SMLAWW_armv4(a, b, c))
102 107
108#undef SAFE_SHL
109
103#endif /* SILK_MACROS_ARMv4_H */ 110#endif /* SILK_MACROS_ARMv4_H */
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/macros_armv5e.h b/lib/rbcodec/codecs/libopus/silk/arm/macros_armv5e.h
index aad4117e46..b14ec65ddb 100644
--- a/lib/rbcodec/codecs/libopus/silk/arm/macros_armv5e.h
+++ b/lib/rbcodec/codecs/libopus/silk/arm/macros_armv5e.h
@@ -29,6 +29,11 @@ POSSIBILITY OF SUCH DAMAGE.
29#ifndef SILK_MACROS_ARMv5E_H 29#ifndef SILK_MACROS_ARMv5E_H
30#define SILK_MACROS_ARMv5E_H 30#define SILK_MACROS_ARMv5E_H
31 31
32/* This macro only avoids the undefined behaviour from a left shift of
33 a negative value. It should only be used in macros that can't include
34 SigProc_FIX.h. In other cases, use silk_LSHIFT32(). */
35#define SAFE_SHL(a,b) ((opus_int32)((opus_uint32)(a) << (b)))
36
32/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ 37/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
33#undef silk_SMULWB 38#undef silk_SMULWB
34static OPUS_INLINE opus_int32 silk_SMULWB_armv5e(opus_int32 a, opus_int16 b) 39static OPUS_INLINE opus_int32 silk_SMULWB_armv5e(opus_int32 a, opus_int16 b)
@@ -190,7 +195,7 @@ static OPUS_INLINE opus_int32 silk_CLZ16_armv5(opus_int16 in16)
190 "#silk_CLZ16\n\t" 195 "#silk_CLZ16\n\t"
191 "clz %0, %1;\n" 196 "clz %0, %1;\n"
192 : "=r"(res) 197 : "=r"(res)
193 : "r"(in16<<16|0x8000) 198 : "r"(SAFE_SHL(in16,16)|0x8000)
194 ); 199 );
195 return res; 200 return res;
196} 201}
@@ -210,4 +215,6 @@ static OPUS_INLINE opus_int32 silk_CLZ32_armv5(opus_int32 in32)
210} 215}
211#define silk_CLZ32(in32) (silk_CLZ32_armv5(in32)) 216#define silk_CLZ32(in32) (silk_CLZ32_armv5(in32))
212 217
218#undef SAFE_SHL
219
213#endif /* SILK_MACROS_ARMv5E_H */ 220#endif /* SILK_MACROS_ARMv5E_H */