summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/silk/x86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/x86')
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/NSQ_del_dec_sse4_1.c859
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c719
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/SigProc_FIX_sse.h94
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/VAD_sse4_1.c277
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/VQ_WMat_EC_sse4_1.c142
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/main_sse.h248
-rw-r--r--lib/rbcodec/codecs/libopus/silk/x86/x86_silk_map.c164
7 files changed, 2503 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/NSQ_del_dec_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_del_dec_sse4_1.c
new file mode 100644
index 0000000000..2c75ede2dd
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_del_dec_sse4_1.c
@@ -0,0 +1,859 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37
38#include "stack_alloc.h"
39
40typedef struct {
41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
42 opus_int32 RandState[ DECISION_DELAY ];
43 opus_int32 Q_Q10[ DECISION_DELAY ];
44 opus_int32 Xq_Q14[ DECISION_DELAY ];
45 opus_int32 Pred_Q15[ DECISION_DELAY ];
46 opus_int32 Shape_Q14[ DECISION_DELAY ];
47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
48 opus_int32 LF_AR_Q14;
49 opus_int32 Seed;
50 opus_int32 SeedInit;
51 opus_int32 RD_Q10;
52} NSQ_del_dec_struct;
53
54typedef struct {
55 opus_int32 Q_Q10;
56 opus_int32 RD_Q10;
57 opus_int32 xq_Q14;
58 opus_int32 LF_AR_Q14;
59 opus_int32 sLTP_shp_Q14;
60 opus_int32 LPC_exc_Q14;
61} NSQ_sample_struct;
62
63typedef NSQ_sample_struct NSQ_sample_pair[ 2 ];
64
65static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
66 const silk_encoder_state *psEncC, /* I Encoder State */
67 silk_nsq_state *NSQ, /* I/O NSQ state */
68 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
69 const opus_int32 x_Q3[], /* I Input in Q3 */
70 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
71 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
72 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
73 opus_int subfr, /* I Subframe number */
74 opus_int nStatesDelayedDecision, /* I Number of del dec states */
75 const opus_int LTP_scale_Q14, /* I LTP state scaling */
76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
78 const opus_int signal_type, /* I Signal type */
79 const opus_int decisionDelay /* I Decision delay */
80);
81
82/******************************************/
83/* Noise shape quantizer for one subframe */
84/******************************************/
85static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
86 silk_nsq_state *NSQ, /* I/O NSQ state */
87 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
88 opus_int signalType, /* I Signal type */
89 const opus_int32 x_Q10[], /* I */
90 opus_int8 pulses[], /* O */
91 opus_int16 xq[], /* O */
92 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
93 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
94 const opus_int16 a_Q12[], /* I Short term prediction coefs */
95 const opus_int16 b_Q14[], /* I Long term prediction coefs */
96 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
97 opus_int lag, /* I Pitch lag */
98 opus_int32 HarmShapeFIRPacked_Q14, /* I */
99 opus_int Tilt_Q14, /* I Spectral tilt */
100 opus_int32 LF_shp_Q14, /* I */
101 opus_int32 Gain_Q16, /* I */
102 opus_int Lambda_Q10, /* I */
103 opus_int offset_Q10, /* I */
104 opus_int length, /* I Input length */
105 opus_int subfr, /* I Subframe number */
106 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
107 opus_int predictLPCOrder, /* I Prediction filter order */
108 opus_int warping_Q16, /* I */
109 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
110 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
111 opus_int decisionDelay /* I */
112);
113
114void silk_NSQ_del_dec_sse4_1(
115 const silk_encoder_state *psEncC, /* I Encoder State */
116 silk_nsq_state *NSQ, /* I/O NSQ state */
117 SideInfoIndices *psIndices, /* I/O Quantization Indices */
118 const opus_int32 x_Q3[], /* I Prefiltered input signal */
119 opus_int8 pulses[], /* O Quantized pulse signal */
120 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
121 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
122 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
123 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
124 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
125 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
126 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
127 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
128 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
129 const opus_int LTP_scale_Q14 /* I LTP state scaling */
130)
131{
132 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
133 opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
134 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
135 opus_int16 *pxq;
136 VARDECL( opus_int32, sLTP_Q15 );
137 VARDECL( opus_int16, sLTP );
138 opus_int32 HarmShapeFIRPacked_Q14;
139 opus_int offset_Q10;
140 opus_int32 RDmin_Q10, Gain_Q10;
141 VARDECL( opus_int32, x_sc_Q10 );
142 VARDECL( opus_int32, delayedGain_Q10 );
143 VARDECL( NSQ_del_dec_struct, psDelDec );
144 NSQ_del_dec_struct *psDD;
145 SAVE_STACK;
146
147 /* Set unvoiced lag to the previous one, overwrite later for voiced */
148 lag = NSQ->lagPrev;
149
150 silk_assert( NSQ->prev_gain_Q16 != 0 );
151
152 /* Initialize delayed decision states */
153 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
154 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
155 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
156 psDD = &psDelDec[ k ];
157 psDD->Seed = ( k + psIndices->Seed ) & 3;
158 psDD->SeedInit = psDD->Seed;
159 psDD->RD_Q10 = 0;
160 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
161 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
162 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
163 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
164 }
165
166 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
167 smpl_buf_idx = 0; /* index of oldest samples */
168
169 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
170
171 /* For voiced frames limit the decision delay to lower than the pitch lag */
172 if( psIndices->signalType == TYPE_VOICED ) {
173 for( k = 0; k < psEncC->nb_subfr; k++ ) {
174 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
175 }
176 } else {
177 if( lag > 0 ) {
178 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
179 }
180 }
181
182 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
183 LSF_interpolation_flag = 0;
184 } else {
185 LSF_interpolation_flag = 1;
186 }
187
188 ALLOC( sLTP_Q15,
189 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
190 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
191 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
192 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
193 /* Set up pointers to start of sub frame */
194 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
195 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
196 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
197 subfr = 0;
198 for( k = 0; k < psEncC->nb_subfr; k++ ) {
199 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
200 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
201 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
202
203 /* Noise shape parameters */
204 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
205 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
206 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
207
208 NSQ->rewhite_flag = 0;
209 if( psIndices->signalType == TYPE_VOICED ) {
210 /* Voiced */
211 lag = pitchL[ k ];
212
213 /* Re-whitening */
214 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
215 if( k == 2 ) {
216 /* RESET DELAYED DECISIONS */
217 /* Find winner */
218 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
219 Winner_ind = 0;
220 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
221 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
222 RDmin_Q10 = psDelDec[ i ].RD_Q10;
223 Winner_ind = i;
224 }
225 }
226 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
227 if( i != Winner_ind ) {
228 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
229 silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
230 }
231 }
232
233 /* Copy final part of signals from winner state to output and long-term filter states */
234 psDD = &psDelDec[ Winner_ind ];
235 last_smple_idx = smpl_buf_idx + decisionDelay;
236 for( i = 0; i < decisionDelay; i++ ) {
237 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
238 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
239 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
240 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
241 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
242 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
243 }
244
245 subfr = 0;
246 }
247
248 /* Rewhiten with new A coefs */
249 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
250 celt_assert( start_idx > 0 );
251
252 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
253 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
254
255 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
256 NSQ->rewhite_flag = 1;
257 }
258 }
259
260 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
261 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
262
263 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
264 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
265 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
266 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
267
268 x_Q3 += psEncC->subfr_length;
269 pulses += psEncC->subfr_length;
270 pxq += psEncC->subfr_length;
271 }
272
273 /* Find winner */
274 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
275 Winner_ind = 0;
276 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
277 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
278 RDmin_Q10 = psDelDec[ k ].RD_Q10;
279 Winner_ind = k;
280 }
281 }
282
283 /* Copy final part of signals from winner state to output and long-term filter states */
284 psDD = &psDelDec[ Winner_ind ];
285 psIndices->Seed = psDD->SeedInit;
286 last_smple_idx = smpl_buf_idx + decisionDelay;
287 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
288 for( i = 0; i < decisionDelay; i++ ) {
289 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
290 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
291 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
292 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
293 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
294 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
295 }
296 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
297 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
298
299 /* Update states */
300 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
301 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
302
303 /* Save quantized speech signal */
304 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
305 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
306 RESTORE_STACK;
307}
308
309/******************************************/
310/* Noise shape quantizer for one subframe */
311/******************************************/
312static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
313 silk_nsq_state *NSQ, /* I/O NSQ state */
314 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
315 opus_int signalType, /* I Signal type */
316 const opus_int32 x_Q10[], /* I */
317 opus_int8 pulses[], /* O */
318 opus_int16 xq[], /* O */
319 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
320 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
321 const opus_int16 a_Q12[], /* I Short term prediction coefs */
322 const opus_int16 b_Q14[], /* I Long term prediction coefs */
323 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
324 opus_int lag, /* I Pitch lag */
325 opus_int32 HarmShapeFIRPacked_Q14, /* I */
326 opus_int Tilt_Q14, /* I Spectral tilt */
327 opus_int32 LF_shp_Q14, /* I */
328 opus_int32 Gain_Q16, /* I */
329 opus_int Lambda_Q10, /* I */
330 opus_int offset_Q10, /* I */
331 opus_int length, /* I Input length */
332 opus_int subfr, /* I Subframe number */
333 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
334 opus_int predictLPCOrder, /* I Prediction filter order */
335 opus_int warping_Q16, /* I */
336 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
337 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
338 opus_int decisionDelay /* I */
339)
340{
341 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
342 opus_int32 Winner_rand_state;
343 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
344 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
345 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
346 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
347 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
348 VARDECL( NSQ_sample_pair, psSampleState );
349 NSQ_del_dec_struct *psDD;
350 NSQ_sample_struct *psSS;
351
352 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
353 __m128i b_Q12_0123, b_sr_Q12_0123;
354 SAVE_STACK;
355
356 celt_assert( nStatesDelayedDecision > 0 );
357 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
358
359 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
360 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
361 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
362
363 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
364 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
365
366 if( opus_likely( predictLPCOrder == 16 ) ) {
367 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
368 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
369 }
370
371 if( signalType == TYPE_VOICED ){
372 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
373 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
374 }
375 for( i = 0; i < length; i++ ) {
376 /* Perform common calculations used in all states */
377
378 /* Long-term prediction */
379 if( signalType == TYPE_VOICED ) {
380 /* Unrolled loop */
381 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
382 LTP_pred_Q14 = 2;
383 {
384 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
385 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
386 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
387 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
388 tmpa = _mm_srli_si128( tmpa, 2 );
389
390 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
391 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
392 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
393 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
394
395 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
396 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
397 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
398
399 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
400 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
401 pred_lag_ptr++;
402 }
403 } else {
404 LTP_pred_Q14 = 0;
405 }
406
407 /* Long-term shaping */
408 if( lag > 0 ) {
409 /* Symmetric, packed FIR coefficients */
410 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
411 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
412 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
413 shp_lag_ptr++;
414 } else {
415 n_LTP_Q14 = 0;
416 }
417 {
418 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
419
420 for( k = 0; k < nStatesDelayedDecision; k++ ) {
421 /* Delayed decision state */
422 psDD = &psDelDec[ k ];
423
424 /* Sample state */
425 psSS = psSampleState[ k ];
426
427 /* Generate dither */
428 psDD->Seed = silk_RAND( psDD->Seed );
429
430 /* Pointer used in short term prediction and shaping */
431 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
432 /* Short-term prediction */
433 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
434 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
435 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
436
437 tmpb = _mm_setzero_si128();
438
439 /* step 1 */
440 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
441 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */
442 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
443
444 tmpa = _mm_srli_epi64( tmpa, 16 );
445 tmpb = _mm_add_epi32( tmpb, tmpa );
446
447 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
448 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
449 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
450 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
451 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
452
453 /* step 2 */
454 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
455 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
456 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
457 tmpa = _mm_srli_epi64( tmpa, 16 );
458 tmpb = _mm_add_epi32( tmpb, tmpa );
459
460 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
461 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
462 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
463 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
464 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
465
466 if ( opus_likely( predictLPCOrder == 16 ) )
467 {
468 /* step 3 */
469 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
470 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
471 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
472 tmpa = _mm_srli_epi64( tmpa, 16 );
473 tmpb = _mm_add_epi32( tmpb, tmpa );
474
475 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
476 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
477 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
478 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
479 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
480
481 /* setp 4 */
482 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
483 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
484 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
485 tmpa = _mm_srli_epi64( tmpa, 16 );
486 tmpb = _mm_add_epi32( tmpb, tmpa );
487
488 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
489 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
490 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
491 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
492 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
493
494 /* add at last */
495 /* equal shift right 8 bytes*/
496 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
497 tmpb = _mm_add_epi32( tmpb, tmpa );
498 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
499 }
500 else
501 {
502 /* add at last */
503 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
504 tmpb = _mm_add_epi32( tmpb, tmpa );
505 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
506
507 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
508 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
509 }
510
511 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
512
513 /* Noise shape feedback */
514 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
515 /* Output of lowpass section */
516 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
517 /* Output of allpass section */
518 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
519 psDD->sAR2_Q14[ 0 ] = tmp2;
520 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
521 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
522 /* Loop over allpass sections */
523 for( j = 2; j < shapingLPCOrder; j += 2 ) {
524 /* Output of allpass section */
525 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
526 psDD->sAR2_Q14[ j - 1 ] = tmp1;
527 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
528 /* Output of allpass section */
529 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
530 psDD->sAR2_Q14[ j + 0 ] = tmp2;
531 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
532 }
533 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
534 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
535
536 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */
537 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */
538 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */
539
540 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */
541 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */
542 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */
543
544 /* Input minus prediction plus noise feedback */
545 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
546 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
547 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
548 tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
549 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
550
551 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
552
553 /* Flip sign depending on dither */
554 if ( psDD->Seed < 0 ) {
555 r_Q10 = -r_Q10;
556 }
557 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
558
559 /* Find two quantization level candidates and measure their rate-distortion */
560 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
561 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
562 if( q1_Q0 > 0 ) {
563 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
564 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
565 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
566 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
567 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
568 } else if( q1_Q0 == 0 ) {
569 q1_Q10 = offset_Q10;
570 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
571 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
572 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
573 } else if( q1_Q0 == -1 ) {
574 q2_Q10 = offset_Q10;
575 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
576 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
577 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
578 } else { /* q1_Q0 < -1 */
579 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
580 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
581 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
582 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
583 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
584 }
585 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 );
586 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
587 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 );
588 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
589
590 if( rd1_Q10 < rd2_Q10 ) {
591 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
592 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
593 psSS[ 0 ].Q_Q10 = q1_Q10;
594 psSS[ 1 ].Q_Q10 = q2_Q10;
595 } else {
596 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
597 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
598 psSS[ 0 ].Q_Q10 = q2_Q10;
599 psSS[ 1 ].Q_Q10 = q1_Q10;
600 }
601
602 /* Update states for best quantization */
603
604 /* Quantized excitation */
605 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
606 if ( psDD->Seed < 0 ) {
607 exc_Q14 = -exc_Q14;
608 }
609
610 /* Add predictions */
611 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
612 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
613
614 /* Update states */
615 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
616 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
617 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
618 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
619 psSS[ 0 ].xq_Q14 = xq_Q14;
620
621 /* Update states for second best quantization */
622
623 /* Quantized excitation */
624 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
625 if ( psDD->Seed < 0 ) {
626 exc_Q14 = -exc_Q14;
627 }
628
629
630 /* Add predictions */
631 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
632 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
633
634 /* Update states */
635 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
636 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
637 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
638 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
639 psSS[ 1 ].xq_Q14 = xq_Q14;
640 }
641 }
642 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
643 if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
644 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
645
646 /* Find winner */
647 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
648 Winner_ind = 0;
649 for( k = 1; k < nStatesDelayedDecision; k++ ) {
650 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
651 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
652 Winner_ind = k;
653 }
654 }
655
656 /* Increase RD values of expired states */
657 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
658 for( k = 0; k < nStatesDelayedDecision; k++ ) {
659 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
660 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
661 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
662 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
663 }
664 }
665
666 /* Find worst in first set and best in second set */
667 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
668 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10;
669 RDmax_ind = 0;
670 RDmin_ind = 0;
671 for( k = 1; k < nStatesDelayedDecision; k++ ) {
672 /* find worst in first set */
673 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
674 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
675 RDmax_ind = k;
676 }
677 /* find best in second set */
678 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
679 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10;
680 RDmin_ind = k;
681 }
682 }
683
684 /* Replace a state if best from second set outperforms worst in first set */
685 if( RDmin_Q10 < RDmax_Q10 ) {
686 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
687 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
688 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
689 }
690
691 /* Write samples from winner to output and long-term filter states */
692 psDD = &psDelDec[ Winner_ind ];
693 if( subfr > 0 || i >= decisionDelay ) {
694 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
695 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
696 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
697 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
698 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ];
699 }
700 NSQ->sLTP_shp_buf_idx++;
701 NSQ->sLTP_buf_idx++;
702
703 /* Update states */
704 for( k = 0; k < nStatesDelayedDecision; k++ ) {
705 psDD = &psDelDec[ k ];
706 psSS = &psSampleState[ k ][ 0 ];
707 psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
708 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
709 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
710 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
711 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
712 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14;
713 psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
714 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed;
715 psDD->RD_Q10 = psSS->RD_Q10;
716 }
717 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
718 }
719 /* Update LPC states */
720 for( k = 0; k < nStatesDelayedDecision; k++ ) {
721 psDD = &psDelDec[ k ];
722 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
723 }
724 RESTORE_STACK;
725}
726
727static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
728 const silk_encoder_state *psEncC, /* I Encoder State */
729 silk_nsq_state *NSQ, /* I/O NSQ state */
730 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
731 const opus_int32 x_Q3[], /* I Input in Q3 */
732 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
733 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
734 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
735 opus_int subfr, /* I Subframe number */
736 opus_int nStatesDelayedDecision, /* I Number of del dec states */
737 const opus_int LTP_scale_Q14, /* I LTP state scaling */
738 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
739 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
740 const opus_int signal_type, /* I Signal type */
741 const opus_int decisionDelay /* I Decision delay */
742)
743{
744 opus_int i, k, lag;
745 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
746 NSQ_del_dec_struct *psDD;
747 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
748
749 lag = pitchL[ subfr ];
750 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
751
752 silk_assert( inv_gain_Q31 != 0 );
753
754 /* Calculate gain adjustment factor */
755 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
756 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
757 } else {
758 gain_adj_Q16 = (opus_int32)1 << 16;
759 }
760
761 /* Scale input */
762 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
763
764 /* prepare inv_gain_Q23 in packed 4 32-bits */
765 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
766
767 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
768 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
769 /* equal shift right 4 bytes*/
770 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
771
772 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
773 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
774
775 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
776 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
777
778 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
779
780 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
781 }
782
783 for( ; i < psEncC->subfr_length; i++ ) {
784 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
785 }
786
787 /* Save inverse gain */
788 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
789
790 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
791 if( NSQ->rewhite_flag ) {
792 if( subfr == 0 ) {
793 /* Do LTP downscaling */
794 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
795 }
796 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
797 silk_assert( i < MAX_FRAME_LENGTH );
798 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
799 }
800 }
801
802 /* Adjust for changing gain */
803 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
804 /* Scale long-term shaping state */
805 {
806 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
807
808 /* prepare gain_adj_Q16 in packed 4 32-bits */
809 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
810
811 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
812 {
813 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
814 /* equal shift right 4 bytes*/
815 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
816
817 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
818 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
819
820 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
821 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
822
823 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
824
825 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
826 }
827
828 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
829 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
830 }
831
832 /* Scale long-term prediction state */
833 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
834 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
835 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
836 }
837 }
838
839 for( k = 0; k < nStatesDelayedDecision; k++ ) {
840 psDD = &psDelDec[ k ];
841
842 /* Scale scalar states */
843 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
844
845 /* Scale short-term prediction and shaping states */
846 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
847 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
848 }
849 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
850 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
851 }
852 for( i = 0; i < DECISION_DELAY; i++ ) {
853 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] );
854 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
855 }
856 }
857 }
858 }
859}
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c
new file mode 100644
index 0000000000..b0315e35fc
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c
@@ -0,0 +1,719 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37#include "stack_alloc.h"
38
39static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40 const silk_encoder_state *psEncC, /* I Encoder State */
41 silk_nsq_state *NSQ, /* I/O NSQ state */
42 const opus_int32 x_Q3[], /* I input in Q3 */
43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
46 opus_int subfr, /* I subframe number */
47 const opus_int LTP_scale_Q14, /* I */
48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
50 const opus_int signal_type /* I Signal type */
51);
52
53static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54 silk_nsq_state *NSQ, /* I/O NSQ state */
55 opus_int signalType, /* I Signal type */
56 const opus_int32 x_sc_Q10[], /* I */
57 opus_int8 pulses[], /* O */
58 opus_int16 xq[], /* O */
59 opus_int32 sLTP_Q15[], /* I/O LTP state */
60 const opus_int16 a_Q12[], /* I Short term prediction coefs */
61 const opus_int16 b_Q14[], /* I Long term prediction coefs */
62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
63 opus_int lag, /* I Pitch lag */
64 opus_int32 HarmShapeFIRPacked_Q14, /* I */
65 opus_int Tilt_Q14, /* I Spectral tilt */
66 opus_int32 LF_shp_Q14, /* I */
67 opus_int32 Gain_Q16, /* I */
68 opus_int offset_Q10, /* I */
69 opus_int length, /* I Input length */
70 opus_int32 table[][4] /* I */
71);
72
73void silk_NSQ_sse4_1(
74 const silk_encoder_state *psEncC, /* I Encoder State */
75 silk_nsq_state *NSQ, /* I/O NSQ state */
76 SideInfoIndices *psIndices, /* I/O Quantization Indices */
77 const opus_int32 x_Q3[], /* I Prefiltered input signal */
78 opus_int8 pulses[], /* O Quantized pulse signal */
79 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
80 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
81 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
82 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
83 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
84 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
85 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
86 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
87 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
88 const opus_int LTP_scale_Q14 /* I LTP state scaling */
89)
90{
91 opus_int k, lag, start_idx, LSF_interpolation_flag;
92 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
93 opus_int16 *pxq;
94 VARDECL( opus_int32, sLTP_Q15 );
95 VARDECL( opus_int16, sLTP );
96 opus_int32 HarmShapeFIRPacked_Q14;
97 opus_int offset_Q10;
98 VARDECL( opus_int32, x_sc_Q10 );
99
100 opus_int32 table[ 64 ][ 4 ];
101 opus_int32 tmp1;
102 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
103
104 SAVE_STACK;
105
106 NSQ->rand_seed = psIndices->Seed;
107
108 /* Set unvoiced lag to the previous one, overwrite later for voiced */
109 lag = NSQ->lagPrev;
110
111 silk_assert( NSQ->prev_gain_Q16 != 0 );
112
113 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
114
115 /* 0 */
116 q1_Q10 = offset_Q10;
117 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
118 rd1_Q20 = q1_Q10 * Lambda_Q10;
119 rd2_Q20 = q2_Q10 * Lambda_Q10;
120
121 table[ 32 ][ 0 ] = q1_Q10;
122 table[ 32 ][ 1 ] = q2_Q10;
123 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
124 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
125
126 /* -1 */
127 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
128 q2_Q10 = offset_Q10;
129 rd1_Q20 = - q1_Q10 * Lambda_Q10;
130 rd2_Q20 = q2_Q10 * Lambda_Q10;
131
132 table[ 31 ][ 0 ] = q1_Q10;
133 table[ 31 ][ 1 ] = q2_Q10;
134 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
135 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
136
137 /* > 0 */
138 for (k = 1; k <= 31; k++)
139 {
140 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
141
142 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10;
143 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
144 rd1_Q20 = q1_Q10 * Lambda_Q10;
145 rd2_Q20 = q2_Q10 * Lambda_Q10;
146
147 table[ 32 + k ][ 0 ] = q1_Q10;
148 table[ 32 + k ][ 1 ] = q2_Q10;
149 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
150 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
151 }
152
153 /* < -1 */
154 for (k = -32; k <= -2; k++)
155 {
156 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
157
158 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10;
159 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
160 rd1_Q20 = - q1_Q10 * Lambda_Q10;
161 rd2_Q20 = - q2_Q10 * Lambda_Q10;
162
163 table[ 32 + k ][ 0 ] = q1_Q10;
164 table[ 32 + k ][ 1 ] = q2_Q10;
165 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
166 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
167 }
168
169 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
170 LSF_interpolation_flag = 0;
171 } else {
172 LSF_interpolation_flag = 1;
173 }
174
175 ALLOC( sLTP_Q15,
176 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
177 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
178 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
179 /* Set up pointers to start of sub frame */
180 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
181 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
182 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
183 for( k = 0; k < psEncC->nb_subfr; k++ ) {
184 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
185 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
186 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
187
188 /* Noise shape parameters */
189 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
190 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
191 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
192
193 NSQ->rewhite_flag = 0;
194 if( psIndices->signalType == TYPE_VOICED ) {
195 /* Voiced */
196 lag = pitchL[ k ];
197
198 /* Re-whitening */
199 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
200 /* Rewhiten with new A coefs */
201 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
202 celt_assert( start_idx > 0 );
203
204 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
205 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
206
207 NSQ->rewhite_flag = 1;
208 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
209 }
210 }
211
212 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
213
214 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
215 {
216 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
217 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
218 offset_Q10, psEncC->subfr_length, &(table[32]) );
219 }
220 else
221 {
222 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
223 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
224 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
225 }
226
227 x_Q3 += psEncC->subfr_length;
228 pulses += psEncC->subfr_length;
229 pxq += psEncC->subfr_length;
230 }
231
232 /* Update lagPrev for next frame */
233 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
234
235 /* Save quantized speech and noise shaping signals */
236 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
237 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
238 RESTORE_STACK;
239}
240
241/***********************************/
242/* silk_noise_shape_quantizer_10_16 */
243/***********************************/
244static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
245 silk_nsq_state *NSQ, /* I/O NSQ state */
246 opus_int signalType, /* I Signal type */
247 const opus_int32 x_sc_Q10[], /* I */
248 opus_int8 pulses[], /* O */
249 opus_int16 xq[], /* O */
250 opus_int32 sLTP_Q15[], /* I/O LTP state */
251 const opus_int16 a_Q12[], /* I Short term prediction coefs */
252 const opus_int16 b_Q14[], /* I Long term prediction coefs */
253 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
254 opus_int lag, /* I Pitch lag */
255 opus_int32 HarmShapeFIRPacked_Q14, /* I */
256 opus_int Tilt_Q14, /* I Spectral tilt */
257 opus_int32 LF_shp_Q14, /* I */
258 opus_int32 Gain_Q16, /* I */
259 opus_int offset_Q10, /* I */
260 opus_int length, /* I Input length */
261 opus_int32 table[][4] /* I */
262)
263{
264 opus_int i;
265 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
266 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
267 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
268 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
269 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
270
271 __m128i xmm_tempa, xmm_tempb;
272
273 __m128i xmm_one;
274
275 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
276 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
277 __m128i a_Q12_01234567, a_Q12_89ABCDEF;
278
279 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
280 __m128i AR_shp_Q13_76543210;
281
282 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
283 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
284 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
285
286 /* Set up short term AR state */
287 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
288
289 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
290 xq_Q14 = psLPC_Q14[ 0 ];
291 LTP_pred_Q13 = 0;
292
293 /* load a_Q12 */
294 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
295
296 /* load a_Q12[0] - a_Q12[7] */
297 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
298 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
299 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
300
301 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
302 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
303
304 /* load AR_shp_Q13 */
305 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
306
307 /* load psLPC_Q14 */
308 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
309
310 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
311 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
312
313 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
314 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
315
316 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
317 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
318
319 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
320 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
321
322 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
323 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
324
325 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
326 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
327
328 /* load sAR2_Q14 */
329 xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
330 xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
331
332 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
333 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
334
335 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
336 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
337
338 /* prepare 1 in 8 * 16bit */
339 xmm_one = _mm_set1_epi16(1);
340
341 for( i = 0; i < length; i++ )
342 {
343 /* Short-term prediction */
344 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
345
346 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
347 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
348
349 /* shift psLPC_Q14 */
350 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
351 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
352
353 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
354 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
355
356 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
357 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 );
358
359 /* high part, use pmaddwd, results in 4 32-bit */
360 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
361 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
362
363 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
364 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
365 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
366
367 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
368 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
369
370 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
371 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
372
373 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
374 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
375
376 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
377 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
378
379 /* accumulate */
380 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
381 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
382
383 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
384
385 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
386 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
387
388 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
389
390 /* Long-term prediction */
391 if ( opus_likely( signalType == TYPE_VOICED ) ) {
392 /* Unrolled loop */
393 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
394 LTP_pred_Q13 = 2;
395 {
396 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
397
398 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
399 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
400
401 /* loaded: [0] [-1] [-2] [-3] */
402 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
403 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
404 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
405 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
406 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
407 /* right shift 2 bytes (16 bits), zero extended */
408 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
409
410 /* a[1] * b[-1], a[3] * b[-3] */
411 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
412 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
413
414 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
415 /* equal shift right 8 bytes*/
416 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
417 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
418
419 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
420
421 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
422 pred_lag_ptr++;
423 }
424 }
425
426 /* Noise shape feedback */
427 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
428 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
429
430 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
431 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
432
433 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
434 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
435
436 /* high part, use pmaddwd, results in 4 32-bit */
437 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
438
439 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
440 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
441 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
442
443 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
444 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
445
446 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
447
448 /* accumulate */
449 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
450
451 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
452 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
453
454 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
455
456 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
457 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
458
459 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */
460 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
461
462 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
463 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
464
465 silk_assert( lag > 0 || signalType != TYPE_VOICED );
466
467 /* Combine prediction and noise shaping signals */
468 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
469 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
470 if( lag > 0 ) {
471 /* Symmetric, packed FIR coefficients */
472 n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
473 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
474 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
475 shp_lag_ptr++;
476
477 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */
478 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */
479 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */
480 } else {
481 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */
482 }
483
484 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */
485
486 /* Generate dither */
487 NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
488
489 /* Flip sign depending on dither */
490 tmp2 = -r_Q10;
491 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
492
493 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
494
495 /* Find two quantization level candidates and measure their rate-distortion */
496 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
497 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
498
499 q1_Q10 = table[q1_Q0][0];
500 q2_Q10 = table[q1_Q0][1];
501
502 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
503 {
504 q1_Q10 = q2_Q10;
505 }
506
507 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
508
509 /* Excitation */
510 exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
511
512 tmp2 = -exc_Q14;
513 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
514
515 /* Add predictions */
516 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
517 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
518
519 /* Update states */
520 psLPC_Q14++;
521 *psLPC_Q14 = xq_Q14;
522 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
523
524 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
525 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
526 NSQ->sLTP_shp_buf_idx++;
527 NSQ->sLTP_buf_idx++;
528
529 /* Make dither dependent on quantized signal */
530 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
531 }
532
533 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
534
535 /* Scale XQ back to normal level before saving */
536 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
537
538 /* write back sAR2_Q14 */
539 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
540 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
541 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
542 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
543
544 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
545 {
546 __m128i xmm_Gain_Q10;
547 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
548
549 /* prepare (1 << 7) in packed 4 32-bits */
550 xmm_tempa = _mm_set1_epi32( (1 << 7) );
551
552 /* prepare Gain_Q10 in packed 4 32-bits */
553 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
554
555 /* process xq */
556 for (i = 0; i < length - 7; i += 8)
557 {
558 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
559 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
560
561 /* equal shift right 4 bytes*/
562 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
563 /* equal shift right 4 bytes*/
564 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
565
566 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
567 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
568 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
569 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
570
571 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
572 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
573 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
574 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
575
576 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
577 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
578
579 /* silk_RSHIFT_ROUND(xq, 8) */
580 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
581 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
582
583 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
584 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
585
586 /* silk_SAT16 */
587 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
588
589 /* save to xq */
590 _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
591 }
592 }
593 for ( ; i < length; i++)
594 {
595 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
596 }
597
598 /* Update LPC synth buffer */
599 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
600}
601
602static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
603 const silk_encoder_state *psEncC, /* I Encoder State */
604 silk_nsq_state *NSQ, /* I/O NSQ state */
605 const opus_int32 x_Q3[], /* I input in Q3 */
606 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
607 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
608 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
609 opus_int subfr, /* I subframe number */
610 const opus_int LTP_scale_Q14, /* I */
611 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
612 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
613 const opus_int signal_type /* I Signal type */
614)
615{
616 opus_int i, lag;
617 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
618 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
619
620 lag = pitchL[ subfr ];
621 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
622 silk_assert( inv_gain_Q31 != 0 );
623
624 /* Calculate gain adjustment factor */
625 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
626 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
627 } else {
628 gain_adj_Q16 = (opus_int32)1 << 16;
629 }
630
631 /* Scale input */
632 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
633
634 /* prepare inv_gain_Q23 in packed 4 32-bits */
635 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
636
637 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
638 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
639
640 /* equal shift right 4 bytes*/
641 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
642
643 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
644 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
645
646 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
647 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
648
649 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
650
651 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
652 }
653
654 for( ; i < psEncC->subfr_length; i++ ) {
655 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
656 }
657
658 /* Save inverse gain */
659 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
660
661 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
662 if( NSQ->rewhite_flag ) {
663 if( subfr == 0 ) {
664 /* Do LTP downscaling */
665 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
666 }
667 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
668 silk_assert( i < MAX_FRAME_LENGTH );
669 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
670 }
671 }
672
673 /* Adjust for changing gain */
674 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
675 /* Scale long-term shaping state */
676 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
677
678 /* prepare gain_adj_Q16 in packed 4 32-bits */
679 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
680
681 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
682 {
683 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
684 /* equal shift right 4 bytes*/
685 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
686
687 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
688 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
689
690 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
691 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
692
693 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
694
695 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
696 }
697
698 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
699 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
700 }
701
702 /* Scale long-term prediction state */
703 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
704 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
705 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
706 }
707 }
708
709 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
710
711 /* Scale short-term prediction and shaping states */
712 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
713 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
714 }
715 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
716 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
717 }
718 }
719}
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/SigProc_FIX_sse.h b/lib/rbcodec/codecs/libopus/silk/x86/SigProc_FIX_sse.h
new file mode 100644
index 0000000000..61efa8da41
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/SigProc_FIX_sse.h
@@ -0,0 +1,94 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifndef SIGPROC_FIX_SSE_H
29#define SIGPROC_FIX_SSE_H
30
31#ifdef HAVE_CONFIG_H
32#include "config.h"
33#endif
34
35#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
36void silk_burg_modified_sse4_1(
37 opus_int32 *res_nrg, /* O Residual energy */
38 opus_int *res_nrg_Q, /* O Residual energy Q value */
39 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
40 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
41 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
42 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
43 const opus_int nb_subfr, /* I Number of subframes stacked in x */
44 const opus_int D, /* I Order */
45 int arch /* I Run-time architecture */
46);
47
48#if defined(OPUS_X86_PRESUME_SSE4_1)
49#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
50 ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
51
52#else
53
54extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
55 opus_int32 *res_nrg, /* O Residual energy */
56 opus_int *res_nrg_Q, /* O Residual energy Q value */
57 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
58 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
59 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
60 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
61 const opus_int nb_subfr, /* I Number of subframes stacked in x */
62 const opus_int D, /* I Order */
63 int arch /* I Run-time architecture */);
64
65# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
66 ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
67
68#endif
69
70opus_int64 silk_inner_prod16_aligned_64_sse4_1(
71 const opus_int16 *inVec1,
72 const opus_int16 *inVec2,
73 const opus_int len
74);
75
76
77#if defined(OPUS_X86_PRESUME_SSE4_1)
78
79#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
80 ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
81
82#else
83
84extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
85 const opus_int16 *inVec1,
86 const opus_int16 *inVec2,
87 const opus_int len);
88
89# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
90 ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
91
92#endif
93#endif
94#endif
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/VAD_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/x86/VAD_sse4_1.c
new file mode 100644
index 0000000000..d02ddf4ad0
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/VAD_sse4_1.c
@@ -0,0 +1,277 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35
36#include "main.h"
37#include "stack_alloc.h"
38
39/* Weighting factors for tilt measure */
40static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
41
42/***************************************/
43/* Get the speech activity level in Q8 */
44/***************************************/
45opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if success */
46 silk_encoder_state *psEncC, /* I/O Encoder state */
47 const opus_int16 pIn[] /* I PCM input */
48)
49{
50 opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
51 opus_int decimated_framelength1, decimated_framelength2;
52 opus_int decimated_framelength;
53 opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54 opus_int32 sumSquared, smooth_coef_Q16;
55 opus_int16 HPstateTmp;
56 VARDECL( opus_int16, X );
57 opus_int32 Xnrg[ VAD_N_BANDS ];
58 opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59 opus_int32 speech_nrg, x_tmp;
60 opus_int X_offset[ VAD_N_BANDS ];
61 opus_int ret = 0;
62 silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63
64 SAVE_STACK;
65
66 /* Safety checks */
67 silk_assert( VAD_N_BANDS == 4 );
68 celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
69 celt_assert( psEncC->frame_length <= 512 );
70 celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
71
72 /***********************/
73 /* Filter and Decimate */
74 /***********************/
75 decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
76 decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
77 decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
78 /* Decimate into 4 bands:
79 0 L 3L L 3L 5L
80 - -- - -- --
81 8 8 2 4 4
82
83 [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz |
84
85 They're arranged to allow the minimal ( frame_length / 4 ) extra
86 scratch space during the downsampling process */
87 X_offset[ 0 ] = 0;
88 X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
89 X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
90 X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
91 ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
92
93 /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
94 silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ],
95 X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
96
97 /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
98 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
99 X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
100
101 /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
102 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
103 X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
104
105 /*********************************************/
106 /* HP filter on lowest band (differentiator) */
107 /*********************************************/
108 X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
109 HPstateTmp = X[ decimated_framelength - 1 ];
110 for( i = decimated_framelength - 1; i > 0; i-- ) {
111 X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 );
112 X[ i ] -= X[ i - 1 ];
113 }
114 X[ 0 ] -= psSilk_VAD->HPstate;
115 psSilk_VAD->HPstate = HPstateTmp;
116
117 /*************************************/
118 /* Calculate the energy in each band */
119 /*************************************/
120 for( b = 0; b < VAD_N_BANDS; b++ ) {
121 /* Find the decimated framelength in the non-uniformly divided bands */
122 decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
123
124 /* Split length into subframe lengths */
125 dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
126 dec_subframe_offset = 0;
127
128 /* Compute energy per sub-frame */
129 /* initialize with summed energy of last subframe */
130 Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
131 for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
132 __m128i xmm_X, xmm_acc;
133 sumSquared = 0;
134
135 xmm_acc = _mm_setzero_si128();
136
137 for( i = 0; i < dec_subframe_length - 7; i += 8 )
138 {
139 xmm_X = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
140 xmm_X = _mm_srai_epi16( xmm_X, 3 );
141 xmm_X = _mm_madd_epi16( xmm_X, xmm_X );
142 xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
143 }
144
145 xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
146 xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
147
148 sumSquared += _mm_cvtsi128_si32( xmm_acc );
149
150 for( ; i < dec_subframe_length; i++ ) {
151 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
152 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
153 x_tmp = silk_RSHIFT(
154 X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
155 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
156
157 /* Safety check */
158 silk_assert( sumSquared >= 0 );
159 }
160
161 /* Add/saturate summed energy of current subframe */
162 if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
163 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
164 } else {
165 /* Look-ahead subframe */
166 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
167 }
168
169 dec_subframe_offset += dec_subframe_length;
170 }
171 psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
172 }
173
174 /********************/
175 /* Noise estimation */
176 /********************/
177 silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
178
179 /***********************************************/
180 /* Signal-plus-noise to noise ratio estimation */
181 /***********************************************/
182 sumSquared = 0;
183 input_tilt = 0;
184 for( b = 0; b < VAD_N_BANDS; b++ ) {
185 speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
186 if( speech_nrg > 0 ) {
187 /* Divide, with sufficient resolution */
188 if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
189 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
190 } else {
191 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
192 }
193
194 /* Convert to log domain */
195 SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
196
197 /* Sum-of-squares */
198 sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
199
200 /* Tilt measure */
201 if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
202 /* Scale down SNR value for small subband speech energies */
203 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
204 }
205 input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
206 } else {
207 NrgToNoiseRatio_Q8[ b ] = 256;
208 }
209 }
210
211 /* Mean-of-squares */
212 sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
213
214 /* Root-mean-square approximation, scale to dBs, and write to output pointer */
215 pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
216
217 /*********************************/
218 /* Speech Probability Estimation */
219 /*********************************/
220 SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
221
222 /**************************/
223 /* Frequency Tilt Measure */
224 /**************************/
225 psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
226
227 /**************************************************/
228 /* Scale the sigmoid output based on power levels */
229 /**************************************************/
230 speech_nrg = 0;
231 for( b = 0; b < VAD_N_BANDS; b++ ) {
232 /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
233 speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
234 }
235
236 /* Power scaling */
237 if( speech_nrg <= 0 ) {
238 SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
239 } else if( speech_nrg < 32768 ) {
240 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
241 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
242 } else {
243 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
244 }
245
246 /* square-root */
247 speech_nrg = silk_SQRT_APPROX( speech_nrg );
248 SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
249 }
250
251 /* Copy the resulting speech activity in Q8 */
252 psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
253
254 /***********************************/
255 /* Energy Level and SNR estimation */
256 /***********************************/
257 /* Smoothing coefficient */
258 smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
259
260 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
261 smooth_coef_Q16 >>= 1;
262 }
263
264 for( b = 0; b < VAD_N_BANDS; b++ ) {
265 /* compute smoothed energy-to-noise ratio per band */
266 psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
267 NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
268
269 /* signal to noise ratio in dB per band */
270 SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
271 /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
272 psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
273 }
274
275 RESTORE_STACK;
276 return( ret );
277}
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/VQ_WMat_EC_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
new file mode 100644
index 0000000000..74d6c6d0ec
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -0,0 +1,142 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37
38/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
39void silk_VQ_WMat_EC_sse4_1(
40 opus_int8 *ind, /* O index of best codebook vector */
41 opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
42 opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
43 const opus_int16 *in_Q14, /* I input vector to be quantized */
44 const opus_int32 *W_Q18, /* I weighting matrix */
45 const opus_int8 *cb_Q7, /* I codebook */
46 const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
47 const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
48 const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
49 const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
50 opus_int L /* I number of vectors in codebook */
51)
52{
53 opus_int k, gain_tmp_Q7;
54 const opus_int8 *cb_row_Q7;
55 opus_int16 diff_Q14[ 5 ];
56 opus_int32 sum1_Q14, sum2_Q16;
57
58 __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
59 /* Loop over codebook */
60 *rate_dist_Q14 = silk_int32_MAX;
61 cb_row_Q7 = cb_Q7;
62 for( k = 0; k < L; k++ ) {
63 gain_tmp_Q7 = cb_gain_Q7[k];
64
65 diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
66
67 C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
68 C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
69 C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
70 C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
71
72 diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
73 diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
74 diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
75 diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
76
77 /* Weighted rate */
78 sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
79
80 /* Penalty for too large gain */
81 sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
82
83 silk_assert( sum1_Q14 >= 0 );
84
85 /* first row of W_Q18 */
86 C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
87 C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
88 C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
89
90 C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
91 C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
92
93 C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
94 C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
95
96 C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
97 C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
98
99 C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
100 sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
101
102 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
103 sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
104
105 /* second row of W_Q18 */
106 sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
107 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
108 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
109 sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
110 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
111 sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
112
113 /* third row of W_Q18 */
114 sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
115 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
116 sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
117 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
118 sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
119
120 /* fourth row of W_Q18 */
121 sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
122 sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
123 sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
124 sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
125
126 /* last row of W_Q18 */
127 sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
128 sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
129
130 silk_assert( sum1_Q14 >= 0 );
131
132 /* find best */
133 if( sum1_Q14 < *rate_dist_Q14 ) {
134 *rate_dist_Q14 = sum1_Q14;
135 *ind = (opus_int8)k;
136 *gain_Q7 = gain_tmp_Q7;
137 }
138
139 /* Go to next cbk vector */
140 cb_row_Q7 += LTP_ORDER;
141 }
142}
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/main_sse.h b/lib/rbcodec/codecs/libopus/silk/x86/main_sse.h
new file mode 100644
index 0000000000..2f15d44869
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/main_sse.h
@@ -0,0 +1,248 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifndef MAIN_SSE_H
29#define MAIN_SSE_H
30
31#ifdef HAVE_CONFIG_H
32#include "config.h"
33#endif
34
35# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
36
37#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
38# define OVERRIDE_silk_VQ_WMat_EC
39
40void silk_VQ_WMat_EC_sse4_1(
41 opus_int8 *ind, /* O index of best codebook vector */
42 opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
43 opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
44 const opus_int16 *in_Q14, /* I input vector to be quantized */
45 const opus_int32 *W_Q18, /* I weighting matrix */
46 const opus_int8 *cb_Q7, /* I codebook */
47 const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
48 const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
49 const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
50 const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
51 opus_int L /* I number of vectors in codebook */
52);
53
54#if defined OPUS_X86_PRESUME_SSE4_1
55
56#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
57 mu_Q9, max_gain_Q7, L, arch) \
58 ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
59 mu_Q9, max_gain_Q7, L))
60
61#else
62
63extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
64 opus_int8 *ind, /* O index of best codebook vector */
65 opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
66 opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
67 const opus_int16 *in_Q14, /* I input vector to be quantized */
68 const opus_int32 *W_Q18, /* I weighting matrix */
69 const opus_int8 *cb_Q7, /* I codebook */
70 const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
71 const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
72 const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
73 const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
74 opus_int L /* I number of vectors in codebook */
75);
76
77# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
78 mu_Q9, max_gain_Q7, L, arch) \
79 ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
80 mu_Q9, max_gain_Q7, L))
81
82#endif
83#endif
84
85#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
86# define OVERRIDE_silk_NSQ
87
88void silk_NSQ_sse4_1(
89 const silk_encoder_state *psEncC, /* I Encoder State */
90 silk_nsq_state *NSQ, /* I/O NSQ state */
91 SideInfoIndices *psIndices, /* I/O Quantization Indices */
92 const opus_int32 x_Q3[], /* I Prefiltered input signal */
93 opus_int8 pulses[], /* O Quantized pulse signal */
94 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
95 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
96 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
97 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
98 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
99 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
100 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
101 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
102 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
103 const opus_int LTP_scale_Q14 /* I LTP state scaling */
104);
105
106#if defined OPUS_X86_PRESUME_SSE4_1
107
108#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
109 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
110 ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
111 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
112
113#else
114
115extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
116 const silk_encoder_state *psEncC, /* I Encoder State */
117 silk_nsq_state *NSQ, /* I/O NSQ state */
118 SideInfoIndices *psIndices, /* I/O Quantization Indices */
119 const opus_int32 x_Q3[], /* I Prefiltered input signal */
120 opus_int8 pulses[], /* O Quantized pulse signal */
121 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
122 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
123 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
124 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
125 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
126 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
127 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
128 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
129 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
130 const opus_int LTP_scale_Q14 /* I LTP state scaling */
131);
132
133# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
134 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
135 ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
136 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
137
138#endif
139
140# define OVERRIDE_silk_NSQ_del_dec
141
142void silk_NSQ_del_dec_sse4_1(
143 const silk_encoder_state *psEncC, /* I Encoder State */
144 silk_nsq_state *NSQ, /* I/O NSQ state */
145 SideInfoIndices *psIndices, /* I/O Quantization Indices */
146 const opus_int32 x_Q3[], /* I Prefiltered input signal */
147 opus_int8 pulses[], /* O Quantized pulse signal */
148 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
149 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
150 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
151 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
152 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
153 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
154 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
155 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
156 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
157 const opus_int LTP_scale_Q14 /* I LTP state scaling */
158);
159
160#if defined OPUS_X86_PRESUME_SSE4_1
161
162#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
163 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
164 ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
165 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
166
167#else
168
169extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
170 const silk_encoder_state *psEncC, /* I Encoder State */
171 silk_nsq_state *NSQ, /* I/O NSQ state */
172 SideInfoIndices *psIndices, /* I/O Quantization Indices */
173 const opus_int32 x_Q3[], /* I Prefiltered input signal */
174 opus_int8 pulses[], /* O Quantized pulse signal */
175 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
176 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
177 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
178 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
179 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
180 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
181 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
182 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
183 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
184 const opus_int LTP_scale_Q14 /* I LTP state scaling */
185);
186
187# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
188 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
189 ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
190 HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
191
192#endif
193#endif
194
195void silk_noise_shape_quantizer(
196 silk_nsq_state *NSQ, /* I/O NSQ state */
197 opus_int signalType, /* I Signal type */
198 const opus_int32 x_sc_Q10[], /* I */
199 opus_int8 pulses[], /* O */
200 opus_int16 xq[], /* O */
201 opus_int32 sLTP_Q15[], /* I/O LTP state */
202 const opus_int16 a_Q12[], /* I Short term prediction coefs */
203 const opus_int16 b_Q14[], /* I Long term prediction coefs */
204 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
205 opus_int lag, /* I Pitch lag */
206 opus_int32 HarmShapeFIRPacked_Q14, /* I */
207 opus_int Tilt_Q14, /* I Spectral tilt */
208 opus_int32 LF_shp_Q14, /* I */
209 opus_int32 Gain_Q16, /* I */
210 opus_int Lambda_Q10, /* I */
211 opus_int offset_Q10, /* I */
212 opus_int length, /* I Input length */
213 opus_int shapingLPCOrder, /* I Noise shaping AR filter order */
214 opus_int predictLPCOrder, /* I Prediction filter order */
215 int arch /* I Architecture */
216);
217
218/**************************/
219/* Noise level estimation */
220/**************************/
221void silk_VAD_GetNoiseLevels(
222 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
223 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
224);
225
226# define OVERRIDE_silk_VAD_GetSA_Q8
227
228opus_int silk_VAD_GetSA_Q8_sse4_1(
229 silk_encoder_state *psEnC,
230 const opus_int16 pIn[]
231);
232
233#if defined(OPUS_X86_PRESUME_SSE4_1)
234#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
235
236#else
237
238# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
239 ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
240
241extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
242 silk_encoder_state *psEnC,
243 const opus_int16 pIn[]);
244
245#endif
246
247# endif
248#endif
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/x86_silk_map.c b/lib/rbcodec/codecs/libopus/silk/x86/x86_silk_map.c
new file mode 100644
index 0000000000..32dcc3cab7
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/x86/x86_silk_map.c
@@ -0,0 +1,164 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#if defined(HAVE_CONFIG_H)
29#include "config.h"
30#endif
31
32#include "celt/x86/x86cpu.h"
33#include "structs.h"
34#include "SigProc_FIX.h"
35#include "pitch.h"
36#include "main.h"
37
38#if !defined(OPUS_X86_PRESUME_SSE4_1)
39
40#if defined(FIXED_POINT)
41
42#include "fixed/main_FIX.h"
43
44opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
45 const opus_int16 *inVec1,
46 const opus_int16 *inVec2,
47 const opus_int len
48) = {
49 silk_inner_prod16_aligned_64_c, /* non-sse */
50 silk_inner_prod16_aligned_64_c,
51 silk_inner_prod16_aligned_64_c,
52 MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
53 MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */
54};
55
56#endif
57
58opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
59 silk_encoder_state *psEncC,
60 const opus_int16 pIn[]
61) = {
62 silk_VAD_GetSA_Q8_c, /* non-sse */
63 silk_VAD_GetSA_Q8_c,
64 silk_VAD_GetSA_Q8_c,
65 MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
66 MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */
67};
68
69#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
70void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
71 const silk_encoder_state *psEncC, /* I Encoder State */
72 silk_nsq_state *NSQ, /* I/O NSQ state */
73 SideInfoIndices *psIndices, /* I/O Quantization Indices */
74 const opus_int32 x_Q3[], /* I Prefiltered input signal */
75 opus_int8 pulses[], /* O Quantized pulse signal */
76 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
77 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
78 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
79 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
80 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
81 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
82 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
83 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
84 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
85 const opus_int LTP_scale_Q14 /* I LTP state scaling */
86) = {
87 silk_NSQ_c, /* non-sse */
88 silk_NSQ_c,
89 silk_NSQ_c,
90 MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
91 MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */
92};
93#endif
94
95#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
96void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
97 opus_int8 *ind, /* O index of best codebook vector */
98 opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
99 opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
100 const opus_int16 *in_Q14, /* I input vector to be quantized */
101 const opus_int32 *W_Q18, /* I weighting matrix */
102 const opus_int8 *cb_Q7, /* I codebook */
103 const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
104 const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
105 const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
106 const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
107 opus_int L /* I number of vectors in codebook */
108) = {
109 silk_VQ_WMat_EC_c, /* non-sse */
110 silk_VQ_WMat_EC_c,
111 silk_VQ_WMat_EC_c,
112 MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
113 MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */
114};
115#endif
116
117#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
118void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
119 const silk_encoder_state *psEncC, /* I Encoder State */
120 silk_nsq_state *NSQ, /* I/O NSQ state */
121 SideInfoIndices *psIndices, /* I/O Quantization Indices */
122 const opus_int32 x_Q3[], /* I Prefiltered input signal */
123 opus_int8 pulses[], /* O Quantized pulse signal */
124 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
125 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
126 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
127 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
128 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
129 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
130 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
131 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
132 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
133 const opus_int LTP_scale_Q14 /* I LTP state scaling */
134) = {
135 silk_NSQ_del_dec_c, /* non-sse */
136 silk_NSQ_del_dec_c,
137 silk_NSQ_del_dec_c,
138 MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
139 MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
140};
141#endif
142
143#if defined(FIXED_POINT)
144
145void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
146 opus_int32 *res_nrg, /* O Residual energy */
147 opus_int *res_nrg_Q, /* O Residual energy Q value */
148 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
149 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
150 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
151 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
152 const opus_int nb_subfr, /* I Number of subframes stacked in x */
153 const opus_int D, /* I Order */
154 int arch /* I Run-time architecture */
155) = {
156 silk_burg_modified_c, /* non-sse */
157 silk_burg_modified_c,
158 silk_burg_modified_c,
159 MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
160 MAY_HAVE_SSE4_1( silk_burg_modified ) /* avx */
161};
162
163#endif
164#endif