diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c | 719 |
1 files changed, 719 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c new file mode 100644 index 0000000000..b0315e35fc --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/x86/NSQ_sse4_1.c | |||
@@ -0,0 +1,719 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <xmmintrin.h> | ||
33 | #include <emmintrin.h> | ||
34 | #include <smmintrin.h> | ||
35 | #include "main.h" | ||
36 | #include "celt/x86/x86cpu.h" | ||
37 | #include "stack_alloc.h" | ||
38 | |||
39 | static OPUS_INLINE void silk_nsq_scale_states_sse4_1( | ||
40 | const silk_encoder_state *psEncC, /* I Encoder State */ | ||
41 | silk_nsq_state *NSQ, /* I/O NSQ state */ | ||
42 | const opus_int32 x_Q3[], /* I input in Q3 */ | ||
43 | opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ | ||
44 | const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ | ||
45 | opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ | ||
46 | opus_int subfr, /* I subframe number */ | ||
47 | const opus_int LTP_scale_Q14, /* I */ | ||
48 | const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ | ||
49 | const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ | ||
50 | const opus_int signal_type /* I Signal type */ | ||
51 | ); | ||
52 | |||
53 | static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( | ||
54 | silk_nsq_state *NSQ, /* I/O NSQ state */ | ||
55 | opus_int signalType, /* I Signal type */ | ||
56 | const opus_int32 x_sc_Q10[], /* I */ | ||
57 | opus_int8 pulses[], /* O */ | ||
58 | opus_int16 xq[], /* O */ | ||
59 | opus_int32 sLTP_Q15[], /* I/O LTP state */ | ||
60 | const opus_int16 a_Q12[], /* I Short term prediction coefs */ | ||
61 | const opus_int16 b_Q14[], /* I Long term prediction coefs */ | ||
62 | const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ | ||
63 | opus_int lag, /* I Pitch lag */ | ||
64 | opus_int32 HarmShapeFIRPacked_Q14, /* I */ | ||
65 | opus_int Tilt_Q14, /* I Spectral tilt */ | ||
66 | opus_int32 LF_shp_Q14, /* I */ | ||
67 | opus_int32 Gain_Q16, /* I */ | ||
68 | opus_int offset_Q10, /* I */ | ||
69 | opus_int length, /* I Input length */ | ||
70 | opus_int32 table[][4] /* I */ | ||
71 | ); | ||
72 | |||
73 | void silk_NSQ_sse4_1( | ||
74 | const silk_encoder_state *psEncC, /* I Encoder State */ | ||
75 | silk_nsq_state *NSQ, /* I/O NSQ state */ | ||
76 | SideInfoIndices *psIndices, /* I/O Quantization Indices */ | ||
77 | const opus_int32 x_Q3[], /* I Prefiltered input signal */ | ||
78 | opus_int8 pulses[], /* O Quantized pulse signal */ | ||
79 | const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ | ||
80 | const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ | ||
81 | const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ | ||
82 | const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ | ||
83 | const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ | ||
84 | const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ | ||
85 | const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ | ||
86 | const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ | ||
87 | const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ | ||
88 | const opus_int LTP_scale_Q14 /* I LTP state scaling */ | ||
89 | ) | ||
90 | { | ||
91 | opus_int k, lag, start_idx, LSF_interpolation_flag; | ||
92 | const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; | ||
93 | opus_int16 *pxq; | ||
94 | VARDECL( opus_int32, sLTP_Q15 ); | ||
95 | VARDECL( opus_int16, sLTP ); | ||
96 | opus_int32 HarmShapeFIRPacked_Q14; | ||
97 | opus_int offset_Q10; | ||
98 | VARDECL( opus_int32, x_sc_Q10 ); | ||
99 | |||
100 | opus_int32 table[ 64 ][ 4 ]; | ||
101 | opus_int32 tmp1; | ||
102 | opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; | ||
103 | |||
104 | SAVE_STACK; | ||
105 | |||
106 | NSQ->rand_seed = psIndices->Seed; | ||
107 | |||
108 | /* Set unvoiced lag to the previous one, overwrite later for voiced */ | ||
109 | lag = NSQ->lagPrev; | ||
110 | |||
111 | silk_assert( NSQ->prev_gain_Q16 != 0 ); | ||
112 | |||
113 | offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; | ||
114 | |||
115 | /* 0 */ | ||
116 | q1_Q10 = offset_Q10; | ||
117 | q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); | ||
118 | rd1_Q20 = q1_Q10 * Lambda_Q10; | ||
119 | rd2_Q20 = q2_Q10 * Lambda_Q10; | ||
120 | |||
121 | table[ 32 ][ 0 ] = q1_Q10; | ||
122 | table[ 32 ][ 1 ] = q2_Q10; | ||
123 | table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); | ||
124 | table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); | ||
125 | |||
126 | /* -1 */ | ||
127 | q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); | ||
128 | q2_Q10 = offset_Q10; | ||
129 | rd1_Q20 = - q1_Q10 * Lambda_Q10; | ||
130 | rd2_Q20 = q2_Q10 * Lambda_Q10; | ||
131 | |||
132 | table[ 31 ][ 0 ] = q1_Q10; | ||
133 | table[ 31 ][ 1 ] = q2_Q10; | ||
134 | table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); | ||
135 | table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); | ||
136 | |||
137 | /* > 0 */ | ||
138 | for (k = 1; k <= 31; k++) | ||
139 | { | ||
140 | tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); | ||
141 | |||
142 | q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10; | ||
143 | q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024; | ||
144 | rd1_Q20 = q1_Q10 * Lambda_Q10; | ||
145 | rd2_Q20 = q2_Q10 * Lambda_Q10; | ||
146 | |||
147 | table[ 32 + k ][ 0 ] = q1_Q10; | ||
148 | table[ 32 + k ][ 1 ] = q2_Q10; | ||
149 | table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); | ||
150 | table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); | ||
151 | } | ||
152 | |||
153 | /* < -1 */ | ||
154 | for (k = -32; k <= -2; k++) | ||
155 | { | ||
156 | tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); | ||
157 | |||
158 | q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10; | ||
159 | q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024; | ||
160 | rd1_Q20 = - q1_Q10 * Lambda_Q10; | ||
161 | rd2_Q20 = - q2_Q10 * Lambda_Q10; | ||
162 | |||
163 | table[ 32 + k ][ 0 ] = q1_Q10; | ||
164 | table[ 32 + k ][ 1 ] = q2_Q10; | ||
165 | table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); | ||
166 | table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); | ||
167 | } | ||
168 | |||
169 | if( psIndices->NLSFInterpCoef_Q2 == 4 ) { | ||
170 | LSF_interpolation_flag = 0; | ||
171 | } else { | ||
172 | LSF_interpolation_flag = 1; | ||
173 | } | ||
174 | |||
175 | ALLOC( sLTP_Q15, | ||
176 | psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); | ||
177 | ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); | ||
178 | ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); | ||
179 | /* Set up pointers to start of sub frame */ | ||
180 | NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; | ||
181 | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | ||
182 | pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; | ||
183 | for( k = 0; k < psEncC->nb_subfr; k++ ) { | ||
184 | A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; | ||
185 | B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; | ||
186 | AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; | ||
187 | |||
188 | /* Noise shape parameters */ | ||
189 | silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); | ||
190 | HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); | ||
191 | HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); | ||
192 | |||
193 | NSQ->rewhite_flag = 0; | ||
194 | if( psIndices->signalType == TYPE_VOICED ) { | ||
195 | /* Voiced */ | ||
196 | lag = pitchL[ k ]; | ||
197 | |||
198 | /* Re-whitening */ | ||
199 | if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { | ||
200 | /* Rewhiten with new A coefs */ | ||
201 | start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; | ||
202 | celt_assert( start_idx > 0 ); | ||
203 | |||
204 | silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], | ||
205 | A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); | ||
206 | |||
207 | NSQ->rewhite_flag = 1; | ||
208 | NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; | ||
209 | } | ||
210 | } | ||
211 | |||
212 | silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); | ||
213 | |||
214 | if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) ) | ||
215 | { | ||
216 | silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, | ||
217 | AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], | ||
218 | offset_Q10, psEncC->subfr_length, &(table[32]) ); | ||
219 | } | ||
220 | else | ||
221 | { | ||
222 | silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, | ||
223 | AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, | ||
224 | offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); | ||
225 | } | ||
226 | |||
227 | x_Q3 += psEncC->subfr_length; | ||
228 | pulses += psEncC->subfr_length; | ||
229 | pxq += psEncC->subfr_length; | ||
230 | } | ||
231 | |||
232 | /* Update lagPrev for next frame */ | ||
233 | NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; | ||
234 | |||
235 | /* Save quantized speech and noise shaping signals */ | ||
236 | silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); | ||
237 | silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); | ||
238 | RESTORE_STACK; | ||
239 | } | ||
240 | |||
241 | /***********************************/ | ||
242 | /* silk_noise_shape_quantizer_10_16 */ | ||
243 | /***********************************/ | ||
244 | static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( | ||
245 | silk_nsq_state *NSQ, /* I/O NSQ state */ | ||
246 | opus_int signalType, /* I Signal type */ | ||
247 | const opus_int32 x_sc_Q10[], /* I */ | ||
248 | opus_int8 pulses[], /* O */ | ||
249 | opus_int16 xq[], /* O */ | ||
250 | opus_int32 sLTP_Q15[], /* I/O LTP state */ | ||
251 | const opus_int16 a_Q12[], /* I Short term prediction coefs */ | ||
252 | const opus_int16 b_Q14[], /* I Long term prediction coefs */ | ||
253 | const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ | ||
254 | opus_int lag, /* I Pitch lag */ | ||
255 | opus_int32 HarmShapeFIRPacked_Q14, /* I */ | ||
256 | opus_int Tilt_Q14, /* I Spectral tilt */ | ||
257 | opus_int32 LF_shp_Q14, /* I */ | ||
258 | opus_int32 Gain_Q16, /* I */ | ||
259 | opus_int offset_Q10, /* I */ | ||
260 | opus_int length, /* I Input length */ | ||
261 | opus_int32 table[][4] /* I */ | ||
262 | ) | ||
263 | { | ||
264 | opus_int i; | ||
265 | opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; | ||
266 | opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10; | ||
267 | opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; | ||
268 | opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; | ||
269 | opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; | ||
270 | |||
271 | __m128i xmm_tempa, xmm_tempb; | ||
272 | |||
273 | __m128i xmm_one; | ||
274 | |||
275 | __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF; | ||
276 | __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF; | ||
277 | __m128i a_Q12_01234567, a_Q12_89ABCDEF; | ||
278 | |||
279 | __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210; | ||
280 | __m128i AR_shp_Q13_76543210; | ||
281 | |||
282 | shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; | ||
283 | pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; | ||
284 | Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); | ||
285 | |||
286 | /* Set up short term AR state */ | ||
287 | psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ]; | ||
288 | |||
289 | sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14; | ||
290 | xq_Q14 = psLPC_Q14[ 0 ]; | ||
291 | LTP_pred_Q13 = 0; | ||
292 | |||
293 | /* load a_Q12 */ | ||
294 | xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); | ||
295 | |||
296 | /* load a_Q12[0] - a_Q12[7] */ | ||
297 | a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) ); | ||
298 | /* load a_Q12[ 8 ] - a_Q12[ 15 ] */ | ||
299 | a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) ); | ||
300 | |||
301 | a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one ); | ||
302 | a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one ); | ||
303 | |||
304 | /* load AR_shp_Q13 */ | ||
305 | AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) ); | ||
306 | |||
307 | /* load psLPC_Q14 */ | ||
308 | xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); | ||
309 | |||
310 | xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) ); | ||
311 | xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) ); | ||
312 | |||
313 | xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); | ||
314 | xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); | ||
315 | |||
316 | psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); | ||
317 | psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); | ||
318 | |||
319 | xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) ); | ||
320 | xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) ); | ||
321 | |||
322 | xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); | ||
323 | xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); | ||
324 | |||
325 | psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); | ||
326 | psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); | ||
327 | |||
328 | /* load sAR2_Q14 */ | ||
329 | xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) ); | ||
330 | xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) ); | ||
331 | |||
332 | xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); | ||
333 | xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); | ||
334 | |||
335 | sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); | ||
336 | sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); | ||
337 | |||
338 | /* prepare 1 in 8 * 16bit */ | ||
339 | xmm_one = _mm_set1_epi16(1); | ||
340 | |||
341 | for( i = 0; i < length; i++ ) | ||
342 | { | ||
343 | /* Short-term prediction */ | ||
344 | __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F; | ||
345 | |||
346 | /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ | ||
347 | LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */ | ||
348 | |||
349 | /* shift psLPC_Q14 */ | ||
350 | psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 ); | ||
351 | psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 ); | ||
352 | |||
353 | psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 ); | ||
354 | psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 ); | ||
355 | |||
356 | psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 ); | ||
357 | psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 ); | ||
358 | |||
359 | /* high part, use pmaddwd, results in 4 32-bit */ | ||
360 | xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 ); | ||
361 | xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF ); | ||
362 | |||
363 | /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */ | ||
364 | xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 ); | ||
365 | xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF ); | ||
366 | |||
367 | xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 ); | ||
368 | xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF ); | ||
369 | |||
370 | xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 ); | ||
371 | xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF ); | ||
372 | |||
373 | xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); | ||
374 | xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb ); | ||
375 | |||
376 | xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); | ||
377 | xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one ); | ||
378 | |||
379 | /* accumulate */ | ||
380 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F ); | ||
381 | xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F ); | ||
382 | |||
383 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); | ||
384 | |||
385 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); | ||
386 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); | ||
387 | |||
388 | LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 ); | ||
389 | |||
390 | /* Long-term prediction */ | ||
391 | if ( opus_likely( signalType == TYPE_VOICED ) ) { | ||
392 | /* Unrolled loop */ | ||
393 | /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ | ||
394 | LTP_pred_Q13 = 2; | ||
395 | { | ||
396 | __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123; | ||
397 | |||
398 | b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 ); | ||
399 | b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B ); | ||
400 | |||
401 | /* loaded: [0] [-1] [-2] [-3] */ | ||
402 | pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); | ||
403 | /* shuffle to [-3] [-2] [-1] [0] and to new xmm */ | ||
404 | xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B ); | ||
405 | /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */ | ||
406 | xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 ); | ||
407 | /* right shift 2 bytes (16 bits), zero extended */ | ||
408 | xmm_tempa = _mm_srli_si128( xmm_tempa, 2 ); | ||
409 | |||
410 | /* a[1] * b[-1], a[3] * b[-3] */ | ||
411 | pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 ); | ||
412 | pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 ); | ||
413 | |||
414 | pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa ); | ||
415 | /* equal shift right 8 bytes*/ | ||
416 | xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) ); | ||
417 | xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 ); | ||
418 | |||
419 | LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa ); | ||
420 | |||
421 | LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); | ||
422 | pred_lag_ptr++; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | /* Noise shape feedback */ | ||
427 | NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ]; | ||
428 | NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) ); | ||
429 | |||
430 | sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 ); | ||
431 | sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 ); | ||
432 | |||
433 | sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 ); | ||
434 | sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 ); | ||
435 | |||
436 | /* high part, use pmaddwd, results in 4 32-bit */ | ||
437 | xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 ); | ||
438 | |||
439 | /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */ | ||
440 | xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 ); | ||
441 | xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 ); | ||
442 | |||
443 | xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 ); | ||
444 | xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); | ||
445 | |||
446 | xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); | ||
447 | |||
448 | /* accumulate */ | ||
449 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); | ||
450 | |||
451 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); | ||
452 | xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); | ||
453 | |||
454 | n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 ); | ||
455 | |||
456 | n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] ); | ||
457 | n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] ); | ||
458 | |||
459 | n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */ | ||
460 | n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 ); | ||
461 | |||
462 | n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); | ||
463 | n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 ); | ||
464 | |||
465 | silk_assert( lag > 0 || signalType != TYPE_VOICED ); | ||
466 | |||
467 | /* Combine prediction and noise shaping signals */ | ||
468 | tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ | ||
469 | tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ | ||
470 | if( lag > 0 ) { | ||
471 | /* Symmetric, packed FIR coefficients */ | ||
472 | n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); | ||
473 | n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); | ||
474 | n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); | ||
475 | shp_lag_ptr++; | ||
476 | |||
477 | tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */ | ||
478 | tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */ | ||
479 | tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */ | ||
480 | } else { | ||
481 | tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */ | ||
482 | } | ||
483 | |||
484 | r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */ | ||
485 | |||
486 | /* Generate dither */ | ||
487 | NSQ->rand_seed = silk_RAND( NSQ->rand_seed ); | ||
488 | |||
489 | /* Flip sign depending on dither */ | ||
490 | tmp2 = -r_Q10; | ||
491 | if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2; | ||
492 | |||
493 | r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); | ||
494 | |||
495 | /* Find two quantization level candidates and measure their rate-distortion */ | ||
496 | q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); | ||
497 | q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); | ||
498 | |||
499 | q1_Q10 = table[q1_Q0][0]; | ||
500 | q2_Q10 = table[q1_Q0][1]; | ||
501 | |||
502 | if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0) | ||
503 | { | ||
504 | q1_Q10 = q2_Q10; | ||
505 | } | ||
506 | |||
507 | pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 ); | ||
508 | |||
509 | /* Excitation */ | ||
510 | exc_Q14 = silk_LSHIFT( q1_Q10, 4 ); | ||
511 | |||
512 | tmp2 = -exc_Q14; | ||
513 | if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2; | ||
514 | |||
515 | /* Add predictions */ | ||
516 | LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 ); | ||
517 | xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 ); | ||
518 | |||
519 | /* Update states */ | ||
520 | psLPC_Q14++; | ||
521 | *psLPC_Q14 = xq_Q14; | ||
522 | sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 ); | ||
523 | |||
524 | NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); | ||
525 | sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); | ||
526 | NSQ->sLTP_shp_buf_idx++; | ||
527 | NSQ->sLTP_buf_idx++; | ||
528 | |||
529 | /* Make dither dependent on quantized signal */ | ||
530 | NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] ); | ||
531 | } | ||
532 | |||
533 | NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14; | ||
534 | |||
535 | /* Scale XQ back to normal level before saving */ | ||
536 | psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ]; | ||
537 | |||
538 | /* write back sAR2_Q14 */ | ||
539 | xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); | ||
540 | xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); | ||
541 | _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); | ||
542 | _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); | ||
543 | |||
544 | /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */ | ||
545 | { | ||
546 | __m128i xmm_Gain_Q10; | ||
547 | __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5; | ||
548 | |||
549 | /* prepare (1 << 7) in packed 4 32-bits */ | ||
550 | xmm_tempa = _mm_set1_epi32( (1 << 7) ); | ||
551 | |||
552 | /* prepare Gain_Q10 in packed 4 32-bits */ | ||
553 | xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 ); | ||
554 | |||
555 | /* process xq */ | ||
556 | for (i = 0; i < length - 7; i += 8) | ||
557 | { | ||
558 | xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) ); | ||
559 | xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) ); | ||
560 | |||
561 | /* equal shift right 4 bytes*/ | ||
562 | xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
563 | /* equal shift right 4 bytes*/ | ||
564 | xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
565 | |||
566 | xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 ); | ||
567 | xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 ); | ||
568 | xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 ); | ||
569 | xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 ); | ||
570 | |||
571 | xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 ); | ||
572 | xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 ); | ||
573 | xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 ); | ||
574 | xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 ); | ||
575 | |||
576 | xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC ); | ||
577 | xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC ); | ||
578 | |||
579 | /* silk_RSHIFT_ROUND(xq, 8) */ | ||
580 | xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa ); | ||
581 | xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa ); | ||
582 | |||
583 | xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 ); | ||
584 | xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 ); | ||
585 | |||
586 | /* silk_SAT16 */ | ||
587 | xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 ); | ||
588 | |||
589 | /* save to xq */ | ||
590 | _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 ); | ||
591 | } | ||
592 | } | ||
593 | for ( ; i < length; i++) | ||
594 | { | ||
595 | xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); | ||
596 | } | ||
597 | |||
598 | /* Update LPC synth buffer */ | ||
599 | silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); | ||
600 | } | ||
601 | |||
602 | static OPUS_INLINE void silk_nsq_scale_states_sse4_1( | ||
603 | const silk_encoder_state *psEncC, /* I Encoder State */ | ||
604 | silk_nsq_state *NSQ, /* I/O NSQ state */ | ||
605 | const opus_int32 x_Q3[], /* I input in Q3 */ | ||
606 | opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ | ||
607 | const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ | ||
608 | opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ | ||
609 | opus_int subfr, /* I subframe number */ | ||
610 | const opus_int LTP_scale_Q14, /* I */ | ||
611 | const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ | ||
612 | const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ | ||
613 | const opus_int signal_type /* I Signal type */ | ||
614 | ) | ||
615 | { | ||
616 | opus_int i, lag; | ||
617 | opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; | ||
618 | __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; | ||
619 | |||
620 | lag = pitchL[ subfr ]; | ||
621 | inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); | ||
622 | silk_assert( inv_gain_Q31 != 0 ); | ||
623 | |||
624 | /* Calculate gain adjustment factor */ | ||
625 | if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { | ||
626 | gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); | ||
627 | } else { | ||
628 | gain_adj_Q16 = (opus_int32)1 << 16; | ||
629 | } | ||
630 | |||
631 | /* Scale input */ | ||
632 | inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); | ||
633 | |||
634 | /* prepare inv_gain_Q23 in packed 4 32-bits */ | ||
635 | xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); | ||
636 | |||
637 | for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { | ||
638 | xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); | ||
639 | |||
640 | /* equal shift right 4 bytes*/ | ||
641 | xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
642 | |||
643 | xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); | ||
644 | xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); | ||
645 | |||
646 | xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); | ||
647 | xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); | ||
648 | |||
649 | xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); | ||
650 | |||
651 | _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 ); | ||
652 | } | ||
653 | |||
654 | for( ; i < psEncC->subfr_length; i++ ) { | ||
655 | x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); | ||
656 | } | ||
657 | |||
658 | /* Save inverse gain */ | ||
659 | NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; | ||
660 | |||
661 | /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ | ||
662 | if( NSQ->rewhite_flag ) { | ||
663 | if( subfr == 0 ) { | ||
664 | /* Do LTP downscaling */ | ||
665 | inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); | ||
666 | } | ||
667 | for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { | ||
668 | silk_assert( i < MAX_FRAME_LENGTH ); | ||
669 | sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); | ||
670 | } | ||
671 | } | ||
672 | |||
673 | /* Adjust for changing gain */ | ||
674 | if( gain_adj_Q16 != (opus_int32)1 << 16 ) { | ||
675 | /* Scale long-term shaping state */ | ||
676 | __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; | ||
677 | |||
678 | /* prepare gain_adj_Q16 in packed 4 32-bits */ | ||
679 | xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16); | ||
680 | |||
681 | for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) | ||
682 | { | ||
683 | xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); | ||
684 | /* equal shift right 4 bytes*/ | ||
685 | xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
686 | |||
687 | xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 ); | ||
688 | xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 ); | ||
689 | |||
690 | xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 ); | ||
691 | xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 ); | ||
692 | |||
693 | xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); | ||
694 | |||
695 | _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); | ||
696 | } | ||
697 | |||
698 | for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { | ||
699 | NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); | ||
700 | } | ||
701 | |||
702 | /* Scale long-term prediction state */ | ||
703 | if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { | ||
704 | for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { | ||
705 | sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); | ||
706 | } | ||
707 | } | ||
708 | |||
709 | NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); | ||
710 | |||
711 | /* Scale short-term prediction and shaping states */ | ||
712 | for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { | ||
713 | NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] ); | ||
714 | } | ||
715 | for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { | ||
716 | NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); | ||
717 | } | ||
718 | } | ||
719 | } | ||