summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/silk/fixed/x86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/fixed/x86')
-rw-r--r--lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c377
-rw-r--r--lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c160
-rw-r--r--lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c88
3 files changed, 625 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
new file mode 100644
index 0000000000..bbb1ce0fcc
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -0,0 +1,377 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35
36#include "SigProc_FIX.h"
37#include "define.h"
38#include "tuning_parameters.h"
39#include "pitch.h"
40#include "celt/x86/x86cpu.h"
41
42#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
43
44#define QA 25
45#define N_BITS_HEAD_ROOM 2
46#define MIN_RSHIFTS -16
47#define MAX_RSHIFTS (32 - QA)
48
49/* Compute reflection coefficients from input signal */
50void silk_burg_modified_sse4_1(
51 opus_int32 *res_nrg, /* O Residual energy */
52 opus_int *res_nrg_Q, /* O Residual energy Q value */
53 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
54 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
55 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
56 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
57 const opus_int nb_subfr, /* I Number of subframes stacked in x */
58 const opus_int D, /* I Order */
59 int arch /* I Run-time architecture */
60)
61{
62 opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
63 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
64 const opus_int16 *x_ptr;
65 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
66 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ];
67 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ];
68 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
69 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
70 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
71
72 __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
73 __m128i CONST1 = _mm_set1_epi32(1);
74
75 celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
76
77 /* Compute autocorrelations, added over subframes */
78 silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
79 if( rshifts > MAX_RSHIFTS ) {
80 C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
81 silk_assert( C0 > 0 );
82 rshifts = MAX_RSHIFTS;
83 } else {
84 lz = silk_CLZ32( C0 ) - 1;
85 rshifts_extra = N_BITS_HEAD_ROOM - lz;
86 if( rshifts_extra > 0 ) {
87 rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
88 C0 = silk_RSHIFT32( C0, rshifts_extra );
89 } else {
90 rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
91 C0 = silk_LSHIFT32( C0, -rshifts_extra );
92 }
93 rshifts += rshifts_extra;
94 }
95 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
96 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
97 if( rshifts > 0 ) {
98 for( s = 0; s < nb_subfr; s++ ) {
99 x_ptr = x + s * subfr_length;
100 for( n = 1; n < D + 1; n++ ) {
101 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
102 silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
103 }
104 }
105 } else {
106 for( s = 0; s < nb_subfr; s++ ) {
107 int i;
108 opus_int32 d;
109 x_ptr = x + s * subfr_length;
110 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
111 for( n = 1; n < D + 1; n++ ) {
112 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
113 d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
114 xcorr[ n - 1 ] += d;
115 }
116 for( n = 1; n < D + 1; n++ ) {
117 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
118 }
119 }
120 }
121 silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
122
123 /* Initialize */
124 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
125
126 invGain_Q30 = (opus_int32)1 << 30;
127 reached_max_gain = 0;
128 for( n = 0; n < D; n++ ) {
129 /* Update first row of correlation matrix (without first element) */
130 /* Update last row of correlation matrix (without last element, stored in reversed order) */
131 /* Update C * Af */
132 /* Update C * flipud(Af) (stored in reversed order) */
133 if( rshifts > -2 ) {
134 for( s = 0; s < nb_subfr; s++ ) {
135 x_ptr = x + s * subfr_length;
136 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */
137 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */
138 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */
139 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */
140 for( k = 0; k < n; k++ ) {
141 C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
142 C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
143 Atmp_QA = Af_QA[ k ];
144 tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */
145 tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */
146 }
147 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */
148 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */
149 for( k = 0; k <= n; k++ ) {
150 CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */
151 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */
152 }
153 }
154 } else {
155 for( s = 0; s < nb_subfr; s++ ) {
156 x_ptr = x + s * subfr_length;
157 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */
158 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */
159 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */
160 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */
161
162 X1_3210 = _mm_set1_epi32( x1 );
163 X2_3210 = _mm_set1_epi32( x2 );
164 TMP1_3210 = _mm_setzero_si128();
165 TMP2_3210 = _mm_setzero_si128();
166 for( k = 0; k < n - 3; k += 4 ) {
167 PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
168 SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
169 FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
170 PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
171 LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
172 ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
173
174 T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
175 T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
176
177 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
178 ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
179 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
180
181 FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
182 LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
183
184 PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
185 SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
186
187 _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
188 _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
189
190 TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
191 TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
192 }
193
194 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
195 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
196 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
197 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
198
199 tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
200 tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
201
202 for( ; k < n; k++ ) {
203 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
204 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
205 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
206 tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
207 tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
208 }
209
210 tmp1 = -tmp1; /* Q17 */
211 tmp2 = -tmp2; /* Q17 */
212
213 {
214 __m128i xmm_tmp1, xmm_tmp2;
215 __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
216 __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
217
218 xmm_tmp1 = _mm_set1_epi32( tmp1 );
219 xmm_tmp2 = _mm_set1_epi32( tmp2 );
220
221 for( k = 0; k <= n - 3; k += 4 ) {
222 xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
223 xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
224
225 xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
226
227 xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
228 xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
229
230 /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
231 xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
232 xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
233
234 xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
235 xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
236 xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
237 xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
238
239 xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
240 xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
241 xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
242 xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
243
244 xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
245 xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
246
247 X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
248 PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
249
250 X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
251 PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
252
253 _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
254 _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
255 }
256
257 for( ; k <= n; k++ ) {
258 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
259 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
260 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
261 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
262 }
263 }
264 }
265 }
266
267 /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
268 tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */
269 tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */
270 num = 0; /* Q( -rshifts ) */
271 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */
272 for( k = 0; k < n; k++ ) {
273 Atmp_QA = Af_QA[ k ];
274 lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
275 lz = silk_min( 32 - QA, lz );
276 Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */
277
278 tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
279 tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
280 num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
281 nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
282 Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */
283 }
284 CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */
285 CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */
286 num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */
287 num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */
288
289 /* Calculate the next order reflection (parcor) coefficient */
290 if( silk_abs( num ) < nrg ) {
291 rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
292 } else {
293 rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
294 }
295
296 /* Update inverse prediction gain */
297 tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
298 tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
299 if( tmp1 <= minInvGain_Q30 ) {
300 /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
301 tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */
302 rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */
303 if( rc_Q31 > 0 ) {
304 /* Newton-Raphson iteration */
305 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */
306 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */
307 if( num < 0 ) {
308 /* Ensure adjusted reflection coefficients has the original sign */
309 rc_Q31 = -rc_Q31;
310 }
311 }
312 invGain_Q30 = minInvGain_Q30;
313 reached_max_gain = 1;
314 } else {
315 invGain_Q30 = tmp1;
316 }
317
318 /* Update the AR coefficients */
319 for( k = 0; k < (n + 1) >> 1; k++ ) {
320 tmp1 = Af_QA[ k ]; /* QA */
321 tmp2 = Af_QA[ n - k - 1 ]; /* QA */
322 Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */
323 Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */
324 }
325 Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */
326
327 if( reached_max_gain ) {
328 /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
329 for( k = n + 1; k < D; k++ ) {
330 Af_QA[ k ] = 0;
331 }
332 break;
333 }
334
335 /* Update C * Af and C * Ab */
336 for( k = 0; k <= n + 1; k++ ) {
337 tmp1 = CAf[ k ]; /* Q( -rshifts ) */
338 tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */
339 CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */
340 CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */
341 }
342 }
343
344 if( reached_max_gain ) {
345 for( k = 0; k < D; k++ ) {
346 /* Scale coefficients */
347 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
348 }
349 /* Subtract energy of preceding samples from C0 */
350 if( rshifts > 0 ) {
351 for( s = 0; s < nb_subfr; s++ ) {
352 x_ptr = x + s * subfr_length;
353 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
354 }
355 } else {
356 for( s = 0; s < nb_subfr; s++ ) {
357 x_ptr = x + s * subfr_length;
358 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
359 }
360 }
361 /* Approximate residual energy */
362 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
363 *res_nrg_Q = -rshifts;
364 } else {
365 /* Return residual energy */
366 nrg = CAf[ 0 ]; /* Q( -rshifts ) */
367 tmp1 = (opus_int32)1 << 16; /* Q16 */
368 for( k = 0; k < D; k++ ) {
369 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */
370 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */
371 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */
372 A_Q16[ k ] = -Atmp1;
373 }
374 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
375 *res_nrg_Q = -rshifts;
376 }
377}
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c
new file mode 100644
index 0000000000..555432cd96
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c
@@ -0,0 +1,160 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37
38void silk_warped_LPC_analysis_filter_FIX_sse4_1(
39 opus_int32 state[], /* I/O State [order + 1] */
40 opus_int32 res_Q2[], /* O Residual signal [length] */
41 const opus_int16 coef_Q13[], /* I Coefficients [order] */
42 const opus_int16 input[], /* I Input signal [length] */
43 const opus_int16 lambda_Q16, /* I Warping factor */
44 const opus_int length, /* I Length of input signal */
45 const opus_int order /* I Filter order (even) */
46)
47{
48 opus_int n, i;
49 opus_int32 acc_Q11, tmp1, tmp2;
50
51 /* Order must be even */
52 celt_assert( ( order & 1 ) == 0 );
53
54 if (order == 10)
55 {
56 if (0 == lambda_Q16)
57 {
58 __m128i coef_Q13_3210, coef_Q13_7654;
59 __m128i coef_Q13_0123, coef_Q13_4567;
60 __m128i state_0123, state_4567;
61 __m128i xmm_product1, xmm_product2;
62 __m128i xmm_tempa, xmm_tempb;
63
64 register opus_int32 sum;
65 register opus_int32 state_8, state_9, state_a;
66 register opus_int64 coef_Q13_8, coef_Q13_9;
67
68 celt_assert( length > 0 );
69
70 coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
71 coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
72
73 coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
74 coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
75
76 coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
77 coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
78
79 state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
80 state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
81
82 state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
83 state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
84
85 state_8 = state[ 8 ];
86 state_9 = state[ 9 ];
87 state_a = 0;
88
89 for( n = 0; n < length; n++ )
90 {
91 xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
92 xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
93
94 xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
95 xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
96
97 xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
98 xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
99
100 xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
101 xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
102
103 xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
104 xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
105
106 xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
107 xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
108 xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
109
110 sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
111 sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
112
113 xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
114 sum += _mm_cvtsi128_si32( xmm_tempa);
115 res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
116
117 /* move right */
118 state_a = state_9;
119 state_9 = state_8;
120 state_8 = _mm_cvtsi128_si32( state_4567 );
121 state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
122
123 state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
124 }
125
126 _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
127 _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
128 state[ 8 ] = state_8;
129 state[ 9 ] = state_9;
130 state[ 10 ] = state_a;
131
132 return;
133 }
134 }
135
136 for( n = 0; n < length; n++ ) {
137 /* Output of lowpass section */
138 tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
139 state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
140 /* Output of allpass section */
141 tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
142 state[ 1 ] = tmp2;
143 acc_Q11 = silk_RSHIFT( order, 1 );
144 acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
145 /* Loop over allpass sections */
146 for( i = 2; i < order; i += 2 ) {
147 /* Output of allpass section */
148 tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
149 state[ i ] = tmp1;
150 acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
151 /* Output of allpass section */
152 tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
153 state[ i + 1 ] = tmp2;
154 acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
155 }
156 state[ order ] = tmp1;
157 acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
158 res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
159 }
160}
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
new file mode 100644
index 0000000000..c1e90564d0
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -0,0 +1,88 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36
37#include "SigProc_FIX.h"
38#include "pitch.h"
39
40opus_int64 silk_inner_prod16_aligned_64_sse4_1(
41 const opus_int16 *inVec1, /* I input vector 1 */
42 const opus_int16 *inVec2, /* I input vector 2 */
43 const opus_int len /* I vector lengths */
44)
45{
46 opus_int i, dataSize8;
47 opus_int64 sum;
48
49 __m128i xmm_tempa;
50 __m128i inVec1_76543210, acc1;
51 __m128i inVec2_76543210, acc2;
52
53 sum = 0;
54 dataSize8 = len & ~7;
55
56 acc1 = _mm_setzero_si128();
57 acc2 = _mm_setzero_si128();
58
59 for( i = 0; i < dataSize8; i += 8 ) {
60 inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
61 inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
62
63 /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
64 inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
65
66 xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
67 /* equal shift right 8 bytes */
68 inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
69 inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
70
71 acc1 = _mm_add_epi64( acc1, xmm_tempa );
72 acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
73 }
74
75 acc1 = _mm_add_epi64( acc1, acc2 );
76
77 /* equal shift right 8 bytes */
78 acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
79 acc1 = _mm_add_epi64( acc1, acc2 );
80
81 _mm_storel_epi64( (__m128i *)&sum, acc1 );
82
83 for( ; i < len; i++ ) {
84 sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
85 }
86
87 return sum;
88}