diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/fixed/x86')
3 files changed, 625 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c new file mode 100644 index 0000000000..bbb1ce0fcc --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c | |||
@@ -0,0 +1,377 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <xmmintrin.h> | ||
33 | #include <emmintrin.h> | ||
34 | #include <smmintrin.h> | ||
35 | |||
36 | #include "SigProc_FIX.h" | ||
37 | #include "define.h" | ||
38 | #include "tuning_parameters.h" | ||
39 | #include "pitch.h" | ||
40 | #include "celt/x86/x86cpu.h" | ||
41 | |||
42 | #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */ | ||
43 | |||
44 | #define QA 25 | ||
45 | #define N_BITS_HEAD_ROOM 2 | ||
46 | #define MIN_RSHIFTS -16 | ||
47 | #define MAX_RSHIFTS (32 - QA) | ||
48 | |||
49 | /* Compute reflection coefficients from input signal */ | ||
50 | void silk_burg_modified_sse4_1( | ||
51 | opus_int32 *res_nrg, /* O Residual energy */ | ||
52 | opus_int *res_nrg_Q, /* O Residual energy Q value */ | ||
53 | opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ | ||
54 | const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */ | ||
55 | const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ | ||
56 | const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ | ||
57 | const opus_int nb_subfr, /* I Number of subframes stacked in x */ | ||
58 | const opus_int D, /* I Order */ | ||
59 | int arch /* I Run-time architecture */ | ||
60 | ) | ||
61 | { | ||
62 | opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; | ||
63 | opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; | ||
64 | const opus_int16 *x_ptr; | ||
65 | opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; | ||
66 | opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; | ||
67 | opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; | ||
68 | opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; | ||
69 | opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; | ||
70 | opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; | ||
71 | |||
72 | __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; | ||
73 | __m128i CONST1 = _mm_set1_epi32(1); | ||
74 | |||
75 | celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); | ||
76 | |||
77 | /* Compute autocorrelations, added over subframes */ | ||
78 | silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length ); | ||
79 | if( rshifts > MAX_RSHIFTS ) { | ||
80 | C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS ); | ||
81 | silk_assert( C0 > 0 ); | ||
82 | rshifts = MAX_RSHIFTS; | ||
83 | } else { | ||
84 | lz = silk_CLZ32( C0 ) - 1; | ||
85 | rshifts_extra = N_BITS_HEAD_ROOM - lz; | ||
86 | if( rshifts_extra > 0 ) { | ||
87 | rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts ); | ||
88 | C0 = silk_RSHIFT32( C0, rshifts_extra ); | ||
89 | } else { | ||
90 | rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts ); | ||
91 | C0 = silk_LSHIFT32( C0, -rshifts_extra ); | ||
92 | } | ||
93 | rshifts += rshifts_extra; | ||
94 | } | ||
95 | CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ | ||
96 | silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); | ||
97 | if( rshifts > 0 ) { | ||
98 | for( s = 0; s < nb_subfr; s++ ) { | ||
99 | x_ptr = x + s * subfr_length; | ||
100 | for( n = 1; n < D + 1; n++ ) { | ||
101 | C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( | ||
102 | silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); | ||
103 | } | ||
104 | } | ||
105 | } else { | ||
106 | for( s = 0; s < nb_subfr; s++ ) { | ||
107 | int i; | ||
108 | opus_int32 d; | ||
109 | x_ptr = x + s * subfr_length; | ||
110 | celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch ); | ||
111 | for( n = 1; n < D + 1; n++ ) { | ||
112 | for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) | ||
113 | d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] ); | ||
114 | xcorr[ n - 1 ] += d; | ||
115 | } | ||
116 | for( n = 1; n < D + 1; n++ ) { | ||
117 | C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts ); | ||
118 | } | ||
119 | } | ||
120 | } | ||
121 | silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); | ||
122 | |||
123 | /* Initialize */ | ||
124 | CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ | ||
125 | |||
126 | invGain_Q30 = (opus_int32)1 << 30; | ||
127 | reached_max_gain = 0; | ||
128 | for( n = 0; n < D; n++ ) { | ||
129 | /* Update first row of correlation matrix (without first element) */ | ||
130 | /* Update last row of correlation matrix (without last element, stored in reversed order) */ | ||
131 | /* Update C * Af */ | ||
132 | /* Update C * flipud(Af) (stored in reversed order) */ | ||
133 | if( rshifts > -2 ) { | ||
134 | for( s = 0; s < nb_subfr; s++ ) { | ||
135 | x_ptr = x + s * subfr_length; | ||
136 | x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */ | ||
137 | x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */ | ||
138 | tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */ | ||
139 | tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */ | ||
140 | for( k = 0; k < n; k++ ) { | ||
141 | C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ | ||
142 | C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ | ||
143 | Atmp_QA = Af_QA[ k ]; | ||
144 | tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ | ||
145 | tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */ | ||
146 | } | ||
147 | tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */ | ||
148 | tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */ | ||
149 | for( k = 0; k <= n; k++ ) { | ||
150 | CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */ | ||
151 | CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */ | ||
152 | } | ||
153 | } | ||
154 | } else { | ||
155 | for( s = 0; s < nb_subfr; s++ ) { | ||
156 | x_ptr = x + s * subfr_length; | ||
157 | x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */ | ||
158 | x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */ | ||
159 | tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */ | ||
160 | tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */ | ||
161 | |||
162 | X1_3210 = _mm_set1_epi32( x1 ); | ||
163 | X2_3210 = _mm_set1_epi32( x2 ); | ||
164 | TMP1_3210 = _mm_setzero_si128(); | ||
165 | TMP2_3210 = _mm_setzero_si128(); | ||
166 | for( k = 0; k < n - 3; k += 4 ) { | ||
167 | PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] ); | ||
168 | SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] ); | ||
169 | FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] ); | ||
170 | PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
171 | LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] ); | ||
172 | ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] ); | ||
173 | |||
174 | T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 ); | ||
175 | T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 ); | ||
176 | |||
177 | ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 ); | ||
178 | ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 ); | ||
179 | ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 ); | ||
180 | |||
181 | FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 ); | ||
182 | LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 ); | ||
183 | |||
184 | PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 ); | ||
185 | SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 ); | ||
186 | |||
187 | _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 ); | ||
188 | _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 ); | ||
189 | |||
190 | TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 ); | ||
191 | TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 ); | ||
192 | } | ||
193 | |||
194 | TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) ); | ||
195 | TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) ); | ||
196 | TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) ); | ||
197 | TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) ); | ||
198 | |||
199 | tmp1 += _mm_cvtsi128_si32( TMP1_3210 ); | ||
200 | tmp2 += _mm_cvtsi128_si32( TMP2_3210 ); | ||
201 | |||
202 | for( ; k < n; k++ ) { | ||
203 | C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ | ||
204 | C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ | ||
205 | Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ | ||
206 | tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ | ||
207 | tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ | ||
208 | } | ||
209 | |||
210 | tmp1 = -tmp1; /* Q17 */ | ||
211 | tmp2 = -tmp2; /* Q17 */ | ||
212 | |||
213 | { | ||
214 | __m128i xmm_tmp1, xmm_tmp2; | ||
215 | __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; | ||
216 | __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; | ||
217 | |||
218 | xmm_tmp1 = _mm_set1_epi32( tmp1 ); | ||
219 | xmm_tmp2 = _mm_set1_epi32( tmp2 ); | ||
220 | |||
221 | for( k = 0; k <= n - 3; k += 4 ) { | ||
222 | xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] ); | ||
223 | xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] ); | ||
224 | |||
225 | xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
226 | |||
227 | xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 ); | ||
228 | xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 ); | ||
229 | |||
230 | /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ | ||
231 | xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
232 | xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); | ||
233 | |||
234 | xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 ); | ||
235 | xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 ); | ||
236 | xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 ); | ||
237 | xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 ); | ||
238 | |||
239 | xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 ); | ||
240 | xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 ); | ||
241 | xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 ); | ||
242 | xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 ); | ||
243 | |||
244 | xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC ); | ||
245 | xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC ); | ||
246 | |||
247 | X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] ); | ||
248 | PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] ); | ||
249 | |||
250 | X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 ); | ||
251 | PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 ); | ||
252 | |||
253 | _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 ); | ||
254 | _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 ); | ||
255 | } | ||
256 | |||
257 | for( ; k <= n; k++ ) { | ||
258 | CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1, | ||
259 | silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */ | ||
260 | CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2, | ||
261 | silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */ | ||
262 | } | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ | ||
268 | tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */ | ||
269 | tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */ | ||
270 | num = 0; /* Q( -rshifts ) */ | ||
271 | nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */ | ||
272 | for( k = 0; k < n; k++ ) { | ||
273 | Atmp_QA = Af_QA[ k ]; | ||
274 | lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1; | ||
275 | lz = silk_min( 32 - QA, lz ); | ||
276 | Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */ | ||
277 | |||
278 | tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ | ||
279 | tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ | ||
280 | num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ | ||
281 | nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ), | ||
282 | Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */ | ||
283 | } | ||
284 | CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */ | ||
285 | CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */ | ||
286 | num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */ | ||
287 | num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */ | ||
288 | |||
289 | /* Calculate the next order reflection (parcor) coefficient */ | ||
290 | if( silk_abs( num ) < nrg ) { | ||
291 | rc_Q31 = silk_DIV32_varQ( num, nrg, 31 ); | ||
292 | } else { | ||
293 | rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN; | ||
294 | } | ||
295 | |||
296 | /* Update inverse prediction gain */ | ||
297 | tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); | ||
298 | tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 ); | ||
299 | if( tmp1 <= minInvGain_Q30 ) { | ||
300 | /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ | ||
301 | tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */ | ||
302 | rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */ | ||
303 | if( rc_Q31 > 0 ) { | ||
304 | /* Newton-Raphson iteration */ | ||
305 | rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */ | ||
306 | rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */ | ||
307 | if( num < 0 ) { | ||
308 | /* Ensure adjusted reflection coefficients has the original sign */ | ||
309 | rc_Q31 = -rc_Q31; | ||
310 | } | ||
311 | } | ||
312 | invGain_Q30 = minInvGain_Q30; | ||
313 | reached_max_gain = 1; | ||
314 | } else { | ||
315 | invGain_Q30 = tmp1; | ||
316 | } | ||
317 | |||
318 | /* Update the AR coefficients */ | ||
319 | for( k = 0; k < (n + 1) >> 1; k++ ) { | ||
320 | tmp1 = Af_QA[ k ]; /* QA */ | ||
321 | tmp2 = Af_QA[ n - k - 1 ]; /* QA */ | ||
322 | Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */ | ||
323 | Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */ | ||
324 | } | ||
325 | Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */ | ||
326 | |||
327 | if( reached_max_gain ) { | ||
328 | /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ | ||
329 | for( k = n + 1; k < D; k++ ) { | ||
330 | Af_QA[ k ] = 0; | ||
331 | } | ||
332 | break; | ||
333 | } | ||
334 | |||
335 | /* Update C * Af and C * Ab */ | ||
336 | for( k = 0; k <= n + 1; k++ ) { | ||
337 | tmp1 = CAf[ k ]; /* Q( -rshifts ) */ | ||
338 | tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */ | ||
339 | CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */ | ||
340 | CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */ | ||
341 | } | ||
342 | } | ||
343 | |||
344 | if( reached_max_gain ) { | ||
345 | for( k = 0; k < D; k++ ) { | ||
346 | /* Scale coefficients */ | ||
347 | A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); | ||
348 | } | ||
349 | /* Subtract energy of preceding samples from C0 */ | ||
350 | if( rshifts > 0 ) { | ||
351 | for( s = 0; s < nb_subfr; s++ ) { | ||
352 | x_ptr = x + s * subfr_length; | ||
353 | C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); | ||
354 | } | ||
355 | } else { | ||
356 | for( s = 0; s < nb_subfr; s++ ) { | ||
357 | x_ptr = x + s * subfr_length; | ||
358 | C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts ); | ||
359 | } | ||
360 | } | ||
361 | /* Approximate residual energy */ | ||
362 | *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 ); | ||
363 | *res_nrg_Q = -rshifts; | ||
364 | } else { | ||
365 | /* Return residual energy */ | ||
366 | nrg = CAf[ 0 ]; /* Q( -rshifts ) */ | ||
367 | tmp1 = (opus_int32)1 << 16; /* Q16 */ | ||
368 | for( k = 0; k < D; k++ ) { | ||
369 | Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */ | ||
370 | nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */ | ||
371 | tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */ | ||
372 | A_Q16[ k ] = -Atmp1; | ||
373 | } | ||
374 | *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ | ||
375 | *res_nrg_Q = -rshifts; | ||
376 | } | ||
377 | } | ||
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c new file mode 100644 index 0000000000..555432cd96 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/prefilter_FIX_sse.c | |||
@@ -0,0 +1,160 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <xmmintrin.h> | ||
33 | #include <emmintrin.h> | ||
34 | #include <smmintrin.h> | ||
35 | #include "main.h" | ||
36 | #include "celt/x86/x86cpu.h" | ||
37 | |||
38 | void silk_warped_LPC_analysis_filter_FIX_sse4_1( | ||
39 | opus_int32 state[], /* I/O State [order + 1] */ | ||
40 | opus_int32 res_Q2[], /* O Residual signal [length] */ | ||
41 | const opus_int16 coef_Q13[], /* I Coefficients [order] */ | ||
42 | const opus_int16 input[], /* I Input signal [length] */ | ||
43 | const opus_int16 lambda_Q16, /* I Warping factor */ | ||
44 | const opus_int length, /* I Length of input signal */ | ||
45 | const opus_int order /* I Filter order (even) */ | ||
46 | ) | ||
47 | { | ||
48 | opus_int n, i; | ||
49 | opus_int32 acc_Q11, tmp1, tmp2; | ||
50 | |||
51 | /* Order must be even */ | ||
52 | celt_assert( ( order & 1 ) == 0 ); | ||
53 | |||
54 | if (order == 10) | ||
55 | { | ||
56 | if (0 == lambda_Q16) | ||
57 | { | ||
58 | __m128i coef_Q13_3210, coef_Q13_7654; | ||
59 | __m128i coef_Q13_0123, coef_Q13_4567; | ||
60 | __m128i state_0123, state_4567; | ||
61 | __m128i xmm_product1, xmm_product2; | ||
62 | __m128i xmm_tempa, xmm_tempb; | ||
63 | |||
64 | register opus_int32 sum; | ||
65 | register opus_int32 state_8, state_9, state_a; | ||
66 | register opus_int64 coef_Q13_8, coef_Q13_9; | ||
67 | |||
68 | celt_assert( length > 0 ); | ||
69 | |||
70 | coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] ); | ||
71 | coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] ); | ||
72 | |||
73 | coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
74 | coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
75 | |||
76 | coef_Q13_8 = (opus_int64) coef_Q13[ 8 ]; | ||
77 | coef_Q13_9 = (opus_int64) coef_Q13[ 9 ]; | ||
78 | |||
79 | state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) ); | ||
80 | state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) ); | ||
81 | |||
82 | state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
83 | state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
84 | |||
85 | state_8 = state[ 8 ]; | ||
86 | state_9 = state[ 9 ]; | ||
87 | state_a = 0; | ||
88 | |||
89 | for( n = 0; n < length; n++ ) | ||
90 | { | ||
91 | xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */ | ||
92 | xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 ); | ||
93 | |||
94 | xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
95 | xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ); | ||
96 | |||
97 | xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */ | ||
98 | xmm_product2 = _mm_srli_epi64( xmm_product2, 16 ); | ||
99 | |||
100 | xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa ); | ||
101 | xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb ); | ||
102 | |||
103 | xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 ); | ||
104 | xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 ); | ||
105 | |||
106 | xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 ); | ||
107 | xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 ); | ||
108 | xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb ); | ||
109 | |||
110 | sum = (opus_int32)((coef_Q13_8 * state_8) >> 16); | ||
111 | sum += (opus_int32)((coef_Q13_9 * state_9) >> 16); | ||
112 | |||
113 | xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) ); | ||
114 | sum += _mm_cvtsi128_si32( xmm_tempa); | ||
115 | res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9); | ||
116 | |||
117 | /* move right */ | ||
118 | state_a = state_9; | ||
119 | state_9 = state_8; | ||
120 | state_8 = _mm_cvtsi128_si32( state_4567 ); | ||
121 | state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 ); | ||
122 | |||
123 | state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 ); | ||
124 | } | ||
125 | |||
126 | _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) ); | ||
127 | _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) ); | ||
128 | state[ 8 ] = state_8; | ||
129 | state[ 9 ] = state_9; | ||
130 | state[ 10 ] = state_a; | ||
131 | |||
132 | return; | ||
133 | } | ||
134 | } | ||
135 | |||
136 | for( n = 0; n < length; n++ ) { | ||
137 | /* Output of lowpass section */ | ||
138 | tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 ); | ||
139 | state[ 0 ] = silk_LSHIFT( input[ n ], 14 ); | ||
140 | /* Output of allpass section */ | ||
141 | tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 ); | ||
142 | state[ 1 ] = tmp2; | ||
143 | acc_Q11 = silk_RSHIFT( order, 1 ); | ||
144 | acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] ); | ||
145 | /* Loop over allpass sections */ | ||
146 | for( i = 2; i < order; i += 2 ) { | ||
147 | /* Output of allpass section */ | ||
148 | tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 ); | ||
149 | state[ i ] = tmp1; | ||
150 | acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] ); | ||
151 | /* Output of allpass section */ | ||
152 | tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 ); | ||
153 | state[ i + 1 ] = tmp2; | ||
154 | acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] ); | ||
155 | } | ||
156 | state[ order ] = tmp1; | ||
157 | acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] ); | ||
158 | res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 ); | ||
159 | } | ||
160 | } | ||
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c new file mode 100644 index 0000000000..c1e90564d0 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c | |||
@@ -0,0 +1,88 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <xmmintrin.h> | ||
33 | #include <emmintrin.h> | ||
34 | #include <smmintrin.h> | ||
35 | #include "main.h" | ||
36 | |||
37 | #include "SigProc_FIX.h" | ||
38 | #include "pitch.h" | ||
39 | |||
40 | opus_int64 silk_inner_prod16_aligned_64_sse4_1( | ||
41 | const opus_int16 *inVec1, /* I input vector 1 */ | ||
42 | const opus_int16 *inVec2, /* I input vector 2 */ | ||
43 | const opus_int len /* I vector lengths */ | ||
44 | ) | ||
45 | { | ||
46 | opus_int i, dataSize8; | ||
47 | opus_int64 sum; | ||
48 | |||
49 | __m128i xmm_tempa; | ||
50 | __m128i inVec1_76543210, acc1; | ||
51 | __m128i inVec2_76543210, acc2; | ||
52 | |||
53 | sum = 0; | ||
54 | dataSize8 = len & ~7; | ||
55 | |||
56 | acc1 = _mm_setzero_si128(); | ||
57 | acc2 = _mm_setzero_si128(); | ||
58 | |||
59 | for( i = 0; i < dataSize8; i += 8 ) { | ||
60 | inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) ); | ||
61 | inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) ); | ||
62 | |||
63 | /* only when all 4 operands are -32768 (0x8000), this results in wrap around */ | ||
64 | inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 ); | ||
65 | |||
66 | xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 ); | ||
67 | /* equal shift right 8 bytes */ | ||
68 | inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) ); | ||
69 | inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 ); | ||
70 | |||
71 | acc1 = _mm_add_epi64( acc1, xmm_tempa ); | ||
72 | acc2 = _mm_add_epi64( acc2, inVec1_76543210 ); | ||
73 | } | ||
74 | |||
75 | acc1 = _mm_add_epi64( acc1, acc2 ); | ||
76 | |||
77 | /* equal shift right 8 bytes */ | ||
78 | acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) ); | ||
79 | acc1 = _mm_add_epi64( acc1, acc2 ); | ||
80 | |||
81 | _mm_storel_epi64( (__m128i *)&sum, acc1 ); | ||
82 | |||
83 | for( ; i < len; i++ ) { | ||
84 | sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); | ||
85 | } | ||
86 | |||
87 | return sum; | ||
88 | } | ||