diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c | 260 |
1 files changed, 260 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/lib/rbcodec/codecs/libopus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c new file mode 100644 index 0000000000..00a70cb51f --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c | |||
@@ -0,0 +1,260 @@ | |||
1 | /*********************************************************************** | ||
2 | Copyright (c) 2017 Google Inc., Jean-Marc Valin | ||
3 | Redistribution and use in source and binary forms, with or without | ||
4 | modification, are permitted provided that the following conditions | ||
5 | are met: | ||
6 | - Redistributions of source code must retain the above copyright notice, | ||
7 | this list of conditions and the following disclaimer. | ||
8 | - Redistributions in binary form must reproduce the above copyright | ||
9 | notice, this list of conditions and the following disclaimer in the | ||
10 | documentation and/or other materials provided with the distribution. | ||
11 | - Neither the name of Internet Society, IETF or IETF Trust, nor the | ||
12 | names of specific contributors, may be used to endorse or promote | ||
13 | products derived from this software without specific prior written | ||
14 | permission. | ||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
18 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
19 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
20 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
21 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
22 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
23 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
24 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
25 | POSSIBILITY OF SUCH DAMAGE. | ||
26 | ***********************************************************************/ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <arm_neon.h> | ||
33 | #ifdef OPUS_CHECK_ASM | ||
34 | # include <string.h> | ||
35 | #endif | ||
36 | #include "stack_alloc.h" | ||
37 | #include "main_FIX.h" | ||
38 | |||
39 | static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 ) | ||
40 | { | ||
41 | int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ]; | ||
42 | const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset ); | ||
43 | corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 ); | ||
44 | corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 ); | ||
45 | t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) ); | ||
46 | t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) ); | ||
47 | corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC ); | ||
48 | corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC ); | ||
49 | vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] ); | ||
50 | vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] ); | ||
51 | } | ||
52 | |||
53 | static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 ) | ||
54 | { | ||
55 | int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 ); | ||
56 | t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 ); | ||
57 | return vaddq_s32( state_QS1_1_s32x4, t_s32x4 ); | ||
58 | } | ||
59 | |||
60 | void silk_warped_autocorrelation_FIX_neon( | ||
61 | opus_int32 *corr, /* O Result [order + 1] */ | ||
62 | opus_int *scale, /* O Scaling of the correlation vector */ | ||
63 | const opus_int16 *input, /* I Input data to correlate */ | ||
64 | const opus_int warping_Q16, /* I Warping coefficient */ | ||
65 | const opus_int length, /* I Length of input */ | ||
66 | const opus_int order /* I Correlation order (even) */ | ||
67 | ) | ||
68 | { | ||
69 | if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { | ||
70 | silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); | ||
71 | } else { | ||
72 | opus_int n, i, lsh; | ||
73 | opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ | ||
74 | opus_int64 corr_QC_orderT; | ||
75 | int64x2_t lsh_s64x2; | ||
76 | const opus_int orderT = ( order + 3 ) & ~3; | ||
77 | opus_int64 *corr_QCT; | ||
78 | opus_int32 *input_QS; | ||
79 | VARDECL( opus_int32, input_QST ); | ||
80 | VARDECL( opus_int32, state ); | ||
81 | SAVE_STACK; | ||
82 | |||
83 | /* Order must be even */ | ||
84 | silk_assert( ( order & 1 ) == 0 ); | ||
85 | silk_assert( 2 * QS - QC >= 0 ); | ||
86 | |||
87 | ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); | ||
88 | |||
89 | input_QS = input_QST; | ||
90 | /* input_QS has zero paddings in the beginning and end. */ | ||
91 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
92 | input_QS += 4; | ||
93 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
94 | input_QS += 4; | ||
95 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
96 | input_QS += 4; | ||
97 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
98 | input_QS += 4; | ||
99 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
100 | input_QS += 4; | ||
101 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
102 | input_QS += 4; | ||
103 | |||
104 | /* Loop over samples */ | ||
105 | for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { | ||
106 | const int16x8_t t0_s16x4 = vld1q_s16( input + n ); | ||
107 | vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); | ||
108 | vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); | ||
109 | } | ||
110 | for( ; n < length; n++, input_QS++ ) { | ||
111 | input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); | ||
112 | } | ||
113 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
114 | input_QS += 4; | ||
115 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
116 | input_QS += 4; | ||
117 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
118 | input_QS += 4; | ||
119 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
120 | input_QS += 4; | ||
121 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
122 | input_QS += 4; | ||
123 | vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); | ||
124 | input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; | ||
125 | |||
126 | /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ | ||
127 | /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ | ||
128 | /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ | ||
129 | |||
130 | /* Keep the C code here to help understand the intrinsics optimization. */ | ||
131 | /* | ||
132 | { | ||
133 | opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; | ||
134 | opus_int32 *state_QST[ 3 ]; | ||
135 | state_QST[ 0 ] = state_QS[ 0 ]; | ||
136 | state_QST[ 1 ] = state_QS[ 1 ]; | ||
137 | for( n = 0; n < length + order; n++, input_QS++ ) { | ||
138 | state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; | ||
139 | for( i = 0; i < orderT; i++ ) { | ||
140 | corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); | ||
141 | state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); | ||
142 | } | ||
143 | state_QST[ 2 ] = state_QST[ 0 ]; | ||
144 | state_QST[ 0 ] = state_QST[ 1 ]; | ||
145 | state_QST[ 1 ] = state_QST[ 2 ]; | ||
146 | } | ||
147 | } | ||
148 | */ | ||
149 | |||
150 | { | ||
151 | const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); | ||
152 | const opus_int32 *in = input_QS + orderT; | ||
153 | opus_int o = orderT; | ||
154 | int32x4_t state_QS_s32x4[ 3 ][ 2 ]; | ||
155 | |||
156 | ALLOC( state, length + orderT, opus_int32 ); | ||
157 | state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); | ||
158 | |||
159 | /* Calculate 8 taps of all inputs in each loop. */ | ||
160 | do { | ||
161 | state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = | ||
162 | state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); | ||
163 | n = 0; | ||
164 | do { | ||
165 | calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); | ||
166 | calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); | ||
167 | state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); | ||
168 | vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); | ||
169 | state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); | ||
170 | state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); | ||
171 | state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); | ||
172 | state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); | ||
173 | state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; | ||
174 | state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; | ||
175 | } while( ++n < ( length + order ) ); | ||
176 | in = state; | ||
177 | o -= 8; | ||
178 | } while( o > 4 ); | ||
179 | |||
180 | if( o ) { | ||
181 | /* Calculate the last 4 taps of all inputs. */ | ||
182 | opus_int32 *stateT = state; | ||
183 | silk_assert( o == 4 ); | ||
184 | state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); | ||
185 | n = length + order; | ||
186 | do { | ||
187 | calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); | ||
188 | state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); | ||
189 | vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); | ||
190 | state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); | ||
191 | state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); | ||
192 | state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; | ||
193 | input_QS++; | ||
194 | stateT++; | ||
195 | } while( --n ); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | { | ||
200 | const opus_int16 *inputT = input; | ||
201 | int32x4_t t_s32x4; | ||
202 | int64x1_t t_s64x1; | ||
203 | int64x2_t t_s64x2 = vdupq_n_s64( 0 ); | ||
204 | for( n = 0; n <= length - 8; n += 8 ) { | ||
205 | int16x8_t input_s16x8 = vld1q_s16( inputT ); | ||
206 | t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); | ||
207 | t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); | ||
208 | t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); | ||
209 | t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); | ||
210 | inputT += 8; | ||
211 | } | ||
212 | t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); | ||
213 | corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); | ||
214 | for( ; n < length; n++ ) { | ||
215 | corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); | ||
216 | } | ||
217 | corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); | ||
218 | corr_QC[ orderT ] = corr_QC_orderT; | ||
219 | } | ||
220 | |||
221 | corr_QCT = corr_QC + orderT - order; | ||
222 | lsh = silk_CLZ64( corr_QC_orderT ) - 35; | ||
223 | lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); | ||
224 | *scale = -( QC + lsh ); | ||
225 | silk_assert( *scale >= -30 && *scale <= 12 ); | ||
226 | lsh_s64x2 = vdupq_n_s64( lsh ); | ||
227 | for( i = 0; i <= order - 3; i += 4 ) { | ||
228 | int32x4_t corr_s32x4; | ||
229 | int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; | ||
230 | corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); | ||
231 | corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); | ||
232 | corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); | ||
233 | corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); | ||
234 | corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); | ||
235 | corr_s32x4 = vrev64q_s32( corr_s32x4 ); | ||
236 | vst1q_s32( corr + order - i - 3, corr_s32x4 ); | ||
237 | } | ||
238 | if( lsh >= 0 ) { | ||
239 | for( ; i < order + 1; i++ ) { | ||
240 | corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); | ||
241 | } | ||
242 | } else { | ||
243 | for( ; i < order + 1; i++ ) { | ||
244 | corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); | ||
245 | } | ||
246 | } | ||
247 | silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ | ||
248 | RESTORE_STACK; | ||
249 | } | ||
250 | |||
251 | #ifdef OPUS_CHECK_ASM | ||
252 | { | ||
253 | opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; | ||
254 | opus_int scale_c; | ||
255 | silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); | ||
256 | silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); | ||
257 | silk_assert( scale_c == *scale ); | ||
258 | } | ||
259 | #endif | ||
260 | } | ||