diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c new file mode 100644 index 0000000000..9715733185 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/silk/arm/biquad_alt_neon_intr.c | |||
@@ -0,0 +1,156 @@ | |||
1 | /*********************************************************************** | ||
2 | Copyright (c) 2017 Google Inc. | ||
3 | Redistribution and use in source and binary forms, with or without | ||
4 | modification, are permitted provided that the following conditions | ||
5 | are met: | ||
6 | - Redistributions of source code must retain the above copyright notice, | ||
7 | this list of conditions and the following disclaimer. | ||
8 | - Redistributions in binary form must reproduce the above copyright | ||
9 | notice, this list of conditions and the following disclaimer in the | ||
10 | documentation and/or other materials provided with the distribution. | ||
11 | - Neither the name of Internet Society, IETF or IETF Trust, nor the | ||
12 | names of specific contributors, may be used to endorse or promote | ||
13 | products derived from this software without specific prior written | ||
14 | permission. | ||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
18 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
19 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
20 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
21 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
22 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
23 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
24 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
25 | POSSIBILITY OF SUCH DAMAGE. | ||
26 | ***********************************************************************/ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <arm_neon.h> | ||
33 | #ifdef OPUS_CHECK_ASM | ||
34 | # include <string.h> | ||
35 | # include "stack_alloc.h" | ||
36 | #endif | ||
37 | #include "SigProc_FIX.h" | ||
38 | |||
39 | static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 ) | ||
40 | { | ||
41 | int32x4_t t_s32x4, out32_Q14_s32x4; | ||
42 | |||
43 | *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */ | ||
44 | *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */ | ||
45 | *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */ | ||
46 | out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */ | ||
47 | t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */ | ||
48 | *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */ | ||
49 | t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */ | ||
50 | *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */ | ||
51 | t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */ | ||
52 | *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */ | ||
53 | } | ||
54 | |||
55 | void silk_biquad_alt_stride2_neon( | ||
56 | const opus_int16 *in, /* I input signal */ | ||
57 | const opus_int32 *B_Q28, /* I MA coefficients [3] */ | ||
58 | const opus_int32 *A_Q28, /* I AR coefficients [2] */ | ||
59 | opus_int32 *S, /* I/O State vector [4] */ | ||
60 | opus_int16 *out, /* O output signal */ | ||
61 | const opus_int32 len /* I signal length (must be even) */ | ||
62 | ) | ||
63 | { | ||
64 | /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ | ||
65 | opus_int k = 0; | ||
66 | const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); | ||
67 | const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); | ||
68 | int16x4_t in_s16x4 = vdup_n_s16( 0 ); | ||
69 | int16x4_t out_s16x4; | ||
70 | int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; | ||
71 | int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; | ||
72 | int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; | ||
73 | |||
74 | #ifdef OPUS_CHECK_ASM | ||
75 | opus_int32 S_c[ 4 ]; | ||
76 | VARDECL( opus_int16, out_c ); | ||
77 | SAVE_STACK; | ||
78 | ALLOC( out_c, 2 * len, opus_int16 ); | ||
79 | |||
80 | silk_memcpy( &S_c, S, sizeof( S_c ) ); | ||
81 | silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); | ||
82 | #endif | ||
83 | |||
84 | /* Negate A_Q28 values and split in two parts */ | ||
85 | A_Q28_s32x2 = vld1_s32( A_Q28 ); | ||
86 | A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); | ||
87 | A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ | ||
88 | A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ | ||
89 | A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ | ||
90 | A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ | ||
91 | A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ | ||
92 | |||
93 | B_Q28_s32x2 = vld1_s32( B_Q28 ); | ||
94 | t_s32x2 = vld1_s32( B_Q28 + 1 ); | ||
95 | t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); | ||
96 | t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); | ||
97 | t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); | ||
98 | A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ | ||
99 | A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ | ||
100 | B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ | ||
101 | S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ | ||
102 | S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ | ||
103 | S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); | ||
104 | |||
105 | for( ; k < len - 1; k += 2 ) { | ||
106 | int32x4_t in_s32x4[ 2 ], t_s32x4; | ||
107 | int32x2_t out32_Q14_s32x2[ 2 ]; | ||
108 | |||
109 | /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ | ||
110 | in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ | ||
111 | in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ | ||
112 | t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ | ||
113 | in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ | ||
114 | in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ | ||
115 | silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); | ||
116 | silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); | ||
117 | |||
118 | /* Scale back to Q0 and saturate */ | ||
119 | out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ | ||
120 | out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ | ||
121 | out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ | ||
122 | vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ | ||
123 | } | ||
124 | |||
125 | /* Process leftover. */ | ||
126 | if( k < len ) { | ||
127 | int32x4_t in_s32x4; | ||
128 | int32x2_t out32_Q14_s32x2; | ||
129 | |||
130 | /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ | ||
131 | in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ | ||
132 | in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ | ||
133 | in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ | ||
134 | t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ | ||
135 | in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ | ||
136 | silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); | ||
137 | |||
138 | /* Scale back to Q0 and saturate */ | ||
139 | out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ | ||
140 | out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ | ||
141 | out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ | ||
142 | vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ | ||
143 | vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ | ||
144 | } | ||
145 | |||
146 | vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ | ||
147 | vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ | ||
148 | vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ | ||
149 | vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ | ||
150 | |||
151 | #ifdef OPUS_CHECK_ASM | ||
152 | silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); | ||
153 | silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); | ||
154 | RESTORE_STACK; | ||
155 | #endif | ||
156 | } | ||