diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c new file mode 100644 index 0000000000..a092c68b24 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c | |||
@@ -0,0 +1,195 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include <xmmintrin.h> | ||
33 | #include <emmintrin.h> | ||
34 | |||
35 | #include "macros.h" | ||
36 | #include "celt_lpc.h" | ||
37 | #include "stack_alloc.h" | ||
38 | #include "mathops.h" | ||
39 | #include "pitch.h" | ||
40 | |||
41 | #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) | ||
42 | #include <smmintrin.h> | ||
43 | #include "x86cpu.h" | ||
44 | |||
45 | opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, | ||
46 | int N) | ||
47 | { | ||
48 | opus_int i, dataSize16; | ||
49 | opus_int32 sum; | ||
50 | __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; | ||
51 | __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; | ||
52 | __m128i inVec1_3210, inVec2_3210; | ||
53 | |||
54 | sum = 0; | ||
55 | dataSize16 = N & ~15; | ||
56 | |||
57 | acc1 = _mm_setzero_si128(); | ||
58 | acc2 = _mm_setzero_si128(); | ||
59 | |||
60 | for (i=0;i<dataSize16;i+=16) { | ||
61 | inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
62 | inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
63 | |||
64 | inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); | ||
65 | inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); | ||
66 | |||
67 | inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
68 | inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); | ||
69 | |||
70 | acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
71 | acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); | ||
72 | } | ||
73 | |||
74 | acc1 = _mm_add_epi32(acc1, acc2); | ||
75 | |||
76 | if (N - i >= 8) | ||
77 | { | ||
78 | inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); | ||
79 | inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); | ||
80 | |||
81 | inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); | ||
82 | |||
83 | acc1 = _mm_add_epi32(acc1, inVec1_76543210); | ||
84 | i += 8; | ||
85 | } | ||
86 | |||
87 | if (N - i >= 4) | ||
88 | { | ||
89 | inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); | ||
90 | inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); | ||
91 | |||
92 | inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); | ||
93 | |||
94 | acc1 = _mm_add_epi32(acc1, inVec1_3210); | ||
95 | i += 4; | ||
96 | } | ||
97 | |||
98 | acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); | ||
99 | acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); | ||
100 | |||
101 | sum += _mm_cvtsi128_si32(acc1); | ||
102 | |||
103 | for (;i<N;i++) | ||
104 | { | ||
105 | sum = silk_SMLABB(sum, x[i], y[i]); | ||
106 | } | ||
107 | |||
108 | return sum; | ||
109 | } | ||
110 | |||
111 | void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) | ||
112 | { | ||
113 | int j; | ||
114 | |||
115 | __m128i vecX, vecX0, vecX1, vecX2, vecX3; | ||
116 | __m128i vecY0, vecY1, vecY2, vecY3; | ||
117 | __m128i sum0, sum1, sum2, sum3, vecSum; | ||
118 | __m128i initSum; | ||
119 | |||
120 | celt_assert(len >= 3); | ||
121 | |||
122 | sum0 = _mm_setzero_si128(); | ||
123 | sum1 = _mm_setzero_si128(); | ||
124 | sum2 = _mm_setzero_si128(); | ||
125 | sum3 = _mm_setzero_si128(); | ||
126 | |||
127 | for (j=0;j<(len-7);j+=8) | ||
128 | { | ||
129 | vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); | ||
130 | vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); | ||
131 | vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); | ||
132 | vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); | ||
133 | vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); | ||
134 | |||
135 | sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); | ||
136 | sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); | ||
137 | sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); | ||
138 | sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); | ||
139 | } | ||
140 | |||
141 | sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); | ||
142 | sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); | ||
143 | |||
144 | sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); | ||
145 | sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); | ||
146 | |||
147 | sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); | ||
148 | sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); | ||
149 | |||
150 | sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); | ||
151 | sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); | ||
152 | |||
153 | vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), | ||
154 | _mm_unpacklo_epi32(sum2, sum3)); | ||
155 | |||
156 | for (;j<(len-3);j+=4) | ||
157 | { | ||
158 | vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); | ||
159 | vecX0 = _mm_shuffle_epi32(vecX, 0x00); | ||
160 | vecX1 = _mm_shuffle_epi32(vecX, 0x55); | ||
161 | vecX2 = _mm_shuffle_epi32(vecX, 0xaa); | ||
162 | vecX3 = _mm_shuffle_epi32(vecX, 0xff); | ||
163 | |||
164 | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); | ||
165 | vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); | ||
166 | vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); | ||
167 | vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); | ||
168 | |||
169 | sum0 = _mm_mullo_epi32(vecX0, vecY0); | ||
170 | sum1 = _mm_mullo_epi32(vecX1, vecY1); | ||
171 | sum2 = _mm_mullo_epi32(vecX2, vecY2); | ||
172 | sum3 = _mm_mullo_epi32(vecX3, vecY3); | ||
173 | |||
174 | sum0 = _mm_add_epi32(sum0, sum1); | ||
175 | sum2 = _mm_add_epi32(sum2, sum3); | ||
176 | vecSum = _mm_add_epi32(vecSum, sum0); | ||
177 | vecSum = _mm_add_epi32(vecSum, sum2); | ||
178 | } | ||
179 | |||
180 | for (;j<len;j++) | ||
181 | { | ||
182 | vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); | ||
183 | vecX0 = _mm_shuffle_epi32(vecX, 0x00); | ||
184 | |||
185 | vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); | ||
186 | |||
187 | sum0 = _mm_mullo_epi32(vecX0, vecY0); | ||
188 | vecSum = _mm_add_epi32(vecSum, sum0); | ||
189 | } | ||
190 | |||
191 | initSum = _mm_loadu_si128((__m128i *)(&sum[0])); | ||
192 | initSum = _mm_add_epi32(initSum, vecSum); | ||
193 | _mm_storeu_si128((__m128i *)sum, initSum); | ||
194 | } | ||
195 | #endif | ||