diff options
author | William Wilgus <me.theuser@yahoo.com> | 2019-01-04 02:01:18 -0600 |
---|---|---|
committer | Solomon Peachy <pizza@shaftnet.org> | 2021-04-23 14:23:04 +0000 |
commit | 14c6bb798d6bebc80f07e863236adbaf8d156a9c (patch) | |
tree | 551a4b641906c2626af844fa3239c1b2b1ff0ad3 /lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c | |
parent | 75d93937965ec4df70d37df6d4feea04577c996b (diff) | |
download | rockbox-14c6bb798d6bebc80f07e863236adbaf8d156a9c.tar.gz rockbox-14c6bb798d6bebc80f07e863236adbaf8d156a9c.zip |
Sync opus codec to upstream git
Change-Id: I0cfcc0005c4ad7bfbb1aaf454188ce70fb043dc1
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c new file mode 100644 index 0000000000..775042860d --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c | |||
@@ -0,0 +1,217 @@ | |||
1 | /* Copyright (c) 2007-2008 CSIRO | ||
2 | Copyright (c) 2007-2009 Xiph.Org Foundation | ||
3 | Copyright (c) 2007-2016 Jean-Marc Valin */ | ||
4 | /* | ||
5 | Redistribution and use in source and binary forms, with or without | ||
6 | modification, are permitted provided that the following conditions | ||
7 | are met: | ||
8 | |||
9 | - Redistributions of source code must retain the above copyright | ||
10 | notice, this list of conditions and the following disclaimer. | ||
11 | |||
12 | - Redistributions in binary form must reproduce the above copyright | ||
13 | notice, this list of conditions and the following disclaimer in the | ||
14 | documentation and/or other materials provided with the distribution. | ||
15 | |||
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
17 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
20 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
27 | */ | ||
28 | |||
29 | #ifdef HAVE_CONFIG_H | ||
30 | #include "config.h" | ||
31 | #endif | ||
32 | |||
33 | #include <xmmintrin.h> | ||
34 | #include <emmintrin.h> | ||
35 | #include "celt_lpc.h" | ||
36 | #include "stack_alloc.h" | ||
37 | #include "mathops.h" | ||
38 | #include "vq.h" | ||
39 | #include "x86cpu.h" | ||
40 | |||
41 | |||
42 | #ifndef FIXED_POINT | ||
43 | |||
44 | opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) | ||
45 | { | ||
46 | int i, j; | ||
47 | int pulsesLeft; | ||
48 | float xy, yy; | ||
49 | VARDECL(celt_norm, y); | ||
50 | VARDECL(celt_norm, X); | ||
51 | VARDECL(float, signy); | ||
52 | __m128 signmask; | ||
53 | __m128 sums; | ||
54 | __m128i fours; | ||
55 | SAVE_STACK; | ||
56 | |||
57 | (void)arch; | ||
58 | /* All bits set to zero, except for the sign bit. */ | ||
59 | signmask = _mm_set_ps1(-0.f); | ||
60 | fours = _mm_set_epi32(4, 4, 4, 4); | ||
61 | ALLOC(y, N+3, celt_norm); | ||
62 | ALLOC(X, N+3, celt_norm); | ||
63 | ALLOC(signy, N+3, float); | ||
64 | |||
65 | OPUS_COPY(X, _X, N); | ||
66 | X[N] = X[N+1] = X[N+2] = 0; | ||
67 | sums = _mm_setzero_ps(); | ||
68 | for (j=0;j<N;j+=4) | ||
69 | { | ||
70 | __m128 x4, s4; | ||
71 | x4 = _mm_loadu_ps(&X[j]); | ||
72 | s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); | ||
73 | /* Get rid of the sign */ | ||
74 | x4 = _mm_andnot_ps(signmask, x4); | ||
75 | sums = _mm_add_ps(sums, x4); | ||
76 | /* Clear y and iy in case we don't do the projection. */ | ||
77 | _mm_storeu_ps(&y[j], _mm_setzero_ps()); | ||
78 | _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); | ||
79 | _mm_storeu_ps(&X[j], x4); | ||
80 | _mm_storeu_ps(&signy[j], s4); | ||
81 | } | ||
82 | sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); | ||
83 | sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); | ||
84 | |||
85 | xy = yy = 0; | ||
86 | |||
87 | pulsesLeft = K; | ||
88 | |||
89 | /* Do a pre-search by projecting on the pyramid */ | ||
90 | if (K > (N>>1)) | ||
91 | { | ||
92 | __m128i pulses_sum; | ||
93 | __m128 yy4, xy4; | ||
94 | __m128 rcp4; | ||
95 | opus_val32 sum = _mm_cvtss_f32(sums); | ||
96 | /* If X is too small, just replace it with a pulse at 0 */ | ||
97 | /* Prevents infinities and NaNs from causing too many pulses | ||
98 | to be allocated. 64 is an approximation of infinity here. */ | ||
99 | if (!(sum > EPSILON && sum < 64)) | ||
100 | { | ||
101 | X[0] = QCONST16(1.f,14); | ||
102 | j=1; do | ||
103 | X[j]=0; | ||
104 | while (++j<N); | ||
105 | sums = _mm_set_ps1(1.f); | ||
106 | } | ||
107 | /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ | ||
108 | rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums)); | ||
109 | xy4 = yy4 = _mm_setzero_ps(); | ||
110 | pulses_sum = _mm_setzero_si128(); | ||
111 | for (j=0;j<N;j+=4) | ||
112 | { | ||
113 | __m128 rx4, x4, y4; | ||
114 | __m128i iy4; | ||
115 | x4 = _mm_loadu_ps(&X[j]); | ||
116 | rx4 = _mm_mul_ps(x4, rcp4); | ||
117 | iy4 = _mm_cvttps_epi32(rx4); | ||
118 | pulses_sum = _mm_add_epi32(pulses_sum, iy4); | ||
119 | _mm_storeu_si128((__m128i*)&iy[j], iy4); | ||
120 | y4 = _mm_cvtepi32_ps(iy4); | ||
121 | xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); | ||
122 | yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); | ||
123 | /* double the y[] vector so we don't have to do it in the search loop. */ | ||
124 | _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); | ||
125 | } | ||
126 | pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); | ||
127 | pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); | ||
128 | pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); | ||
129 | xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); | ||
130 | xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); | ||
131 | xy = _mm_cvtss_f32(xy4); | ||
132 | yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); | ||
133 | yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); | ||
134 | yy = _mm_cvtss_f32(yy4); | ||
135 | } | ||
136 | X[N] = X[N+1] = X[N+2] = -100; | ||
137 | y[N] = y[N+1] = y[N+2] = 100; | ||
138 | celt_sig_assert(pulsesLeft>=0); | ||
139 | |||
140 | /* This should never happen, but just in case it does (e.g. on silence) | ||
141 | we fill the first bin with pulses. */ | ||
142 | if (pulsesLeft > N+3) | ||
143 | { | ||
144 | opus_val16 tmp = (opus_val16)pulsesLeft; | ||
145 | yy = MAC16_16(yy, tmp, tmp); | ||
146 | yy = MAC16_16(yy, tmp, y[0]); | ||
147 | iy[0] += pulsesLeft; | ||
148 | pulsesLeft=0; | ||
149 | } | ||
150 | |||
151 | for (i=0;i<pulsesLeft;i++) | ||
152 | { | ||
153 | int best_id; | ||
154 | __m128 xy4, yy4; | ||
155 | __m128 max, max2; | ||
156 | __m128i count; | ||
157 | __m128i pos; | ||
158 | /* The squared magnitude term gets added anyway, so we might as well | ||
159 | add it outside the loop */ | ||
160 | yy = ADD16(yy, 1); | ||
161 | xy4 = _mm_load1_ps(&xy); | ||
162 | yy4 = _mm_load1_ps(&yy); | ||
163 | max = _mm_setzero_ps(); | ||
164 | pos = _mm_setzero_si128(); | ||
165 | count = _mm_set_epi32(3, 2, 1, 0); | ||
166 | for (j=0;j<N;j+=4) | ||
167 | { | ||
168 | __m128 x4, y4, r4; | ||
169 | x4 = _mm_loadu_ps(&X[j]); | ||
170 | y4 = _mm_loadu_ps(&y[j]); | ||
171 | x4 = _mm_add_ps(x4, xy4); | ||
172 | y4 = _mm_add_ps(y4, yy4); | ||
173 | y4 = _mm_rsqrt_ps(y4); | ||
174 | r4 = _mm_mul_ps(x4, y4); | ||
175 | /* Update the index of the max. */ | ||
176 | pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); | ||
177 | /* Update the max. */ | ||
178 | max = _mm_max_ps(max, r4); | ||
179 | /* Update the indices (+4) */ | ||
180 | count = _mm_add_epi32(count, fours); | ||
181 | } | ||
182 | /* Horizontal max */ | ||
183 | max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); | ||
184 | max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); | ||
185 | /* Now that max2 contains the max at all positions, look at which value(s) of the | ||
186 | partial max is equal to the global max. */ | ||
187 | pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); | ||
188 | pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); | ||
189 | pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); | ||
190 | best_id = _mm_cvtsi128_si32(pos); | ||
191 | |||
192 | /* Updating the sums of the new pulse(s) */ | ||
193 | xy = ADD32(xy, EXTEND32(X[best_id])); | ||
194 | /* We're multiplying y[j] by two so we don't have to do it here */ | ||
195 | yy = ADD16(yy, y[best_id]); | ||
196 | |||
197 | /* Only now that we've made the final choice, update y/iy */ | ||
198 | /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ | ||
199 | y[best_id] += 2; | ||
200 | iy[best_id]++; | ||
201 | } | ||
202 | |||
203 | /* Put the original sign back */ | ||
204 | for (j=0;j<N;j+=4) | ||
205 | { | ||
206 | __m128i y4; | ||
207 | __m128i s4; | ||
208 | y4 = _mm_loadu_si128((__m128i*)&iy[j]); | ||
209 | s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); | ||
210 | y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); | ||
211 | _mm_storeu_si128((__m128i*)&iy[j], y4); | ||
212 | } | ||
213 | RESTORE_STACK; | ||
214 | return yy; | ||
215 | } | ||
216 | |||
217 | #endif | ||