diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c new file mode 100644 index 0000000000..20e73126b6 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c | |||
@@ -0,0 +1,185 @@ | |||
1 | /* Copyright (c) 2014, Cisco Systems, INC | ||
2 | Written by XiangMingZhu WeiZhou MinPeng YanWang | ||
3 | |||
4 | Redistribution and use in source and binary forms, with or without | ||
5 | modification, are permitted provided that the following conditions | ||
6 | are met: | ||
7 | |||
8 | - Redistributions of source code must retain the above copyright | ||
9 | notice, this list of conditions and the following disclaimer. | ||
10 | |||
11 | - Redistributions in binary form must reproduce the above copyright | ||
12 | notice, this list of conditions and the following disclaimer in the | ||
13 | documentation and/or other materials provided with the distribution. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
26 | */ | ||
27 | |||
28 | #ifdef HAVE_CONFIG_H | ||
29 | #include "config.h" | ||
30 | #endif | ||
31 | |||
32 | #include "macros.h" | ||
33 | #include "celt_lpc.h" | ||
34 | #include "stack_alloc.h" | ||
35 | #include "mathops.h" | ||
36 | #include "pitch.h" | ||
37 | |||
38 | #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) | ||
39 | |||
40 | #include <xmmintrin.h> | ||
41 | #include "arch.h" | ||
42 | |||
43 | void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) | ||
44 | { | ||
45 | int j; | ||
46 | __m128 xsum1, xsum2; | ||
47 | xsum1 = _mm_loadu_ps(sum); | ||
48 | xsum2 = _mm_setzero_ps(); | ||
49 | |||
50 | for (j = 0; j < len-3; j += 4) | ||
51 | { | ||
52 | __m128 x0 = _mm_loadu_ps(x+j); | ||
53 | __m128 yj = _mm_loadu_ps(y+j); | ||
54 | __m128 y3 = _mm_loadu_ps(y+j+3); | ||
55 | |||
56 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); | ||
57 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), | ||
58 | _mm_shuffle_ps(yj,y3,0x49))); | ||
59 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), | ||
60 | _mm_shuffle_ps(yj,y3,0x9e))); | ||
61 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); | ||
62 | } | ||
63 | if (j < len) | ||
64 | { | ||
65 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
66 | if (++j < len) | ||
67 | { | ||
68 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
69 | if (++j < len) | ||
70 | { | ||
71 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); | ||
72 | } | ||
73 | } | ||
74 | } | ||
75 | _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); | ||
76 | } | ||
77 | |||
78 | |||
79 | void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | ||
80 | int N, opus_val32 *xy1, opus_val32 *xy2) | ||
81 | { | ||
82 | int i; | ||
83 | __m128 xsum1, xsum2; | ||
84 | xsum1 = _mm_setzero_ps(); | ||
85 | xsum2 = _mm_setzero_ps(); | ||
86 | for (i=0;i<N-3;i+=4) | ||
87 | { | ||
88 | __m128 xi = _mm_loadu_ps(x+i); | ||
89 | __m128 y1i = _mm_loadu_ps(y01+i); | ||
90 | __m128 y2i = _mm_loadu_ps(y02+i); | ||
91 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); | ||
92 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); | ||
93 | } | ||
94 | /* Horizontal sum */ | ||
95 | xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); | ||
96 | xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); | ||
97 | _mm_store_ss(xy1, xsum1); | ||
98 | xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); | ||
99 | xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); | ||
100 | _mm_store_ss(xy2, xsum2); | ||
101 | for (;i<N;i++) | ||
102 | { | ||
103 | *xy1 = MAC16_16(*xy1, x[i], y01[i]); | ||
104 | *xy2 = MAC16_16(*xy2, x[i], y02[i]); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, | ||
109 | int N) | ||
110 | { | ||
111 | int i; | ||
112 | float xy; | ||
113 | __m128 sum; | ||
114 | sum = _mm_setzero_ps(); | ||
115 | /* FIXME: We should probably go 8-way and use 2 sums. */ | ||
116 | for (i=0;i<N-3;i+=4) | ||
117 | { | ||
118 | __m128 xi = _mm_loadu_ps(x+i); | ||
119 | __m128 yi = _mm_loadu_ps(y+i); | ||
120 | sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); | ||
121 | } | ||
122 | /* Horizontal sum */ | ||
123 | sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); | ||
124 | sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); | ||
125 | _mm_store_ss(&xy, sum); | ||
126 | for (;i<N;i++) | ||
127 | { | ||
128 | xy = MAC16_16(xy, x[i], y[i]); | ||
129 | } | ||
130 | return xy; | ||
131 | } | ||
132 | |||
133 | void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, | ||
134 | opus_val16 g10, opus_val16 g11, opus_val16 g12) | ||
135 | { | ||
136 | int i; | ||
137 | __m128 x0v; | ||
138 | __m128 g10v, g11v, g12v; | ||
139 | g10v = _mm_load1_ps(&g10); | ||
140 | g11v = _mm_load1_ps(&g11); | ||
141 | g12v = _mm_load1_ps(&g12); | ||
142 | x0v = _mm_loadu_ps(&x[-T-2]); | ||
143 | for (i=0;i<N-3;i+=4) | ||
144 | { | ||
145 | __m128 yi, yi2, x1v, x2v, x3v, x4v; | ||
146 | const opus_val32 *xp = &x[i-T-2]; | ||
147 | yi = _mm_loadu_ps(x+i); | ||
148 | x4v = _mm_loadu_ps(xp+4); | ||
149 | #if 0 | ||
150 | /* Slower version with all loads */ | ||
151 | x1v = _mm_loadu_ps(xp+1); | ||
152 | x2v = _mm_loadu_ps(xp+2); | ||
153 | x3v = _mm_loadu_ps(xp+3); | ||
154 | #else | ||
155 | x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); | ||
156 | x1v = _mm_shuffle_ps(x0v, x2v, 0x99); | ||
157 | x3v = _mm_shuffle_ps(x2v, x4v, 0x99); | ||
158 | #endif | ||
159 | |||
160 | yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); | ||
161 | #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ | ||
162 | yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); | ||
163 | yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); | ||
164 | #else | ||
165 | /* Use partial sums */ | ||
166 | yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), | ||
167 | _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); | ||
168 | yi = _mm_add_ps(yi, yi2); | ||
169 | #endif | ||
170 | x0v=x4v; | ||
171 | _mm_storeu_ps(y+i, yi); | ||
172 | } | ||
173 | #ifdef CUSTOM_MODES | ||
174 | for (;i<N;i++) | ||
175 | { | ||
176 | y[i] = x[i] | ||
177 | + MULT16_32_Q15(g10,x[i-T]) | ||
178 | + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) | ||
179 | + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); | ||
180 | } | ||
181 | #endif | ||
182 | } | ||
183 | |||
184 | |||
185 | #endif | ||