summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/x86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/x86')
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse.h66
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse4_1.c89
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c185
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.h192
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/pitch_sse2.c95
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c195
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/vq_sse.h50
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c217
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/x86_celt_map.c167
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/x86cpu.c157
-rw-r--r--lib/rbcodec/codecs/libopus/celt/x86/x86cpu.h95
11 files changed, 1508 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse.h b/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse.h
new file mode 100644
index 0000000000..7d1ecf7533
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse.h
@@ -0,0 +1,66 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifndef CELT_LPC_SSE_H
29#define CELT_LPC_SSE_H
30
31#ifdef HAVE_CONFIG_H
32#include "config.h"
33#endif
34
35#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
36#define OVERRIDE_CELT_FIR
37
38void celt_fir_sse4_1(
39 const opus_val16 *x,
40 const opus_val16 *num,
41 opus_val16 *y,
42 int N,
43 int ord,
44 int arch);
45
46#if defined(OPUS_X86_PRESUME_SSE4_1)
47#define celt_fir(x, num, y, N, ord, arch) \
48 ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
49
50#else
51
52extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
53 const opus_val16 *x,
54 const opus_val16 *num,
55 opus_val16 *y,
56 int N,
57 int ord,
58 int arch);
59
60# define celt_fir(x, num, y, N, ord, arch) \
61 ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
62
63#endif
64#endif
65
66#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse4_1.c b/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse4_1.c
new file mode 100644
index 0000000000..5478568849
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "celt_lpc.h"
36#include "stack_alloc.h"
37#include "mathops.h"
38#include "pitch.h"
39#include "x86cpu.h"
40
41#if defined(FIXED_POINT)
42
43void celt_fir_sse4_1(const opus_val16 *x,
44 const opus_val16 *num,
45 opus_val16 *y,
46 int N,
47 int ord,
48 int arch)
49{
50 int i,j;
51 VARDECL(opus_val16, rnum);
52
53 __m128i vecNoA;
54 opus_int32 noA ;
55 SAVE_STACK;
56
57 ALLOC(rnum, ord, opus_val16);
58 for(i=0;i<ord;i++)
59 rnum[i] = num[ord-i-1];
60 noA = EXTEND32(1) << SIG_SHIFT >> 1;
61 vecNoA = _mm_set_epi32(noA, noA, noA, noA);
62
63 for (i=0;i<N-3;i+=4)
64 {
65 opus_val32 sums[4] = {0};
66 __m128i vecSum, vecX;
67
68 xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
69
70 vecSum = _mm_loadu_si128((__m128i *)sums);
71 vecSum = _mm_add_epi32(vecSum, vecNoA);
72 vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
73 vecX = OP_CVTEPI16_EPI32_M64(x + i);
74 vecSum = _mm_add_epi32(vecSum, vecX);
75 vecSum = _mm_packs_epi32(vecSum, vecSum);
76 _mm_storel_epi64((__m128i *)(y + i), vecSum);
77 }
78 for (;i<N;i++)
79 {
80 opus_val32 sum = 0;
81 for (j=0;j<ord;j++)
82 sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
83 y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
84 }
85
86 RESTORE_STACK;
87}
88
89#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c
new file mode 100644
index 0000000000..20e73126b6
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.c
@@ -0,0 +1,185 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "macros.h"
33#include "celt_lpc.h"
34#include "stack_alloc.h"
35#include "mathops.h"
36#include "pitch.h"
37
38#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
39
40#include <xmmintrin.h>
41#include "arch.h"
42
43void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
44{
45 int j;
46 __m128 xsum1, xsum2;
47 xsum1 = _mm_loadu_ps(sum);
48 xsum2 = _mm_setzero_ps();
49
50 for (j = 0; j < len-3; j += 4)
51 {
52 __m128 x0 = _mm_loadu_ps(x+j);
53 __m128 yj = _mm_loadu_ps(y+j);
54 __m128 y3 = _mm_loadu_ps(y+j+3);
55
56 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
58 _mm_shuffle_ps(yj,y3,0x49)));
59 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
60 _mm_shuffle_ps(yj,y3,0x9e)));
61 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
62 }
63 if (j < len)
64 {
65 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
66 if (++j < len)
67 {
68 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
69 if (++j < len)
70 {
71 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
72 }
73 }
74 }
75 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
76}
77
78
79void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
80 int N, opus_val32 *xy1, opus_val32 *xy2)
81{
82 int i;
83 __m128 xsum1, xsum2;
84 xsum1 = _mm_setzero_ps();
85 xsum2 = _mm_setzero_ps();
86 for (i=0;i<N-3;i+=4)
87 {
88 __m128 xi = _mm_loadu_ps(x+i);
89 __m128 y1i = _mm_loadu_ps(y01+i);
90 __m128 y2i = _mm_loadu_ps(y02+i);
91 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
92 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
93 }
94 /* Horizontal sum */
95 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
96 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
97 _mm_store_ss(xy1, xsum1);
98 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
99 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
100 _mm_store_ss(xy2, xsum2);
101 for (;i<N;i++)
102 {
103 *xy1 = MAC16_16(*xy1, x[i], y01[i]);
104 *xy2 = MAC16_16(*xy2, x[i], y02[i]);
105 }
106}
107
108opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
109 int N)
110{
111 int i;
112 float xy;
113 __m128 sum;
114 sum = _mm_setzero_ps();
115 /* FIXME: We should probably go 8-way and use 2 sums. */
116 for (i=0;i<N-3;i+=4)
117 {
118 __m128 xi = _mm_loadu_ps(x+i);
119 __m128 yi = _mm_loadu_ps(y+i);
120 sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
121 }
122 /* Horizontal sum */
123 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
124 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
125 _mm_store_ss(&xy, sum);
126 for (;i<N;i++)
127 {
128 xy = MAC16_16(xy, x[i], y[i]);
129 }
130 return xy;
131}
132
133void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
134 opus_val16 g10, opus_val16 g11, opus_val16 g12)
135{
136 int i;
137 __m128 x0v;
138 __m128 g10v, g11v, g12v;
139 g10v = _mm_load1_ps(&g10);
140 g11v = _mm_load1_ps(&g11);
141 g12v = _mm_load1_ps(&g12);
142 x0v = _mm_loadu_ps(&x[-T-2]);
143 for (i=0;i<N-3;i+=4)
144 {
145 __m128 yi, yi2, x1v, x2v, x3v, x4v;
146 const opus_val32 *xp = &x[i-T-2];
147 yi = _mm_loadu_ps(x+i);
148 x4v = _mm_loadu_ps(xp+4);
149#if 0
150 /* Slower version with all loads */
151 x1v = _mm_loadu_ps(xp+1);
152 x2v = _mm_loadu_ps(xp+2);
153 x3v = _mm_loadu_ps(xp+3);
154#else
155 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
156 x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
157 x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
158#endif
159
160 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
161#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
162 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
163 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
164#else
165 /* Use partial sums */
166 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
167 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
168 yi = _mm_add_ps(yi, yi2);
169#endif
170 x0v=x4v;
171 _mm_storeu_ps(y+i, yi);
172 }
173#ifdef CUSTOM_MODES
174 for (;i<N;i++)
175 {
176 y[i] = x[i]
177 + MULT16_32_Q15(g10,x[i-T])
178 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
179 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
180 }
181#endif
182}
183
184
185#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.h b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.h
new file mode 100644
index 0000000000..e5f87ab51a
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse.h
@@ -0,0 +1,192 @@
1/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
2 Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
3/**
4 @file pitch_sse.h
5 @brief Pitch analysis
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifndef PITCH_SSE_H
34#define PITCH_SSE_H
35
36#if defined(HAVE_CONFIG_H)
37#include "config.h"
38#endif
39
40#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
41void xcorr_kernel_sse4_1(
42 const opus_int16 *x,
43 const opus_int16 *y,
44 opus_val32 sum[4],
45 int len);
46#endif
47
48#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
49void xcorr_kernel_sse(
50 const opus_val16 *x,
51 const opus_val16 *y,
52 opus_val32 sum[4],
53 int len);
54#endif
55
56#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
57#define OVERRIDE_XCORR_KERNEL
58#define xcorr_kernel(x, y, sum, len, arch) \
59 ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
60
61#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
62#define OVERRIDE_XCORR_KERNEL
63#define xcorr_kernel(x, y, sum, len, arch) \
64 ((void)arch, xcorr_kernel_sse(x, y, sum, len))
65
66#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
67
68extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
69 const opus_val16 *x,
70 const opus_val16 *y,
71 opus_val32 sum[4],
72 int len);
73
74#define OVERRIDE_XCORR_KERNEL
75#define xcorr_kernel(x, y, sum, len, arch) \
76 ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
77
78#endif
79
80#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
81opus_val32 celt_inner_prod_sse4_1(
82 const opus_int16 *x,
83 const opus_int16 *y,
84 int N);
85#endif
86
87#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
88opus_val32 celt_inner_prod_sse2(
89 const opus_int16 *x,
90 const opus_int16 *y,
91 int N);
92#endif
93
94#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
95opus_val32 celt_inner_prod_sse(
96 const opus_val16 *x,
97 const opus_val16 *y,
98 int N);
99#endif
100
101
102#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
103#define OVERRIDE_CELT_INNER_PROD
104#define celt_inner_prod(x, y, N, arch) \
105 ((void)arch, celt_inner_prod_sse4_1(x, y, N))
106
107#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
108#define OVERRIDE_CELT_INNER_PROD
109#define celt_inner_prod(x, y, N, arch) \
110 ((void)arch, celt_inner_prod_sse2(x, y, N))
111
112#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
113#define OVERRIDE_CELT_INNER_PROD
114#define celt_inner_prod(x, y, N, arch) \
115 ((void)arch, celt_inner_prod_sse(x, y, N))
116
117
118#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
119 (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
120
121extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
122 const opus_val16 *x,
123 const opus_val16 *y,
124 int N);
125
126#define OVERRIDE_CELT_INNER_PROD
127#define celt_inner_prod(x, y, N, arch) \
128 ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
129
130#endif
131
132#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
133
134#define OVERRIDE_DUAL_INNER_PROD
135#define OVERRIDE_COMB_FILTER_CONST
136
137#undef dual_inner_prod
138#undef comb_filter_const
139
140void dual_inner_prod_sse(const opus_val16 *x,
141 const opus_val16 *y01,
142 const opus_val16 *y02,
143 int N,
144 opus_val32 *xy1,
145 opus_val32 *xy2);
146
147void comb_filter_const_sse(opus_val32 *y,
148 opus_val32 *x,
149 int T,
150 int N,
151 opus_val16 g10,
152 opus_val16 g11,
153 opus_val16 g12);
154
155
156#if defined(OPUS_X86_PRESUME_SSE)
157# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
158 ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
159
160# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
161 ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
162#else
163
164extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
165 const opus_val16 *x,
166 const opus_val16 *y01,
167 const opus_val16 *y02,
168 int N,
169 opus_val32 *xy1,
170 opus_val32 *xy2);
171
172#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
173 ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
174
175extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
176 opus_val32 *y,
177 opus_val32 *x,
178 int T,
179 int N,
180 opus_val16 g10,
181 opus_val16 g11,
182 opus_val16 g12);
183
184#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
185 ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))
186
187#define NON_STATIC_COMB_FILTER_CONST_C
188
189#endif
190#endif
191
192#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse2.c b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse2.c
new file mode 100644
index 0000000000..a0e7d1beaf
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse2.c
@@ -0,0 +1,95 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34
35#include "macros.h"
36#include "celt_lpc.h"
37#include "stack_alloc.h"
38#include "mathops.h"
39#include "pitch.h"
40
41#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
42opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
43 int N)
44{
45 opus_int i, dataSize16;
46 opus_int32 sum;
47
48 __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
49 __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
50
51 sum = 0;
52 dataSize16 = N & ~15;
53
54 acc1 = _mm_setzero_si128();
55 acc2 = _mm_setzero_si128();
56
57 for (i=0;i<dataSize16;i+=16)
58 {
59 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
60 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
61
62 inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
63 inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
64
65 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
66 inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
67
68 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
69 acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
70 }
71
72 acc1 = _mm_add_epi32( acc1, acc2 );
73
74 if (N - i >= 8)
75 {
76 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
77 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
78
79 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
80
81 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
82 i += 8;
83 }
84
85 acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
86 acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
87 sum += _mm_cvtsi128_si32(acc1);
88
89 for (;i<N;i++) {
90 sum = silk_SMLABB(sum, x[i], y[i]);
91 }
92
93 return sum;
94}
95#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c
new file mode 100644
index 0000000000..a092c68b24
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/pitch_sse4_1.c
@@ -0,0 +1,195 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34
35#include "macros.h"
36#include "celt_lpc.h"
37#include "stack_alloc.h"
38#include "mathops.h"
39#include "pitch.h"
40
41#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42#include <smmintrin.h>
43#include "x86cpu.h"
44
45opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46 int N)
47{
48 opus_int i, dataSize16;
49 opus_int32 sum;
50 __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51 __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52 __m128i inVec1_3210, inVec2_3210;
53
54 sum = 0;
55 dataSize16 = N & ~15;
56
57 acc1 = _mm_setzero_si128();
58 acc2 = _mm_setzero_si128();
59
60 for (i=0;i<dataSize16;i+=16) {
61 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63
64 inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65 inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66
67 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68 inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69
70 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71 acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72 }
73
74 acc1 = _mm_add_epi32(acc1, acc2);
75
76 if (N - i >= 8)
77 {
78 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80
81 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82
83 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84 i += 8;
85 }
86
87 if (N - i >= 4)
88 {
89 inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90 inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91
92 inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93
94 acc1 = _mm_add_epi32(acc1, inVec1_3210);
95 i += 4;
96 }
97
98 acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99 acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100
101 sum += _mm_cvtsi128_si32(acc1);
102
103 for (;i<N;i++)
104 {
105 sum = silk_SMLABB(sum, x[i], y[i]);
106 }
107
108 return sum;
109}
110
111void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112{
113 int j;
114
115 __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116 __m128i vecY0, vecY1, vecY2, vecY3;
117 __m128i sum0, sum1, sum2, sum3, vecSum;
118 __m128i initSum;
119
120 celt_assert(len >= 3);
121
122 sum0 = _mm_setzero_si128();
123 sum1 = _mm_setzero_si128();
124 sum2 = _mm_setzero_si128();
125 sum3 = _mm_setzero_si128();
126
127 for (j=0;j<(len-7);j+=8)
128 {
129 vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
130 vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
131 vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
132 vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
133 vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
134
135 sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
136 sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
137 sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
138 sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
139 }
140
141 sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
142 sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
143
144 sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
145 sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
146
147 sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
148 sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
149
150 sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
151 sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
152
153 vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
154 _mm_unpacklo_epi32(sum2, sum3));
155
156 for (;j<(len-3);j+=4)
157 {
158 vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
159 vecX0 = _mm_shuffle_epi32(vecX, 0x00);
160 vecX1 = _mm_shuffle_epi32(vecX, 0x55);
161 vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
162 vecX3 = _mm_shuffle_epi32(vecX, 0xff);
163
164 vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
165 vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
166 vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
167 vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
168
169 sum0 = _mm_mullo_epi32(vecX0, vecY0);
170 sum1 = _mm_mullo_epi32(vecX1, vecY1);
171 sum2 = _mm_mullo_epi32(vecX2, vecY2);
172 sum3 = _mm_mullo_epi32(vecX3, vecY3);
173
174 sum0 = _mm_add_epi32(sum0, sum1);
175 sum2 = _mm_add_epi32(sum2, sum3);
176 vecSum = _mm_add_epi32(vecSum, sum0);
177 vecSum = _mm_add_epi32(vecSum, sum2);
178 }
179
180 for (;j<len;j++)
181 {
182 vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
183 vecX0 = _mm_shuffle_epi32(vecX, 0x00);
184
185 vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
186
187 sum0 = _mm_mullo_epi32(vecX0, vecY0);
188 vecSum = _mm_add_epi32(vecSum, sum0);
189 }
190
191 initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
192 initSum = _mm_add_epi32(initSum, vecSum);
193 _mm_storeu_si128((__m128i *)sum, initSum);
194}
195#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/vq_sse.h b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse.h
new file mode 100644
index 0000000000..b4efe8f249
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse.h
@@ -0,0 +1,50 @@
1/* Copyright (c) 2016 Jean-Marc Valin */
2/*
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6
7 - Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9
10 - Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13
14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*/
26
27#ifndef VQ_SSE_H
28#define VQ_SSE_H
29
30#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
31#define OVERRIDE_OP_PVQ_SEARCH
32
33opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
34
35#if defined(OPUS_X86_PRESUME_SSE2)
36#define op_pvq_search(x, iy, K, N, arch) \
37 (op_pvq_search_sse2(x, iy, K, N, arch))
38
39#else
40
41extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
42 celt_norm *_X, int *iy, int K, int N, int arch);
43
44# define op_pvq_search(X, iy, K, N, arch) \
45 ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch))
46
47#endif
48#endif
49
50#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c
new file mode 100644
index 0000000000..775042860d
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/vq_sse2.c
@@ -0,0 +1,217 @@
1/* Copyright (c) 2007-2008 CSIRO
2 Copyright (c) 2007-2009 Xiph.Org Foundation
3 Copyright (c) 2007-2016 Jean-Marc Valin */
4/*
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 - Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 - Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29#ifdef HAVE_CONFIG_H
30#include "config.h"
31#endif
32
33#include <xmmintrin.h>
34#include <emmintrin.h>
35#include "celt_lpc.h"
36#include "stack_alloc.h"
37#include "mathops.h"
38#include "vq.h"
39#include "x86cpu.h"
40
41
42#ifndef FIXED_POINT
43
44opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
45{
46 int i, j;
47 int pulsesLeft;
48 float xy, yy;
49 VARDECL(celt_norm, y);
50 VARDECL(celt_norm, X);
51 VARDECL(float, signy);
52 __m128 signmask;
53 __m128 sums;
54 __m128i fours;
55 SAVE_STACK;
56
57 (void)arch;
58 /* All bits set to zero, except for the sign bit. */
59 signmask = _mm_set_ps1(-0.f);
60 fours = _mm_set_epi32(4, 4, 4, 4);
61 ALLOC(y, N+3, celt_norm);
62 ALLOC(X, N+3, celt_norm);
63 ALLOC(signy, N+3, float);
64
65 OPUS_COPY(X, _X, N);
66 X[N] = X[N+1] = X[N+2] = 0;
67 sums = _mm_setzero_ps();
68 for (j=0;j<N;j+=4)
69 {
70 __m128 x4, s4;
71 x4 = _mm_loadu_ps(&X[j]);
72 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
73 /* Get rid of the sign */
74 x4 = _mm_andnot_ps(signmask, x4);
75 sums = _mm_add_ps(sums, x4);
76 /* Clear y and iy in case we don't do the projection. */
77 _mm_storeu_ps(&y[j], _mm_setzero_ps());
78 _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
79 _mm_storeu_ps(&X[j], x4);
80 _mm_storeu_ps(&signy[j], s4);
81 }
82 sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
83 sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
84
85 xy = yy = 0;
86
87 pulsesLeft = K;
88
89 /* Do a pre-search by projecting on the pyramid */
90 if (K > (N>>1))
91 {
92 __m128i pulses_sum;
93 __m128 yy4, xy4;
94 __m128 rcp4;
95 opus_val32 sum = _mm_cvtss_f32(sums);
96 /* If X is too small, just replace it with a pulse at 0 */
97 /* Prevents infinities and NaNs from causing too many pulses
98 to be allocated. 64 is an approximation of infinity here. */
99 if (!(sum > EPSILON && sum < 64))
100 {
101 X[0] = QCONST16(1.f,14);
102 j=1; do
103 X[j]=0;
104 while (++j<N);
105 sums = _mm_set_ps1(1.f);
106 }
107 /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
108 rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
109 xy4 = yy4 = _mm_setzero_ps();
110 pulses_sum = _mm_setzero_si128();
111 for (j=0;j<N;j+=4)
112 {
113 __m128 rx4, x4, y4;
114 __m128i iy4;
115 x4 = _mm_loadu_ps(&X[j]);
116 rx4 = _mm_mul_ps(x4, rcp4);
117 iy4 = _mm_cvttps_epi32(rx4);
118 pulses_sum = _mm_add_epi32(pulses_sum, iy4);
119 _mm_storeu_si128((__m128i*)&iy[j], iy4);
120 y4 = _mm_cvtepi32_ps(iy4);
121 xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
122 yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
123 /* double the y[] vector so we don't have to do it in the search loop. */
124 _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
125 }
126 pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
127 pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
128 pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
129 xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
130 xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
131 xy = _mm_cvtss_f32(xy4);
132 yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
133 yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
134 yy = _mm_cvtss_f32(yy4);
135 }
136 X[N] = X[N+1] = X[N+2] = -100;
137 y[N] = y[N+1] = y[N+2] = 100;
138 celt_sig_assert(pulsesLeft>=0);
139
140 /* This should never happen, but just in case it does (e.g. on silence)
141 we fill the first bin with pulses. */
142 if (pulsesLeft > N+3)
143 {
144 opus_val16 tmp = (opus_val16)pulsesLeft;
145 yy = MAC16_16(yy, tmp, tmp);
146 yy = MAC16_16(yy, tmp, y[0]);
147 iy[0] += pulsesLeft;
148 pulsesLeft=0;
149 }
150
151 for (i=0;i<pulsesLeft;i++)
152 {
153 int best_id;
154 __m128 xy4, yy4;
155 __m128 max, max2;
156 __m128i count;
157 __m128i pos;
158 /* The squared magnitude term gets added anyway, so we might as well
159 add it outside the loop */
160 yy = ADD16(yy, 1);
161 xy4 = _mm_load1_ps(&xy);
162 yy4 = _mm_load1_ps(&yy);
163 max = _mm_setzero_ps();
164 pos = _mm_setzero_si128();
165 count = _mm_set_epi32(3, 2, 1, 0);
166 for (j=0;j<N;j+=4)
167 {
168 __m128 x4, y4, r4;
169 x4 = _mm_loadu_ps(&X[j]);
170 y4 = _mm_loadu_ps(&y[j]);
171 x4 = _mm_add_ps(x4, xy4);
172 y4 = _mm_add_ps(y4, yy4);
173 y4 = _mm_rsqrt_ps(y4);
174 r4 = _mm_mul_ps(x4, y4);
175 /* Update the index of the max. */
176 pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
177 /* Update the max. */
178 max = _mm_max_ps(max, r4);
179 /* Update the indices (+4) */
180 count = _mm_add_epi32(count, fours);
181 }
182 /* Horizontal max */
183 max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
184 max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
185 /* Now that max2 contains the max at all positions, look at which value(s) of the
186 partial max is equal to the global max. */
187 pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
188 pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
189 pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
190 best_id = _mm_cvtsi128_si32(pos);
191
192 /* Updating the sums of the new pulse(s) */
193 xy = ADD32(xy, EXTEND32(X[best_id]));
194 /* We're multiplying y[j] by two so we don't have to do it here */
195 yy = ADD16(yy, y[best_id]);
196
197 /* Only now that we've made the final choice, update y/iy */
198 /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
199 y[best_id] += 2;
200 iy[best_id]++;
201 }
202
203 /* Put the original sign back */
204 for (j=0;j<N;j+=4)
205 {
206 __m128i y4;
207 __m128i s4;
208 y4 = _mm_loadu_si128((__m128i*)&iy[j]);
209 s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
210 y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
211 _mm_storeu_si128((__m128i*)&iy[j], y4);
212 }
213 RESTORE_STACK;
214 return yy;
215}
216
217#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/x86_celt_map.c b/lib/rbcodec/codecs/libopus/celt/x86/x86_celt_map.c
new file mode 100644
index 0000000000..d39d88edec
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/x86_celt_map.c
@@ -0,0 +1,167 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#if defined(HAVE_CONFIG_H)
29#include "config.h"
30#endif
31
32#include "x86/x86cpu.h"
33#include "celt_lpc.h"
34#include "pitch.h"
35#include "pitch_sse.h"
36#include "vq.h"
37
38#if defined(OPUS_HAVE_RTCD)
39
40# if defined(FIXED_POINT)
41
42#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)
43
44void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
45 const opus_val16 *x,
46 const opus_val16 *num,
47 opus_val16 *y,
48 int N,
49 int ord,
50 int arch
51) = {
52 celt_fir_c, /* non-sse */
53 celt_fir_c,
54 celt_fir_c,
55 MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
56 MAY_HAVE_SSE4_1(celt_fir) /* avx */
57};
58
59void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
60 const opus_val16 *x,
61 const opus_val16 *y,
62 opus_val32 sum[4],
63 int len
64) = {
65 xcorr_kernel_c, /* non-sse */
66 xcorr_kernel_c,
67 xcorr_kernel_c,
68 MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
69 MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */
70};
71
72#endif
73
74#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
75 (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
76
77opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
78 const opus_val16 *x,
79 const opus_val16 *y,
80 int N
81) = {
82 celt_inner_prod_c, /* non-sse */
83 celt_inner_prod_c,
84 MAY_HAVE_SSE2(celt_inner_prod),
85 MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
86 MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */
87};
88
89#endif
90
91# else
92
93#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
94
95void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
96 const opus_val16 *x,
97 const opus_val16 *y,
98 opus_val32 sum[4],
99 int len
100) = {
101 xcorr_kernel_c, /* non-sse */
102 MAY_HAVE_SSE(xcorr_kernel),
103 MAY_HAVE_SSE(xcorr_kernel),
104 MAY_HAVE_SSE(xcorr_kernel),
105 MAY_HAVE_SSE(xcorr_kernel)
106};
107
108opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
109 const opus_val16 *x,
110 const opus_val16 *y,
111 int N
112) = {
113 celt_inner_prod_c, /* non-sse */
114 MAY_HAVE_SSE(celt_inner_prod),
115 MAY_HAVE_SSE(celt_inner_prod),
116 MAY_HAVE_SSE(celt_inner_prod),
117 MAY_HAVE_SSE(celt_inner_prod)
118};
119
120void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
121 const opus_val16 *x,
122 const opus_val16 *y01,
123 const opus_val16 *y02,
124 int N,
125 opus_val32 *xy1,
126 opus_val32 *xy2
127) = {
128 dual_inner_prod_c, /* non-sse */
129 MAY_HAVE_SSE(dual_inner_prod),
130 MAY_HAVE_SSE(dual_inner_prod),
131 MAY_HAVE_SSE(dual_inner_prod),
132 MAY_HAVE_SSE(dual_inner_prod)
133};
134
135void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
136 opus_val32 *y,
137 opus_val32 *x,
138 int T,
139 int N,
140 opus_val16 g10,
141 opus_val16 g11,
142 opus_val16 g12
143) = {
144 comb_filter_const_c, /* non-sse */
145 MAY_HAVE_SSE(comb_filter_const),
146 MAY_HAVE_SSE(comb_filter_const),
147 MAY_HAVE_SSE(comb_filter_const),
148 MAY_HAVE_SSE(comb_filter_const)
149};
150
151
152#endif
153
154#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)
155opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
156 celt_norm *_X, int *iy, int K, int N, int arch
157) = {
158 op_pvq_search_c, /* non-sse */
159 op_pvq_search_c,
160 MAY_HAVE_SSE2(op_pvq_search),
161 MAY_HAVE_SSE2(op_pvq_search),
162 MAY_HAVE_SSE2(op_pvq_search)
163};
164#endif
165
166#endif
167#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.c b/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.c
new file mode 100644
index 0000000000..080eb25e41
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.c
@@ -0,0 +1,157 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "cpu_support.h"
33#include "macros.h"
34#include "main.h"
35#include "pitch.h"
36#include "x86cpu.h"
37
38#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
39 (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
40 (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
41 (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
42
43
44#if defined(_MSC_VER)
45
46#include <intrin.h>
47static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
48{
49 __cpuid((int*)CPUInfo, InfoType);
50}
51
52#else
53
54#if defined(CPU_INFO_BY_C)
55#include <cpuid.h>
56#endif
57
58static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
59{
60#if defined(CPU_INFO_BY_ASM)
61#if defined(__i386__) && defined(__PIC__)
62/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
63 __asm__ __volatile__ (
64 "xchg %%ebx, %1\n"
65 "cpuid\n"
66 "xchg %%ebx, %1\n":
67 "=a" (CPUInfo[0]),
68 "=r" (CPUInfo[1]),
69 "=c" (CPUInfo[2]),
70 "=d" (CPUInfo[3]) :
71 "0" (InfoType)
72 );
73#else
74 __asm__ __volatile__ (
75 "cpuid":
76 "=a" (CPUInfo[0]),
77 "=b" (CPUInfo[1]),
78 "=c" (CPUInfo[2]),
79 "=d" (CPUInfo[3]) :
80 "0" (InfoType)
81 );
82#endif
83#elif defined(CPU_INFO_BY_C)
84 __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
85#endif
86}
87
88#endif
89
90typedef struct CPU_Feature{
91 /* SIMD: 128-bit */
92 int HW_SSE;
93 int HW_SSE2;
94 int HW_SSE41;
95 /* SIMD: 256-bit */
96 int HW_AVX;
97} CPU_Feature;
98
99static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
100{
101 unsigned int info[4] = {0};
102 unsigned int nIds = 0;
103
104 cpuid(info, 0);
105 nIds = info[0];
106
107 if (nIds >= 1){
108 cpuid(info, 1);
109 cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
110 cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
111 cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
112 cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
113 }
114 else {
115 cpu_feature->HW_SSE = 0;
116 cpu_feature->HW_SSE2 = 0;
117 cpu_feature->HW_SSE41 = 0;
118 cpu_feature->HW_AVX = 0;
119 }
120}
121
122int opus_select_arch(void)
123{
124 CPU_Feature cpu_feature;
125 int arch;
126
127 opus_cpu_feature_check(&cpu_feature);
128
129 arch = 0;
130 if (!cpu_feature.HW_SSE)
131 {
132 return arch;
133 }
134 arch++;
135
136 if (!cpu_feature.HW_SSE2)
137 {
138 return arch;
139 }
140 arch++;
141
142 if (!cpu_feature.HW_SSE41)
143 {
144 return arch;
145 }
146 arch++;
147
148 if (!cpu_feature.HW_AVX)
149 {
150 return arch;
151 }
152 arch++;
153
154 return arch;
155}
156
157#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.h b/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.h
new file mode 100644
index 0000000000..1e2bf17b9b
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/x86/x86cpu.h
@@ -0,0 +1,95 @@
1/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#if !defined(X86CPU_H)
29# define X86CPU_H
30
31# if defined(OPUS_X86_MAY_HAVE_SSE)
32# define MAY_HAVE_SSE(name) name ## _sse
33# else
34# define MAY_HAVE_SSE(name) name ## _c
35# endif
36
37# if defined(OPUS_X86_MAY_HAVE_SSE2)
38# define MAY_HAVE_SSE2(name) name ## _sse2
39# else
40# define MAY_HAVE_SSE2(name) name ## _c
41# endif
42
43# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
44# define MAY_HAVE_SSE4_1(name) name ## _sse4_1
45# else
46# define MAY_HAVE_SSE4_1(name) name ## _c
47# endif
48
49# if defined(OPUS_X86_MAY_HAVE_AVX)
50# define MAY_HAVE_AVX(name) name ## _avx
51# else
52# define MAY_HAVE_AVX(name) name ## _c
53# endif
54
55# if defined(OPUS_HAVE_RTCD)
56int opus_select_arch(void);
57# endif
58
59/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
60 or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
61 actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
62 reference, these require 16-byte alignment and load a full 16 bytes (instead
63 of 4 or 8), possibly reading out of bounds.
64
65 We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
66 _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
67 reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
68 optimize this out when optimizations ARE enabled.
69
70 Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
71 (which is fair, since technically the compiler is always allowed to do the
72 dereference before invoking the function implementing the intrinsic).
73 However, it is smart enough to eliminate the extra MOVD instruction.
74 For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
75 the extra MOVQ if it's specified explicitly */
76
77# if defined(__clang__) || !defined(__OPTIMIZE__)
78# define OP_CVTEPI8_EPI32_M32(x) \
79 (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
80# else
81# define OP_CVTEPI8_EPI32_M32(x) \
82 (_mm_cvtepi8_epi32(*(__m128i *)(x)))
83#endif
84
85/* similar reasoning about the instruction sequence as in the 32-bit macro above,
86 */
87# if defined(__clang__) || !defined(__OPTIMIZE__)
88# define OP_CVTEPI16_EPI32_M64(x) \
89 (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
90# else
91# define OP_CVTEPI16_EPI32_M64(x) \
92 (_mm_cvtepi16_epi32(*(__m128i *)(x)))
93# endif
94
95#endif