diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c new file mode 100644 index 0000000000..effda769d0 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c | |||
@@ -0,0 +1,211 @@ | |||
1 | /* Copyright (c) 2014-2015 Xiph.Org Foundation | ||
2 | Written by Viswanath Puttagunta */ | ||
3 | /** | ||
4 | @file celt_neon_intr.c | ||
5 | @brief ARM Neon Intrinsic optimizations for celt | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | Redistribution and use in source and binary forms, with or without | ||
10 | modification, are permitted provided that the following conditions | ||
11 | are met: | ||
12 | |||
13 | - Redistributions of source code must retain the above copyright | ||
14 | notice, this list of conditions and the following disclaimer. | ||
15 | |||
16 | - Redistributions in binary form must reproduce the above copyright | ||
17 | notice, this list of conditions and the following disclaimer in the | ||
18 | documentation and/or other materials provided with the distribution. | ||
19 | |||
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
21 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
24 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
31 | */ | ||
32 | |||
33 | #ifdef HAVE_CONFIG_H | ||
34 | #include "config.h" | ||
35 | #endif | ||
36 | |||
37 | #include <arm_neon.h> | ||
38 | #include "../pitch.h" | ||
39 | |||
40 | #if defined(FIXED_POINT) | ||
41 | void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) | ||
42 | { | ||
43 | int j; | ||
44 | int32x4_t a = vld1q_s32(sum); | ||
45 | /* Load y[0...3] */ | ||
46 | /* This requires len>0 to always be valid (which we assert in the C code). */ | ||
47 | int16x4_t y0 = vld1_s16(y); | ||
48 | y += 4; | ||
49 | |||
50 | for (j = 0; j + 8 <= len; j += 8) | ||
51 | { | ||
52 | /* Load x[0...7] */ | ||
53 | int16x8_t xx = vld1q_s16(x); | ||
54 | int16x4_t x0 = vget_low_s16(xx); | ||
55 | int16x4_t x4 = vget_high_s16(xx); | ||
56 | /* Load y[4...11] */ | ||
57 | int16x8_t yy = vld1q_s16(y); | ||
58 | int16x4_t y4 = vget_low_s16(yy); | ||
59 | int16x4_t y8 = vget_high_s16(yy); | ||
60 | int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); | ||
61 | int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0); | ||
62 | |||
63 | int16x4_t y1 = vext_s16(y0, y4, 1); | ||
64 | int16x4_t y5 = vext_s16(y4, y8, 1); | ||
65 | int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1); | ||
66 | int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1); | ||
67 | |||
68 | int16x4_t y2 = vext_s16(y0, y4, 2); | ||
69 | int16x4_t y6 = vext_s16(y4, y8, 2); | ||
70 | int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2); | ||
71 | int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2); | ||
72 | |||
73 | int16x4_t y3 = vext_s16(y0, y4, 3); | ||
74 | int16x4_t y7 = vext_s16(y4, y8, 3); | ||
75 | int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3); | ||
76 | int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3); | ||
77 | |||
78 | y0 = y8; | ||
79 | a = a7; | ||
80 | x += 8; | ||
81 | y += 8; | ||
82 | } | ||
83 | |||
84 | for (; j < len; j++) | ||
85 | { | ||
86 | int16x4_t x0 = vld1_dup_s16(x); /* load next x */ | ||
87 | int32x4_t a0 = vmlal_s16(a, y0, x0); | ||
88 | |||
89 | int16x4_t y4 = vld1_dup_s16(y); /* load next y */ | ||
90 | y0 = vext_s16(y0, y4, 1); | ||
91 | a = a0; | ||
92 | x++; | ||
93 | y++; | ||
94 | } | ||
95 | |||
96 | vst1q_s32(sum, a); | ||
97 | } | ||
98 | |||
99 | #else | ||
100 | /* | ||
101 | * Function: xcorr_kernel_neon_float | ||
102 | * --------------------------------- | ||
103 | * Computes 4 correlation values and stores them in sum[4] | ||
104 | */ | ||
105 | static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y, | ||
106 | float32_t sum[4], int len) { | ||
107 | float32x4_t YY[3]; | ||
108 | float32x4_t YEXT[3]; | ||
109 | float32x4_t XX[2]; | ||
110 | float32x2_t XX_2; | ||
111 | float32x4_t SUMM; | ||
112 | const float32_t *xi = x; | ||
113 | const float32_t *yi = y; | ||
114 | |||
115 | celt_assert(len>0); | ||
116 | |||
117 | YY[0] = vld1q_f32(yi); | ||
118 | SUMM = vdupq_n_f32(0); | ||
119 | |||
120 | /* Consume 8 elements in x vector and 12 elements in y | ||
121 | * vector. However, the 12'th element never really gets | ||
122 | * touched in this loop. So, if len == 8, then we only | ||
123 | * must access y[0] to y[10]. y[11] must not be accessed | ||
124 | * hence make sure len > 8 and not len >= 8 | ||
125 | */ | ||
126 | while (len > 8) { | ||
127 | yi += 4; | ||
128 | YY[1] = vld1q_f32(yi); | ||
129 | yi += 4; | ||
130 | YY[2] = vld1q_f32(yi); | ||
131 | |||
132 | XX[0] = vld1q_f32(xi); | ||
133 | xi += 4; | ||
134 | XX[1] = vld1q_f32(xi); | ||
135 | xi += 4; | ||
136 | |||
137 | SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); | ||
138 | YEXT[0] = vextq_f32(YY[0], YY[1], 1); | ||
139 | SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); | ||
140 | YEXT[1] = vextq_f32(YY[0], YY[1], 2); | ||
141 | SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); | ||
142 | YEXT[2] = vextq_f32(YY[0], YY[1], 3); | ||
143 | SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); | ||
144 | |||
145 | SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); | ||
146 | YEXT[0] = vextq_f32(YY[1], YY[2], 1); | ||
147 | SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); | ||
148 | YEXT[1] = vextq_f32(YY[1], YY[2], 2); | ||
149 | SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); | ||
150 | YEXT[2] = vextq_f32(YY[1], YY[2], 3); | ||
151 | SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); | ||
152 | |||
153 | YY[0] = YY[2]; | ||
154 | len -= 8; | ||
155 | } | ||
156 | |||
157 | /* Consume 4 elements in x vector and 8 elements in y | ||
158 | * vector. However, the 8'th element in y never really gets | ||
159 | * touched in this loop. So, if len == 4, then we only | ||
160 | * must access y[0] to y[6]. y[7] must not be accessed | ||
161 | * hence make sure len>4 and not len>=4 | ||
162 | */ | ||
163 | if (len > 4) { | ||
164 | yi += 4; | ||
165 | YY[1] = vld1q_f32(yi); | ||
166 | |||
167 | XX[0] = vld1q_f32(xi); | ||
168 | xi += 4; | ||
169 | |||
170 | SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); | ||
171 | YEXT[0] = vextq_f32(YY[0], YY[1], 1); | ||
172 | SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); | ||
173 | YEXT[1] = vextq_f32(YY[0], YY[1], 2); | ||
174 | SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); | ||
175 | YEXT[2] = vextq_f32(YY[0], YY[1], 3); | ||
176 | SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); | ||
177 | |||
178 | YY[0] = YY[1]; | ||
179 | len -= 4; | ||
180 | } | ||
181 | |||
182 | while (--len > 0) { | ||
183 | XX_2 = vld1_dup_f32(xi++); | ||
184 | SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); | ||
185 | YY[0]= vld1q_f32(++yi); | ||
186 | } | ||
187 | |||
188 | XX_2 = vld1_dup_f32(xi); | ||
189 | SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); | ||
190 | |||
191 | vst1q_f32(sum, SUMM); | ||
192 | } | ||
193 | |||
194 | void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, | ||
195 | opus_val32 *xcorr, int len, int max_pitch, int arch) { | ||
196 | int i; | ||
197 | (void)arch; | ||
198 | celt_assert(max_pitch > 0); | ||
199 | celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); | ||
200 | |||
201 | for (i = 0; i < (max_pitch-3); i += 4) { | ||
202 | xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i, | ||
203 | (float32_t *)xcorr+i, len); | ||
204 | } | ||
205 | |||
206 | /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ | ||
207 | for (; i < max_pitch; i++) { | ||
208 | xcorr[i] = celt_inner_prod_neon(_x, _y+i, len); | ||
209 | } | ||
210 | } | ||
211 | #endif | ||