diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c b/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c new file mode 100644 index 0000000000..3531d02d10 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c | |||
@@ -0,0 +1,258 @@ | |||
1 | /* Copyright (c) 2015 Xiph.Org Foundation | ||
2 | Written by Viswanath Puttagunta */ | ||
3 | /** | ||
4 | @file celt_mdct_ne10.c | ||
5 | @brief ARM Neon optimizations for mdct using NE10 library | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | Redistribution and use in source and binary forms, with or without | ||
10 | modification, are permitted provided that the following conditions | ||
11 | are met: | ||
12 | |||
13 | - Redistributions of source code must retain the above copyright | ||
14 | notice, this list of conditions and the following disclaimer. | ||
15 | |||
16 | - Redistributions in binary form must reproduce the above copyright | ||
17 | notice, this list of conditions and the following disclaimer in the | ||
18 | documentation and/or other materials provided with the distribution. | ||
19 | |||
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
21 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
24 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
31 | */ | ||
32 | |||
33 | #ifndef SKIP_CONFIG_H | ||
34 | #ifdef HAVE_CONFIG_H | ||
35 | #include "config.h" | ||
36 | #endif | ||
37 | #endif | ||
38 | |||
39 | #include "kiss_fft.h" | ||
40 | #include "_kiss_fft_guts.h" | ||
41 | #include "mdct.h" | ||
42 | #include "stack_alloc.h" | ||
43 | |||
44 | void clt_mdct_forward_neon(const mdct_lookup *l, | ||
45 | kiss_fft_scalar *in, | ||
46 | kiss_fft_scalar * OPUS_RESTRICT out, | ||
47 | const opus_val16 *window, | ||
48 | int overlap, int shift, int stride, int arch) | ||
49 | { | ||
50 | int i; | ||
51 | int N, N2, N4; | ||
52 | VARDECL(kiss_fft_scalar, f); | ||
53 | VARDECL(kiss_fft_cpx, f2); | ||
54 | const kiss_fft_state *st = l->kfft[shift]; | ||
55 | const kiss_twiddle_scalar *trig; | ||
56 | |||
57 | SAVE_STACK; | ||
58 | |||
59 | N = l->n; | ||
60 | trig = l->trig; | ||
61 | for (i=0;i<shift;i++) | ||
62 | { | ||
63 | N >>= 1; | ||
64 | trig += N; | ||
65 | } | ||
66 | N2 = N>>1; | ||
67 | N4 = N>>2; | ||
68 | |||
69 | ALLOC(f, N2, kiss_fft_scalar); | ||
70 | ALLOC(f2, N4, kiss_fft_cpx); | ||
71 | |||
72 | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | ||
73 | /* Window, shuffle, fold */ | ||
74 | { | ||
75 | /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
76 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); | ||
77 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); | ||
78 | kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
79 | const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); | ||
80 | const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | ||
81 | for(i=0;i<((overlap+3)>>2);i++) | ||
82 | { | ||
83 | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | ||
84 | *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); | ||
85 | *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); | ||
86 | xp1+=2; | ||
87 | xp2-=2; | ||
88 | wp1+=2; | ||
89 | wp2-=2; | ||
90 | } | ||
91 | wp1 = window; | ||
92 | wp2 = window+overlap-1; | ||
93 | for(;i<N4-((overlap+3)>>2);i++) | ||
94 | { | ||
95 | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | ||
96 | *yp++ = *xp2; | ||
97 | *yp++ = *xp1; | ||
98 | xp1+=2; | ||
99 | xp2-=2; | ||
100 | } | ||
101 | for(;i<N4;i++) | ||
102 | { | ||
103 | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | ||
104 | *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2); | ||
105 | *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]); | ||
106 | xp1+=2; | ||
107 | xp2-=2; | ||
108 | wp1+=2; | ||
109 | wp2-=2; | ||
110 | } | ||
111 | } | ||
112 | /* Pre-rotation */ | ||
113 | { | ||
114 | kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
115 | const kiss_twiddle_scalar *t = &trig[0]; | ||
116 | for(i=0;i<N4;i++) | ||
117 | { | ||
118 | kiss_fft_cpx yc; | ||
119 | kiss_twiddle_scalar t0, t1; | ||
120 | kiss_fft_scalar re, im, yr, yi; | ||
121 | t0 = t[i]; | ||
122 | t1 = t[N4+i]; | ||
123 | re = *yp++; | ||
124 | im = *yp++; | ||
125 | yr = S_MUL(re,t0) - S_MUL(im,t1); | ||
126 | yi = S_MUL(im,t0) + S_MUL(re,t1); | ||
127 | yc.r = yr; | ||
128 | yc.i = yi; | ||
129 | f2[i] = yc; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | opus_fft(st, f2, (kiss_fft_cpx *)f, arch); | ||
134 | |||
135 | /* Post-rotate */ | ||
136 | { | ||
137 | /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
138 | const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f; | ||
139 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | ||
140 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | ||
141 | const kiss_twiddle_scalar *t = &trig[0]; | ||
142 | /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
143 | for(i=0;i<N4;i++) | ||
144 | { | ||
145 | kiss_fft_scalar yr, yi; | ||
146 | yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]); | ||
147 | yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]); | ||
148 | *yp1 = yr; | ||
149 | *yp2 = yi; | ||
150 | fp++; | ||
151 | yp1 += 2*stride; | ||
152 | yp2 -= 2*stride; | ||
153 | } | ||
154 | } | ||
155 | RESTORE_STACK; | ||
156 | } | ||
157 | |||
158 | void clt_mdct_backward_neon(const mdct_lookup *l, | ||
159 | kiss_fft_scalar *in, | ||
160 | kiss_fft_scalar * OPUS_RESTRICT out, | ||
161 | const opus_val16 * OPUS_RESTRICT window, | ||
162 | int overlap, int shift, int stride, int arch) | ||
163 | { | ||
164 | int i; | ||
165 | int N, N2, N4; | ||
166 | VARDECL(kiss_fft_scalar, f); | ||
167 | const kiss_twiddle_scalar *trig; | ||
168 | const kiss_fft_state *st = l->kfft[shift]; | ||
169 | |||
170 | N = l->n; | ||
171 | trig = l->trig; | ||
172 | for (i=0;i<shift;i++) | ||
173 | { | ||
174 | N >>= 1; | ||
175 | trig += N; | ||
176 | } | ||
177 | N2 = N>>1; | ||
178 | N4 = N>>2; | ||
179 | |||
180 | ALLOC(f, N2, kiss_fft_scalar); | ||
181 | |||
182 | /* Pre-rotate */ | ||
183 | { | ||
184 | /* Temp pointers to make it really clear to the compiler what we're doing */ | ||
185 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | ||
186 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | ||
187 | kiss_fft_scalar * OPUS_RESTRICT yp = f; | ||
188 | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; | ||
189 | for(i=0;i<N4;i++) | ||
190 | { | ||
191 | kiss_fft_scalar yr, yi; | ||
192 | yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]); | ||
193 | yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]); | ||
194 | yp[2*i] = yr; | ||
195 | yp[2*i+1] = yi; | ||
196 | xp1+=2*stride; | ||
197 | xp2-=2*stride; | ||
198 | } | ||
199 | } | ||
200 | |||
201 | opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch); | ||
202 | |||
203 | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | ||
204 | it in-place. */ | ||
205 | { | ||
206 | kiss_fft_scalar * yp0 = out+(overlap>>1); | ||
207 | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; | ||
208 | const kiss_twiddle_scalar *t = &trig[0]; | ||
209 | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | ||
210 | middle pair will be computed twice. */ | ||
211 | for(i=0;i<(N4+1)>>1;i++) | ||
212 | { | ||
213 | kiss_fft_scalar re, im, yr, yi; | ||
214 | kiss_twiddle_scalar t0, t1; | ||
215 | re = yp0[0]; | ||
216 | im = yp0[1]; | ||
217 | t0 = t[i]; | ||
218 | t1 = t[N4+i]; | ||
219 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | ||
220 | yr = S_MUL(re,t0) + S_MUL(im,t1); | ||
221 | yi = S_MUL(re,t1) - S_MUL(im,t0); | ||
222 | re = yp1[0]; | ||
223 | im = yp1[1]; | ||
224 | yp0[0] = yr; | ||
225 | yp1[1] = yi; | ||
226 | |||
227 | t0 = t[(N4-i-1)]; | ||
228 | t1 = t[(N2-i-1)]; | ||
229 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | ||
230 | yr = S_MUL(re,t0) + S_MUL(im,t1); | ||
231 | yi = S_MUL(re,t1) - S_MUL(im,t0); | ||
232 | yp1[0] = yr; | ||
233 | yp0[1] = yi; | ||
234 | yp0 += 2; | ||
235 | yp1 -= 2; | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /* Mirror on both sides for TDAC */ | ||
240 | { | ||
241 | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; | ||
242 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | ||
243 | const opus_val16 * OPUS_RESTRICT wp1 = window; | ||
244 | const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; | ||
245 | |||
246 | for(i = 0; i < overlap/2; i++) | ||
247 | { | ||
248 | kiss_fft_scalar x1, x2; | ||
249 | x1 = *xp1; | ||
250 | x2 = *yp1; | ||
251 | *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); | ||
252 | *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); | ||
253 | wp1++; | ||
254 | wp2--; | ||
255 | } | ||
256 | } | ||
257 | RESTORE_STACK; | ||
258 | } | ||