summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/mdct.c
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2013-05-20 22:25:57 +0200
committerNils Wallménius <nils@rockbox.org>2013-08-31 08:30:51 +0200
commit580b307fd791c0997a8831bc800bba87797bfb7e (patch)
tree807846056f06fd944a750ce41217a877910ebd59 /lib/rbcodec/codecs/libopus/celt/mdct.c
parent74761b70acd96cecc0d35450dd56a98ad9ee7d3d (diff)
downloadrockbox-580b307fd791c0997a8831bc800bba87797bfb7e.tar.gz
rockbox-580b307fd791c0997a8831bc800bba87797bfb7e.zip
Sync opus codec to upstream git
Sync opus codec to upstream commit 02fed471a4568852d6618e041c4f2af0d7730ee2 (August 30 2013) This brings in a lot of optimizations but also makes the diff between our codec and the upstream much smaller as most of our optimizations have been upstreamed or supeceded. Speedups across the board for CELT mode files: 64kbps 128kbps H300 9.82MHz 15.48MHz c200 4.86MHz 9.63MHz fuze v1 10.32MHz 15.92MHz For the silk mode test file (16kbps) arm targets get a speedup of about 2MHz while the H300 is 7.8MHz slower, likely because it's now using the pseudostack more rather than the real stack which is in iram. Patches to get around that are upcomming. Change-Id: Ifecf963e461c51ac42e09dac1e91bc4bc3b12fa3
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/mdct.c')
-rw-r--r--lib/rbcodec/codecs/libopus/celt/mdct.c147
1 files changed, 55 insertions, 92 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 0df77fd5ec..72ea180568 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -41,7 +41,7 @@
41 41
42#ifndef SKIP_CONFIG_H 42#ifndef SKIP_CONFIG_H
43#ifdef HAVE_CONFIG_H 43#ifdef HAVE_CONFIG_H
44#include "opus_config.h" 44#include "config.h"
45#endif 45#endif
46#endif 46#endif
47 47
@@ -110,12 +110,14 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
110 int N, N2, N4; 110 int N, N2, N4;
111 kiss_twiddle_scalar sine; 111 kiss_twiddle_scalar sine;
112 VARDECL(kiss_fft_scalar, f); 112 VARDECL(kiss_fft_scalar, f);
113 VARDECL(kiss_fft_scalar, f2);
113 SAVE_STACK; 114 SAVE_STACK;
114 N = l->n; 115 N = l->n;
115 N >>= shift; 116 N >>= shift;
116 N2 = N>>1; 117 N2 = N>>1;
117 N4 = N>>2; 118 N4 = N>>2;
118 ALLOC(f, N2, kiss_fft_scalar); 119 ALLOC(f, N2, kiss_fft_scalar);
120 ALLOC(f2, N2, kiss_fft_scalar);
119 /* sin(x) ~= x here */ 121 /* sin(x) ~= x here */
120#ifdef FIXED_POINT 122#ifdef FIXED_POINT
121 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; 123 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
@@ -132,7 +134,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
132 kiss_fft_scalar * OPUS_RESTRICT yp = f; 134 kiss_fft_scalar * OPUS_RESTRICT yp = f;
133 const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); 135 const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
134 const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; 136 const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
135 for(i=0;i<(overlap>>2);i++) 137 for(i=0;i<((overlap+3)>>2);i++)
136 { 138 {
137 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ 139 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
138 *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); 140 *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
@@ -144,7 +146,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
144 } 146 }
145 wp1 = window; 147 wp1 = window;
146 wp2 = window+overlap-1; 148 wp2 = window+overlap-1;
147 for(;i<N4-(overlap>>2);i++) 149 for(;i<N4-((overlap+3)>>2);i++)
148 { 150 {
149 /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 151 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
150 *yp++ = *xp2; 152 *yp++ = *xp2;
@@ -181,12 +183,12 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
181 } 183 }
182 184
183 /* N/4 complex FFT, down-scales by 4/N */ 185 /* N/4 complex FFT, down-scales by 4/N */
184 opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)in); 186 opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
185 187
186 /* Post-rotate */ 188 /* Post-rotate */
187 { 189 {
188 /* Temp pointers to make it really clear to the compiler what we're doing */ 190 /* Temp pointers to make it really clear to the compiler what we're doing */
189 const kiss_fft_scalar * OPUS_RESTRICT fp = in; 191 const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
190 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 192 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
191 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); 193 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
192 const kiss_twiddle_scalar *t = &l->trig[0]; 194 const kiss_twiddle_scalar *t = &l->trig[0];
@@ -208,35 +210,20 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
208} 210}
209#endif 211#endif
210 212
211#define S_F_BUF_SIZE (1920>>1) /* N = 1920 for static modes */
212static kiss_fft_scalar s_f2[S_F_BUF_SIZE] IBSS_ATTR MEM_ALIGN_ATTR;
213void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 213void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
214 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) 214 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
215{ 215{
216 int i; 216 int i;
217 int N, N2, N4; 217 int N, N2, N4;
218 int tstride = 1<<shift;
219 kiss_twiddle_scalar sine; 218 kiss_twiddle_scalar sine;
220 VARDECL(kiss_fft_scalar, f); 219/* VARDECL(kiss_fft_scalar, f2);
221 VARDECL(kiss_fft_scalar, f2); 220 SAVE_STACK; */
222 SAVE_STACK;
223 N = l->n; 221 N = l->n;
224 N >>= shift; 222 N >>= shift;
225 N2 = N>>1; 223 N2 = N>>1;
226 N4 = N>>2; 224 N4 = N>>2;
227 kiss_fft_scalar s_f[S_F_BUF_SIZE]; 225/* ALLOC(f2, N2, kiss_fft_scalar); */
228 226 kiss_fft_scalar f2[N2]; /* worst case 3840b */
229 if (S_F_BUF_SIZE >= N2)
230 {
231 f = s_f;
232 f2 = s_f2;
233 }
234 else
235 {
236 ALLOC(f , N2, kiss_fft_scalar);
237 ALLOC(f2, N2, kiss_fft_scalar);
238 }
239
240 /* sin(x) ~= x here */ 227 /* sin(x) ~= x here */
241#ifdef FIXED_POINT 228#ifdef FIXED_POINT
242 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; 229 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
@@ -250,102 +237,78 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
250 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; 237 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
251 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); 238 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
252 kiss_fft_scalar * OPUS_RESTRICT yp = f2; 239 kiss_fft_scalar * OPUS_RESTRICT yp = f2;
253 const kiss_twiddle_scalar *t0 = &l->trig[0]; 240 const kiss_twiddle_scalar *t = &l->trig[0];
254 const kiss_twiddle_scalar *t1 = &l->trig[N4<<shift];
255 for(i=0;i<N4;i++) 241 for(i=0;i<N4;i++)
256 { 242 {
257 kiss_fft_scalar yr, yi; 243 kiss_fft_scalar yr, yi;
258 yr = -S_MUL(*xp2, *t0) + S_MUL(*xp1, *t1); 244 yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
259 yi = -S_MUL(*xp2, *t1) - S_MUL(*xp1, *t0); 245 yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
260 /* works because the cos is nearly one */ 246 /* works because the cos is nearly one */
261 *yp++ = yr - S_MUL(yi,sine); 247 *yp++ = yr - S_MUL(yi,sine);
262 *yp++ = yi + S_MUL(yr,sine); 248 *yp++ = yi + S_MUL(yr,sine);
263 xp1+=2*stride; 249 xp1+=2*stride;
264 xp2-=2*stride; 250 xp2-=2*stride;
265 t0 += tstride;
266 t1 -= tstride;
267 } 251 }
268 } 252 }
269 253
270 /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ 254 /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
271 opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f); 255 opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
272 256
273 /* Post-rotate */ 257 /* Post-rotate and de-shuffle from both ends of the buffer at once to make
258 it in-place. */
274 { 259 {
275 kiss_fft_scalar * OPUS_RESTRICT fp = f; 260 kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
276 const kiss_twiddle_scalar *t0 = &l->trig[0]; 261 kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
277 const kiss_twiddle_scalar *t1 = &l->trig[N4<<shift]; 262 const kiss_twiddle_scalar *t = &l->trig[0];
278 for(i=0;i<N4;i++) 263 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
264 middle pair will be computed twice. */
265 for(i=0;i<(N4+1)>>1;i++)
279 { 266 {
280 kiss_fft_scalar re, im, yr, yi; 267 kiss_fft_scalar re, im, yr, yi;
281 re = fp[0]; 268 kiss_twiddle_scalar t0, t1;
282 im = fp[1]; 269 re = yp0[0];
270 im = yp0[1];
271 t0 = t[i<<shift];
272 t1 = t[(N4-i)<<shift];
283 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 273 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
284 yr = S_MUL(re, *t0) - S_MUL(im, *t1); 274 yr = S_MUL(re,t0) - S_MUL(im,t1);
285 yi = S_MUL(im, *t0) + S_MUL(re, *t1); 275 yi = S_MUL(im,t0) + S_MUL(re,t1);
276 re = yp1[0];
277 im = yp1[1];
286 /* works because the cos is nearly one */ 278 /* works because the cos is nearly one */
287 *fp++ = yr - S_MUL(yi,sine); 279 yp0[0] = -(yr - S_MUL(yi,sine));
288 *fp++ = yi + S_MUL(yr,sine); 280 yp1[1] = yi + S_MUL(yr,sine);
289 t0 += tstride;
290 t1 -= tstride;
291 }
292 }
293 /* De-shuffle the components for the middle of the window only */
294 {
295 const kiss_fft_scalar * OPUS_RESTRICT fp1 = f;
296 const kiss_fft_scalar * OPUS_RESTRICT fp2 = f+N2-1;
297 kiss_fft_scalar * OPUS_RESTRICT yp = f2;
298 for(i = 0; i < N4; i++)
299 {
300 *yp++ =-*fp1;
301 *yp++ = *fp2;
302 fp1 += 2;
303 fp2 -= 2;
304 }
305 }
306 out -= (N2-overlap)>>1;
307 /* Mirror on both sides for TDAC */
308 {
309 kiss_fft_scalar * OPUS_RESTRICT fp1 = f2+N4-1;
310 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+N2-1;
311 kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2;
312 const opus_val16 * OPUS_RESTRICT wp1 = window;
313 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
314 281
315 i = N4-overlap/2; 282 t0 = t[(N4-i-1)<<shift];
316 xp1 -= N4-overlap/2; 283 t1 = t[(i+1)<<shift];
317 fp1 -= N4-overlap/2; 284 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
318 OPUS_COPY(xp1+1, fp1+1, N4-overlap/2); 285 yr = S_MUL(re,t0) - S_MUL(im,t1);
319 for(; i < N4; i++) 286 yi = S_MUL(im,t0) + S_MUL(re,t1);
320 { 287 /* works because the cos is nearly one */
321 kiss_fft_scalar x1; 288 yp1[0] = -(yr - S_MUL(yi,sine));
322 x1 = *fp1--; 289 yp0[1] = yi + S_MUL(yr,sine);
323 *yp1++ +=-MULT16_32_Q15(*wp1, x1); 290 yp0 += 2;
324 *xp1-- += MULT16_32_Q15(*wp2, x1); 291 yp1 -= 2;
325 wp1++;
326 wp2--;
327 } 292 }
328 } 293 }
294
295 /* Mirror on both sides for TDAC */
329 { 296 {
330 kiss_fft_scalar * OPUS_RESTRICT fp2 = f2+N4; 297 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
331 kiss_fft_scalar * OPUS_RESTRICT xp2 = out+N2; 298 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
332 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2);
333 const opus_val16 * OPUS_RESTRICT wp1 = window; 299 const opus_val16 * OPUS_RESTRICT wp1 = window;
334 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; 300 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
335 301
336 i = N4-overlap/2; 302 for(i = 0; i < overlap/2; i++)
337 OPUS_COPY(xp2, fp2, N4-overlap/2);
338 xp2 += N4-overlap/2;
339 fp2 += N4-overlap/2;
340 for(; i < N4; i++)
341 { 303 {
342 kiss_fft_scalar x2; 304 kiss_fft_scalar x1, x2;
343 x2 = *fp2++; 305 x1 = *xp1;
344 *yp2-- = MULT16_32_Q15(*wp1, x2); 306 x2 = *yp1;
345 *xp2++ = MULT16_32_Q15(*wp2, x2); 307 *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
308 *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
346 wp1++; 309 wp1++;
347 wp2--; 310 wp2--;
348 } 311 }
349 } 312 }
350 RESTORE_STACK; 313/* RESTORE_STACK; */
351} 314}