Sync to upstream libopus

Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c This brings in a bunch of optimizations to decode speed and memory usage. Allocations are switched from using the pseudostack to using the real stack. Enabled hacks to reduce stack usage. This should fix crashes on sansa clip, although some files will not play due to failing allocations in the codec buffer. Speeds up decoding of the following test files: H300 (cf) C200 (arm7tdmi) ipod classic (arm9e) 16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz 64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz 128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
author: Nils Wallménius <nils@rockbox.org> 2014-01-19 16:31:59 +0100
committer: Nils Wallménius <nils@rockbox.org> 2014-07-13 11:12:40 +0200
commit: 9b7ec42403073ee887efc531c153e6b1b6c15bab (patch)
tree: 07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/mdct.c
parent: e557951c94c1efa769900257e466900f0ffeb53b (diff)
download: rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz
rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip
1 files changed, 110 insertions, 81 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 72ea180568..7fa8eaf6bf 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -53,18 +53,20 @@
 #include "mathops.h"
 #include "stack_alloc.h"
+#if defined(MIPSr1_ASM)
+#include "mips/mdct_mipsr1.h"
+#endif
 #ifdef CUSTOM_MODES
 int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
 {
   int i;
-   int N4;
   kiss_twiddle_scalar *trig;
-#if defined(FIXED_POINT)
+   int shift;
   int N2=N>>1;
-#endif
   l->n = N;
-   N4 = N>>2;
   l->maxshift = maxshift;
   for (i=0;i<=maxshift;i++)
   {
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
         return 0;
 #endif
   }
-   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
   if (l->trig==NULL)
     return 0;
-   /* We have enough points that sine isn't necessary */
+   for (shift=0;shift<=maxshift;shift++)
+   {
+      /* We have enough points that sine isn't necessary */
 #if defined(FIXED_POINT)
-   for (i=0;i<=N4;i++)
+#if 1
-      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
+      for (i=0;i<N2;i++)
+         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
 #else
-   for (i=0;i<=N4;i++)
+      for (i=0;i<N2;i++)
-      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
+         trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
 #endif
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
+#endif
+      trig += N2;
+      N2 >>= 1;
+      N >>= 1;
+   }
   return 1;
 }
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)
 #if 0
 /* Forward MDCT trashes the input array */
+#ifndef OVERRIDE_clt_mdct_forward
 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 *window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
   VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_scalar, f2);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
   SAVE_STACK;
+   scale = st->scale;
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
   ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
   /* Window, shuffle, fold */
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
   /* Pre-rotation */
   {
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      for(i=0;i<N4;i++)
      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
         kiss_fft_scalar re, im, yr, yi;
-         re = yp[0];
+         t0 = t[i];
-         im = yp[1];
+         t1 = t[N4+i];
-         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
+         re = *yp++;
-         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
+         im = *yp++;
-         /* works because the cos is nearly one */
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
-         *yp++ = yr + S_MUL(yi,sine);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
-         *yp++ = yi - S_MUL(yr,sine);
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
      }
   }
-   /* N/4 complex FFT, down-scales by 4/N */
+   /* N/4 complex FFT, does not downscale anymore */
-   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
+   opus_fft_impl(st, f2);
   /* Post-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Temp pointers to make it really clear to the compiler what we're doing */
      for(i=0;i<N4;i++)
      {
         kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
-         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
-         /* works because the cos is nearly one */
+         *yp1 = yr;
-         *yp1 = yr - S_MUL(yi,sine);
+         *yp2 = yi;
-         *yp2 = yi + S_MUL(yr,sine);;
+         fp++;
-         fp += 2;
         yp1 += 2*stride;
         yp2 -= 2*stride;
      }
   }
   RESTORE_STACK;
 }
+#endif /* OVERRIDE_clt_mdct_forward */
 #endif
+#ifndef OVERRIDE_clt_mdct_backward
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
+   const kiss_twiddle_scalar *trig;
-/*   VARDECL(kiss_fft_scalar, f2);
-   SAVE_STACK; */
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
-/*   ALLOC(f2, N2, kiss_fft_scalar); */
-   kiss_fft_scalar f2[N2]; /* worst case 3840b */
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
   /* Pre-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
      for(i=0;i<N4;i++)
      {
+         int rev;
         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
+         rev = *bitrev++;
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
-         /* works because the cos is nearly one */
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
-         *yp++ = yr - S_MUL(yi,sine);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
-         *yp++ = yi + S_MUL(yr,sine);
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
         xp1+=2*stride;
         xp2-=2*stride;
      }
   }
-   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
-   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
      it in-place. */
   {
-      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
         middle pair will be computed twice. */
      for(i=0;i<(N4+1)>>1;i++)
      {
         kiss_fft_scalar re, im, yr, yi;
         kiss_twiddle_scalar t0, t1;
-         re = yp0[0];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
-         im = yp0[1];
+         re = yp0[1];
-         t0 = t[i<<shift];
+         im = yp0[0];
-         t1 = t[(N4-i)<<shift];
+         t0 = t[i];
+         t1 = t[N4+i];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         re = yp1[0];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
-         im = yp1[1];
+         re = yp1[1];
-         /* works because the cos is nearly one */
+         im = yp1[0];
-         yp0[0] = -(yr - S_MUL(yi,sine));
+         yp0[0] = yr;
-         yp1[1] = yi + S_MUL(yr,sine);
+         yp1[1] = yi;
-         t0 = t[(N4-i-1)<<shift];
+         t0 = t[(N4-i-1)];
-         t1 = t[(i+1)<<shift];
+         t1 = t[(N2-i-1)];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         /* works because the cos is nearly one */
+         yp1[0] = yr;
-         yp1[0] = -(yr - S_MUL(yi,sine));
+         yp0[1] = yi;
-         yp0[1] = yi + S_MUL(yr,sine);
         yp0 += 2;
         yp1 -= 2;
      }
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
         wp2--;
      }
   }
-/*   RESTORE_STACK; */
 }
+#endif /* OVERRIDE_clt_mdct_backward */
author	Nils Wallménius <nils@rockbox.org>	2014-01-19 16:31:59 +0100
committer	Nils Wallménius <nils@rockbox.org>	2014-07-13 11:12:40 +0200
commit	9b7ec42403073ee887efc531c153e6b1b6c15bab (patch)
tree	07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/mdct.c
parent	e557951c94c1efa769900257e466900f0ffeb53b (diff)
download	rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip

diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c index 72ea180568..7fa8eaf6bf 100644 --- a/lib/rbcodec/codecs/libopus/celt/mdct.c +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -53,18 +53,20 @@
53	#include "mathops.h"	53	#include "mathops.h"
54	#include "stack_alloc.h"	54	#include "stack_alloc.h"
55		55
		56	#if defined(MIPSr1_ASM)
		57	#include "mips/mdct_mipsr1.h"
		58	#endif
		59
		60
56	#ifdef CUSTOM_MODES	61	#ifdef CUSTOM_MODES
57		62
58	int clt_mdct_init(mdct_lookup *l,int N, int maxshift)	63	int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
59	{	64	{
60	int i;	65	int i;
61	int N4;
62	kiss_twiddle_scalar *trig;	66	kiss_twiddle_scalar *trig;
63	#if defined(FIXED_POINT)	67	int shift;
64	int N2=N>>1;	68	int N2=N>>1;
65	#endif
66	l->n = N;	69	l->n = N;
67	N4 = N>>2;
68	l->maxshift = maxshift;	70	l->maxshift = maxshift;
69	for (i=0;i<=maxshift;i++)	71	for (i=0;i<=maxshift;i++)
70	{	72	{
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
77	return 0;	79	return 0;
78	#endif	80	#endif
79	}	81	}
80	l->trig = trig = (kiss_twiddle_scalar)opus_alloc((N4+1)sizeof(kiss_twiddle_scalar));	82	l->trig = trig = (kiss_twiddle_scalar)opus_alloc((N-(N2>>maxshift))sizeof(kiss_twiddle_scalar));
81	if (l->trig==NULL)	83	if (l->trig==NULL)
82	return 0;	84	return 0;
83	/* We have enough points that sine isn't necessary */	85	for (shift=0;shift<=maxshift;shift++)
		86	{
		87	/* We have enough points that sine isn't necessary */
84	#if defined(FIXED_POINT)	88	#if defined(FIXED_POINT)
85	for (i=0;i<=N4;i++)	89	#if 1
86	trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));	90	for (i=0;i<N2;i++)
		91	trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
87	#else	92	#else
88	for (i=0;i<=N4;i++)	93	for (i=0;i<N2;i++)
89	trig[i] = (kiss_twiddle_scalar)cos(2PIi/N);	94	trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768cos(2M_PI*(i+.125)/N))));
90	#endif	95	#endif
		96	#else
		97	for (i=0;i<N2;i++)
		98	trig[i] = (kiss_twiddle_scalar)cos(2PI(i+.125)/N);
		99	#endif
		100	trig += N2;
		101	N2 >>= 1;
		102	N >>= 1;
		103	}
91	return 1;	104	return 1;
92	}	105	}
93		106
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)
103		116
104	#if 0	117	#if 0
105	/* Forward MDCT trashes the input array */	118	/* Forward MDCT trashes the input array */
		119	#ifndef OVERRIDE_clt_mdct_forward
106	void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,	120	void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,
107	const opus_val16 *window, int overlap, int shift, int stride)	121	const opus_val16 *window, int overlap, int shift, int stride)
108	{	122	{
109	int i;	123	int i;
110	int N, N2, N4;	124	int N, N2, N4;
111	kiss_twiddle_scalar sine;
112	VARDECL(kiss_fft_scalar, f);	125	VARDECL(kiss_fft_scalar, f);
113	VARDECL(kiss_fft_scalar, f2);	126	VARDECL(kiss_fft_cpx, f2);
		127	const kiss_fft_state *st = l->kfft[shift];
		128	const kiss_twiddle_scalar *trig;
		129	opus_val16 scale;
		130	#ifdef FIXED_POINT
		131	/* Allows us to scale with MULT16_32_Q16(), which is faster than
		132	MULT16_32_Q15() on ARM. */
		133	int scale_shift = st->scale_shift-1;
		134	#endif
114	SAVE_STACK;	135	SAVE_STACK;
		136	scale = st->scale;
		137
115	N = l->n;	138	N = l->n;
116	N >>= shift;	139	trig = l->trig;
		140	for (i=0;i<shift;i++)
		141	{
		142	N >>= 1;
		143	trig += N;
		144	}
117	N2 = N>>1;	145	N2 = N>>1;
118	N4 = N>>2;	146	N4 = N>>2;
		147
119	ALLOC(f, N2, kiss_fft_scalar);	148	ALLOC(f, N2, kiss_fft_scalar);
120	ALLOC(f2, N2, kiss_fft_scalar);	149	ALLOC(f2, N4, kiss_fft_cpx);
121	/* sin(x) ~= x here */
122	#ifdef FIXED_POINT
123	sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
124	#else
125	sine = (kiss_twiddle_scalar)2PI(.125f)/N;
126	#endif
127		150
128	/* Consider the input to be composed of four blocks: [a, b, c, d] */	151	/* Consider the input to be composed of four blocks: [a, b, c, d] */
129	/* Window, shuffle, fold */	152	/* Window, shuffle, fold */
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar
168	/* Pre-rotation */	191	/* Pre-rotation */
169	{	192	{
170	kiss_fft_scalar * OPUS_RESTRICT yp = f;	193	kiss_fft_scalar * OPUS_RESTRICT yp = f;
171	const kiss_twiddle_scalar *t = &l->trig[0];	194	const kiss_twiddle_scalar *t = &trig[0];
172	for(i=0;i<N4;i++)	195	for(i=0;i<N4;i++)
173	{	196	{
		197	kiss_fft_cpx yc;
		198	kiss_twiddle_scalar t0, t1;
174	kiss_fft_scalar re, im, yr, yi;	199	kiss_fft_scalar re, im, yr, yi;
175	re = yp[0];	200	t0 = t[i];
176	im = yp[1];	201	t1 = t[N4+i];
177	yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);	202	re = *yp++;
178	yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);	203	im = *yp++;
179	/* works because the cos is nearly one */	204	yr = S_MUL(re,t0) - S_MUL(im,t1);
180	*yp++ = yr + S_MUL(yi,sine);	205	yi = S_MUL(im,t0) + S_MUL(re,t1);
181	*yp++ = yi - S_MUL(yr,sine);	206	yc.r = yr;
		207	yc.i = yi;
		208	yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
		209	yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
		210	f2[st->bitrev[i]] = yc;
182	}	211	}
183	}	212	}
184		213
185	/* N/4 complex FFT, down-scales by 4/N */	214	/* N/4 complex FFT, does not downscale anymore */
186	opus_fft(l->kfft[shift], (kiss_fft_cpx )f, (kiss_fft_cpx )f2);	215	opus_fft_impl(st, f2);
187		216
188	/* Post-rotate */	217	/* Post-rotate */
189	{	218	{
190	/* Temp pointers to make it really clear to the compiler what we're doing */	219	/* Temp pointers to make it really clear to the compiler what we're doing */
191	const kiss_fft_scalar * OPUS_RESTRICT fp = f2;	220	const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
192	kiss_fft_scalar * OPUS_RESTRICT yp1 = out;	221	kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
193	kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);	222	kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
194	const kiss_twiddle_scalar *t = &l->trig[0];	223	const kiss_twiddle_scalar *t = &trig[0];
195	/* Temp pointers to make it really clear to the compiler what we're doing */	224	/* Temp pointers to make it really clear to the compiler what we're doing */
196	for(i=0;i<N4;i++)	225	for(i=0;i<N4;i++)
197	{	226	{
198	kiss_fft_scalar yr, yi;	227	kiss_fft_scalar yr, yi;
199	yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);	228	yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
200	yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);	229	yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
201	/* works because the cos is nearly one */	230	*yp1 = yr;
202	*yp1 = yr - S_MUL(yi,sine);	231	*yp2 = yi;
203	*yp2 = yi + S_MUL(yr,sine);;	232	fp++;
204	fp += 2;
205	yp1 += 2*stride;	233	yp1 += 2*stride;
206	yp2 -= 2*stride;	234	yp2 -= 2*stride;
207	}	235	}
208	}	236	}
209	RESTORE_STACK;	237	RESTORE_STACK;
210	}	238	}
		239	#endif /* OVERRIDE_clt_mdct_forward */
211	#endif	240	#endif
212		241
		242	#ifndef OVERRIDE_clt_mdct_backward
213	void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,	243	void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,
214	const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)	244	const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
215	{	245	{
216	int i;	246	int i;
217	int N, N2, N4;	247	int N, N2, N4;
218	kiss_twiddle_scalar sine;	248	const kiss_twiddle_scalar *trig;
219	/* VARDECL(kiss_fft_scalar, f2);	249
220	SAVE_STACK; */
221	N = l->n;	250	N = l->n;
222	N >>= shift;	251	trig = l->trig;
		252	for (i=0;i<shift;i++)
		253	{
		254	N >>= 1;
		255	trig += N;
		256	}
223	N2 = N>>1;	257	N2 = N>>1;
224	N4 = N>>2;	258	N4 = N>>2;
225	/* ALLOC(f2, N2, kiss_fft_scalar); */
226	kiss_fft_scalar f2[N2]; /* worst case 3840b */
227	/* sin(x) ~= x here */
228	#ifdef FIXED_POINT
229	sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
230	#else
231	sine = (kiss_twiddle_scalar)2PI(.125f)/N;
232	#endif
233		259
234	/* Pre-rotate */	260	/* Pre-rotate */
235	{	261	{
236	/* Temp pointers to make it really clear to the compiler what we're doing */	262	/* Temp pointers to make it really clear to the compiler what we're doing */
237	const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;	263	const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
238	const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);	264	const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
239	kiss_fft_scalar * OPUS_RESTRICT yp = f2;	265	kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
240	const kiss_twiddle_scalar *t = &l->trig[0];	266	const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
		267	const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
241	for(i=0;i<N4;i++)	268	for(i=0;i<N4;i++)
242	{	269	{
		270	int rev;
243	kiss_fft_scalar yr, yi;	271	kiss_fft_scalar yr, yi;
244	yr = -S_MUL(xp2, t[i<<shift]) + S_MUL(xp1,t[(N4-i)<<shift]);	272	rev = *bitrev++;
245	yi = -S_MUL(xp2, t[(N4-i)<<shift]) - S_MUL(xp1,t[i<<shift]);	273	yr = S_MUL(xp2, t[i]) + S_MUL(xp1, t[N4+i]);
246	/* works because the cos is nearly one */	274	yi = S_MUL(xp1, t[i]) - S_MUL(xp2, t[N4+i]);
247	*yp++ = yr - S_MUL(yi,sine);	275	/* We swap real and imag because we use an FFT instead of an IFFT. */
248	*yp++ = yi + S_MUL(yr,sine);	276	yp[2*rev+1] = yr;
		277	yp[2*rev] = yi;
		278	/* Storing the pre-rotation directly in the bitrev order. */
249	xp1+=2*stride;	279	xp1+=2*stride;
250	xp2-=2*stride;	280	xp2-=2*stride;
251	}	281	}
252	}	282	}
253		283
254	/* Inverse N/4 complex FFT. This one should not downscale even in fixed-point */	284	opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
255	opus_ifft(l->kfft[shift], (kiss_fft_cpx )f2, (kiss_fft_cpx )(out+(overlap>>1)));
256		285
257	/* Post-rotate and de-shuffle from both ends of the buffer at once to make	286	/* Post-rotate and de-shuffle from both ends of the buffer at once to make
258	it in-place. */	287	it in-place. */
259	{	288	{
260	kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);	289	kiss_fft_scalar * yp0 = out+(overlap>>1);
261	kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;	290	kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
262	const kiss_twiddle_scalar *t = &l->trig[0];	291	const kiss_twiddle_scalar *t = &trig[0];
263	/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the	292	/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
264	middle pair will be computed twice. */	293	middle pair will be computed twice. */
265	for(i=0;i<(N4+1)>>1;i++)	294	for(i=0;i<(N4+1)>>1;i++)
266	{	295	{
267	kiss_fft_scalar re, im, yr, yi;	296	kiss_fft_scalar re, im, yr, yi;
268	kiss_twiddle_scalar t0, t1;	297	kiss_twiddle_scalar t0, t1;
269	re = yp0[0];	298	/* We swap real and imag because we're using an FFT instead of an IFFT. */
270	im = yp0[1];	299	re = yp0[1];
271	t0 = t[i<<shift];	300	im = yp0[0];
272	t1 = t[(N4-i)<<shift];	301	t0 = t[i];
		302	t1 = t[N4+i];
273	/* We'd scale up by 2 here, but instead it's done when mixing the windows */	303	/* We'd scale up by 2 here, but instead it's done when mixing the windows */
274	yr = S_MUL(re,t0) - S_MUL(im,t1);	304	yr = S_MUL(re,t0) + S_MUL(im,t1);
275	yi = S_MUL(im,t0) + S_MUL(re,t1);	305	yi = S_MUL(re,t1) - S_MUL(im,t0);
276	re = yp1[0];	306	/* We swap real and imag because we're using an FFT instead of an IFFT. */
277	im = yp1[1];	307	re = yp1[1];
278	/* works because the cos is nearly one */	308	im = yp1[0];
279	yp0[0] = -(yr - S_MUL(yi,sine));	309	yp0[0] = yr;
280	yp1[1] = yi + S_MUL(yr,sine);	310	yp1[1] = yi;
281		311
282	t0 = t[(N4-i-1)<<shift];	312	t0 = t[(N4-i-1)];
283	t1 = t[(i+1)<<shift];	313	t1 = t[(N2-i-1)];
284	/* We'd scale up by 2 here, but instead it's done when mixing the windows */	314	/* We'd scale up by 2 here, but instead it's done when mixing the windows */
285	yr = S_MUL(re,t0) - S_MUL(im,t1);	315	yr = S_MUL(re,t0) + S_MUL(im,t1);
286	yi = S_MUL(im,t0) + S_MUL(re,t1);	316	yi = S_MUL(re,t1) - S_MUL(im,t0);
287	/* works because the cos is nearly one */	317	yp1[0] = yr;
288	yp1[0] = -(yr - S_MUL(yi,sine));	318	yp0[1] = yi;
289	yp0[1] = yi + S_MUL(yr,sine);
290	yp0 += 2;	319	yp0 += 2;
291	yp1 -= 2;	320	yp1 -= 2;
292	}	321	}
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scala
310	wp2--;	339	wp2--;
311	}	340	}
312	}	341	}
313	/* RESTORE_STACK; */
314	}	342	}
		343	#endif /* OVERRIDE_clt_mdct_backward */