1 files changed, 110 insertions, 81 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 72ea180568..7fa8eaf6bf 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -53,18 +53,20 @@
 #include "mathops.h"
 #include "stack_alloc.h"
+#if defined(MIPSr1_ASM)
+#include "mips/mdct_mipsr1.h"
+#endif
 #ifdef CUSTOM_MODES
 int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
 {
   int i;
-   int N4;
   kiss_twiddle_scalar *trig;
-#if defined(FIXED_POINT)
+   int shift;
   int N2=N>>1;
-#endif
   l->n = N;
-   N4 = N>>2;
   l->maxshift = maxshift;
   for (i=0;i<=maxshift;i++)
   {
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
         return 0;
 #endif
   }
-   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
   if (l->trig==NULL)
     return 0;
-   /* We have enough points that sine isn't necessary */
+   for (shift=0;shift<=maxshift;shift++)
+   {
+      /* We have enough points that sine isn't necessary */
 #if defined(FIXED_POINT)
-   for (i=0;i<=N4;i++)
+#if 1
-      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
+      for (i=0;i<N2;i++)
+         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
 #else
-   for (i=0;i<=N4;i++)
+      for (i=0;i<N2;i++)
-      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
+         trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
 #endif
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
+#endif
+      trig += N2;
+      N2 >>= 1;
+      N >>= 1;
+   }
   return 1;
 }
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)
 #if 0
 /* Forward MDCT trashes the input array */
+#ifndef OVERRIDE_clt_mdct_forward
 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 *window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
   VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_scalar, f2);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
   SAVE_STACK;
+   scale = st->scale;
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
   ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
   /* Consider the input to be composed of four blocks: [a, b, c, d] */
   /* Window, shuffle, fold */
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
   /* Pre-rotation */
   {
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      for(i=0;i<N4;i++)
      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
         kiss_fft_scalar re, im, yr, yi;
-         re = yp[0];
+         t0 = t[i];
-         im = yp[1];
+         t1 = t[N4+i];
-         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
+         re = *yp++;
-         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
+         im = *yp++;
-         /* works because the cos is nearly one */
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
-         *yp++ = yr + S_MUL(yi,sine);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
-         *yp++ = yi - S_MUL(yr,sine);
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
      }
   }
-   /* N/4 complex FFT, down-scales by 4/N */
+   /* N/4 complex FFT, does not downscale anymore */
-   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
+   opus_fft_impl(st, f2);
   /* Post-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Temp pointers to make it really clear to the compiler what we're doing */
      for(i=0;i<N4;i++)
      {
         kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
-         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
-         /* works because the cos is nearly one */
+         *yp1 = yr;
-         *yp1 = yr - S_MUL(yi,sine);
+         *yp2 = yi;
-         *yp2 = yi + S_MUL(yr,sine);;
+         fp++;
-         fp += 2;
         yp1 += 2*stride;
         yp2 -= 2*stride;
      }
   }
   RESTORE_STACK;
 }
+#endif /* OVERRIDE_clt_mdct_forward */
 #endif
+#ifndef OVERRIDE_clt_mdct_backward
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
+   const kiss_twiddle_scalar *trig;
-/*   VARDECL(kiss_fft_scalar, f2);
-   SAVE_STACK; */
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
-/*   ALLOC(f2, N2, kiss_fft_scalar); */
-   kiss_fft_scalar f2[N2]; /* worst case 3840b */
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
   /* Pre-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
      for(i=0;i<N4;i++)
      {
+         int rev;
         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
+         rev = *bitrev++;
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
-         /* works because the cos is nearly one */
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
-         *yp++ = yr - S_MUL(yi,sine);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
-         *yp++ = yi + S_MUL(yr,sine);
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
         xp1+=2*stride;
         xp2-=2*stride;
      }
   }
-   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
-   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
      it in-place. */
   {
-      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
         middle pair will be computed twice. */
      for(i=0;i<(N4+1)>>1;i++)
      {
         kiss_fft_scalar re, im, yr, yi;
         kiss_twiddle_scalar t0, t1;
-         re = yp0[0];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
-         im = yp0[1];
+         re = yp0[1];
-         t0 = t[i<<shift];
+         im = yp0[0];
-         t1 = t[(N4-i)<<shift];
+         t0 = t[i];
+         t1 = t[N4+i];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         re = yp1[0];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
-         im = yp1[1];
+         re = yp1[1];
-         /* works because the cos is nearly one */
+         im = yp1[0];
-         yp0[0] = -(yr - S_MUL(yi,sine));
+         yp0[0] = yr;
-         yp1[1] = yi + S_MUL(yr,sine);
+         yp1[1] = yi;
-         t0 = t[(N4-i-1)<<shift];
+         t0 = t[(N4-i-1)];
-         t1 = t[(i+1)<<shift];
+         t1 = t[(N2-i-1)];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         /* works because the cos is nearly one */
+         yp1[0] = yr;
-         yp1[0] = -(yr - S_MUL(yi,sine));
+         yp0[1] = yi;
-         yp0[1] = yi + S_MUL(yr,sine);
         yp0 += 2;
         yp1 -= 2;
      }
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
         wp2--;
      }
   }
-/*   RESTORE_STACK; */
 }
+#endif /* OVERRIDE_clt_mdct_backward */

diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c index 72ea180568..7fa8eaf6bf 100644 --- a/lib/rbcodec/codecs/libopus/celt/mdct.c +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -53,18 +53,20 @@
53	#include "mathops.h"	53	#include "mathops.h"
54	#include "stack_alloc.h"	54	#include "stack_alloc.h"
55		55
		56	#if defined(MIPSr1_ASM)
		57	#include "mips/mdct_mipsr1.h"
		58	#endif
		59
		60
56	#ifdef CUSTOM_MODES	61	#ifdef CUSTOM_MODES
57		62
58	int clt_mdct_init(mdct_lookup *l,int N, int maxshift)	63	int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
59	{	64	{
60	int i;	65	int i;
61	int N4;
62	kiss_twiddle_scalar *trig;	66	kiss_twiddle_scalar *trig;
63	#if defined(FIXED_POINT)	67	int shift;
64	int N2=N>>1;	68	int N2=N>>1;
65	#endif
66	l->n = N;	69	l->n = N;
67	N4 = N>>2;
68	l->maxshift = maxshift;	70	l->maxshift = maxshift;
69	for (i=0;i<=maxshift;i++)	71	for (i=0;i<=maxshift;i++)
70	{	72	{
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
77	return 0;	79	return 0;
78	#endif	80	#endif
79	}	81	}
80	l->trig = trig = (kiss_twiddle_scalar)opus_alloc((N4+1)sizeof(kiss_twiddle_scalar));	82	l->trig = trig = (kiss_twiddle_scalar)opus_alloc((N-(N2>>maxshift))sizeof(kiss_twiddle_scalar));
81	if (l->trig==NULL)	83	if (l->trig==NULL)
82	return 0;	84	return 0;
83	/* We have enough points that sine isn't necessary */	85	for (shift=0;shift<=maxshift;shift++)
		86	{
		87	/* We have enough points that sine isn't necessary */
84	#if defined(FIXED_POINT)	88	#if defined(FIXED_POINT)
85	for (i=0;i<=N4;i++)	89	#if 1
86	trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));	90	for (i=0;i<N2;i++)
		91	trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
87	#else	92	#else
88	for (i=0;i<=N4;i++)	93	for (i=0;i<N2;i++)
89	trig[i] = (kiss_twiddle_scalar)cos(2PIi/N);	94	trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768cos(2M_PI*(i+.125)/N))));
90	#endif	95	#endif
		96	#else
		97	for (i=0;i<N2;i++)
		98	trig[i] = (kiss_twiddle_scalar)cos(2PI(i+.125)/N);
		99	#endif
		100	trig += N2;
		101	N2 >>= 1;
		102	N >>= 1;
		103	}
91	return 1;	104	return 1;
92	}	105	}
93		106
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)
103		116
104	#if 0	117	#if 0
105	/* Forward MDCT trashes the input array */	118	/* Forward MDCT trashes the input array */
		119	#ifndef OVERRIDE_clt_mdct_forward
106	void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,	120	void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,
107	const opus_val16 *window, int overlap, int shift, int stride)	121	const opus_val16 *window, int overlap, int shift, int stride)
108	{	122	{
109	int i;	123	int i;
110	int N, N2, N4;	124	int N, N2, N4;
111	kiss_twiddle_scalar sine;
112	VARDECL(kiss_fft_scalar, f);	125	VARDECL(kiss_fft_scalar, f);
113	VARDECL(kiss_fft_scalar, f2);	126	VARDECL(kiss_fft_cpx, f2);
		127	const kiss_fft_state *st = l->kfft[shift];
		128	const kiss_twiddle_scalar *trig;
		129	opus_val16 scale;
		130	#ifdef FIXED_POINT
		131	/* Allows us to scale with MULT16_32_Q16(), which is faster than
		132	MULT16_32_Q15() on ARM. */
		133	int scale_shift = st->scale_shift-1;
		134	#endif
114	SAVE_STACK;	135	SAVE_STACK;
		136	scale = st->scale;
		137
115	N = l->n;	138	N = l->n;
116	N >>= shift;	139	trig = l->trig;
		140	for (i=0;i<shift;i++)
		141	{
		142	N >>= 1;
		143	trig += N;
		144	}
117	N2 = N>>1;	145	N2 = N>>1;
118	N4 = N>>2;	146	N4 = N>>2;
		147
119	ALLOC(f, N2, kiss_fft_scalar);	148	ALLOC(f, N2, kiss_fft_scalar);
120	ALLOC(f2, N2, kiss_fft_scalar);	149	ALLOC(f2, N4, kiss_fft_cpx);
121	/* sin(x) ~= x here */
122	#ifdef FIXED_POINT
123	sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
124	#else
125	sine = (kiss_twiddle_scalar)2PI(.125f)/N;
126	#endif
127		150
128	/* Consider the input to be composed of four blocks: [a, b, c, d] */	151	/* Consider the input to be composed of four blocks: [a, b, c, d] */
129	/* Window, shuffle, fold */	152	/* Window, shuffle, fold */
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar
168	/* Pre-rotation */	191	/* Pre-rotation */
169	{	192	{
170	kiss_fft_scalar * OPUS_RESTRICT yp = f;	193	kiss_fft_scalar * OPUS_RESTRICT yp = f;
171	const kiss_twiddle_scalar *t = &l->trig[0];	194	const kiss_twiddle_scalar *t = &trig[0];
172	for(i=0;i<N4;i++)	195	for(i=0;i<N4;i++)
173	{	196	{
		197	kiss_fft_cpx yc;
		198	kiss_twiddle_scalar t0, t1;
174	kiss_fft_scalar re, im, yr, yi;	199	kiss_fft_scalar re, im, yr, yi;
175	re = yp[0];	200	t0 = t[i];
176	im = yp[1];	201	t1 = t[N4+i];
177	yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);	202	re = *yp++;
178	yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);	203	im = *yp++;
179	/* works because the cos is nearly one */	204	yr = S_MUL(re,t0) - S_MUL(im,t1);
180	*yp++ = yr + S_MUL(yi,sine);	205	yi = S_MUL(im,t0) + S_MUL(re,t1);
181	*yp++ = yi - S_MUL(yr,sine);	206	yc.r = yr;
		207	yc.i = yi;
		208	yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
		209	yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
		210	f2[st->bitrev[i]] = yc;
182	}	211	}
183	}	212	}
184		213
185	/* N/4 complex FFT, down-scales by 4/N */	214	/* N/4 complex FFT, does not downscale anymore */
186	opus_fft(l->kfft[shift], (kiss_fft_cpx )f, (kiss_fft_cpx )f2);	215	opus_fft_impl(st, f2);
187		216
188	/* Post-rotate */	217	/* Post-rotate */
189	{	218	{
190	/* Temp pointers to make it really clear to the compiler what we're doing */	219	/* Temp pointers to make it really clear to the compiler what we're doing */
191	const kiss_fft_scalar * OPUS_RESTRICT fp = f2;	220	const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
192	kiss_fft_scalar * OPUS_RESTRICT yp1 = out;	221	kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
193	kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);	222	kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
194	const kiss_twiddle_scalar *t = &l->trig[0];	223	const kiss_twiddle_scalar *t = &trig[0];
195	/* Temp pointers to make it really clear to the compiler what we're doing */	224	/* Temp pointers to make it really clear to the compiler what we're doing */
196	for(i=0;i<N4;i++)	225	for(i=0;i<N4;i++)
197	{	226	{
198	kiss_fft_scalar yr, yi;	227	kiss_fft_scalar yr, yi;
199	yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);	228	yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
200	yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);	229	yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
201	/* works because the cos is nearly one */	230	*yp1 = yr;
202	*yp1 = yr - S_MUL(yi,sine);	231	*yp2 = yi;
203	*yp2 = yi + S_MUL(yr,sine);;	232	fp++;
204	fp += 2;
205	yp1 += 2*stride;	233	yp1 += 2*stride;
206	yp2 -= 2*stride;	234	yp2 -= 2*stride;
207	}	235	}
208	}	236	}
209	RESTORE_STACK;	237	RESTORE_STACK;
210	}	238	}
		239	#endif /* OVERRIDE_clt_mdct_forward */
211	#endif	240	#endif
212		241
		242	#ifndef OVERRIDE_clt_mdct_backward
213	void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,	243	void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scalar * OPUS_RESTRICT out,
214	const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)	244	const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
215	{	245	{
216	int i;	246	int i;
217	int N, N2, N4;	247	int N, N2, N4;
218	kiss_twiddle_scalar sine;	248	const kiss_twiddle_scalar *trig;
219	/* VARDECL(kiss_fft_scalar, f2);	249
220	SAVE_STACK; */
221	N = l->n;	250	N = l->n;
222	N >>= shift;	251	trig = l->trig;
		252	for (i=0;i<shift;i++)
		253	{
		254	N >>= 1;
		255	trig += N;
		256	}
223	N2 = N>>1;	257	N2 = N>>1;
224	N4 = N>>2;	258	N4 = N>>2;
225	/* ALLOC(f2, N2, kiss_fft_scalar); */
226	kiss_fft_scalar f2[N2]; /* worst case 3840b */
227	/* sin(x) ~= x here */
228	#ifdef FIXED_POINT
229	sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
230	#else
231	sine = (kiss_twiddle_scalar)2PI(.125f)/N;
232	#endif
233		259
234	/* Pre-rotate */	260	/* Pre-rotate */
235	{	261	{
236	/* Temp pointers to make it really clear to the compiler what we're doing */	262	/* Temp pointers to make it really clear to the compiler what we're doing */
237	const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;	263	const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
238	const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);	264	const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
239	kiss_fft_scalar * OPUS_RESTRICT yp = f2;	265	kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
240	const kiss_twiddle_scalar *t = &l->trig[0];	266	const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
		267	const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
241	for(i=0;i<N4;i++)	268	for(i=0;i<N4;i++)
242	{	269	{
		270	int rev;
243	kiss_fft_scalar yr, yi;	271	kiss_fft_scalar yr, yi;
244	yr = -S_MUL(xp2, t[i<<shift]) + S_MUL(xp1,t[(N4-i)<<shift]);	272	rev = *bitrev++;
245	yi = -S_MUL(xp2, t[(N4-i)<<shift]) - S_MUL(xp1,t[i<<shift]);	273	yr = S_MUL(xp2, t[i]) + S_MUL(xp1, t[N4+i]);
246	/* works because the cos is nearly one */	274	yi = S_MUL(xp1, t[i]) - S_MUL(xp2, t[N4+i]);
247	*yp++ = yr - S_MUL(yi,sine);	275	/* We swap real and imag because we use an FFT instead of an IFFT. */
248	*yp++ = yi + S_MUL(yr,sine);	276	yp[2*rev+1] = yr;
		277	yp[2*rev] = yi;
		278	/* Storing the pre-rotation directly in the bitrev order. */
249	xp1+=2*stride;	279	xp1+=2*stride;
250	xp2-=2*stride;	280	xp2-=2*stride;
251	}	281	}
252	}	282	}
253		283
254	/* Inverse N/4 complex FFT. This one should not downscale even in fixed-point */	284	opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
255	opus_ifft(l->kfft[shift], (kiss_fft_cpx )f2, (kiss_fft_cpx )(out+(overlap>>1)));
256		285
257	/* Post-rotate and de-shuffle from both ends of the buffer at once to make	286	/* Post-rotate and de-shuffle from both ends of the buffer at once to make
258	it in-place. */	287	it in-place. */
259	{	288	{
260	kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);	289	kiss_fft_scalar * yp0 = out+(overlap>>1);
261	kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;	290	kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
262	const kiss_twiddle_scalar *t = &l->trig[0];	291	const kiss_twiddle_scalar *t = &trig[0];
263	/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the	292	/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
264	middle pair will be computed twice. */	293	middle pair will be computed twice. */
265	for(i=0;i<(N4+1)>>1;i++)	294	for(i=0;i<(N4+1)>>1;i++)
266	{	295	{
267	kiss_fft_scalar re, im, yr, yi;	296	kiss_fft_scalar re, im, yr, yi;
268	kiss_twiddle_scalar t0, t1;	297	kiss_twiddle_scalar t0, t1;
269	re = yp0[0];	298	/* We swap real and imag because we're using an FFT instead of an IFFT. */
270	im = yp0[1];	299	re = yp0[1];
271	t0 = t[i<<shift];	300	im = yp0[0];
272	t1 = t[(N4-i)<<shift];	301	t0 = t[i];
		302	t1 = t[N4+i];
273	/* We'd scale up by 2 here, but instead it's done when mixing the windows */	303	/* We'd scale up by 2 here, but instead it's done when mixing the windows */
274	yr = S_MUL(re,t0) - S_MUL(im,t1);	304	yr = S_MUL(re,t0) + S_MUL(im,t1);
275	yi = S_MUL(im,t0) + S_MUL(re,t1);	305	yi = S_MUL(re,t1) - S_MUL(im,t0);
276	re = yp1[0];	306	/* We swap real and imag because we're using an FFT instead of an IFFT. */
277	im = yp1[1];	307	re = yp1[1];
278	/* works because the cos is nearly one */	308	im = yp1[0];
279	yp0[0] = -(yr - S_MUL(yi,sine));	309	yp0[0] = yr;
280	yp1[1] = yi + S_MUL(yr,sine);	310	yp1[1] = yi;
281		311
282	t0 = t[(N4-i-1)<<shift];	312	t0 = t[(N4-i-1)];
283	t1 = t[(i+1)<<shift];	313	t1 = t[(N2-i-1)];
284	/* We'd scale up by 2 here, but instead it's done when mixing the windows */	314	/* We'd scale up by 2 here, but instead it's done when mixing the windows */
285	yr = S_MUL(re,t0) - S_MUL(im,t1);	315	yr = S_MUL(re,t0) + S_MUL(im,t1);
286	yi = S_MUL(im,t0) + S_MUL(re,t1);	316	yi = S_MUL(re,t1) - S_MUL(im,t0);
287	/* works because the cos is nearly one */	317	yp1[0] = yr;
288	yp1[0] = -(yr - S_MUL(yi,sine));	318	yp0[1] = yi;
289	yp0[1] = yi + S_MUL(yr,sine);
290	yp0 += 2;	319	yp0 += 2;
291	yp1 -= 2;	320	yp1 -= 2;
292	}	321	}
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup l, kiss_fft_scalar in, kiss_fft_scala
310	wp2--;	339	wp2--;
311	}	340	}
312	}	341	}
313	/* RESTORE_STACK; */
314	}	342	}
		343	#endif /* OVERRIDE_clt_mdct_backward */