Merge from branches/mdctexp - faster ifft+imdct in codec lib

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24712 a1c6a512-1295-4272-9138-f99709370657
author: Dave Hooper <dave@beermex.com> 2010-02-17 00:49:53 +0000
committer: Dave Hooper <dave@beermex.com> 2010-02-17 00:49:53 +0000
commit: 42774d3128b91d5a37344cb40d56d3c4d147e5f2 (patch)
tree: bf336b407992ec9a5e454556f3351e3f8a0d10de /apps/codecs/lib/mdct.c
parent: 62257ebc38bc0a3095b25dd0f58c4c8215edf602 (diff)
download: rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.tar.gz
rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.zip
1 files changed, 414 insertions, 0 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
new file mode 100644
index 0000000000..03baa4db4a
--- /dev/null
+++ b/apps/codecs/lib/mdct.c
@@ -0,0 +1,414 @@
+/*
+ * Fixed Point IMDCT 
+ * Copyright (c) 2002 The FFmpeg Project.
+ * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "codeclib.h"
+#include "mdct.h"
+#include "asm_arm.h"
+#include "asm_mcf5249.h"
+#include "codeclib_misc.h"
+#include "mdct_lookup.h"
+/**
+ * Compute the middle half of the inverse MDCT of size N = 2^nbits
+ * thus excluding the parts that can be derived by symmetry
+ * @param output N/2 samples
+ * @param input N/2 samples
+ *
+ * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
+ *                                          not overlap or intersect at all)
+ */
+void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
+{
+    int n8, n4, n2, n, j;
+    const fixed32 *in1, *in2;
+    
+    n = 1 << nbits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+    FFTComplex *z = (FFTComplex *)output;
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    
+    /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
+       The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
+    const int revtab_shift = (14- nbits);
+    
+    /* bitreverse reorder the input and rotate;   result here is in OUTPUT ... */
+    /* (note that when using the current split radix, the bitreverse ordering is
+        complex, meaning that this reordering cannot easily be done in-place) */
+    /* Using the following pdf, you can see that it is possible to rearrange
+       the 'classic' pre/post rotate with an alternative one that enables
+       us to use fewer distinct twiddle factors.
+       http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
+       
+       For prerotation, the factors are just sin,cos(2PI*i/N)
+       For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
+       
+       Therefore, prerotation can immediately reuse the same twiddles as fft
+       (for postrotation it's still a bit complex, so this is still using
+        an mdct-local set of twiddles to do that part)
+       */
+    const int32_t *T = sincos_lookup0;
+    const int step = 2<<(12-nbits);
+    const uint16_t * p_revtab=revtab;
+    {
+        const uint16_t * const p_revtab_end = p_revtab + n8;
+        while(LIKELY(p_revtab < p_revtab_end))
+        {
+            j = (*p_revtab)>>revtab_shift;
+            XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
+            T += step;
+            in1 += 2;
+            in2 -= 2;
+            p_revtab++;
+            j = (*p_revtab)>>revtab_shift;
+            XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
+            T += step;
+            in1 += 2;
+            in2 -= 2;
+            p_revtab++;
+        }
+    }
+    {
+        const uint16_t * const p_revtab_end = p_revtab + n8;
+        while(LIKELY(p_revtab < p_revtab_end))
+        {
+            j = (*p_revtab)>>revtab_shift;
+            XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
+            T -= step;
+            in1 += 2;
+            in2 -= 2;
+            p_revtab++;
+            j = (*p_revtab)>>revtab_shift;
+            XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
+            T -= step;
+            in1 += 2;
+            in2 -= 2;
+            p_revtab++;
+        }
+    }
+    /* ... and so fft runs in OUTPUT buffer */
+    ff_fft_calc_c(nbits-2, z);
+    /* post rotation + reordering.  now keeps the result within the OUTPUT buffer */
+    switch( nbits )
+    {
+        default:
+        {
+            fixed32 * z1 = (fixed32 *)(&z[0]);
+            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
+            int magic_step = step>>2;
+            int newstep;
+            if(n<=1024)
+            {
+                T = sincos_lookup0 + magic_step;
+                newstep = step>>1;
+            }
+            else
+            {   
+                 T = sincos_lookup1;
+                 newstep = 2;
+            }
+        
+            while(z1<z2)
+            {
+                fixed32 r0,i0,r1,i1;
+                XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
+                XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
+                z1[0] = r0;
+                z1[1] = i0;
+                z2[0] = r1;
+                z2[1] = i1;
+                z1+=2;
+                z2-=2;
+            }
+            
+            break;
+        }
+        case 12: /* n=4096 */
+        {
+            /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
+            const int32_t * V = sincos_lookup1;
+            T = sincos_lookup0;
+            int32_t t0,t1,v0,v1;
+            fixed32 * z1 = (fixed32 *)(&z[0]);
+            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
+            t0 = T[0]>>1; t1=T[1]>>1;
+        
+            while(z1<z2)
+            {
+                fixed32 r0,i0,r1,i1;
+                t0 += (v0 = (V[0]>>1));
+                t1 += (v1 = (V[1]>>1));
+                XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
+                T+=2;
+                v0 += (t0 = (T[0]>>1));
+                v1 += (t1 = (T[1]>>1));
+                XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
+                z1[0] = r0;
+                z1[1] = i0;
+                z2[0] = r1;
+                z2[1] = i1;
+                z1+=2;
+                z2-=2;
+                V+=2;
+            }
+            
+            break;
+        }
+        
+        case 13: /* n = 8192 */
+        {
+            /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
+               specifically: 25:75 for first twiddle and 75:25 for second twiddle */
+            const int32_t * V = sincos_lookup1;
+            T = sincos_lookup0;
+            int32_t t0,t1,v0,v1,q0,q1;
+            fixed32 * z1 = (fixed32 *)(&z[0]);
+            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
+            t0 = T[0]; t1=T[1];
+        
+            while(z1<z2)
+            {
+                fixed32 r0,i0,r1,i1;
+                v0 = V[0]; v1 = V[1];
+                t0 += (q0 = (v0-t0)>>1);
+                t1 += (q1 = (v1-t1)>>1);
+                XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
+                t0 = v0-q0;
+                t1 = v1-q1;
+                XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
+                z1[0] = r0;
+                z1[1] = i0;
+                z2[0] = r1;
+                z2[1] = i1;
+                z1+=2;
+                z2-=2;
+                T+=2;
+                
+                t0 = T[0]; t1 = T[1];
+                v0 += (q0 = (t0-v0)>>1);
+                v1 += (q1 = (t1-v1)>>1);
+                XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
+                v0 = t0-q0;
+                v1 = t1-q1;
+                XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
+                z1[0] = r0;
+                z1[1] = i0;
+                z2[0] = r1;
+                z2[1] = i1;
+                z1+=2;
+                z2-=2;
+                V+=2;
+            }
+               
+            break;
+        }
+    }
+} 
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ * "In-place" processing can be achieved provided that:
+ *            [0  ..  N/2-1 | N/2  ..  N-1 ]
+ *            <----input---->
+ *            <-----------output----------->
+ *
+ */
+void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
+{
+    const int n = (1<<nbits);
+    const int n2 = (n>>1);
+    const int n4 = (n>>2);
+    
+    ff_imdct_half(nbits,output+n2,input);
+    /* reflect the half imdct into the full N samples */
+    /* TODO: this could easily be optimised more! */
+    fixed32 * in_r, * in_r2, * out_r, * out_r2;
+    out_r = output;
+    out_r2 = output+n2-8;
+    in_r  = output+n2+n4-8;
+    while(out_r<out_r2)
+    {
+        out_r[0]     = -(out_r2[7] = in_r[7]);
+        out_r[1]     = -(out_r2[6] = in_r[6]);
+        out_r[2]     = -(out_r2[5] = in_r[5]);
+        out_r[3]     = -(out_r2[4] = in_r[4]);
+        out_r[4]     = -(out_r2[3] = in_r[3]);
+        out_r[5]     = -(out_r2[2] = in_r[2]);
+        out_r[6]     = -(out_r2[1] = in_r[1]);
+        out_r[7]     = -(out_r2[0] = in_r[0]);
+        in_r -= 8;
+        out_r += 8;
+        out_r2 -= 8;
+    }
+    in_r = output + n2+n4;
+    in_r2 = output + n-4;
+    out_r = output + n2;
+    out_r2 = output + n2 + n4 - 4;
+    while(in_r<in_r2)
+    {
+        register fixed32 t0,t1,t2,t3;
+        register fixed32 s0,s1,s2,s3;
+        //simultaneously do the following things:
+        // 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
+        // 2. reflect range from [n2+n4 .. n-1] inplace
+        //
+        //  [                      |                        ]
+        //   ^a ->            <- ^b ^c ->               <- ^d
+        //
+        //  #1: copy from ^c to ^a
+        //  #2: copy from ^d to ^b
+        //  #3: swap ^c and ^d in place
+        //
+        // #1 pt1 : load 4 words from ^c.
+        t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
+        // #1 pt2 : write to ^a
+        out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
+        // #2 pt1 : load 4 words from ^d
+        s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
+        // #2 pt2 : write to ^b
+        out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
+        // #3 pt1 : write words from #2 to ^c
+        in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
+        // #3 pt2 : write words from #1 to ^d
+        in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
+        in_r += 4;
+        in_r2 -= 4;
+        out_r += 4;
+        out_r2 -= 4;
+    }
+}
+static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */
+/* Table of values of atan(2^-i) in 0.32 format fractions of pi where pi = 0xffffffff / 2 */
+static const unsigned long atan_table[] = {
+    0x1fffffff, /* +0.785398163 (or pi/4) */
+    0x12e4051d, /* +0.463647609 */
+    0x09fb385b, /* +0.244978663 */
+    0x051111d4, /* +0.124354995 */
+    0x028b0d43, /* +0.062418810 */
+    0x0145d7e1, /* +0.031239833 */
+    0x00a2f61e, /* +0.015623729 */
+    0x00517c55, /* +0.007812341 */
+    0x0028be53, /* +0.003906230 */
+    0x00145f2e, /* +0.001953123 */
+    0x000a2f98, /* +0.000976562 */
+    0x000517cc, /* +0.000488281 */
+    0x00028be6, /* +0.000244141 */
+    0x000145f3, /* +0.000122070 */
+    0x0000a2f9, /* +0.000061035 */
+    0x0000517c, /* +0.000030518 */
+    0x000028be, /* +0.000015259 */
+    0x0000145f, /* +0.000007629 */
+    0x00000a2f, /* +0.000003815 */
+    0x00000517, /* +0.000001907 */
+    0x0000028b, /* +0.000000954 */
+    0x00000145, /* +0.000000477 */
+    0x000000a2, /* +0.000000238 */
+    0x00000051, /* +0.000000119 */
+    0x00000028, /* +0.000000060 */
+    0x00000014, /* +0.000000030 */
+    0x0000000a, /* +0.000000015 */
+    0x00000005, /* +0.000000007 */
+    0x00000002, /* +0.000000004 */
+    0x00000001, /* +0.000000002 */
+    0x00000000, /* +0.000000001 */
+    0x00000000, /* +0.000000000 */
+};
+/**
+ * Implements sin and cos using CORDIC rotation.
+ *
+ * @param phase has range from 0 to 0xffffffff, representing 0 and
+ *        2*pi respectively.
+ * @param cos return address for cos
+ * @return sin of phase, value is a signed value from LONG_MIN to LONG_MAX,
+ *         representing -1 and 1 respectively.
+ *
+ *        Gives at least 24 bits precision (last 2-8 bits or so are probably off)
+ */
+long fsincos(unsigned long phase, fixed32 *cos)
+{
+    int32_t x, x1, y, y1;
+    unsigned long z, z1;
+    int i;
+    /* Setup initial vector */
+    x = cordic_circular_gain;
+    y = 0;
+    z = phase;
+    /* The phase has to be somewhere between 0..pi for this to work right */
+    if (z < 0xffffffff / 4) {
+        /* z in first quadrant, z += pi/2 to correct */
+        x = -x;
+        z += 0xffffffff / 4;
+    } else if (z < 3 * (0xffffffff / 4)) {
+        /* z in third quadrant, z -= pi/2 to correct */
+        z -= 0xffffffff / 4;
+    } else {
+        /* z in fourth quadrant, z -= 3pi/2 to correct */
+        x = -x;
+        z -= 3 * (0xffffffff / 4);
+    }
+    /* Each iteration adds roughly 1-bit of extra precision */
+    for (i = 0; i < 31; i++) {
+        x1 = x >> i;
+        y1 = y >> i;
+        z1 = atan_table[i];
+        /* Decided which direction to rotate vector. Pivot point is pi/2 */
+        if (z >= 0xffffffff / 4) {
+            x -= y1;
+            y += x1;
+            z -= z1;
+        } else {
+            x += y1;
+            y -= x1;
+            z += z1;
+        }
+    }
+    if (cos)
+        *cos = x;
+    return y;
+}
author	Dave Hooper <dave@beermex.com>	2010-02-17 00:49:53 +0000
committer	Dave Hooper <dave@beermex.com>	2010-02-17 00:49:53 +0000
commit	42774d3128b91d5a37344cb40d56d3c4d147e5f2 (patch)
tree	bf336b407992ec9a5e454556f3351e3f8a0d10de /apps/codecs/lib/mdct.c
parent	62257ebc38bc0a3095b25dd0f58c4c8215edf602 (diff)
download	rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.tar.gz rockbox-42774d3128b91d5a37344cb40d56d3c4d147e5f2.zip

diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c new file mode 100644 index 0000000000..03baa4db4a --- /dev/null +++ b/apps/codecs/lib/mdct.c
@@ -0,0 +1,414 @@
	1	/*
	2	* Fixed Point IMDCT
	3	* Copyright (c) 2002 The FFmpeg Project.
	4	* Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, write to the Free Software
	18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	19	*/
	20
	21	#include "codeclib.h"
	22	#include "mdct.h"
	23	#include "asm_arm.h"
	24	#include "asm_mcf5249.h"
	25	#include "codeclib_misc.h"
	26	#include "mdct_lookup.h"
	27
	28	/**
	29	* Compute the middle half of the inverse MDCT of size N = 2^nbits
	30	* thus excluding the parts that can be derived by symmetry
	31	* @param output N/2 samples
	32	* @param input N/2 samples
	33	*
	34	* NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
	35	* not overlap or intersect at all)
	36	*/
	37	void ff_imdct_half(unsigned int nbits, fixed32 output, const fixed32 input)
	38	{
	39	int n8, n4, n2, n, j;
	40	const fixed32 in1, in2;
	41
	42	n = 1 << nbits;
	43
	44	n2 = n >> 1;
	45	n4 = n >> 2;
	46	n8 = n >> 3;
	47
	48	FFTComplex z = (FFTComplex )output;
	49
	50	/* pre rotation */
	51	in1 = input;
	52	in2 = input + n2 - 1;
	53
	54	/* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
	55	The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
	56	const int revtab_shift = (14- nbits);
	57
	58	/* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
	59	/* (note that when using the current split radix, the bitreverse ordering is
	60	complex, meaning that this reordering cannot easily be done in-place) */
	61	/* Using the following pdf, you can see that it is possible to rearrange
	62	the 'classic' pre/post rotate with an alternative one that enables
	63	us to use fewer distinct twiddle factors.
	64	http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
	65
	66	For prerotation, the factors are just sin,cos(2PI*i/N)
	67	For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
	68
	69	Therefore, prerotation can immediately reuse the same twiddles as fft
	70	(for postrotation it's still a bit complex, so this is still using
	71	an mdct-local set of twiddles to do that part)
	72	*/
	73	const int32_t *T = sincos_lookup0;
	74	const int step = 2<<(12-nbits);
	75	const uint16_t * p_revtab=revtab;
	76	{
	77	const uint16_t * const p_revtab_end = p_revtab + n8;
	78	while(LIKELY(p_revtab < p_revtab_end))
	79	{
	80	j = (*p_revtab)>>revtab_shift;
	81	XNPROD31(in2, in1, T[1], T[0], &z[j].re, &z[j].im );
	82	T += step;
	83	in1 += 2;
	84	in2 -= 2;
	85	p_revtab++;
	86	j = (*p_revtab)>>revtab_shift;
	87	XNPROD31(in2, in1, T[1], T[0], &z[j].re, &z[j].im );
	88	T += step;
	89	in1 += 2;
	90	in2 -= 2;
	91	p_revtab++;
	92	}
	93	}
	94	{
	95	const uint16_t * const p_revtab_end = p_revtab + n8;
	96	while(LIKELY(p_revtab < p_revtab_end))
	97	{
	98	j = (*p_revtab)>>revtab_shift;
	99	XNPROD31(in2, in1, T[0], T[1], &z[j].re, &z[j].im);
	100	T -= step;
	101	in1 += 2;
	102	in2 -= 2;
	103	p_revtab++;
	104	j = (*p_revtab)>>revtab_shift;
	105	XNPROD31(in2, in1, T[0], T[1], &z[j].re, &z[j].im);
	106	T -= step;
	107	in1 += 2;
	108	in2 -= 2;
	109	p_revtab++;
	110	}
	111	}
	112
	113
	114	/* ... and so fft runs in OUTPUT buffer */
	115	ff_fft_calc_c(nbits-2, z);
	116
	117	/* post rotation + reordering. now keeps the result within the OUTPUT buffer */
	118	switch( nbits )
	119	{
	120	default:
	121	{
	122	fixed32 * z1 = (fixed32 *)(&z[0]);
	123	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
	124	int magic_step = step>>2;
	125	int newstep;
	126	if(n<=1024)
	127	{
	128	T = sincos_lookup0 + magic_step;
	129	newstep = step>>1;
	130	}
	131	else
	132	{
	133	T = sincos_lookup1;
	134	newstep = 2;
	135	}
	136
	137	while(z1<z2)
	138	{
	139	fixed32 r0,i0,r1,i1;
	140	XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
	141	XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
	142	z1[0] = r0;
	143	z1[1] = i0;
	144	z2[0] = r1;
	145	z2[1] = i1;
	146	z1+=2;
	147	z2-=2;
	148	}
	149
	150	break;
	151	}
	152
	153	case 12: /* n=4096 */
	154	{
	155	/* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
	156	const int32_t * V = sincos_lookup1;
	157	T = sincos_lookup0;
	158	int32_t t0,t1,v0,v1;
	159	fixed32 * z1 = (fixed32 *)(&z[0]);
	160	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
	161
	162	t0 = T[0]>>1; t1=T[1]>>1;
	163
	164	while(z1<z2)
	165	{
	166	fixed32 r0,i0,r1,i1;
	167	t0 += (v0 = (V[0]>>1));
	168	t1 += (v1 = (V[1]>>1));
	169	XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
	170	T+=2;
	171	v0 += (t0 = (T[0]>>1));
	172	v1 += (t1 = (T[1]>>1));
	173	XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
	174	z1[0] = r0;
	175	z1[1] = i0;
	176	z2[0] = r1;
	177	z2[1] = i1;
	178	z1+=2;
	179	z2-=2;
	180	V+=2;
	181	}
	182
	183	break;
	184	}
	185
	186	case 13: /* n = 8192 */
	187	{
	188	/* weight linear interpolation between sincos_lookup0 and sincos_lookup1
	189	specifically: 25:75 for first twiddle and 75:25 for second twiddle */
	190	const int32_t * V = sincos_lookup1;
	191	T = sincos_lookup0;
	192	int32_t t0,t1,v0,v1,q0,q1;
	193	fixed32 * z1 = (fixed32 *)(&z[0]);
	194	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
	195
	196	t0 = T[0]; t1=T[1];
	197
	198	while(z1<z2)
	199	{
	200	fixed32 r0,i0,r1,i1;
	201	v0 = V[0]; v1 = V[1];
	202	t0 += (q0 = (v0-t0)>>1);
	203	t1 += (q1 = (v1-t1)>>1);
	204	XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
	205	t0 = v0-q0;
	206	t1 = v1-q1;
	207	XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
	208	z1[0] = r0;
	209	z1[1] = i0;
	210	z2[0] = r1;
	211	z2[1] = i1;
	212	z1+=2;
	213	z2-=2;
	214	T+=2;
	215
	216	t0 = T[0]; t1 = T[1];
	217	v0 += (q0 = (t0-v0)>>1);
	218	v1 += (q1 = (t1-v1)>>1);
	219	XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
	220	v0 = t0-q0;
	221	v1 = t1-q1;
	222	XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
	223	z1[0] = r0;
	224	z1[1] = i0;
	225	z2[0] = r1;
	226	z2[1] = i1;
	227	z1+=2;
	228	z2-=2;
	229	V+=2;
	230	}
	231
	232	break;
	233	}
	234	}
	235	}
	236
	237	/**
	238	* Compute inverse MDCT of size N = 2^nbits
	239	* @param output N samples
	240	* @param input N/2 samples
	241	* "In-place" processing can be achieved provided that:
	242	* [0 .. N/2-1 \| N/2 .. N-1 ]
	243	* <----input---->
	244	* <-----------output----------->
	245	*
	246	*/
	247	void ff_imdct_calc(unsigned int nbits, fixed32 output, const fixed32 input)
	248	{
	249	const int n = (1<<nbits);
	250	const int n2 = (n>>1);
	251	const int n4 = (n>>2);
	252
	253	ff_imdct_half(nbits,output+n2,input);
	254
	255	/* reflect the half imdct into the full N samples */
	256	/* TODO: this could easily be optimised more! */
	257	fixed32 * in_r, * in_r2, * out_r, * out_r2;
	258
	259	out_r = output;
	260	out_r2 = output+n2-8;
	261	in_r = output+n2+n4-8;
	262	while(out_r<out_r2)
	263	{
	264	out_r[0] = -(out_r2[7] = in_r[7]);
	265	out_r[1] = -(out_r2[6] = in_r[6]);
	266	out_r[2] = -(out_r2[5] = in_r[5]);
	267	out_r[3] = -(out_r2[4] = in_r[4]);
	268	out_r[4] = -(out_r2[3] = in_r[3]);
	269	out_r[5] = -(out_r2[2] = in_r[2]);
	270	out_r[6] = -(out_r2[1] = in_r[1]);
	271	out_r[7] = -(out_r2[0] = in_r[0]);
	272	in_r -= 8;
	273	out_r += 8;
	274	out_r2 -= 8;
	275	}
	276
	277	in_r = output + n2+n4;
	278	in_r2 = output + n-4;
	279	out_r = output + n2;
	280	out_r2 = output + n2 + n4 - 4;
	281	while(in_r<in_r2)
	282	{
	283	register fixed32 t0,t1,t2,t3;
	284	register fixed32 s0,s1,s2,s3;
	285
	286	//simultaneously do the following things:
	287	// 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
	288	// 2. reflect range from [n2+n4 .. n-1] inplace
	289	//
	290	// [ \| ]
	291	// ^a -> <- ^b ^c -> <- ^d
	292	//
	293	// #1: copy from ^c to ^a
	294	// #2: copy from ^d to ^b
	295	// #3: swap ^c and ^d in place
	296	//
	297	// #1 pt1 : load 4 words from ^c.
	298	t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
	299	// #1 pt2 : write to ^a
	300	out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
	301	// #2 pt1 : load 4 words from ^d
	302	s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
	303	// #2 pt2 : write to ^b
	304	out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
	305	// #3 pt1 : write words from #2 to ^c
	306	in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
	307	// #3 pt2 : write words from #1 to ^d
	308	in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
	309
	310	in_r += 4;
	311	in_r2 -= 4;
	312	out_r += 4;
	313	out_r2 -= 4;
	314	}
	315	}
	316
	317	static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */
	318
	319	/* Table of values of atan(2^-i) in 0.32 format fractions of pi where pi = 0xffffffff / 2 */
	320	static const unsigned long atan_table[] = {
	321	0x1fffffff, /* +0.785398163 (or pi/4) */
	322	0x12e4051d, /* +0.463647609 */
	323	0x09fb385b, /* +0.244978663 */
	324	0x051111d4, /* +0.124354995 */
	325	0x028b0d43, /* +0.062418810 */
	326	0x0145d7e1, /* +0.031239833 */
	327	0x00a2f61e, /* +0.015623729 */
	328	0x00517c55, /* +0.007812341 */
	329	0x0028be53, /* +0.003906230 */
	330	0x00145f2e, /* +0.001953123 */
	331	0x000a2f98, /* +0.000976562 */
	332	0x000517cc, /* +0.000488281 */
	333	0x00028be6, /* +0.000244141 */
	334	0x000145f3, /* +0.000122070 */
	335	0x0000a2f9, /* +0.000061035 */
	336	0x0000517c, /* +0.000030518 */
	337	0x000028be, /* +0.000015259 */
	338	0x0000145f, /* +0.000007629 */
	339	0x00000a2f, /* +0.000003815 */
	340	0x00000517, /* +0.000001907 */
	341	0x0000028b, /* +0.000000954 */
	342	0x00000145, /* +0.000000477 */
	343	0x000000a2, /* +0.000000238 */
	344	0x00000051, /* +0.000000119 */
	345	0x00000028, /* +0.000000060 */
	346	0x00000014, /* +0.000000030 */
	347	0x0000000a, /* +0.000000015 */
	348	0x00000005, /* +0.000000007 */
	349	0x00000002, /* +0.000000004 */
	350	0x00000001, /* +0.000000002 */
	351	0x00000000, /* +0.000000001 */
	352	0x00000000, /* +0.000000000 */
	353	};
	354
	355	/**
	356	* Implements sin and cos using CORDIC rotation.
	357	*
	358	* @param phase has range from 0 to 0xffffffff, representing 0 and
	359	* 2*pi respectively.
	360	* @param cos return address for cos
	361	* @return sin of phase, value is a signed value from LONG_MIN to LONG_MAX,
	362	* representing -1 and 1 respectively.
	363	*
	364	* Gives at least 24 bits precision (last 2-8 bits or so are probably off)
	365	*/
	366
	367	long fsincos(unsigned long phase, fixed32 *cos)
	368	{
	369	int32_t x, x1, y, y1;
	370	unsigned long z, z1;
	371	int i;
	372
	373	/* Setup initial vector */
	374	x = cordic_circular_gain;
	375	y = 0;
	376	z = phase;
	377
	378	/* The phase has to be somewhere between 0..pi for this to work right */
	379	if (z < 0xffffffff / 4) {
	380	/* z in first quadrant, z += pi/2 to correct */
	381	x = -x;
	382	z += 0xffffffff / 4;
	383	} else if (z < 3 * (0xffffffff / 4)) {
	384	/* z in third quadrant, z -= pi/2 to correct */
	385	z -= 0xffffffff / 4;
	386	} else {
	387	/* z in fourth quadrant, z -= 3pi/2 to correct */
	388	x = -x;
	389	z -= 3 * (0xffffffff / 4);
	390	}
	391
	392	/* Each iteration adds roughly 1-bit of extra precision */
	393	for (i = 0; i < 31; i++) {
	394	x1 = x >> i;
	395	y1 = y >> i;
	396	z1 = atan_table[i];
	397
	398	/* Decided which direction to rotate vector. Pivot point is pi/2 */
	399	if (z >= 0xffffffff / 4) {
	400	x -= y1;
	401	y += x1;
	402	z -= z1;
	403	} else {
	404	x += y1;
	405	y -= x1;
	406	z += z1;
	407	}
	408	}
	409
	410	if (cos)
	411	*cos = x;
	412
	413	return y;
	414	}