1 files changed, 0 insertions, 644 deletions
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
deleted file mode 100644
index 777aec4a55..0000000000
--- a/apps/codecs/lib/mdct.c
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- * Fixed Point IMDCT 
- * Copyright (c) 2002 The FFmpeg Project.
- * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include "codeclib.h"
-#include "mdct.h"
-#include "codeclib_misc.h"
-#include "mdct_lookup.h"
-#ifndef ICODE_ATTR_TREMOR_MDCT
-#define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
-#endif
-/**
- * Compute the middle half of the inverse MDCT of size N = 2^nbits
- * thus excluding the parts that can be derived by symmetry
- * @param output N/2 samples
- * @param input N/2 samples
- *
- * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
- *                                          not overlap or intersect at all)
- */
-void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
-void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
-{
-    int n8, n4, n2, n, j;
-    const fixed32 *in1, *in2;
-    (void)j;
-    n = 1 << nbits;
-    n2 = n >> 1;
-    n4 = n >> 2;
-    n8 = n >> 3;
-    FFTComplex *z = (FFTComplex *)output;
-    /* pre rotation */
-    in1 = input;
-    in2 = input + n2 - 1;
-    
-    /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
-       The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
-    const int revtab_shift = (14- nbits);
-    
-    /* bitreverse reorder the input and rotate;   result here is in OUTPUT ... */
-    /* (note that when using the current split radix, the bitreverse ordering is
-        complex, meaning that this reordering cannot easily be done in-place) */
-    /* Using the following pdf, you can see that it is possible to rearrange
-       the 'classic' pre/post rotate with an alternative one that enables
-       us to use fewer distinct twiddle factors.
-       http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
-       
-       For prerotation, the factors are just sin,cos(2PI*i/N)
-       For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
-       
-       Therefore, prerotation can immediately reuse the same twiddles as fft
-       (for postrotation it's still a bit complex, we reuse the fft trig tables
-        where we can, or a special table for N=2048, or interpolate between
-        trig tables for N>2048)
-       */
-    const int32_t *T = sincos_lookup0;
-    const int step = 2<<(12-nbits);
-    const uint16_t * p_revtab=revtab;
-    {
-        const uint16_t * const p_revtab_end = p_revtab + n8;
-#ifdef CPU_COLDFIRE
-        asm volatile ("move.l (%[in2]), %%d0\n\t"
-                      "move.l (%[in1]), %%d1\n\t"
-                      "bra.s 1f\n\t"
-                      "0:\n\t"
-                      "movem.l (%[T]), %%d2-%%d3\n\t"
-                      "addq.l #8, %[in1]\n\t"
-                      "subq.l #8, %[in2]\n\t"
-                      "lea (%[step]*4, %[T]), %[T]\n\t"
-                      "mac.l %%d0, %%d3, (%[T]), %%d4, %%acc0;"
-                      "msac.l %%d1, %%d2, (4, %[T]), %%d5, %%acc0;"
-                      "mac.l %%d1, %%d3, (%[in1]), %%d1, %%acc1;"
-                      "mac.l %%d0, %%d2, (%[in2]), %%d0, %%acc1;"
-                      "addq.l #8, %[in1]\n\t"
-                      "subq.l #8, %[in2]\n\t"
-                      "mac.l %%d0, %%d5, %%acc2;"
-                      "msac.l %%d1, %%d4, (%[p_revtab])+, %%d2, %%acc2;"
-                      "mac.l %%d1, %%d5, (%[in1]), %%d1, %%acc3;"
-                      "mac.l %%d0, %%d4, (%[in2]), %%d0, %%acc3;"
-                      "clr.l %%d3\n\t"
-                      "move.w %%d2, %%d3\n\t"
-                      "eor.l %%d3, %%d2\n\t"
-                      "swap %%d2\n\t"
-                      "lsr.l %[revtab_shift], %%d2\n\t"
-                      "movclr.l %%acc0, %%d4;"
-                      "movclr.l %%acc1, %%d5;"
-                      "lsl.l #3, %%d2\n\t"
-                      "lea (%%d2, %[z]), %%a1\n\t"
-                      "movem.l %%d4-%%d5, (%%a1)\n\t"
-                      "lsr.l %[revtab_shift], %%d3\n\t"
-                      "movclr.l %%acc2, %%d4;"
-                      "movclr.l %%acc3, %%d5;"
-                      "lsl.l #3, %%d3\n\t"
-                      "lea (%%d3, %[z]), %%a1\n\t"
-                      "movem.l %%d4-%%d5, (%%a1)\n\t"
-                          
-                      "lea (%[step]*4, %[T]), %[T]\n\t"
-                      "1:\n\t"
-                      "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
-                      "bcs.s 0b\n\t"
-                      : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
-                        [p_revtab] "+a" (p_revtab)
-                      : [z] "a" (z), [step] "d" (step), [revtab_shift] "d" (revtab_shift),
-                        [p_revtab_end] "r" (p_revtab_end)
-                      : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
-#else
-        while(LIKELY(p_revtab < p_revtab_end))
-        {
-            j = (*p_revtab)>>revtab_shift;
-            XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
-            T += step;
-            in1 += 2;
-            in2 -= 2;
-            p_revtab++;
-            j = (*p_revtab)>>revtab_shift;
-            XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
-            T += step;
-            in1 += 2;
-            in2 -= 2;
-            p_revtab++;
-        }
-#endif
-    }
-    {
-        const uint16_t * const p_revtab_end = p_revtab + n8;
-#ifdef CPU_COLDFIRE
-        asm volatile ("move.l (%[in2]), %%d0\n\t"
-                      "move.l (%[in1]), %%d1\n\t"
-                      "bra.s 1f\n\t"
-                      "0:\n\t"
-                      "movem.l (%[T]), %%d2-%%d3\n\t"
-                      "addq.l #8, %[in1]\n\t"
-                      "subq.l #8, %[in2]\n\t"
-                      "lea (%[step]*4, %[T]), %[T]\n\t"
-                      "mac.l %%d0, %%d2, (%[T]), %%d4, %%acc0;"
-                      "msac.l %%d1, %%d3, (4, %[T]), %%d5, %%acc0;"
-                      "mac.l %%d1, %%d2, (%[in1]), %%d1, %%acc1;"
-                      "mac.l %%d0, %%d3, (%[in2]), %%d0, %%acc1;"
-                      "addq.l #8, %[in1]\n\t"
-                      "subq.l #8, %[in2]\n\t"
-                      "mac.l %%d0, %%d4, %%acc2;"
-                      "msac.l %%d1, %%d5, (%[p_revtab])+, %%d2, %%acc2;"
-                      "mac.l %%d1, %%d4, (%[in1]), %%d1, %%acc3;"
-                      "mac.l %%d0, %%d5, (%[in2]), %%d0, %%acc3;"
-                      "clr.l %%d3\n\t"
-                      "move.w %%d2, %%d3\n\t"
-                      "eor.l %%d3, %%d2\n\t"
-                      "swap %%d2\n\t"
-                      "lsr.l %[revtab_shift], %%d2\n\t"
-                      "movclr.l %%acc0, %%d4;"
-                      "movclr.l %%acc1, %%d5;"
-                      "lsl.l #3, %%d2\n\t"
-                      "lea (%%d2, %[z]), %%a1\n\t"
-                      "movem.l %%d4-%%d5, (%%a1)\n\t"
-                      "lsr.l %[revtab_shift], %%d3\n\t"
-                      "movclr.l %%acc2, %%d4;"
-                      "movclr.l %%acc3, %%d5;"
-                      "lsl.l #3, %%d3\n\t"
-                      "lea (%%d3, %[z]), %%a1\n\t"
-                      "movem.l %%d4-%%d5, (%%a1)\n\t"
-                          
-                      "lea (%[step]*4, %[T]), %[T]\n\t"
-                      "1:\n\t"
-                      "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
-                      "bcs.s 0b\n\t"
-                      : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
-                        [p_revtab] "+a" (p_revtab)
-                      : [z] "a" (z), [step] "d" (-step), [revtab_shift] "d" (revtab_shift),
-                        [p_revtab_end] "r" (p_revtab_end)
-                      : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
-#else
-        while(LIKELY(p_revtab < p_revtab_end))
-        {
-            j = (*p_revtab)>>revtab_shift;
-            XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
-            T -= step;
-            in1 += 2;
-            in2 -= 2;
-            p_revtab++;
-            j = (*p_revtab)>>revtab_shift;
-            XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
-            T -= step;
-            in1 += 2;
-            in2 -= 2;
-            p_revtab++;
-        }
-#endif
-    }
-    /* ... and so fft runs in OUTPUT buffer */
-    ff_fft_calc_c(nbits-2, z);
-    /* post rotation + reordering.  now keeps the result within the OUTPUT buffer */
-    switch( nbits )
-    {
-        default:
-        {
-            fixed32 * z1 = (fixed32 *)(&z[0]);
-            int magic_step = step>>2;
-            int newstep;
-            if(n<=1024)
-            {
-                T = sincos_lookup0 + magic_step;
-                newstep = step>>1;
-            }
-            else
-            {   
-                T = sincos_lookup1;
-                newstep = 2;
-            }
-#ifdef CPU_COLDFIRE
-            fixed32 * z2 = (fixed32 *)(&z[n4]);
-            int c = n4;
-            if (newstep == 2)
-            {
-                asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
-                              "addq.l #8, %[z1]\n\t"
-                              "movem.l (%[T]), %%d2-%%d3\n\t"
-                              "addq.l #8, %[T]\n\t"
-                              "bra.s 1f\n\t"
-                              "0:\n\t"
-                              "msac.l %%d1, %%d2, (%[T])+, %%a3, %%acc0\n\t"
-                              "mac.l  %%d0, %%d3, (%[T])+, %%a4, %%acc0\n\t"
-                              
-                              "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
-                              "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
-                              "msac.l %%d1, %%a4, (%[T])+, %%d2, %%acc2\n\t"
-                              "mac.l  %%d0, %%a3, (%[T])+, %%d3, %%acc2\n\t"
-                              "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
-                              "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
-                              "movclr.l %%acc0, %%a3\n\t"
-                              "movclr.l %%acc3, %%a4\n\t"
-                              "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
-                              "movclr.l %%acc1, %%a4\n\t"
-                              "movclr.l %%acc2, %%a3\n\t"
-                              "movem.l %%a3-%%a4, (%[z2])\n\t"
-                              "subq.l #2, %[n]\n\t"
-                              "1:\n\t"
-                              "bhi.s 0b\n\t"
-                              : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
-                              :
-                              : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
-            }
-            else
-            {
-                asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
-                              "addq.l #8, %[z1]\n\t"
-                              "movem.l (%[T]), %%d2-%%d3\n\t"
-                              "lea (%[newstep]*4, %[T]), %[T]\n\t"
-                              "bra.s 1f\n\t"
-                              "0:\n\t"
-                              "msac.l %%d1, %%d2, (%[T]), %%a3, %%acc0\n\t"
-                              "mac.l  %%d0, %%d3, (4, %[T]), %%a4, %%acc0\n\t"
-                              "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
-                              "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
-                              "lea (%[newstep]*4, %[T]), %[T]\n\t"
-                              "msac.l %%d1, %%a4, (%[T]), %%d2, %%acc2\n\t"
-                              "mac.l  %%d0, %%a3, (4, %[T]), %%d3, %%acc2\n\t"
-                              "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
-                              "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
-                              "lea (%[newstep]*4, %[T]), %[T]\n\t"
-                              "movclr.l %%acc0, %%a3\n\t"
-                              "movclr.l %%acc3, %%a4\n\t"
-                              "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
-                              "movclr.l %%acc1, %%a4\n\t"
-                              "movclr.l %%acc2, %%a3\n\t"
-                              "movem.l %%a3-%%a4, (%[z2])\n\t"
-                              "subq.l #2, %[n]\n\t"
-                              "1:\n\t"
-                              "bhi.s 0b\n\t"
-                              : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
-                              : [newstep] "d" (newstep)
-                              : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
-            }
-#else
-            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
-            while(z1<z2)
-            {
-                fixed32 r0,i0,r1,i1;
-                XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
-                XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
-                z1[0] = -r0;
-                z1[1] = -i0;
-                z2[0] = -r1;
-                z2[1] = -i1;
-                z1+=2;
-                z2-=2;
-            }
-#endif 
-            break;
-        }
-        case 12: /* n=4096 */
-        {
-            /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
-            const int32_t * V = sincos_lookup1;
-            T = sincos_lookup0;
-            int32_t t0,t1,v0,v1;
-            fixed32 * z1 = (fixed32 *)(&z[0]);
-            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
-            t0 = T[0]>>1; t1=T[1]>>1;
-        
-            while(z1<z2)
-            {
-                fixed32 r0,i0,r1,i1;
-                t0 += (v0 = (V[0]>>1));
-                t1 += (v1 = (V[1]>>1));
-                XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
-                T+=2;
-                v0 += (t0 = (T[0]>>1));
-                v1 += (t1 = (T[1]>>1));
-                XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
-                z1[0] = -r0;
-                z1[1] = -i0;
-                z2[0] = -r1;
-                z2[1] = -i1;
-                z1+=2;
-                z2-=2;
-                V+=2;
-            }
-            
-            break;
-        }
-        
-        case 13: /* n = 8192 */
-        {
-            /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
-               specifically: 25:75 for first twiddle and 75:25 for second twiddle */
-            const int32_t * V = sincos_lookup1;
-            T = sincos_lookup0;
-            int32_t t0,t1,v0,v1,q0,q1;
-            fixed32 * z1 = (fixed32 *)(&z[0]);
-            fixed32 * z2 = (fixed32 *)(&z[n4-1]);
-            t0 = T[0]; t1=T[1];
-        
-            while(z1<z2)
-            {
-                fixed32 r0,i0,r1,i1;
-                v0 = V[0]; v1 = V[1];
-                t0 += (q0 = (v0-t0)>>1);
-                t1 += (q1 = (v1-t1)>>1);
-                XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
-                t0 = v0-q0;
-                t1 = v1-q1;
-                XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
-                z1[0] = -r0;
-                z1[1] = -i0;
-                z2[0] = -r1;
-                z2[1] = -i1;
-                z1+=2;
-                z2-=2;
-                T+=2;
-                
-                t0 = T[0]; t1 = T[1];
-                v0 += (q0 = (t0-v0)>>1);
-                v1 += (q1 = (t1-v1)>>1);
-                XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
-                v0 = t0-q0;
-                v1 = t1-q1;
-                XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
-                z1[0] = -r0;
-                z1[1] = -i0;
-                z2[0] = -r1;
-                z2[1] = -i1;
-                z1+=2;
-                z2-=2;
-                V+=2;
-            }
-               
-            break;
-        }
-    }
-} 
-/**
- * Compute inverse MDCT of size N = 2^nbits
- * @param output N samples
- * @param input N/2 samples
- * "In-place" processing can be achieved provided that:
- *            [0  ..  N/2-1 | N/2  ..  N-1 ]
- *            <----input---->
- *            <-----------output----------->
- *
- * The result of ff_imdct_half is to put the 'half' imdct here
- *
- *                          N/2          N-1
- *                          <--half imdct-->
- *
- * We want it here for the full imdct:
- *                   N/4      3N/4-1
- *                   <-------------->
- *
- * In addition we need to apply two symmetries to get the full imdct:
- *
- *           <AAAAAA>                <DDDDDD>
- *                   <BBBBBB><CCCCCC>
- *
- *           D is a reflection of C
- *           A is a reflection of B (but with sign flipped)
- *
- * We process the symmetries at the same time as we 'move' the half imdct
- * from [N/2,N-1] to [N/4,3N/4-1]
- *
- * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
- * This would require being able to use revtab 'inplace' (since the input
- * and output of imdct_half would then overlap somewhat)
- */
-void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
-#ifndef CPU_ARM
-void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
-{
-    const int n = (1<<nbits);
-    const int n2 = (n>>1);
-    const int n4 = (n>>2);
-    
-    /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
-    ff_imdct_half(nbits,output+n2,input);
-    fixed32 * in_r, * in_r2, * out_r, * out_r2;
-    /* Copy BBBB to AAAA, reflected and sign-flipped.
-       Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
-    out_r = output;
-    out_r2 = output+n2-8;
-    in_r  = output+n2+n4-8;
-    while(out_r<out_r2)
-    {
-#if defined CPU_COLDFIRE
-        asm volatile( 
-            "movem.l (%[in_r]), %%d0-%%d7\n\t"
-            "movem.l %%d0-%%d7, (%[out_r2])\n\t"
-            "neg.l %%d7\n\t"
-            "move.l %%d7, (%[out_r])+\n\t"
-            "neg.l %%d6\n\t"
-            "move.l %%d6, (%[out_r])+\n\t"
-            "neg.l %%d5\n\t"
-            "move.l %%d5, (%[out_r])+\n\t"
-            "neg.l %%d4\n\t"
-            "move.l %%d4, (%[out_r])+\n\t"
-            "neg.l %%d3\n\t"
-            "move.l %%d3, (%[out_r])+\n\t"
-            "neg.l %%d2\n\t"
-            "move.l %%d2, (%[out_r])+\n\t"
-            "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
-            "neg.l %%d1\n\t"
-            "move.l %%d1, (%[out_r])+\n\t"
-            "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
-            "neg.l %%d0\n\t"
-            "move.l %%d0, (%[out_r])+\n\t"
-            : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
-            :
-            : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
-#else
-        out_r[0]     = -(out_r2[7] = in_r[7]);
-        out_r[1]     = -(out_r2[6] = in_r[6]);
-        out_r[2]     = -(out_r2[5] = in_r[5]);
-        out_r[3]     = -(out_r2[4] = in_r[4]);
-        out_r[4]     = -(out_r2[3] = in_r[3]);
-        out_r[5]     = -(out_r2[2] = in_r[2]);
-        out_r[6]     = -(out_r2[1] = in_r[1]);
-        out_r[7]     = -(out_r2[0] = in_r[0]);
-        in_r -= 8;
-        out_r += 8;
-        out_r2 -= 8;
-#endif
-    }
-    in_r = output + n2+n4;
-    in_r2 = output + n-4;
-    out_r = output + n2;
-    out_r2 = output + n2 + n4 - 4;
-    while(in_r<in_r2)
-    {
-#if defined CPU_COLDFIRE
-        asm volatile(
-            "movem.l (%[in_r]), %%d0-%%d3\n\t"
-            "movem.l %%d0-%%d3, (%[out_r])\n\t"
-            "movem.l (%[in_r2]), %%d4-%%d7\n\t"
-            "movem.l %%d4-%%d7, (%[out_r2])\n\t"
-            "move.l %%d0, %%a3\n\t"
-            "move.l %%d3, %%d0\n\t"
-            "move.l %%d1, %%d3\n\t"
-            "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
-            "move.l %%d7, %%d1\n\t"
-            "move.l %%d6, %%d2\n\t"
-            "move.l %%d5, %%d3\n\t"
-            "movem.l %%d1-%%d4, (%[in_r])\n\t"
-            "lea.l (4*4, %[in_r]), %[in_r]\n\t"
-            "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
-            "lea.l (4*4, %[out_r]), %[out_r]\n\t"
-            "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
-            : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
-              [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
-            :
-            : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
-#else
-        register fixed32 t0,t1,t2,t3;
-        register fixed32 s0,s1,s2,s3;
-        /* Copy and reflect CCCC to DDDD.  Because CCCC is already where
-           we actually want to put DDDD this is a bit complicated.
-         * So simultaneously do the following things:
-         * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
-         * 2. reflect range from [n2+n4 .. n-1] inplace
-         *
-         *  [                      |                        ]
-         *   ^a ->            <- ^b ^c ->               <- ^d
-         *
-         *  #1: copy from ^c to ^a
-         *  #2: copy from ^d to ^b
-         *  #3: swap ^c and ^d in place
-         */
-        /* #1 pt1 : load 4 words from ^c. */
-        t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
-        /* #1 pt2 : write to ^a */
-        out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
-        /* #2 pt1 : load 4 words from ^d */
-        s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
-        /* #2 pt2 : write to ^b */
-        out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
-        /* #3 pt1 : write words from #2 to ^c */
-        in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
-        /* #3 pt2 : write words from #1 to ^d */
-        in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
-        in_r += 4;
-        in_r2 -= 4;
-        out_r += 4;
-        out_r2 -= 4;
-#endif
-    }
-}
-#else
-/* Follows the same structure as the canonical version above */
-void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
-{
-    const int n = (1<<nbits);
-    const int n2 = (n>>1);
-    const int n4 = (n>>2);
-    
-    ff_imdct_half(nbits,output+n2,input);
-    fixed32 * in_r, * in_r2, * out_r, * out_r2;
-    out_r = output;
-    out_r2 = output+n2;
-    in_r  = output+n2+n4;
-    while(out_r<out_r2)
-    {
-        asm volatile( 
-            "ldmdb %[in_r]!, {r0-r7}\n\t"
-            "stmdb %[out_r2]!, {r0-r7}\n\t"
-            "rsb r8,r0,#0\n\t"
-            "rsb r0,r7,#0\n\t"
-            "rsb r7,r1,#0\n\t"
-            "rsb r1,r6,#0\n\t"
-            "rsb r6,r2,#0\n\t"
-            "rsb r2,r5,#0\n\t"
-            "rsb r5,r3,#0\n\t"
-            "rsb r3,r4,#0\n\t"
-            "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
-            : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
-            :
-            : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
-    }
-    in_r = output + n2+n4;
-    in_r2 = output + n;
-    out_r = output + n2;
-    out_r2 = output + n2 + n4;
-    while(in_r<in_r2)
-    {
-        asm volatile(
-            "ldmia %[in_r], {r0-r3}\n\t"
-            "stmia %[out_r]!, {r0-r3}\n\t"
-            "ldmdb %[in_r2], {r5-r8}\n\t"
-            "stmdb %[out_r2]!, {r5-r8}\n\t"
-            "mov r4,r0\n\t"
-            "mov r0,r3\n\t"
-            "mov r3,r1\n\t"
-            "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
-            "mov r4,r8\n\t"
-            "mov r8,r5\n\t"
-            "mov r5,r7\n\t"
-            "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
-            :
-            [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
-            :
-            : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
-    }
-}
-#endif

diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c deleted file mode 100644 index 777aec4a55..0000000000 --- a/apps/codecs/lib/mdct.c +++ /dev/null
@@ -1,644 +0,0 @@
1	/*
2	* Fixed Point IMDCT
3	* Copyright (c) 2002 The FFmpeg Project.
4	* Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, write to the Free Software
18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19	*/
20
21	#include "codeclib.h"
22	#include "mdct.h"
23	#include "codeclib_misc.h"
24	#include "mdct_lookup.h"
25
26	#ifndef ICODE_ATTR_TREMOR_MDCT
27	#define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
28	#endif
29
30	/**
31	* Compute the middle half of the inverse MDCT of size N = 2^nbits
32	* thus excluding the parts that can be derived by symmetry
33	* @param output N/2 samples
34	* @param input N/2 samples
35	*
36	* NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
37	* not overlap or intersect at all)
38	*/
39	void ff_imdct_half(unsigned int nbits, fixed32 output, const fixed32 input) ICODE_ATTR_TREMOR_MDCT;
40	void ff_imdct_half(unsigned int nbits, fixed32 output, const fixed32 input)
41	{
42	int n8, n4, n2, n, j;
43	const fixed32 in1, in2;
44	(void)j;
45	n = 1 << nbits;
46
47	n2 = n >> 1;
48	n4 = n >> 2;
49	n8 = n >> 3;
50
51	FFTComplex z = (FFTComplex )output;
52
53	/* pre rotation */
54	in1 = input;
55	in2 = input + n2 - 1;
56
57	/* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
58	The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
59	const int revtab_shift = (14- nbits);
60
61	/* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
62	/* (note that when using the current split radix, the bitreverse ordering is
63	complex, meaning that this reordering cannot easily be done in-place) */
64	/* Using the following pdf, you can see that it is possible to rearrange
65	the 'classic' pre/post rotate with an alternative one that enables
66	us to use fewer distinct twiddle factors.
67	http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
68
69	For prerotation, the factors are just sin,cos(2PI*i/N)
70	For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
71
72	Therefore, prerotation can immediately reuse the same twiddles as fft
73	(for postrotation it's still a bit complex, we reuse the fft trig tables
74	where we can, or a special table for N=2048, or interpolate between
75	trig tables for N>2048)
76	*/
77	const int32_t *T = sincos_lookup0;
78	const int step = 2<<(12-nbits);
79	const uint16_t * p_revtab=revtab;
80	{
81	const uint16_t * const p_revtab_end = p_revtab + n8;
82	#ifdef CPU_COLDFIRE
83	asm volatile ("move.l (%[in2]), %%d0\n\t"
84	"move.l (%[in1]), %%d1\n\t"
85	"bra.s 1f\n\t"
86	"0:\n\t"
87	"movem.l (%[T]), %%d2-%%d3\n\t"
88
89	"addq.l #8, %[in1]\n\t"
90	"subq.l #8, %[in2]\n\t"
91
92	"lea (%[step]*4, %[T]), %[T]\n\t"
93
94	"mac.l %%d0, %%d3, (%[T]), %%d4, %%acc0;"
95	"msac.l %%d1, %%d2, (4, %[T]), %%d5, %%acc0;"
96	"mac.l %%d1, %%d3, (%[in1]), %%d1, %%acc1;"
97	"mac.l %%d0, %%d2, (%[in2]), %%d0, %%acc1;"
98
99	"addq.l #8, %[in1]\n\t"
100	"subq.l #8, %[in2]\n\t"
101
102	"mac.l %%d0, %%d5, %%acc2;"
103	"msac.l %%d1, %%d4, (%[p_revtab])+, %%d2, %%acc2;"
104	"mac.l %%d1, %%d5, (%[in1]), %%d1, %%acc3;"
105	"mac.l %%d0, %%d4, (%[in2]), %%d0, %%acc3;"
106
107	"clr.l %%d3\n\t"
108	"move.w %%d2, %%d3\n\t"
109	"eor.l %%d3, %%d2\n\t"
110	"swap %%d2\n\t"
111	"lsr.l %[revtab_shift], %%d2\n\t"
112
113	"movclr.l %%acc0, %%d4;"
114	"movclr.l %%acc1, %%d5;"
115	"lsl.l #3, %%d2\n\t"
116	"lea (%%d2, %[z]), %%a1\n\t"
117	"movem.l %%d4-%%d5, (%%a1)\n\t"
118
119	"lsr.l %[revtab_shift], %%d3\n\t"
120
121	"movclr.l %%acc2, %%d4;"
122	"movclr.l %%acc3, %%d5;"
123	"lsl.l #3, %%d3\n\t"
124	"lea (%%d3, %[z]), %%a1\n\t"
125	"movem.l %%d4-%%d5, (%%a1)\n\t"
126
127	"lea (%[step]*4, %[T]), %[T]\n\t"
128
129	"1:\n\t"
130	"cmp.l %[p_revtab_end], %[p_revtab]\n\t"
131	"bcs.s 0b\n\t"
132	: [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
133	[p_revtab] "+a" (p_revtab)
134	: [z] "a" (z), [step] "d" (step), [revtab_shift] "d" (revtab_shift),
135	[p_revtab_end] "r" (p_revtab_end)
136	: "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
137	#else
138	while(LIKELY(p_revtab < p_revtab_end))
139	{
140	j = (*p_revtab)>>revtab_shift;
141	XNPROD31(in2, in1, T[1], T[0], &z[j].re, &z[j].im );
142	T += step;
143	in1 += 2;
144	in2 -= 2;
145	p_revtab++;
146	j = (*p_revtab)>>revtab_shift;
147	XNPROD31(in2, in1, T[1], T[0], &z[j].re, &z[j].im );
148	T += step;
149	in1 += 2;
150	in2 -= 2;
151	p_revtab++;
152	}
153	#endif
154	}
155	{
156	const uint16_t * const p_revtab_end = p_revtab + n8;
157	#ifdef CPU_COLDFIRE
158	asm volatile ("move.l (%[in2]), %%d0\n\t"
159	"move.l (%[in1]), %%d1\n\t"
160	"bra.s 1f\n\t"
161	"0:\n\t"
162	"movem.l (%[T]), %%d2-%%d3\n\t"
163
164	"addq.l #8, %[in1]\n\t"
165	"subq.l #8, %[in2]\n\t"
166
167	"lea (%[step]*4, %[T]), %[T]\n\t"
168
169	"mac.l %%d0, %%d2, (%[T]), %%d4, %%acc0;"
170	"msac.l %%d1, %%d3, (4, %[T]), %%d5, %%acc0;"
171	"mac.l %%d1, %%d2, (%[in1]), %%d1, %%acc1;"
172	"mac.l %%d0, %%d3, (%[in2]), %%d0, %%acc1;"
173
174	"addq.l #8, %[in1]\n\t"
175	"subq.l #8, %[in2]\n\t"
176
177	"mac.l %%d0, %%d4, %%acc2;"
178	"msac.l %%d1, %%d5, (%[p_revtab])+, %%d2, %%acc2;"
179	"mac.l %%d1, %%d4, (%[in1]), %%d1, %%acc3;"
180	"mac.l %%d0, %%d5, (%[in2]), %%d0, %%acc3;"
181
182	"clr.l %%d3\n\t"
183	"move.w %%d2, %%d3\n\t"
184	"eor.l %%d3, %%d2\n\t"
185	"swap %%d2\n\t"
186	"lsr.l %[revtab_shift], %%d2\n\t"
187
188	"movclr.l %%acc0, %%d4;"
189	"movclr.l %%acc1, %%d5;"
190	"lsl.l #3, %%d2\n\t"
191	"lea (%%d2, %[z]), %%a1\n\t"
192	"movem.l %%d4-%%d5, (%%a1)\n\t"
193
194	"lsr.l %[revtab_shift], %%d3\n\t"
195
196	"movclr.l %%acc2, %%d4;"
197	"movclr.l %%acc3, %%d5;"
198	"lsl.l #3, %%d3\n\t"
199	"lea (%%d3, %[z]), %%a1\n\t"
200	"movem.l %%d4-%%d5, (%%a1)\n\t"
201
202	"lea (%[step]*4, %[T]), %[T]\n\t"
203
204	"1:\n\t"
205	"cmp.l %[p_revtab_end], %[p_revtab]\n\t"
206	"bcs.s 0b\n\t"
207	: [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
208	[p_revtab] "+a" (p_revtab)
209	: [z] "a" (z), [step] "d" (-step), [revtab_shift] "d" (revtab_shift),
210	[p_revtab_end] "r" (p_revtab_end)
211	: "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
212	#else
213	while(LIKELY(p_revtab < p_revtab_end))
214	{
215	j = (*p_revtab)>>revtab_shift;
216	XNPROD31(in2, in1, T[0], T[1], &z[j].re, &z[j].im);
217	T -= step;
218	in1 += 2;
219	in2 -= 2;
220	p_revtab++;
221	j = (*p_revtab)>>revtab_shift;
222	XNPROD31(in2, in1, T[0], T[1], &z[j].re, &z[j].im);
223	T -= step;
224	in1 += 2;
225	in2 -= 2;
226	p_revtab++;
227	}
228	#endif
229	}
230
231
232	/* ... and so fft runs in OUTPUT buffer */
233	ff_fft_calc_c(nbits-2, z);
234
235	/* post rotation + reordering. now keeps the result within the OUTPUT buffer */
236	switch( nbits )
237	{
238	default:
239	{
240	fixed32 * z1 = (fixed32 *)(&z[0]);
241	int magic_step = step>>2;
242	int newstep;
243	if(n<=1024)
244	{
245	T = sincos_lookup0 + magic_step;
246	newstep = step>>1;
247	}
248	else
249	{
250	T = sincos_lookup1;
251	newstep = 2;
252	}
253
254	#ifdef CPU_COLDFIRE
255	fixed32 * z2 = (fixed32 *)(&z[n4]);
256	int c = n4;
257	if (newstep == 2)
258	{
259	asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
260	"addq.l #8, %[z1]\n\t"
261	"movem.l (%[T]), %%d2-%%d3\n\t"
262	"addq.l #8, %[T]\n\t"
263	"bra.s 1f\n\t"
264	"0:\n\t"
265	"msac.l %%d1, %%d2, (%[T])+, %%a3, %%acc0\n\t"
266	"mac.l %%d0, %%d3, (%[T])+, %%a4, %%acc0\n\t"
267
268	"msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
269	"msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
270
271	"msac.l %%d1, %%a4, (%[T])+, %%d2, %%acc2\n\t"
272	"mac.l %%d0, %%a3, (%[T])+, %%d3, %%acc2\n\t"
273	"msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
274	"msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
275
276	"movclr.l %%acc0, %%a3\n\t"
277	"movclr.l %%acc3, %%a4\n\t"
278	"movem.l %%a3-%%a4, (-16, %[z1])\n\t"
279
280	"movclr.l %%acc1, %%a4\n\t"
281	"movclr.l %%acc2, %%a3\n\t"
282	"movem.l %%a3-%%a4, (%[z2])\n\t"
283
284	"subq.l #2, %[n]\n\t"
285	"1:\n\t"
286	"bhi.s 0b\n\t"
287	: [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
288	:
289	: "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
290	}
291	else
292	{
293	asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
294	"addq.l #8, %[z1]\n\t"
295	"movem.l (%[T]), %%d2-%%d3\n\t"
296	"lea (%[newstep]*4, %[T]), %[T]\n\t"
297	"bra.s 1f\n\t"
298	"0:\n\t"
299	"msac.l %%d1, %%d2, (%[T]), %%a3, %%acc0\n\t"
300	"mac.l %%d0, %%d3, (4, %[T]), %%a4, %%acc0\n\t"
301	"msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
302	"msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
303
304	"lea (%[newstep]*4, %[T]), %[T]\n\t"
305	"msac.l %%d1, %%a4, (%[T]), %%d2, %%acc2\n\t"
306	"mac.l %%d0, %%a3, (4, %[T]), %%d3, %%acc2\n\t"
307	"msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
308	"msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
309
310	"lea (%[newstep]*4, %[T]), %[T]\n\t"
311
312	"movclr.l %%acc0, %%a3\n\t"
313	"movclr.l %%acc3, %%a4\n\t"
314	"movem.l %%a3-%%a4, (-16, %[z1])\n\t"
315
316	"movclr.l %%acc1, %%a4\n\t"
317	"movclr.l %%acc2, %%a3\n\t"
318	"movem.l %%a3-%%a4, (%[z2])\n\t"
319
320	"subq.l #2, %[n]\n\t"
321	"1:\n\t"
322	"bhi.s 0b\n\t"
323	: [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
324	: [newstep] "d" (newstep)
325	: "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
326	}
327	#else
328	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
329	while(z1<z2)
330	{
331	fixed32 r0,i0,r1,i1;
332	XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
333	XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
334	z1[0] = -r0;
335	z1[1] = -i0;
336	z2[0] = -r1;
337	z2[1] = -i1;
338	z1+=2;
339	z2-=2;
340	}
341	#endif
342	break;
343	}
344
345	case 12: /* n=4096 */
346	{
347	/* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
348	const int32_t * V = sincos_lookup1;
349	T = sincos_lookup0;
350	int32_t t0,t1,v0,v1;
351	fixed32 * z1 = (fixed32 *)(&z[0]);
352	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
353
354	t0 = T[0]>>1; t1=T[1]>>1;
355
356	while(z1<z2)
357	{
358	fixed32 r0,i0,r1,i1;
359	t0 += (v0 = (V[0]>>1));
360	t1 += (v1 = (V[1]>>1));
361	XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
362	T+=2;
363	v0 += (t0 = (T[0]>>1));
364	v1 += (t1 = (T[1]>>1));
365	XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
366	z1[0] = -r0;
367	z1[1] = -i0;
368	z2[0] = -r1;
369	z2[1] = -i1;
370	z1+=2;
371	z2-=2;
372	V+=2;
373	}
374
375	break;
376	}
377
378	case 13: /* n = 8192 */
379	{
380	/* weight linear interpolation between sincos_lookup0 and sincos_lookup1
381	specifically: 25:75 for first twiddle and 75:25 for second twiddle */
382	const int32_t * V = sincos_lookup1;
383	T = sincos_lookup0;
384	int32_t t0,t1,v0,v1,q0,q1;
385	fixed32 * z1 = (fixed32 *)(&z[0]);
386	fixed32 * z2 = (fixed32 *)(&z[n4-1]);
387
388	t0 = T[0]; t1=T[1];
389
390	while(z1<z2)
391	{
392	fixed32 r0,i0,r1,i1;
393	v0 = V[0]; v1 = V[1];
394	t0 += (q0 = (v0-t0)>>1);
395	t1 += (q1 = (v1-t1)>>1);
396	XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
397	t0 = v0-q0;
398	t1 = v1-q1;
399	XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
400	z1[0] = -r0;
401	z1[1] = -i0;
402	z2[0] = -r1;
403	z2[1] = -i1;
404	z1+=2;
405	z2-=2;
406	T+=2;
407
408	t0 = T[0]; t1 = T[1];
409	v0 += (q0 = (t0-v0)>>1);
410	v1 += (q1 = (t1-v1)>>1);
411	XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
412	v0 = t0-q0;
413	v1 = t1-q1;
414	XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
415	z1[0] = -r0;
416	z1[1] = -i0;
417	z2[0] = -r1;
418	z2[1] = -i1;
419	z1+=2;
420	z2-=2;
421	V+=2;
422	}
423
424	break;
425	}
426	}
427	}
428
429	/**
430	* Compute inverse MDCT of size N = 2^nbits
431	* @param output N samples
432	* @param input N/2 samples
433	* "In-place" processing can be achieved provided that:
434	* [0 .. N/2-1 \| N/2 .. N-1 ]
435	* <----input---->
436	* <-----------output----------->
437	*
438	* The result of ff_imdct_half is to put the 'half' imdct here
439	*
440	* N/2 N-1
441	* <--half imdct-->
442	*
443	* We want it here for the full imdct:
444	* N/4 3N/4-1
445	* <-------------->
446	*
447	* In addition we need to apply two symmetries to get the full imdct:
448	*
449	* <AAAAAA> <DDDDDD>
450	* <BBBBBB><CCCCCC>
451	*
452	* D is a reflection of C
453	* A is a reflection of B (but with sign flipped)
454	*
455	* We process the symmetries at the same time as we 'move' the half imdct
456	* from [N/2,N-1] to [N/4,3N/4-1]
457	*
458	* TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
459	* This would require being able to use revtab 'inplace' (since the input
460	* and output of imdct_half would then overlap somewhat)
461	*/
462	void ff_imdct_calc(unsigned int nbits, fixed32 output, const fixed32 input) ICODE_ATTR_TREMOR_MDCT;
463	#ifndef CPU_ARM
464	void ff_imdct_calc(unsigned int nbits, fixed32 output, const fixed32 input)
465	{
466	const int n = (1<<nbits);
467	const int n2 = (n>>1);
468	const int n4 = (n>>2);
469
470	/* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
471	ff_imdct_half(nbits,output+n2,input);
472
473	fixed32 * in_r, * in_r2, * out_r, * out_r2;
474
475	/* Copy BBBB to AAAA, reflected and sign-flipped.
476	Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
477	out_r = output;
478	out_r2 = output+n2-8;
479	in_r = output+n2+n4-8;
480	while(out_r<out_r2)
481	{
482	#if defined CPU_COLDFIRE
483	asm volatile(
484	"movem.l (%[in_r]), %%d0-%%d7\n\t"
485	"movem.l %%d0-%%d7, (%[out_r2])\n\t"
486	"neg.l %%d7\n\t"
487	"move.l %%d7, (%[out_r])+\n\t"
488	"neg.l %%d6\n\t"
489	"move.l %%d6, (%[out_r])+\n\t"
490	"neg.l %%d5\n\t"
491	"move.l %%d5, (%[out_r])+\n\t"
492	"neg.l %%d4\n\t"
493	"move.l %%d4, (%[out_r])+\n\t"
494	"neg.l %%d3\n\t"
495	"move.l %%d3, (%[out_r])+\n\t"
496	"neg.l %%d2\n\t"
497	"move.l %%d2, (%[out_r])+\n\t"
498	"lea.l (-8*4, %[in_r]), %[in_r]\n\t"
499	"neg.l %%d1\n\t"
500	"move.l %%d1, (%[out_r])+\n\t"
501	"lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
502	"neg.l %%d0\n\t"
503	"move.l %%d0, (%[out_r])+\n\t"
504	: [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
505	:
506	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
507	#else
508	out_r[0] = -(out_r2[7] = in_r[7]);
509	out_r[1] = -(out_r2[6] = in_r[6]);
510	out_r[2] = -(out_r2[5] = in_r[5]);
511	out_r[3] = -(out_r2[4] = in_r[4]);
512	out_r[4] = -(out_r2[3] = in_r[3]);
513	out_r[5] = -(out_r2[2] = in_r[2]);
514	out_r[6] = -(out_r2[1] = in_r[1]);
515	out_r[7] = -(out_r2[0] = in_r[0]);
516	in_r -= 8;
517	out_r += 8;
518	out_r2 -= 8;
519	#endif
520	}
521	in_r = output + n2+n4;
522	in_r2 = output + n-4;
523	out_r = output + n2;
524	out_r2 = output + n2 + n4 - 4;
525	while(in_r<in_r2)
526	{
527	#if defined CPU_COLDFIRE
528	asm volatile(
529	"movem.l (%[in_r]), %%d0-%%d3\n\t"
530	"movem.l %%d0-%%d3, (%[out_r])\n\t"
531	"movem.l (%[in_r2]), %%d4-%%d7\n\t"
532	"movem.l %%d4-%%d7, (%[out_r2])\n\t"
533	"move.l %%d0, %%a3\n\t"
534	"move.l %%d3, %%d0\n\t"
535	"move.l %%d1, %%d3\n\t"
536	"movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
537	"move.l %%d7, %%d1\n\t"
538	"move.l %%d6, %%d2\n\t"
539	"move.l %%d5, %%d3\n\t"
540	"movem.l %%d1-%%d4, (%[in_r])\n\t"
541	"lea.l (4*4, %[in_r]), %[in_r]\n\t"
542	"lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
543	"lea.l (4*4, %[out_r]), %[out_r]\n\t"
544	"lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
545	: [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
546	[out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
547	:
548	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
549	#else
550	register fixed32 t0,t1,t2,t3;
551	register fixed32 s0,s1,s2,s3;
552
553	/* Copy and reflect CCCC to DDDD. Because CCCC is already where
554	we actually want to put DDDD this is a bit complicated.
555	* So simultaneously do the following things:
556	* 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
557	* 2. reflect range from [n2+n4 .. n-1] inplace
558	*
559	* [ \| ]
560	* ^a -> <- ^b ^c -> <- ^d
561	*
562	* #1: copy from ^c to ^a
563	* #2: copy from ^d to ^b
564	* #3: swap ^c and ^d in place
565	*/
566	/* #1 pt1 : load 4 words from ^c. */
567	t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
568	/* #1 pt2 : write to ^a */
569	out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
570	/* #2 pt1 : load 4 words from ^d */
571	s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
572	/* #2 pt2 : write to ^b */
573	out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
574	/* #3 pt1 : write words from #2 to ^c */
575	in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
576	/* #3 pt2 : write words from #1 to ^d */
577	in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
578
579	in_r += 4;
580	in_r2 -= 4;
581	out_r += 4;
582	out_r2 -= 4;
583	#endif
584	}
585	}
586	#else
587	/* Follows the same structure as the canonical version above */
588	void ff_imdct_calc(unsigned int nbits, fixed32 output, const fixed32 input)
589	{
590	const int n = (1<<nbits);
591	const int n2 = (n>>1);
592	const int n4 = (n>>2);
593
594	ff_imdct_half(nbits,output+n2,input);
595
596	fixed32 * in_r, * in_r2, * out_r, * out_r2;
597
598	out_r = output;
599	out_r2 = output+n2;
600	in_r = output+n2+n4;
601	while(out_r<out_r2)
602	{
603	asm volatile(
604	"ldmdb %[in_r]!, {r0-r7}\n\t"
605	"stmdb %[out_r2]!, {r0-r7}\n\t"
606	"rsb r8,r0,#0\n\t"
607	"rsb r0,r7,#0\n\t"
608	"rsb r7,r1,#0\n\t"
609	"rsb r1,r6,#0\n\t"
610	"rsb r6,r2,#0\n\t"
611	"rsb r2,r5,#0\n\t"
612	"rsb r5,r3,#0\n\t"
613	"rsb r3,r4,#0\n\t"
614	"stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
615	: [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
616	:
617	: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
618	}
619	in_r = output + n2+n4;
620	in_r2 = output + n;
621	out_r = output + n2;
622	out_r2 = output + n2 + n4;
623	while(in_r<in_r2)
624	{
625	asm volatile(
626	"ldmia %[in_r], {r0-r3}\n\t"
627	"stmia %[out_r]!, {r0-r3}\n\t"
628	"ldmdb %[in_r2], {r5-r8}\n\t"
629	"stmdb %[out_r2]!, {r5-r8}\n\t"
630	"mov r4,r0\n\t"
631	"mov r0,r3\n\t"
632	"mov r3,r1\n\t"
633	"stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
634	"mov r4,r8\n\t"
635	"mov r8,r5\n\t"
636	"mov r5,r7\n\t"
637	"stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
638	:
639	[in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
640	:
641	: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
642	}
643	}
644	#endif