Patch by Mohamed Tarek from FS#10182 - convert codec to fixed-point using patches submitted to the ffmpeg mailing list in 2007 by Ian Braithwaite.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@20901 a1c6a512-1295-4272-9138-f99709370657
author: Dave Chapman <dave@dchapman.com> 2009-05-10 22:26:02 +0000
committer: Dave Chapman <dave@dchapman.com> 2009-05-10 22:26:02 +0000
commit: fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5 (patch)
tree: 5902a50d7efe84a5f3e6270c464dbf54572a744a /apps/codecs/libcook/cook_fixp_mdct.h
parent: 3a0a9915eb802d558c0399d17a5ac045934d6be1 (diff)
download: rockbox-fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5.tar.gz
rockbox-fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5.zip
1 files changed, 545 insertions, 0 deletions
diff --git a/apps/codecs/libcook/cook_fixp_mdct.h b/apps/codecs/libcook/cook_fixp_mdct.h
new file mode 100644
index 0000000000..dcd6d96227
--- /dev/null
+++ b/apps/codecs/libcook/cook_fixp_mdct.h
@@ -0,0 +1,545 @@
+/*
+ * The following (normalized modified discrete cosine transform)
+ * is taken from the OggVorbis 'TREMOR' source code.
+ *
+ * It has been modified for the ffmpeg cook fixed point decoder.
+ */
+/********************************************************************
+ *                                                                  *
+ * THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002    *
+ * BY THE Xiph.Org FOUNDATION http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of the Xiph.org Foundation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *********************************************************************
+ function: normalized modified discrete cosine transform
+           power of two length transform only [64 <= n ]
+ last mod: $Id: mdct.c 14281 2004-12-30 12:11:32Z henry $
+ Original algorithm adapted long ago from _The use of multirate filter
+ banks for coding of high quality digital audio_, by T. Sporer,
+ K. Brandenburg and B. Edler, collection of the European Signal
+ Processing Conference (EUSIPCO), Amsterdam, June 1992, Vol.1, pp
+ 211-214
+ The below code implements an algorithm that no longer looks much like
+ that presented in the paper, but the basic structure remains if you
+ dig deep enough to see it.
+ This module DOES NOT INCLUDE code to generate/apply the window
+ function.  Everybody has their own weird favorite including me... I
+ happen to like the properties of y=sin(.5PI*sin^2(x)), but others may
+ vehemently disagree.
+ ********************************************************************/
+#define STIN static inline
+typedef int32_t ogg_int32_t;
+#define DATA_TYPE ogg_int32_t
+#define REG_TYPE  register ogg_int32_t
+#define LOOKUP_T const uint16_t
+static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
+  return fixp_mult_su(x, y) >> 1;
+}
+static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
+  return fixp_mult_su(x, y);
+}
+/*
+ * This should be used as a memory barrier, forcing all cached values in
+ * registers to wr writen back to memory.  Might or might not be beneficial
+ * depending on the architecture and compiler.
+ */
+#define MB()
+/*
+ * The XPROD functions are meant to optimize the cross products found all
+ * over the place in mdct.c by forcing memory operation ordering to avoid
+ * unnecessary register reloads as soon as memory is being written to.
+ * However this is only beneficial on CPUs with a sane number of general
+ * purpose registers which exclude the Intel x86.  On Intel, better let the
+ * compiler actually reload registers directly from original memory by using
+ * macros.
+ */
+#ifdef __i386__
+#define XPROD32(_a, _b, _t, _v, _x, _y)         \
+  { *(_x)=MULT32(_a,_t)+MULT32(_b,_v);          \
+    *(_y)=MULT32(_b,_t)-MULT32(_a,_v); }
+#define XPROD31(_a, _b, _t, _v, _x, _y)         \
+  { *(_x)=MULT31(_a,_t)+MULT31(_b,_v);          \
+    *(_y)=MULT31(_b,_t)-MULT31(_a,_v); }
+#define XNPROD31(_a, _b, _t, _v, _x, _y)        \
+  { *(_x)=MULT31(_a,_t)-MULT31(_b,_v);          \
+    *(_y)=MULT31(_b,_t)+MULT31(_a,_v); }
+#else
+static inline void XPROD32(ogg_int32_t  a, ogg_int32_t  b,
+                           ogg_int32_t  t, ogg_int32_t  v,
+                           ogg_int32_t *x, ogg_int32_t *y)
+{
+  *x = MULT32(a, t) + MULT32(b, v);
+  *y = MULT32(b, t) - MULT32(a, v);
+}
+static inline void XPROD31(ogg_int32_t  a, ogg_int32_t  b,
+                           ogg_int32_t  t, ogg_int32_t  v,
+                           ogg_int32_t *x, ogg_int32_t *y)
+{
+  *x = MULT31(a, t) + MULT31(b, v);
+  *y = MULT31(b, t) - MULT31(a, v);
+}
+static inline void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,
+                            ogg_int32_t  t, ogg_int32_t  v,
+                            ogg_int32_t *x, ogg_int32_t *y)
+{
+  *x = MULT31(a, t) - MULT31(b, v);
+  *y = MULT31(b, t) + MULT31(a, v);
+}
+#endif
+/* 8 point butterfly (in place) */
+STIN void mdct_butterfly_8(DATA_TYPE *x){
+  REG_TYPE r0   = x[4] + x[0];
+  REG_TYPE r1   = x[4] - x[0];
+  REG_TYPE r2   = x[5] + x[1];
+  REG_TYPE r3   = x[5] - x[1];
+  REG_TYPE r4   = x[6] + x[2];
+  REG_TYPE r5   = x[6] - x[2];
+  REG_TYPE r6   = x[7] + x[3];
+  REG_TYPE r7   = x[7] - x[3];
+           x[0] = r5   + r3;
+           x[1] = r7   - r1;
+           x[2] = r5   - r3;
+           x[3] = r7   + r1;
+           x[4] = r4   - r0;
+           x[5] = r6   - r2;
+           x[6] = r4   + r0;
+           x[7] = r6   + r2;
+           MB();
+}
+/* 16 point butterfly (in place, 4 register) */
+STIN void mdct_butterfly_16(DATA_TYPE *x){
+  REG_TYPE r0, r1;
+           r0 = x[ 0] - x[ 8]; x[ 8] += x[ 0];
+           r1 = x[ 1] - x[ 9]; x[ 9] += x[ 1];
+           x[ 0] = MULT31((r0 + r1) , cPI2_8);
+           x[ 1] = MULT31((r1 - r0) , cPI2_8);
+           MB();
+           r0 = x[10] - x[ 2]; x[10] += x[ 2];
+           r1 = x[ 3] - x[11]; x[11] += x[ 3];
+           x[ 2] = r1; x[ 3] = r0;
+           MB();
+           r0 = x[12] - x[ 4]; x[12] += x[ 4];
+           r1 = x[13] - x[ 5]; x[13] += x[ 5];
+           x[ 4] = MULT31((r0 - r1) , cPI2_8);
+           x[ 5] = MULT31((r0 + r1) , cPI2_8);
+           MB();
+           r0 = x[14] - x[ 6]; x[14] += x[ 6];
+           r1 = x[15] - x[ 7]; x[15] += x[ 7];
+           x[ 6] = r0; x[ 7] = r1;
+           MB();
+           mdct_butterfly_8(x);
+           mdct_butterfly_8(x+8);
+}
+/* 32 point butterfly (in place, 4 register) */
+STIN void mdct_butterfly_32(DATA_TYPE *x){
+  REG_TYPE r0, r1;
+           r0 = x[30] - x[14]; x[30] += x[14];           
+           r1 = x[31] - x[15]; x[31] += x[15];
+           x[14] = r0; x[15] = r1;
+           MB();
+           r0 = x[28] - x[12]; x[28] += x[12];           
+           r1 = x[29] - x[13]; x[29] += x[13];
+           XNPROD31( r0, r1, cPI1_8, cPI3_8, &x[12], &x[13] );
+           MB();
+           r0 = x[26] - x[10]; x[26] += x[10];
+           r1 = x[27] - x[11]; x[27] += x[11];
+           x[10] = MULT31((r0 - r1) , cPI2_8);
+           x[11] = MULT31((r0 + r1) , cPI2_8);
+           MB();
+           r0 = x[24] - x[ 8]; x[24] += x[ 8];
+           r1 = x[25] - x[ 9]; x[25] += x[ 9];
+           XNPROD31( r0, r1, cPI3_8, cPI1_8, &x[ 8], &x[ 9] );
+           MB();
+           r0 = x[22] - x[ 6]; x[22] += x[ 6];
+           r1 = x[ 7] - x[23]; x[23] += x[ 7];
+           x[ 6] = r1; x[ 7] = r0;
+           MB();
+           r0 = x[ 4] - x[20]; x[20] += x[ 4];
+           r1 = x[ 5] - x[21]; x[21] += x[ 5];
+           XPROD31 ( r0, r1, cPI3_8, cPI1_8, &x[ 4], &x[ 5] );
+           MB();
+           r0 = x[ 2] - x[18]; x[18] += x[ 2];
+           r1 = x[ 3] - x[19]; x[19] += x[ 3];
+           x[ 2] = MULT31((r1 + r0) , cPI2_8);
+           x[ 3] = MULT31((r1 - r0) , cPI2_8);
+           MB();
+           r0 = x[ 0] - x[16]; x[16] += x[ 0];
+           r1 = x[ 1] - x[17]; x[17] += x[ 1];
+           XPROD31 ( r0, r1, cPI1_8, cPI3_8, &x[ 0], &x[ 1] );
+           MB();
+           mdct_butterfly_16(x);
+           mdct_butterfly_16(x+16);
+}
+/* N/stage point generic N stage butterfly (in place, 2 register) */
+STIN void mdct_butterfly_generic(DATA_TYPE *x,int points,int step){
+  LOOKUP_T *T   = sincos_lookup;
+  DATA_TYPE *x1        = x + points      - 8;
+  DATA_TYPE *x2        = x + (points>>1) - 8;
+  REG_TYPE   r0;
+  REG_TYPE   r1;
+  //av_log(0, 0, "bfly: points=%d, step=%d\n", points, step);
+  do{
+    r0 = x1[6] - x2[6]; x1[6] += x2[6];
+    r1 = x2[7] - x1[7]; x1[7] += x2[7];
+    XPROD31( r1, r0, T[0], T[1], &x2[6], &x2[7] ); T+=step;
+    r0 = x1[4] - x2[4]; x1[4] += x2[4];
+    r1 = x2[5] - x1[5]; x1[5] += x2[5];
+    XPROD31( r1, r0, T[0], T[1], &x2[4], &x2[5] ); T+=step;
+    r0 = x1[2] - x2[2]; x1[2] += x2[2];
+    r1 = x2[3] - x1[3]; x1[3] += x2[3];
+    XPROD31( r1, r0, T[0], T[1], &x2[2], &x2[3] ); T+=step;
+    r0 = x1[0] - x2[0]; x1[0] += x2[0];
+    r1 = x2[1] - x1[1]; x1[1] += x2[1];
+    XPROD31( r1, r0, T[0], T[1], &x2[0], &x2[1] ); T+=step;
+    x1-=8; x2-=8;
+  }while(T<sincos_lookup+2048);
+  do{
+    r0 = x1[6] - x2[6]; x1[6] += x2[6];
+    r1 = x1[7] - x2[7]; x1[7] += x2[7];
+    XNPROD31( r0, r1, T[0], T[1], &x2[6], &x2[7] ); T-=step;
+    r0 = x1[4] - x2[4]; x1[4] += x2[4];
+    r1 = x1[5] - x2[5]; x1[5] += x2[5];
+    XNPROD31( r0, r1, T[0], T[1], &x2[4], &x2[5] ); T-=step;
+    r0 = x1[2] - x2[2]; x1[2] += x2[2];
+    r1 = x1[3] - x2[3]; x1[3] += x2[3];
+    XNPROD31( r0, r1, T[0], T[1], &x2[2], &x2[3] ); T-=step;
+    r0 = x1[0] - x2[0]; x1[0] += x2[0];
+    r1 = x1[1] - x2[1]; x1[1] += x2[1];
+    XNPROD31( r0, r1, T[0], T[1], &x2[0], &x2[1] ); T-=step;
+    x1-=8; x2-=8;
+  }while(T>sincos_lookup);
+  do{
+    r0 = x2[6] - x1[6]; x1[6] += x2[6];
+    r1 = x2[7] - x1[7]; x1[7] += x2[7];
+    XPROD31( r0, r1, T[0], T[1], &x2[6], &x2[7] ); T+=step;
+    r0 = x2[4] - x1[4]; x1[4] += x2[4];
+    r1 = x2[5] - x1[5]; x1[5] += x2[5];
+    XPROD31( r0, r1, T[0], T[1], &x2[4], &x2[5] ); T+=step;
+    r0 = x2[2] - x1[2]; x1[2] += x2[2];
+    r1 = x2[3] - x1[3]; x1[3] += x2[3];
+    XPROD31( r0, r1, T[0], T[1], &x2[2], &x2[3] ); T+=step;
+    r0 = x2[0] - x1[0]; x1[0] += x2[0];
+    r1 = x2[1] - x1[1]; x1[1] += x2[1];
+    XPROD31( r0, r1, T[0], T[1], &x2[0], &x2[1] ); T+=step;
+    x1-=8; x2-=8;
+  }while(T<sincos_lookup+2048);
+  do{
+    r0 = x1[6] - x2[6]; x1[6] += x2[6];
+    r1 = x2[7] - x1[7]; x1[7] += x2[7];
+    XNPROD31( r1, r0, T[0], T[1], &x2[6], &x2[7] ); T-=step;
+    r0 = x1[4] - x2[4]; x1[4] += x2[4];
+    r1 = x2[5] - x1[5]; x1[5] += x2[5];
+    XNPROD31( r1, r0, T[0], T[1], &x2[4], &x2[5] ); T-=step;
+    r0 = x1[2] - x2[2]; x1[2] += x2[2];
+    r1 = x2[3] - x1[3]; x1[3] += x2[3];
+    XNPROD31( r1, r0, T[0], T[1], &x2[2], &x2[3] ); T-=step;
+    r0 = x1[0] - x2[0]; x1[0] += x2[0];
+    r1 = x2[1] - x1[1]; x1[1] += x2[1];
+    XNPROD31( r1, r0, T[0], T[1], &x2[0], &x2[1] ); T-=step;
+    x1-=8; x2-=8;
+  }while(T>sincos_lookup);
+}
+STIN void mdct_butterflies(DATA_TYPE *x,int points,int shift){
+  int stages=8-shift;
+  int i,j;
+  
+  for(i=0;--stages>0;i++){
+    for(j=0;j<(1<<i);j++)
+      mdct_butterfly_generic(x+(points>>i)*j,points>>i,8<<(i+shift));
+  }
+  for(j=0;j<points;j+=32)
+    mdct_butterfly_32(x+j);
+}
+static unsigned char bitrev[16]={0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15};
+STIN int bitrev12(int x){
+  return bitrev[x>>8]|(bitrev[(x&0x0f0)>>4]<<4)|(((int)bitrev[x&0x00f])<<8);
+}
+STIN void mdct_bitreverse(DATA_TYPE *x,int n,int step,int shift){
+  int          bit   = 0;
+  DATA_TYPE   *w0    = x;
+  DATA_TYPE   *w1    = x = w0+(n>>1);
+  LOOKUP_T    *T = sincos_lookup+(step>>1);
+  LOOKUP_T    *Ttop  = T+2048;
+  DATA_TYPE    r2;
+  //av_log(0, 0, "brev: shift=%d, step=%d\n", shift, step);
+  do{
+    DATA_TYPE r3     = bitrev12(bit++);
+    DATA_TYPE *x0    = x + ((r3 ^ 0xfff)>>shift) -1;
+    DATA_TYPE *x1    = x + (r3>>shift);
+    REG_TYPE  r0     = x0[0]  + x1[0];
+    REG_TYPE  r1     = x1[1]  - x0[1];
+              XPROD32( r0, r1, T[1], T[0], &r2, &r3 ); T+=step;
+              w1    -= 4;
+              r0     = (x0[1] + x1[1])>>1;
+              r1     = (x0[0] - x1[0])>>1;
+              w0[0]  = r0     + r2;
+              w0[1]  = r1     + r3;
+              w1[2]  = r0     - r2;
+              w1[3]  = r3     - r1;
+              r3     = bitrev12(bit++);
+              x0     = x + ((r3 ^ 0xfff)>>shift) -1;
+              x1     = x + (r3>>shift);
+              r0     = x0[0]  + x1[0];
+              r1     = x1[1]  - x0[1];
+              XPROD32( r0, r1, T[1], T[0], &r2, &r3 ); T+=step;
+              r0     = (x0[1] + x1[1])>>1;
+              r1     = (x0[0] - x1[0])>>1;
+              w0[2]  = r0     + r2;
+              w0[3]  = r1     + r3;
+              w1[0]  = r0     - r2;
+              w1[1]  = r3     - r1;
+              w0    += 4;
+  }while(T<Ttop);
+  do{
+    DATA_TYPE r3     = bitrev12(bit++);
+    DATA_TYPE *x0    = x + ((r3 ^ 0xfff)>>shift) -1;
+    DATA_TYPE *x1    = x + (r3>>shift);
+    REG_TYPE  r0     = x0[0]  + x1[0];
+    REG_TYPE  r1     = x1[1]  - x0[1];
+              T-=step; XPROD32( r0, r1, T[0], T[1], &r2, &r3 );
+              w1    -= 4;
+              r0     = (x0[1] + x1[1])>>1;
+              r1     = (x0[0] - x1[0])>>1;
+              w0[0]  = r0     + r2;
+              w0[1]  = r1     + r3;
+              w1[2]  = r0     - r2;
+              w1[3]  = r3     - r1;
+              r3     = bitrev12(bit++);
+              x0     = x + ((r3 ^ 0xfff)>>shift) -1;
+              x1     = x + (r3>>shift);
+              r0     = x0[0]  + x1[0];
+              r1     = x1[1]  - x0[1];
+              T-=step; XPROD32( r0, r1, T[0], T[1], &r2, &r3 );
+              r0     = (x0[1] + x1[1])>>1;
+              r1     = (x0[0] - x1[0])>>1;
+              w0[2]  = r0     + r2;
+              w0[3]  = r1     + r3;
+              w1[0]  = r0     - r2;
+              w1[1]  = r3     - r1;
+              w0    += 4;
+  }while(w0<w1);
+}
+STIN void cook_mdct_backward(int n, DATA_TYPE *in, DATA_TYPE *out){
+  int n2=n>>1;
+  int n4=n>>2;
+  DATA_TYPE *iX;
+  DATA_TYPE *oX;
+  LOOKUP_T *T;
+  int shift;
+  int step;
+  for (shift=6;!(n&(1<<shift));shift++);
+  shift=13-shift;
+  step=4<<shift;
+  //step=16;
+  //av_log(0, 0, "mdct: shift=%d, step=%d\n", shift, step);
+   
+  /* rotate */
+  iX            = in+n2-7;
+  oX            = out+n2+n4;
+  T             = sincos_lookup;
+  do{
+    oX-=4;
+    XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
+    XPROD31( iX[0], iX[2], T[0], T[1], &oX[0], &oX[1] ); T+=step;
+    iX-=8;
+  }while(iX>=in+n4);
+  do{
+    oX-=4;
+    XPROD31( iX[4], iX[6], T[1], T[0], &oX[2], &oX[3] ); T-=step;
+    XPROD31( iX[0], iX[2], T[1], T[0], &oX[0], &oX[1] ); T-=step;
+    iX-=8;
+  }while(iX>=in);
+  iX            = in+n2-8;
+  oX            = out+n2+n4;
+  T             = sincos_lookup;
+  do{
+    T+=step; XNPROD31( iX[6], iX[4], T[0], T[1], &oX[0], &oX[1] );
+    T+=step; XNPROD31( iX[2], iX[0], T[0], T[1], &oX[2], &oX[3] );
+    iX-=8;
+    oX+=4;
+  }while(iX>=in+n4);
+  do{
+    T-=step; XNPROD31( iX[6], iX[4], T[1], T[0], &oX[0], &oX[1] );
+    T-=step; XNPROD31( iX[2], iX[0], T[1], T[0], &oX[2], &oX[3] );
+    iX-=8;
+    oX+=4;
+  }while(iX>=in);
+  mdct_butterflies(out+n2,n2,shift);
+  mdct_bitreverse(out,n,step,shift);
+  /* rotate */
+  step>>=2;
+  //step=4;
+  {
+    DATA_TYPE *oX1=out+n2+n4;
+    DATA_TYPE *oX2=out+n2+n4;
+    DATA_TYPE *iX =out;
+    T=sincos_lookup+(step>>1);
+    do{
+      oX1-=4;
+      XPROD31( iX[0], -iX[1], T[0], T[1], &oX1[3], &oX2[0] ); T+=step;
+      XPROD31( iX[2], -iX[3], T[0], T[1], &oX1[2], &oX2[1] ); T+=step;
+      XPROD31( iX[4], -iX[5], T[0], T[1], &oX1[1], &oX2[2] ); T+=step;
+      XPROD31( iX[6], -iX[7], T[0], T[1], &oX1[0], &oX2[3] ); T+=step;
+      oX2+=4;
+      iX+=8;
+    }while(iX<oX1);
+    iX=out+n2+n4;
+    oX1=out+n4;
+    oX2=oX1;
+    do{
+      oX1-=4;
+      iX-=4;
+      oX2[0] = -(oX1[3] = iX[3]);
+      oX2[1] = -(oX1[2] = iX[2]);
+      oX2[2] = -(oX1[1] = iX[1]);
+      oX2[3] = -(oX1[0] = iX[0]);
+      oX2+=4;
+    }while(oX2<iX);
+    iX=out+n2+n4;
+    oX1=out+n2+n4;
+    oX2=out+n2;
+    do{
+      oX1-=4;
+      oX1[0]= iX[3];
+      oX1[1]= iX[2];
+      oX1[2]= iX[1];
+      oX1[3]= iX[0];
+      iX+=4;
+    }while(oX1>oX2);
+  }
+}
author	Dave Chapman <dave@dchapman.com>	2009-05-10 22:26:02 +0000
committer	Dave Chapman <dave@dchapman.com>	2009-05-10 22:26:02 +0000
commit	fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5 (patch)
tree	5902a50d7efe84a5f3e6270c464dbf54572a744a /apps/codecs/libcook/cook_fixp_mdct.h
parent	3a0a9915eb802d558c0399d17a5ac045934d6be1 (diff)
download	rockbox-fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5.tar.gz rockbox-fc28cb4ed5adf4a0bc548af38ca6de95bbf027e5.zip

diff --git a/apps/codecs/libcook/cook_fixp_mdct.h b/apps/codecs/libcook/cook_fixp_mdct.h new file mode 100644 index 0000000000..dcd6d96227 --- /dev/null +++ b/apps/codecs/libcook/cook_fixp_mdct.h
@@ -0,0 +1,545 @@
	1	/*
	2	* The following (normalized modified discrete cosine transform)
	3	* is taken from the OggVorbis 'TREMOR' source code.
	4	*
	5	* It has been modified for the ffmpeg cook fixed point decoder.
	6	*/
	7
	8	/********************************************************************
	9	* *
	10	* THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002 *
	11	* BY THE Xiph.Org FOUNDATION http://www.xiph.org/ *
	12	* *
	13	********************************************************************
	14
	15	Redistribution and use in source and binary forms, with or without
	16	modification, are permitted provided that the following conditions
	17	are met:
	18
	19	- Redistributions of source code must retain the above copyright
	20	notice, this list of conditions and the following disclaimer.
	21
	22	- Redistributions in binary form must reproduce the above copyright
	23	notice, this list of conditions and the following disclaimer in the
	24	documentation and/or other materials provided with the distribution.
	25
	26	- Neither the name of the Xiph.org Foundation nor the names of its
	27	contributors may be used to endorse or promote products derived from
	28	this software without specific prior written permission.
	29
	30	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	31	``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	32	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	33	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION
	34	OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	35	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	36	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	37	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	38	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	39	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	40	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	41
	42	*********************************************************************
	43
	44	function: normalized modified discrete cosine transform
	45	power of two length transform only [64 <= n ]
	46	last mod: $Id: mdct.c 14281 2004-12-30 12:11:32Z henry $
	47
	48	Original algorithm adapted long ago from _The use of multirate filter
	49	banks for coding of high quality digital audio_, by T. Sporer,
	50	K. Brandenburg and B. Edler, collection of the European Signal
	51	Processing Conference (EUSIPCO), Amsterdam, June 1992, Vol.1, pp
	52	211-214
	53
	54	The below code implements an algorithm that no longer looks much like
	55	that presented in the paper, but the basic structure remains if you
	56	dig deep enough to see it.
	57
	58	This module DOES NOT INCLUDE code to generate/apply the window
	59	function. Everybody has their own weird favorite including me... I
	60	happen to like the properties of y=sin(.5PI*sin^2(x)), but others may
	61	vehemently disagree.
	62
	63	********************************************************************/
	64
	65	#define STIN static inline
	66
	67	typedef int32_t ogg_int32_t;
	68
	69	#define DATA_TYPE ogg_int32_t
	70	#define REG_TYPE register ogg_int32_t
	71	#define LOOKUP_T const uint16_t
	72
	73	static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
	74	return fixp_mult_su(x, y) >> 1;
	75	}
	76
	77	static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
	78	return fixp_mult_su(x, y);
	79	}
	80
	81	/*
	82	* This should be used as a memory barrier, forcing all cached values in
	83	* registers to wr writen back to memory. Might or might not be beneficial
	84	* depending on the architecture and compiler.
	85	*/
	86	#define MB()
	87
	88	/*
	89	* The XPROD functions are meant to optimize the cross products found all
	90	* over the place in mdct.c by forcing memory operation ordering to avoid
	91	* unnecessary register reloads as soon as memory is being written to.
	92	* However this is only beneficial on CPUs with a sane number of general
	93	* purpose registers which exclude the Intel x86. On Intel, better let the
	94	* compiler actually reload registers directly from original memory by using
	95	* macros.
	96	*/
	97
	98	#ifdef __i386__
	99
	100	#define XPROD32(_a, _b, _t, _v, _x, _y) \
	101	{ *(_x)=MULT32(_a,_t)+MULT32(_b,_v); \
	102	*(_y)=MULT32(_b,_t)-MULT32(_a,_v); }
	103	#define XPROD31(_a, _b, _t, _v, _x, _y) \
	104	{ *(_x)=MULT31(_a,_t)+MULT31(_b,_v); \
	105	*(_y)=MULT31(_b,_t)-MULT31(_a,_v); }
	106	#define XNPROD31(_a, _b, _t, _v, _x, _y) \
	107	{ *(_x)=MULT31(_a,_t)-MULT31(_b,_v); \
	108	*(_y)=MULT31(_b,_t)+MULT31(_a,_v); }
	109
	110	#else
	111
	112	static inline void XPROD32(ogg_int32_t a, ogg_int32_t b,
	113	ogg_int32_t t, ogg_int32_t v,
	114	ogg_int32_t x, ogg_int32_t y)
	115	{
	116	*x = MULT32(a, t) + MULT32(b, v);
	117	*y = MULT32(b, t) - MULT32(a, v);
	118	}
	119
	120	static inline void XPROD31(ogg_int32_t a, ogg_int32_t b,
	121	ogg_int32_t t, ogg_int32_t v,
	122	ogg_int32_t x, ogg_int32_t y)
	123	{
	124	*x = MULT31(a, t) + MULT31(b, v);
	125	*y = MULT31(b, t) - MULT31(a, v);
	126	}
	127
	128	static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b,
	129	ogg_int32_t t, ogg_int32_t v,
	130	ogg_int32_t x, ogg_int32_t y)
	131	{
	132	*x = MULT31(a, t) - MULT31(b, v);
	133	*y = MULT31(b, t) + MULT31(a, v);
	134	}
	135
	136	#endif
	137
	138
	139	/* 8 point butterfly (in place) */
	140	STIN void mdct_butterfly_8(DATA_TYPE *x){
	141
	142	REG_TYPE r0 = x[4] + x[0];
	143	REG_TYPE r1 = x[4] - x[0];
	144	REG_TYPE r2 = x[5] + x[1];
	145	REG_TYPE r3 = x[5] - x[1];
	146	REG_TYPE r4 = x[6] + x[2];
	147	REG_TYPE r5 = x[6] - x[2];
	148	REG_TYPE r6 = x[7] + x[3];
	149	REG_TYPE r7 = x[7] - x[3];
	150
	151	x[0] = r5 + r3;
	152	x[1] = r7 - r1;
	153	x[2] = r5 - r3;
	154	x[3] = r7 + r1;
	155	x[4] = r4 - r0;
	156	x[5] = r6 - r2;
	157	x[6] = r4 + r0;
	158	x[7] = r6 + r2;
	159	MB();
	160	}
	161
	162	/* 16 point butterfly (in place, 4 register) */
	163	STIN void mdct_butterfly_16(DATA_TYPE *x){
	164
	165	REG_TYPE r0, r1;
	166
	167	r0 = x[ 0] - x[ 8]; x[ 8] += x[ 0];
	168	r1 = x[ 1] - x[ 9]; x[ 9] += x[ 1];
	169	x[ 0] = MULT31((r0 + r1) , cPI2_8);
	170	x[ 1] = MULT31((r1 - r0) , cPI2_8);
	171	MB();
	172
	173	r0 = x[10] - x[ 2]; x[10] += x[ 2];
	174	r1 = x[ 3] - x[11]; x[11] += x[ 3];
	175	x[ 2] = r1; x[ 3] = r0;
	176	MB();
	177
	178	r0 = x[12] - x[ 4]; x[12] += x[ 4];
	179	r1 = x[13] - x[ 5]; x[13] += x[ 5];
	180	x[ 4] = MULT31((r0 - r1) , cPI2_8);
	181	x[ 5] = MULT31((r0 + r1) , cPI2_8);
	182	MB();
	183
	184	r0 = x[14] - x[ 6]; x[14] += x[ 6];
	185	r1 = x[15] - x[ 7]; x[15] += x[ 7];
	186	x[ 6] = r0; x[ 7] = r1;
	187	MB();
	188
	189	mdct_butterfly_8(x);
	190	mdct_butterfly_8(x+8);
	191	}
	192
	193	/* 32 point butterfly (in place, 4 register) */
	194	STIN void mdct_butterfly_32(DATA_TYPE *x){
	195
	196	REG_TYPE r0, r1;
	197
	198	r0 = x[30] - x[14]; x[30] += x[14];
	199	r1 = x[31] - x[15]; x[31] += x[15];
	200	x[14] = r0; x[15] = r1;
	201	MB();
	202
	203	r0 = x[28] - x[12]; x[28] += x[12];
	204	r1 = x[29] - x[13]; x[29] += x[13];
	205	XNPROD31( r0, r1, cPI1_8, cPI3_8, &x[12], &x[13] );
	206	MB();
	207
	208	r0 = x[26] - x[10]; x[26] += x[10];
	209	r1 = x[27] - x[11]; x[27] += x[11];
	210	x[10] = MULT31((r0 - r1) , cPI2_8);
	211	x[11] = MULT31((r0 + r1) , cPI2_8);
	212	MB();
	213
	214	r0 = x[24] - x[ 8]; x[24] += x[ 8];
	215	r1 = x[25] - x[ 9]; x[25] += x[ 9];
	216	XNPROD31( r0, r1, cPI3_8, cPI1_8, &x[ 8], &x[ 9] );
	217	MB();
	218
	219	r0 = x[22] - x[ 6]; x[22] += x[ 6];
	220	r1 = x[ 7] - x[23]; x[23] += x[ 7];
	221	x[ 6] = r1; x[ 7] = r0;
	222	MB();
	223
	224	r0 = x[ 4] - x[20]; x[20] += x[ 4];
	225	r1 = x[ 5] - x[21]; x[21] += x[ 5];
	226	XPROD31 ( r0, r1, cPI3_8, cPI1_8, &x[ 4], &x[ 5] );
	227	MB();
	228
	229	r0 = x[ 2] - x[18]; x[18] += x[ 2];
	230	r1 = x[ 3] - x[19]; x[19] += x[ 3];
	231	x[ 2] = MULT31((r1 + r0) , cPI2_8);
	232	x[ 3] = MULT31((r1 - r0) , cPI2_8);
	233	MB();
	234
	235	r0 = x[ 0] - x[16]; x[16] += x[ 0];
	236	r1 = x[ 1] - x[17]; x[17] += x[ 1];
	237	XPROD31 ( r0, r1, cPI1_8, cPI3_8, &x[ 0], &x[ 1] );
	238	MB();
	239
	240	mdct_butterfly_16(x);
	241	mdct_butterfly_16(x+16);
	242	}
	243
	244	/* N/stage point generic N stage butterfly (in place, 2 register) */
	245	STIN void mdct_butterfly_generic(DATA_TYPE *x,int points,int step){
	246
	247	LOOKUP_T *T = sincos_lookup;
	248	DATA_TYPE *x1 = x + points - 8;
	249	DATA_TYPE *x2 = x + (points>>1) - 8;
	250	REG_TYPE r0;
	251	REG_TYPE r1;
	252
	253	//av_log(0, 0, "bfly: points=%d, step=%d\n", points, step);
	254
	255	do{
	256	r0 = x1[6] - x2[6]; x1[6] += x2[6];
	257	r1 = x2[7] - x1[7]; x1[7] += x2[7];
	258	XPROD31( r1, r0, T[0], T[1], &x2[6], &x2[7] ); T+=step;
	259
	260	r0 = x1[4] - x2[4]; x1[4] += x2[4];
	261	r1 = x2[5] - x1[5]; x1[5] += x2[5];
	262	XPROD31( r1, r0, T[0], T[1], &x2[4], &x2[5] ); T+=step;
	263
	264	r0 = x1[2] - x2[2]; x1[2] += x2[2];
	265	r1 = x2[3] - x1[3]; x1[3] += x2[3];
	266	XPROD31( r1, r0, T[0], T[1], &x2[2], &x2[3] ); T+=step;
	267
	268	r0 = x1[0] - x2[0]; x1[0] += x2[0];
	269	r1 = x2[1] - x1[1]; x1[1] += x2[1];
	270	XPROD31( r1, r0, T[0], T[1], &x2[0], &x2[1] ); T+=step;
	271
	272	x1-=8; x2-=8;
	273	}while(T<sincos_lookup+2048);
	274	do{
	275	r0 = x1[6] - x2[6]; x1[6] += x2[6];
	276	r1 = x1[7] - x2[7]; x1[7] += x2[7];
	277	XNPROD31( r0, r1, T[0], T[1], &x2[6], &x2[7] ); T-=step;
	278
	279	r0 = x1[4] - x2[4]; x1[4] += x2[4];
	280	r1 = x1[5] - x2[5]; x1[5] += x2[5];
	281	XNPROD31( r0, r1, T[0], T[1], &x2[4], &x2[5] ); T-=step;
	282
	283	r0 = x1[2] - x2[2]; x1[2] += x2[2];
	284	r1 = x1[3] - x2[3]; x1[3] += x2[3];
	285	XNPROD31( r0, r1, T[0], T[1], &x2[2], &x2[3] ); T-=step;
	286
	287	r0 = x1[0] - x2[0]; x1[0] += x2[0];
	288	r1 = x1[1] - x2[1]; x1[1] += x2[1];
	289	XNPROD31( r0, r1, T[0], T[1], &x2[0], &x2[1] ); T-=step;
	290
	291	x1-=8; x2-=8;
	292	}while(T>sincos_lookup);
	293	do{
	294	r0 = x2[6] - x1[6]; x1[6] += x2[6];
	295	r1 = x2[7] - x1[7]; x1[7] += x2[7];
	296	XPROD31( r0, r1, T[0], T[1], &x2[6], &x2[7] ); T+=step;
	297
	298	r0 = x2[4] - x1[4]; x1[4] += x2[4];
	299	r1 = x2[5] - x1[5]; x1[5] += x2[5];
	300	XPROD31( r0, r1, T[0], T[1], &x2[4], &x2[5] ); T+=step;
	301
	302	r0 = x2[2] - x1[2]; x1[2] += x2[2];
	303	r1 = x2[3] - x1[3]; x1[3] += x2[3];
	304	XPROD31( r0, r1, T[0], T[1], &x2[2], &x2[3] ); T+=step;
	305
	306	r0 = x2[0] - x1[0]; x1[0] += x2[0];
	307	r1 = x2[1] - x1[1]; x1[1] += x2[1];
	308	XPROD31( r0, r1, T[0], T[1], &x2[0], &x2[1] ); T+=step;
	309
	310	x1-=8; x2-=8;
	311	}while(T<sincos_lookup+2048);
	312	do{
	313	r0 = x1[6] - x2[6]; x1[6] += x2[6];
	314	r1 = x2[7] - x1[7]; x1[7] += x2[7];
	315	XNPROD31( r1, r0, T[0], T[1], &x2[6], &x2[7] ); T-=step;
	316
	317	r0 = x1[4] - x2[4]; x1[4] += x2[4];
	318	r1 = x2[5] - x1[5]; x1[5] += x2[5];
	319	XNPROD31( r1, r0, T[0], T[1], &x2[4], &x2[5] ); T-=step;
	320
	321	r0 = x1[2] - x2[2]; x1[2] += x2[2];
	322	r1 = x2[3] - x1[3]; x1[3] += x2[3];
	323	XNPROD31( r1, r0, T[0], T[1], &x2[2], &x2[3] ); T-=step;
	324
	325	r0 = x1[0] - x2[0]; x1[0] += x2[0];
	326	r1 = x2[1] - x1[1]; x1[1] += x2[1];
	327	XNPROD31( r1, r0, T[0], T[1], &x2[0], &x2[1] ); T-=step;
	328
	329	x1-=8; x2-=8;
	330	}while(T>sincos_lookup);
	331	}
	332
	333	STIN void mdct_butterflies(DATA_TYPE *x,int points,int shift){
	334
	335	int stages=8-shift;
	336	int i,j;
	337
	338	for(i=0;--stages>0;i++){
	339	for(j=0;j<(1<<i);j++)
	340	mdct_butterfly_generic(x+(points>>i)*j,points>>i,8<<(i+shift));
	341	}
	342
	343	for(j=0;j<points;j+=32)
	344	mdct_butterfly_32(x+j);
	345
	346	}
	347
	348	static unsigned char bitrev[16]={0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15};
	349
	350	STIN int bitrev12(int x){
	351	return bitrev[x>>8]\|(bitrev[(x&0x0f0)>>4]<<4)\|(((int)bitrev[x&0x00f])<<8);
	352	}
	353
	354	STIN void mdct_bitreverse(DATA_TYPE *x,int n,int step,int shift){
	355
	356	int bit = 0;
	357	DATA_TYPE *w0 = x;
	358	DATA_TYPE *w1 = x = w0+(n>>1);
	359	LOOKUP_T *T = sincos_lookup+(step>>1);
	360	LOOKUP_T *Ttop = T+2048;
	361	DATA_TYPE r2;
	362
	363	//av_log(0, 0, "brev: shift=%d, step=%d\n", shift, step);
	364
	365	do{
	366	DATA_TYPE r3 = bitrev12(bit++);
	367	DATA_TYPE *x0 = x + ((r3 ^ 0xfff)>>shift) -1;
	368	DATA_TYPE *x1 = x + (r3>>shift);
	369
	370	REG_TYPE r0 = x0[0] + x1[0];
	371	REG_TYPE r1 = x1[1] - x0[1];
	372
	373	XPROD32( r0, r1, T[1], T[0], &r2, &r3 ); T+=step;
	374
	375	w1 -= 4;
	376
	377	r0 = (x0[1] + x1[1])>>1;
	378	r1 = (x0[0] - x1[0])>>1;
	379	w0[0] = r0 + r2;
	380	w0[1] = r1 + r3;
	381	w1[2] = r0 - r2;
	382	w1[3] = r3 - r1;
	383
	384	r3 = bitrev12(bit++);
	385	x0 = x + ((r3 ^ 0xfff)>>shift) -1;
	386	x1 = x + (r3>>shift);
	387
	388	r0 = x0[0] + x1[0];
	389	r1 = x1[1] - x0[1];
	390
	391	XPROD32( r0, r1, T[1], T[0], &r2, &r3 ); T+=step;
	392
	393	r0 = (x0[1] + x1[1])>>1;
	394	r1 = (x0[0] - x1[0])>>1;
	395	w0[2] = r0 + r2;
	396	w0[3] = r1 + r3;
	397	w1[0] = r0 - r2;
	398	w1[1] = r3 - r1;
	399
	400	w0 += 4;
	401	}while(T<Ttop);
	402	do{
	403	DATA_TYPE r3 = bitrev12(bit++);
	404	DATA_TYPE *x0 = x + ((r3 ^ 0xfff)>>shift) -1;
	405	DATA_TYPE *x1 = x + (r3>>shift);
	406
	407	REG_TYPE r0 = x0[0] + x1[0];
	408	REG_TYPE r1 = x1[1] - x0[1];
	409
	410	T-=step; XPROD32( r0, r1, T[0], T[1], &r2, &r3 );
	411
	412	w1 -= 4;
	413
	414	r0 = (x0[1] + x1[1])>>1;
	415	r1 = (x0[0] - x1[0])>>1;
	416	w0[0] = r0 + r2;
	417	w0[1] = r1 + r3;
	418	w1[2] = r0 - r2;
	419	w1[3] = r3 - r1;
	420
	421	r3 = bitrev12(bit++);
	422	x0 = x + ((r3 ^ 0xfff)>>shift) -1;
	423	x1 = x + (r3>>shift);
	424
	425	r0 = x0[0] + x1[0];
	426	r1 = x1[1] - x0[1];
	427
	428	T-=step; XPROD32( r0, r1, T[0], T[1], &r2, &r3 );
	429
	430	r0 = (x0[1] + x1[1])>>1;
	431	r1 = (x0[0] - x1[0])>>1;
	432	w0[2] = r0 + r2;
	433	w0[3] = r1 + r3;
	434	w1[0] = r0 - r2;
	435	w1[1] = r3 - r1;
	436
	437	w0 += 4;
	438	}while(w0<w1);
	439	}
	440
	441	STIN void cook_mdct_backward(int n, DATA_TYPE in, DATA_TYPE out){
	442	int n2=n>>1;
	443	int n4=n>>2;
	444	DATA_TYPE *iX;
	445	DATA_TYPE *oX;
	446	LOOKUP_T *T;
	447	int shift;
	448	int step;
	449
	450	for (shift=6;!(n&(1<<shift));shift++);
	451
	452	shift=13-shift;
	453	step=4<<shift;
	454	//step=16;
	455	//av_log(0, 0, "mdct: shift=%d, step=%d\n", shift, step);
	456
	457	/* rotate */
	458
	459	iX = in+n2-7;
	460	oX = out+n2+n4;
	461	T = sincos_lookup;
	462
	463	do{
	464	oX-=4;
	465	XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
	466	XPROD31( iX[0], iX[2], T[0], T[1], &oX[0], &oX[1] ); T+=step;
	467	iX-=8;
	468	}while(iX>=in+n4);
	469	do{
	470	oX-=4;
	471	XPROD31( iX[4], iX[6], T[1], T[0], &oX[2], &oX[3] ); T-=step;
	472	XPROD31( iX[0], iX[2], T[1], T[0], &oX[0], &oX[1] ); T-=step;
	473	iX-=8;
	474	}while(iX>=in);
	475
	476	iX = in+n2-8;
	477	oX = out+n2+n4;
	478	T = sincos_lookup;
	479
	480	do{
	481	T+=step; XNPROD31( iX[6], iX[4], T[0], T[1], &oX[0], &oX[1] );
	482	T+=step; XNPROD31( iX[2], iX[0], T[0], T[1], &oX[2], &oX[3] );
	483	iX-=8;
	484	oX+=4;
	485	}while(iX>=in+n4);
	486	do{
	487	T-=step; XNPROD31( iX[6], iX[4], T[1], T[0], &oX[0], &oX[1] );
	488	T-=step; XNPROD31( iX[2], iX[0], T[1], T[0], &oX[2], &oX[3] );
	489	iX-=8;
	490	oX+=4;
	491	}while(iX>=in);
	492
	493	mdct_butterflies(out+n2,n2,shift);
	494	mdct_bitreverse(out,n,step,shift);
	495
	496	/* rotate */
	497
	498	step>>=2;
	499	//step=4;
	500	{
	501	DATA_TYPE *oX1=out+n2+n4;
	502	DATA_TYPE *oX2=out+n2+n4;
	503	DATA_TYPE *iX =out;
	504
	505	T=sincos_lookup+(step>>1);
	506	do{
	507	oX1-=4;
	508	XPROD31( iX[0], -iX[1], T[0], T[1], &oX1[3], &oX2[0] ); T+=step;
	509	XPROD31( iX[2], -iX[3], T[0], T[1], &oX1[2], &oX2[1] ); T+=step;
	510	XPROD31( iX[4], -iX[5], T[0], T[1], &oX1[1], &oX2[2] ); T+=step;
	511	XPROD31( iX[6], -iX[7], T[0], T[1], &oX1[0], &oX2[3] ); T+=step;
	512	oX2+=4;
	513	iX+=8;
	514	}while(iX<oX1);
	515
	516	iX=out+n2+n4;
	517	oX1=out+n4;
	518	oX2=oX1;
	519
	520	do{
	521	oX1-=4;
	522	iX-=4;
	523
	524	oX2[0] = -(oX1[3] = iX[3]);
	525	oX2[1] = -(oX1[2] = iX[2]);
	526	oX2[2] = -(oX1[1] = iX[1]);
	527	oX2[3] = -(oX1[0] = iX[0]);
	528
	529	oX2+=4;
	530	}while(oX2<iX);
	531
	532	iX=out+n2+n4;
	533	oX1=out+n2+n4;
	534	oX2=out+n2;
	535
	536	do{
	537	oX1-=4;
	538	oX1[0]= iX[3];
	539	oX1[1]= iX[2];
	540	oX1[2]= iX[1];
	541	oX1[3]= iX[0];
	542	iX+=4;
	543	}while(oX1>oX2);
	544	}
	545	}