From 1ddef375df58c96327ec05f386f544d2c98007cf Mon Sep 17 00:00:00 2001 From: Marcoen Hirschberg Date: Thu, 20 Sep 2007 12:27:24 +0000 Subject: move (i)MDCT and FFT code to separate files git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14776 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libwma/SOURCES | 2 + apps/codecs/libwma/fft.c | 152 +++++++++++++++++++ apps/codecs/libwma/fft.h | 22 +++ apps/codecs/libwma/mdct.c | 164 +++++++++++++++++++++ apps/codecs/libwma/mdct.h | 23 +++ apps/codecs/libwma/wmadeci.c | 331 +----------------------------------------- apps/codecs/libwma/wmafixed.h | 66 +++++++++ 7 files changed, 432 insertions(+), 328 deletions(-) create mode 100644 apps/codecs/libwma/fft.c create mode 100644 apps/codecs/libwma/fft.h create mode 100644 apps/codecs/libwma/mdct.c create mode 100644 apps/codecs/libwma/mdct.h diff --git a/apps/codecs/libwma/SOURCES b/apps/codecs/libwma/SOURCES index 967577d0db..b9d4cc1882 100644 --- a/apps/codecs/libwma/SOURCES +++ b/apps/codecs/libwma/SOURCES @@ -1,3 +1,5 @@ wmadeci.c wmafixed.c bitstream.c +fft.c +mdct.c diff --git a/apps/codecs/libwma/fft.c b/apps/codecs/libwma/fft.c new file mode 100644 index 0000000000..3def74d92c --- /dev/null +++ b/apps/codecs/libwma/fft.c @@ -0,0 +1,152 @@ +/* + * WMA compatible decoder + * Copyright (c) 2002 The FFmpeg Project. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "wmadec.h" +#include "wmafixed.h" + +FFTComplex exptab0[512] IBSS_ATTR; + +/* butter fly op */ +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ +{\ + fixed32 ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ +} + + +int fft_calc_unscaled(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + // FFTComplex *exptab = s->exptab; + int l; + fixed32 tmp_re, tmp_im; + int tabshift = 10-ln; + + np = 1 << ln; + + + /* pass 0 */ + + p=&z[0]; + j=(np >> 1); + do + { + BF(p[0].re, p[0].im, p[1].re, p[1].im, + p[0].re, p[0].im, p[1].re, p[1].im); + p+=2; + } + while (--j != 0); + + /* pass 1 */ + + + p=&z[0]; + j=np >> 2; + if (s->inverse) + { + do + { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, -p[3].im, p[3].re); + p+=4; + } + while (--j != 0); + } + else + { + do + { + BF(p[0].re, p[0].im, p[2].re, p[2].im, + p[0].re, p[0].im, p[2].re, p[2].im); + BF(p[1].re, p[1].im, p[3].re, p[3].im, + p[1].re, p[1].im, p[3].im, -p[3].re); + p+=4; + } + while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + do + { + p = z; + q = z + nloops; + for (j = 0; j < nblocks; ++j) + { + BF(p->re, p->im, q->re, q->im, + p->re, p->im, q->re, q->im); + + p++; + q++; + for(l = nblocks; l < np2; l += nblocks) + { + CMUL(&tmp_re, &tmp_im, exptab0[(l<re, q->im); + //CMUL(&tmp_re, &tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); + BF(p->re, p->im, q->re, q->im, + p->re, p->im, tmp_re, tmp_im); + p++; + q++; + } + + p += nloops; + q += nloops; + } + nblocks = nblocks >> 1; + nloops = nloops << 1; + } + while (nblocks != 0); + return 0; +} + +int fft_init_global() +{ + int i, n; + fixed32 c1, s1, s2; + + n=1<<10; + s2 = 1 ? 1 : -1; + for(i=0;i<(n/2);++i) + { + fixed32 ifix = itofix32(i); + fixed32 nfix = itofix32(n); + fixed32 res = fixdiv32(ifix,nfix); + + s1 = fsincos(res<<16, &c1); + + exptab0[i].re = c1; + exptab0[i].im = s1*s2; + } + + return 0; +} + diff --git a/apps/codecs/libwma/fft.h b/apps/codecs/libwma/fft.h new file mode 100644 index 0000000000..dd962cc171 --- /dev/null +++ b/apps/codecs/libwma/fft.h @@ -0,0 +1,22 @@ +/* + * WMA compatible decoder + * Copyright (c) 2002 The FFmpeg Project. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +int fft_calc_unscaled(FFTContext *s, FFTComplex *z); +int fft_init_global(); + diff --git a/apps/codecs/libwma/mdct.c b/apps/codecs/libwma/mdct.c new file mode 100644 index 0000000000..00a160ecef --- /dev/null +++ b/apps/codecs/libwma/mdct.c @@ -0,0 +1,164 @@ +/* + * WMA compatible decoder + * Copyright (c) 2002 The FFmpeg Project. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "wmadec.h" +#include "wmafixed.h" +#include "fft.h" + +fixed32 *tcosarray[5], *tsinarray[5]; +fixed32 tcos0[1024], tcos1[512], tcos2[256], tcos3[128], tcos4[64]; //these are the sin and cos rotations used by the MDCT +fixed32 tsin0[1024], tsin1[512], tsin2[256], tsin3[128], tsin4[64]; + +uint16_t revtab0[1024]; + +/** + * init MDCT or IMDCT computation. + */ +int ff_mdct_init(MDCTContext *s, int nbits, int inverse) +{ + int n, n4, i; + // fixed32 alpha; + + + memset(s, 0, sizeof(*s)); + n = 1 << nbits; //nbits ranges from 12 to 8 inclusive + s->nbits = nbits; + s->n = n; + n4 = n >> 2; + s->tcos = tcosarray[12-nbits]; + s->tsin = tsinarray[12-nbits]; + for(i=0;i> nbits; + //ip = fixdiv32(ip,itofix32(n)); // PJJ optimize + //alpha = fixmul32(TWO_M_PI_F, ip); + //s->tcos[i] = -fixcos32(alpha); //alpha between 0 and pi/2 + //s->tsin[i] = -fixsin32(alpha); + + s->tsin[i] = - fsincos(ip<<16, &(s->tcos[i])); //I can't remember why this works, but it seems to agree for ~24 bits, maybe more! + s->tcos[i] *=-1; + } + (&s->fft)->nbits = nbits-2; + + (&s->fft)->inverse = inverse; + + return 0; + +} + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + * @param tmp N/2 samples + */ +void ff_imdct_calc(MDCTContext *s, + fixed32 *output, + fixed32 *input) +{ + int k, n8, n4, n2, n, j,scale; + const fixed32 *tcos = s->tcos; + const fixed32 *tsin = s->tsin; + const fixed32 *in1, *in2; + FFTComplex *z1 = (FFTComplex *)output; + FFTComplex *z2 = (FFTComplex *)input; + int revtabshift = 12 - s->nbits; + + n = 1 << s->nbits; + + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + + for(k = 0; k < n4; k++) + { + j=revtab0[k<fft, z1); + + /* post rotation + reordering */ + + for(k = 0; k < n4; k++) + { + CMUL(&z2[k].re, &z2[k].im, (z1[k].re), (z1[k].im), tcos[k], tsin[k]); + } + + for(k = 0; k < n8; k++) + { + fixed32 r1,r2,r3,r4,r1n,r2n,r3n; + + r1 = z2[n8 + k].im; + r1n = r1 * -1; + r2 = z2[n8-1-k].re; + r2n = r2 * -1; + r3 = z2[k+n8].re; + r3n = r3 * -1; + r4 = z2[n8-k-1].im; + + output[2*k] = r1n; + output[n2-1-2*k] = r1; + + output[2*k+1] = r2; + output[n2-1-2*k-1] = r2n; + + output[n2 + 2*k]= r3n; + output[n-1- 2*k]= r3n; + + output[n2 + 2*k+1]= r4; + output[n-2 - 2 * k] = r4; + } +} + +int mdct_init_global() +{ + int i,j,m; + /* init MDCT */ + /*TODO: figure out how to fold this up into one array*/ + tcosarray[0] = tcos0; tcosarray[1] = tcos1; tcosarray[2] = tcos2; tcosarray[3] = tcos3;tcosarray[4] = tcos4; + tsinarray[0] = tsin0; tsinarray[1] = tsin1; tsinarray[2] = tsin2; tsinarray[3] = tsin3;tsinarray[4] = tsin4; + /* init the MDCT bit reverse table here rather then in fft_init */ + + for(i=0;i<1024;i++) /*hard coded to a 2048 bit rotation*/ + { /*smaller sizes can reuse the largest*/ + m=0; + for(j=0;j<10;j++) + { + m |= ((i >> j) & 1) << (10-j-1); + } + + revtab0[i]=m; + } + + fft_init_global(); + + return 0; +} + diff --git a/apps/codecs/libwma/mdct.h b/apps/codecs/libwma/mdct.h new file mode 100644 index 0000000000..1c050204bd --- /dev/null +++ b/apps/codecs/libwma/mdct.h @@ -0,0 +1,23 @@ +/* + * WMA compatible decoder + * Copyright (c) 2002 The FFmpeg Project. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +int ff_mdct_init(MDCTContext *s, int nbits, int inverse); +void ff_imdct_calc(MDCTContext *s, fixed32 *output, fixed32 *input); +int mdct_init_global(); + diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c index 99a098ea1f..bc3c11f9f2 100644 --- a/apps/codecs/libwma/wmadeci.c +++ b/apps/codecs/libwma/wmadeci.c @@ -28,6 +28,7 @@ #include "wmadec.h" #include "wmafixed.h" #include "bitstream.h" +#include "mdct.h" #define VLCBITS 7 /*7 is the lowest without glitching*/ @@ -40,71 +41,6 @@ #define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS) -#ifdef CPU_ARM -static inline -void CMUL(fixed32 *x, fixed32 *y, - fixed32 a, fixed32 b, - fixed32 t, fixed32 v) -{ - /* This version loses one bit of precision. Could be solved at the cost - * of 2 extra cycles if it becomes an issue. */ - int x1, y1, l; - asm( - "smull %[l], %[y1], %[b], %[t] \n" - "smlal %[l], %[y1], %[a], %[v] \n" - "rsb %[b], %[b], #0 \n" - "smull %[l], %[x1], %[a], %[t] \n" - "smlal %[l], %[x1], %[b], %[v] \n" - : [l] "=&r" (l), [x1]"=&r" (x1), [y1]"=&r" (y1), [b] "+r" (b) - : [a] "r" (a), [t] "r" (t), [v] "r" (v) - : "cc" - ); - *x = x1 << 1; - *y = y1 << 1; -} -#elif defined CPU_COLDFIRE -static inline -void CMUL(fixed32 *x, fixed32 *y, - fixed32 a, fixed32 b, - fixed32 t, fixed32 v) -{ - asm volatile ("mac.l %[a], %[t], %%acc0;" - "msac.l %[b], %[v], %%acc0;" - "mac.l %[b], %[t], %%acc1;" - "mac.l %[a], %[v], %%acc1;" - "movclr.l %%acc0, %[a];" - "move.l %[a], (%[x]);" - "movclr.l %%acc1, %[a];" - "move.l %[a], (%[y]);" - : [a] "+&r" (a) - : [x] "a" (x), [y] "a" (y), - [b] "r" (b), [t] "r" (t), [v] "r" (v) - : "cc", "memory"); -} -#else -// PJJ : reinstate macro -void CMUL(fixed32 *pre, - fixed32 *pim, - fixed32 are, - fixed32 aim, - fixed32 bre, - fixed32 bim) -{ - //int64_t x,y; - fixed32 _aref = are; - fixed32 _aimf = aim; - fixed32 _bref = bre; - fixed32 _bimf = bim; - fixed32 _r1 = fixmul32b(_bref, _aref); - fixed32 _r2 = fixmul32b(_bimf, _aimf); - fixed32 _r3 = fixmul32b(_bref, _aimf); - fixed32 _r4 = fixmul32b(_bimf, _aref); - *pre = _r1 - _r2; - *pim = _r3 + _r4; - -} -#endif - typedef struct CoefVLCTable { int n; /* total number of codes */ @@ -121,13 +57,6 @@ fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR; //static variables that replace malloced stuff fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128]; //these are the MDCT reconstruction windows -fixed32 *tcosarray[5], *tsinarray[5]; -fixed32 tcos0[1024], tcos1[512], tcos2[256], tcos3[128], tcos4[64]; //these are the sin and cos rotations used by the MDCT -fixed32 tsin0[1024], tsin1[512], tsin2[256], tsin3[128], tsin4[64]; - -FFTComplex exptab0[512] IBSS_ATTR; -uint16_t revtab0[1024]; - uint16_t *runtabarray[2], *levtabarray[2]; //these are VLC lookup tables uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336]; //these could be made smaller since only one can be 1336 @@ -146,225 +75,6 @@ VLC_TYPE vlcbuf4[540][2]; #include "wmadata.h" // PJJ -/* butter fly op */ -#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ -{\ - fixed32 ax, ay, bx, by;\ - bx=pre1;\ - by=pim1;\ - ax=qre1;\ - ay=qim1;\ - pre = (bx + ax);\ - pim = (by + ay);\ - qre = (bx - ax);\ - qim = (by - ay);\ -} - - -int fft_calc_unscaled(FFTContext *s, FFTComplex *z) -{ - int ln = s->nbits; - int j, np, np2; - int nblocks, nloops; - register FFTComplex *p, *q; - // FFTComplex *exptab = s->exptab; - int l; - fixed32 tmp_re, tmp_im; - int tabshift = 10-ln; - - np = 1 << ln; - - - /* pass 0 */ - - p=&z[0]; - j=(np >> 1); - do - { - BF(p[0].re, p[0].im, p[1].re, p[1].im, - p[0].re, p[0].im, p[1].re, p[1].im); - p+=2; - } - while (--j != 0); - - /* pass 1 */ - - - p=&z[0]; - j=np >> 2; - if (s->inverse) - { - do - { - BF(p[0].re, p[0].im, p[2].re, p[2].im, - p[0].re, p[0].im, p[2].re, p[2].im); - BF(p[1].re, p[1].im, p[3].re, p[3].im, - p[1].re, p[1].im, -p[3].im, p[3].re); - p+=4; - } - while (--j != 0); - } - else - { - do - { - BF(p[0].re, p[0].im, p[2].re, p[2].im, - p[0].re, p[0].im, p[2].re, p[2].im); - BF(p[1].re, p[1].im, p[3].re, p[3].im, - p[1].re, p[1].im, p[3].im, -p[3].re); - p+=4; - } - while (--j != 0); - } - /* pass 2 .. ln-1 */ - - nblocks = np >> 3; - nloops = 1 << 2; - np2 = np >> 1; - do - { - p = z; - q = z + nloops; - for (j = 0; j < nblocks; ++j) - { - BF(p->re, p->im, q->re, q->im, - p->re, p->im, q->re, q->im); - - p++; - q++; - for(l = nblocks; l < np2; l += nblocks) - { - CMUL(&tmp_re, &tmp_im, exptab0[(l<re, q->im); - //CMUL(&tmp_re, &tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); - BF(p->re, p->im, q->re, q->im, - p->re, p->im, tmp_re, tmp_im); - p++; - q++; - } - - p += nloops; - q += nloops; - } - nblocks = nblocks >> 1; - nloops = nloops << 1; - } - while (nblocks != 0); - return 0; -} - -/** - * init MDCT or IMDCT computation. - */ -int ff_mdct_init(MDCTContext *s, int nbits, int inverse) -{ - int n, n4, i; - // fixed32 alpha; - - - memset(s, 0, sizeof(*s)); - n = 1 << nbits; //nbits ranges from 12 to 8 inclusive - s->nbits = nbits; - s->n = n; - n4 = n >> 2; - s->tcos = tcosarray[12-nbits]; - s->tsin = tsinarray[12-nbits]; - for(i=0;i> nbits; - //ip = fixdiv32(ip,itofix32(n)); // PJJ optimize - //alpha = fixmul32(TWO_M_PI_F, ip); - //s->tcos[i] = -fixcos32(alpha); //alpha between 0 and pi/2 - //s->tsin[i] = -fixsin32(alpha); - - s->tsin[i] = - fsincos(ip<<16, &(s->tcos[i])); //I can't remember why this works, but it seems to agree for ~24 bits, maybe more! - s->tcos[i] *=-1; - } - (&s->fft)->nbits = nbits-2; - - (&s->fft)->inverse = inverse; - - return 0; - -} - -/** - * Compute inverse MDCT of size N = 2^nbits - * @param output N samples - * @param input N/2 samples - * @param tmp N/2 samples - */ -void ff_imdct_calc(MDCTContext *s, - fixed32 *output, - fixed32 *input) -{ - int k, n8, n4, n2, n, j,scale; - const fixed32 *tcos = s->tcos; - const fixed32 *tsin = s->tsin; - const fixed32 *in1, *in2; - FFTComplex *z1 = (FFTComplex *)output; - FFTComplex *z2 = (FFTComplex *)input; - int revtabshift = 12 - s->nbits; - - n = 1 << s->nbits; - - n2 = n >> 1; - n4 = n >> 2; - n8 = n >> 3; - - - /* pre rotation */ - in1 = input; - in2 = input + n2 - 1; - - for(k = 0; k < n4; k++) - { - j=revtab0[k<fft, z1); - - /* post rotation + reordering */ - - for(k = 0; k < n4; k++) - { - CMUL(&z2[k].re, &z2[k].im, (z1[k].re), (z1[k].im), tcos[k], tsin[k]); - } - - for(k = 0; k < n8; k++) - { - fixed32 r1,r2,r3,r4,r1n,r2n,r3n; - - r1 = z2[n8 + k].im; - r1n = r1 * -1; - r2 = z2[n8-1-k].re; - r2n = r2 * -1; - r3 = z2[k+n8].re; - r3n = r3 * -1; - r4 = z2[n8-k-1].im; - - output[2*k] = r1n; - output[n2-1-2*k] = r1; - - output[2*k+1] = r2; - output[n2-1-2*k-1] = r2n; - - output[n2 + 2*k]= r3n; - output[n-1- 2*k]= r3n; - - output[n2 + 2*k+1]= r4; - output[n-2 - 2 * k] = r4; - } - - - - -} - /* * Helper functions for wma_window. @@ -524,7 +234,7 @@ static void init_coef_vlc(VLC *vlc, int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) { //WMADecodeContext *s = avctx->priv_data; - int i, m, j, flags1, flags2; + int i, flags1, flags2; fixed32 *window; uint8_t *extradata; fixed64 bps1; @@ -800,10 +510,7 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) } } - /* init MDCT */ - /*TODO: figure out how to fold this up into one array*/ - tcosarray[0] = tcos0; tcosarray[1] = tcos1; tcosarray[2] = tcos2; tcosarray[3] = tcos3;tcosarray[4] = tcos4; - tsinarray[0] = tsin0; tsinarray[1] = tsin1; tsinarray[2] = tsin2; tsinarray[3] = tsin3;tsinarray[4] = tsin4; + mdct_init_global(); s->mdct_tmp = mdct_tmp; /* temporary storage for imdct */ for(i = 0; i < s->nb_block_sizes; ++i) @@ -811,38 +518,6 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) ff_mdct_init(&s->mdct_ctx[i], s->frame_len_bits - i + 1, 1); } - { - int i, n; - fixed32 c1, s1, s2; - - n=1<<10; - s2 = 1 ? 1 : -1; - for(i=0;i<(n/2);++i) - { - fixed32 ifix = itofix32(i); - fixed32 nfix = itofix32(n); - fixed32 res = fixdiv32(ifix,nfix); - - s1 = fsincos(res<<16, &c1); - - exptab0[i].re = c1; - exptab0[i].im = s1*s2; - } - } - - /* init the MDCT bit reverse table here rather then in fft_init */ - - for(i=0;i<1024;i++) /*hard coded to a 2048 bit rotation*/ - { /*smaller sizes can reuse the largest*/ - m=0; - for(j=0;j<10;j++) - { - m |= ((i >> j) & 1) << (10-j-1); - } - - revtab0[i]=m; - } - /*ffmpeg uses malloc to only allocate as many window sizes as needed. However, we're really only interested in the worst case memory usage. * In the worst case you can have 5 window sizes, 128 doubling up 2048 * Smaller windows are handled differently. diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h index db7529f681..0a6e8f61e0 100644 --- a/apps/codecs/libwma/wmafixed.h +++ b/apps/codecs/libwma/wmafixed.h @@ -102,3 +102,69 @@ fixed32 fixmul32(fixed32 x, fixed32 y); fixed32 fixmul32b(fixed32 x, fixed32 y); #endif + +#ifdef CPU_ARM +static inline +void CMUL(fixed32 *x, fixed32 *y, + fixed32 a, fixed32 b, + fixed32 t, fixed32 v) +{ + /* This version loses one bit of precision. Could be solved at the cost + * of 2 extra cycles if it becomes an issue. */ + int x1, y1, l; + asm( + "smull %[l], %[y1], %[b], %[t] \n" + "smlal %[l], %[y1], %[a], %[v] \n" + "rsb %[b], %[b], #0 \n" + "smull %[l], %[x1], %[a], %[t] \n" + "smlal %[l], %[x1], %[b], %[v] \n" + : [l] "=&r" (l), [x1]"=&r" (x1), [y1]"=&r" (y1), [b] "+r" (b) + : [a] "r" (a), [t] "r" (t), [v] "r" (v) + : "cc" + ); + *x = x1 << 1; + *y = y1 << 1; +} +#elif defined CPU_COLDFIRE +static inline +void CMUL(fixed32 *x, fixed32 *y, + fixed32 a, fixed32 b, + fixed32 t, fixed32 v) +{ + asm volatile ("mac.l %[a], %[t], %%acc0;" + "msac.l %[b], %[v], %%acc0;" + "mac.l %[b], %[t], %%acc1;" + "mac.l %[a], %[v], %%acc1;" + "movclr.l %%acc0, %[a];" + "move.l %[a], (%[x]);" + "movclr.l %%acc1, %[a];" + "move.l %[a], (%[y]);" + : [a] "+&r" (a) + : [x] "a" (x), [y] "a" (y), + [b] "r" (b), [t] "r" (t), [v] "r" (v) + : "cc", "memory"); +} +#else +// PJJ : reinstate macro +static inline +void CMUL(fixed32 *pre, + fixed32 *pim, + fixed32 are, + fixed32 aim, + fixed32 bre, + fixed32 bim) +{ + //int64_t x,y; + fixed32 _aref = are; + fixed32 _aimf = aim; + fixed32 _bref = bre; + fixed32 _bimf = bim; + fixed32 _r1 = fixmul32b(_bref, _aref); + fixed32 _r2 = fixmul32b(_bimf, _aimf); + fixed32 _r3 = fixmul32b(_bref, _aimf); + fixed32 _r4 = fixmul32b(_bimf, _aref); + *pre = _r1 - _r2; + *pim = _r3 + _r4; + +} +#endif -- cgit v1.2.3