4 files changed, 182 insertions, 186 deletions
diff --git a/apps/codecs/libwma/wmadec.h b/apps/codecs/libwma/wmadec.h
index aaa06ee2a6..d22e435304 100644
--- a/apps/codecs/libwma/wmadec.h
+++ b/apps/codecs/libwma/wmadec.h
@@ -64,6 +64,25 @@
 #endif
 #endif
+#define VLCBITS 7       /*7 is the lowest without glitching*/
+#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
+#define EXPVLCBITS 7
+#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
+#define HGAINVLCBITS 9
+#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
+typedef struct CoefVLCTable
+{
+    int n;                               /* total number of codes */ 
+    const uint32_t *huffcodes;           /* VLC bit values */
+    const uint8_t *huffbits;             /* VLC bit size */
+    const uint16_t *levels;              /* table to build run/level tables */
+}
+CoefVLCTable;
 typedef struct WMADecodeContext
 {
    GetBitContext gb;
diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c
index aa81b5b81f..d1baca4c99 100644
--- a/apps/codecs/libwma/wmadeci.c
+++ b/apps/codecs/libwma/wmadeci.c
@@ -28,39 +28,29 @@
 #include "wmadec.h"
 #include "wmafixed.h"
 #include "bitstream.h"
+#include "wmadata.h"
-#define VLCBITS 7       /*7 is the lowest without glitching*/
-#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
-#define EXPVLCBITS 7
-#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
-#define HGAINVLCBITS 9
-#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
-typedef struct CoefVLCTable
-{
-    int n; /* total number of codes */
-    const uint32_t *huffcodes; /* VLC bit values */
-    const uint8_t *huffbits;   /* VLC bit size */
-    const uint16_t *levels; /* table to build run/level tables */
-}
-CoefVLCTable;
 static void wma_lsp_to_curve_init(WMADecodeContext *s, int frame_len);
+inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
+                         const fixed32 *window, int n);
+inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, 
+                        const fixed32 *src1, int len);
+                        
+/*declarations of statically allocated variables used to remove malloc calls*/
 fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR;
 /*decode and window into IRAM on targets with at least 80KB of codec IRAM*/
 fixed32 frame_out_buf[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] IBSS_ATTR_WMA_LARGE_IRAM;
-//static variables that replace malloced stuff
+/*MDCT reconstruction windows*/
-fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128];    //these are the MDCT reconstruction windows
+fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128];    
-uint16_t *runtabarray[2], *levtabarray[2];                                        //these are VLC lookup tables
+/*VLC lookup tables*/
+uint16_t *runtabarray[2], *levtabarray[2];                                        
-uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336];                //these could be made smaller since only one can be 1336
+/*these could be made smaller since only one can be 1336*/
+uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336];               
 #define VLCBUF1SIZE 4598
 #define VLCBUF2SIZE 3574
@@ -76,141 +66,6 @@ VLC_TYPE vlcbuf4[VLCBUF4SIZE][2];
-#include "wmadata.h" // PJJ
-/*
- * Helper functions for wma_window.
- *
- *
- */
-#ifdef CPU_ARM
-static inline
-void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
-                         const fixed32 *window, int n)
-{
-    /* Block sizes are always power of two */
-    asm volatile (
-        "0:"
-        "ldmia %[d]!, {r0, r1};"
-        "ldmia %[w]!, {r4, r5};"
-        /* consume the first data and window value so we can use those
-         * registers again */
-        "smull r8, r9, r0, r4;"
-        "ldmia %[dst], {r0, r4};"
-        "add   r0, r0, r9, lsl #1;"  /* *dst=*dst+(r9<<1)*/
-        "smull r8, r9, r1, r5;"
-        "add   r1, r4, r9, lsl #1;"
-        "stmia %[dst]!, {r0, r1};"
-        "subs  %[n], %[n], #2;"
-        "bne   0b;"
-        : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
-        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
-}
-static inline
-void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
-                         int len)
-{
-    /* Block sizes are always power of two */
-    asm volatile (
-        "add   %[s1], %[s1], %[n], lsl #2;"
-        "0:"
-        "ldmia %[s0]!, {r0, r1};"
-        "ldmdb %[s1]!, {r4, r5};"
-        "smull r8, r9, r0, r5;"
-        "mov   r0, r9, lsl #1;"
-        "smull r8, r9, r1, r4;"
-        "mov   r1, r9, lsl #1;"
-        "stmia %[dst]!, {r0, r1};"
-        "subs  %[n], %[n], #2;"
-        "bne   0b;"
-        : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
-        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
-}
-#elif defined(CPU_COLDFIRE)
-static inline
-void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
-                         const fixed32 *window, int n)
-{
-    /* Block sizes are always power of two. Smallest block is always way bigger
-     * than four too.*/
-    asm volatile (
-        "0:"
-        "movem.l (%[d]), %%d0-%%d3;"
-        "movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;"
-        "mac.l %%d0, %%d4, %%acc0;"
-        "mac.l %%d1, %%d5, %%acc1;"
-        "mac.l %%d2, %%a0, %%acc2;"
-        "mac.l %%d3, %%a1, %%acc3;"
-        "lea.l (16, %[d]), %[d];"
-        "lea.l (16, %[w]), %[w];"
-        "movclr.l %%acc0, %%d0;"
-        "movclr.l %%acc1, %%d1;"
-        "movclr.l %%acc2, %%d2;"
-        "movclr.l %%acc3, %%d3;"
-        "movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;"
-        "add.l %%d4, %%d0;"
-        "add.l %%d5, %%d1;"
-        "add.l %%a0, %%d2;"
-        "add.l %%a1, %%d3;"
-        "movem.l %%d0-%%d3, (%[dst]);"
-        "lea.l (16, %[dst]), %[dst];"
-        "subq.l #4, %[n];"
-        "jne 0b;"
-        : [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n)
-        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
-}
-static inline
-void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
-                         int len)
-{
-    /* Block sizes are always power of two. Smallest block is always way bigger
-     * than four too.*/
-    asm volatile (
-        "lea.l (-16, %[s1], %[n]*4), %[s1];"
-        "0:"
-        "movem.l (%[s0]), %%d0-%%d3;"
-        "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
-        "mac.l %%d0, %%a1, %%acc0;"
-        "mac.l %%d1, %%a0, %%acc1;"
-        "mac.l %%d2, %%d5, %%acc2;"
-        "mac.l %%d3, %%d4, %%acc3;"
-        "lea.l (16, %[s0]), %[s0];"
-        "lea.l (-16, %[s1]), %[s1];"
-        "movclr.l %%acc0, %%d0;"
-        "movclr.l %%acc1, %%d1;"
-        "movclr.l %%acc2, %%d2;"
-        "movclr.l %%acc3, %%d3;"
-        "movem.l %%d0-%%d3, (%[dst]);"
-        "lea.l (16, %[dst]), %[dst];"
-        "subq.l #4, %[n];"
-        "jne 0b;"
-        : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
-        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
-}
-#else
-static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
-}
-static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
-    int i;
-    src1 += len-1;
-    for(i=0; i<len; i++)
-        dst[i] = fixmul32b(src0[i], src1[-i]);
-}
-#endif
 /**
  * Apply MDCT window and add into output.
@@ -227,7 +82,9 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const
     int block_len, bsize, n;
     /* left part */
-     /*previous block was larger, so we'll use the size of the current block to set the window size*/
+     
+     /* previous block was larger, so we'll use the size of the current 
+      * block to set the window size*/
     if (s->block_len_bits <= s->prev_block_len_bits) {
         block_len = s->block_len;
         bsize = s->frame_len_bits - s->block_len_bits;
@@ -314,7 +171,7 @@ static void init_coef_vlc(VLC *vlc,
 int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
 {
-    //WMADecodeContext *s = avctx->priv_data;
+    
    int i, flags1, flags2;
    fixed32 *window;
    uint8_t *extradata;
@@ -608,10 +465,11 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
    }
    */
-    /*ffmpeg uses malloc to only allocate as many window sizes as needed.  However, we're really only interested in the worst case memory usage.
+    /* ffmpeg uses malloc to only allocate as many window sizes as needed.  
-    * In the worst case you can have 5 window sizes, 128 doubling up 2048
+    *  However, we're really only interested in the worst case memory usage.
-    * Smaller windows are handled differently.
+    *  In the worst case you can have 5 window sizes, 128 doubling up 2048
-    * Since we don't have malloc, just statically allocate this
+    *  Smaller windows are handled differently.
+    *  Since we don't have malloc, just statically allocate this
    */
    fixed32 *temp[5];
    temp[0] = stat0;
@@ -626,19 +484,15 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
        int n, j;
        fixed32 alpha;
        n = 1 << (s->frame_len_bits - i);
-        //window = av_malloc(sizeof(fixed32) * n);
        window = temp[i];
+         
-        //fixed32 n2 = itofix32(n<<1);        //2x the window length
+         /* this calculates 0.5/(2*n) */
-        //alpha = fixdiv32(M_PI_F, n2);        //PI / (2x Window length) == PI<<(s->frame_len_bits - i+1)
+        alpha = (1<<15)>>(s->frame_len_bits - i+1);  
-        //alpha = M_PI_F>>(s->frame_len_bits - i+1);
-        alpha = (1<<15)>>(s->frame_len_bits - i+1);   /* this calculates 0.5/(2*n) */
        for(j=0;j<n;++j)
        {
            fixed32 j2 = itofix32(j) + 0x8000;
-            window[j] = fsincos(fixmul32(j2,alpha)<<16, 0);        //alpha between 0 and pi/2
+            /*alpha between 0 and pi/2*/
+            window[j] = fsincos(fixmul32(j2,alpha)<<16, 0); 
        }
        s->windows[i] = window;
@@ -663,6 +517,7 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
            s->noise_table = noisetable_exp;
        }
 #if 0
+/* We use a lookup table computered in advance, so no need to do this*/
        {
            unsigned int seed;
            fixed32 norm;
@@ -836,7 +691,9 @@ static void wma_lsp_to_curve(WMADecodeContext *s,
    *val_max_ptr = val_max;
 }
-/* decode exponents coded with LSP coefficients (same idea as Vorbis) */
+/* decode exponents coded with LSP coefficients (same idea as Vorbis)
+ * only used for low bitrate (< 16kbps) files
+ */
 static void decode_exp_lsp(WMADecodeContext *s, int ch)
 {
    fixed32 lsp_coefs[NB_LSP_COEFS];
@@ -858,7 +715,7 @@ static void decode_exp_lsp(WMADecodeContext *s, int ch)
                     lsp_coefs);
 }
-/* decode exponents coded with VLC codes */
+/* decode exponents coded with VLC codes - used for bitrate >= 32kbps*/
 static int decode_exp_vlc(WMADecodeContext *s, int ch)
 {
    int last_exp, n, code;
@@ -879,7 +736,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch)
    if (s->version == 1)        //wmav1 only
    {
        last_exp = get_bits(&s->gb, 5) + 10;
-        /* XXX: use a table */
        v = pow_10_to_yover16_ptr[last_exp];
        max_scale = v;
        n = *ptr++;
@@ -901,7 +758,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch)
        }
        /* NOTE: this offset is the same as MPEG4 AAC ! */
        last_exp += code - 60;
-        /* XXX: use a table */
        v = pow_10_to_yover16_ptr[last_exp];
        if (v > max_scale)
        {
@@ -1136,7 +993,7 @@ static int wma_decode_block(WMADecodeContext *s, int32_t *scratch_buffer)
            for(;;)
            {
                code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX);
-                //code = get_vlc(&s->gb, coef_vlc);
                if (code < 0)
                {
                    return -8;
@@ -1228,7 +1085,9 @@ static int wma_decode_block(WMADecodeContext *s, int32_t *scratch_buffer)
            if (s->use_noise_coding)
-            {
+            {   
+                /*This case is only used for low bitrates (typically less then 32kbps)*/
+                
                /*TODO:  mult should be converted to 32 bit to speed up noise coding*/
                mult = fixdiv64(pow_table[total_gain+20],Fixed32To64(s->max_exponent[ch]));
diff --git a/apps/codecs/libwma/wmafixed.c b/apps/codecs/libwma/wmafixed.c
index 3c96700076..5569309145 100644
--- a/apps/codecs/libwma/wmafixed.c
+++ b/apps/codecs/libwma/wmafixed.c
@@ -63,20 +63,139 @@ fixed64 Fixed32To64(fixed32 x)
  return (fixed64)x;
 }
 /*
-    Not performance senstitive code here
+ * Helper functions for wma_window.
+ *
+ *
+ */
-*/
+#ifdef CPU_ARM
+inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
+                         const fixed32 *window, int n)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "0:"
+        "ldmia %[d]!, {r0, r1};"
+        "ldmia %[w]!, {r4, r5};"
+        /* consume the first data and window value so we can use those
+         * registers again */
+        "smull r8, r9, r0, r4;"
+        "ldmia %[dst], {r0, r4};"
+        "add   r0, r0, r9, lsl #1;"  /* *dst=*dst+(r9<<1)*/
+        "smull r8, r9, r1, r5;"
+        "add   r1, r4, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
+        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
+                         int len)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "add   %[s1], %[s1], %[n], lsl #2;"
+        "0:"
+        "ldmia %[s0]!, {r0, r1};"
+        "ldmdb %[s1]!, {r4, r5};"
+        "smull r8, r9, r0, r5;"
+        "mov   r0, r9, lsl #1;"
+        "smull r8, r9, r1, r4;"
+        "mov   r1, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
+        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+#elif defined(CPU_COLDFIRE)
-fixed64 fixmul64byfixed(fixed64 x, fixed32 y)
+inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
+                         const fixed32 *window, int n)
 {
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "0:"
+        "movem.l (%[d]), %%d0-%%d3;"
+        "movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%d4, %%acc0;"
+        "mac.l %%d1, %%d5, %%acc1;"
+        "mac.l %%d2, %%a0, %%acc2;"
+        "mac.l %%d3, %%a1, %%acc3;"
+        "lea.l (16, %[d]), %[d];"
+        "lea.l (16, %[w]), %[w];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;"
+        "add.l %%d4, %%d0;"
+        "add.l %%d5, %%d1;"
+        "add.l %%a0, %%d2;"
+        "add.l %%a1, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
-    //return x * y;
+inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
-     return (x * y);
+                         int len)
- // return (fixed64) fixmul32(Fixed32From64(x),y);
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "lea.l (-16, %[s1], %[n]*4), %[s1];"
+        "0:"
+        "movem.l (%[s0]), %%d0-%%d3;"
+        "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%a1, %%acc0;"
+        "mac.l %%d1, %%a0, %%acc1;"
+        "mac.l %%d2, %%d5, %%acc2;"
+        "mac.l %%d3, %%d4, %%acc3;"
+        "lea.l (16, %[s0]), %[s0];"
+        "lea.l (-16, %[s1]), %[s1];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
 }
+#else
+inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
+}
+inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = fixmul32b(src0[i], src1[-i]);
+}
+#endif
+/*
+    Not performance senstitive code here
+*/
 fixed32 fixdiv32(fixed32 x, fixed32 y)
 {
diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h
index fb31cf8bfa..6b5137e044 100644
--- a/apps/codecs/libwma/wmafixed.h
+++ b/apps/codecs/libwma/wmafixed.h
@@ -47,7 +47,6 @@ fixed64 IntTo64(int x);
 int IntFrom64(fixed64 x);
 fixed32 Fixed32From64(fixed64 x);
 fixed64 Fixed32To64(fixed32 x);
-fixed64 fixmul64byfixed(fixed64 x, fixed32 y);
 fixed32 fixdiv32(fixed32 x, fixed32 y);
 fixed64 fixdiv64(fixed64 x, fixed64 y);
 fixed32 fixsqrt32(fixed32 x);

diff --git a/apps/codecs/libwma/wmadec.h b/apps/codecs/libwma/wmadec.h index aaa06ee2a6..d22e435304 100644 --- a/apps/codecs/libwma/wmadec.h +++ b/apps/codecs/libwma/wmadec.h
@@ -64,6 +64,25 @@
64	#endif	64	#endif
65	#endif	65	#endif
66		66
		67	#define VLCBITS 7 /7 is the lowest without glitching/
		68	#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
		69
		70	#define EXPVLCBITS 7
		71	#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
		72
		73	#define HGAINVLCBITS 9
		74	#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
		75
		76
		77	typedef struct CoefVLCTable
		78	{
		79	int n; /* total number of codes */
		80	const uint32_t huffcodes; / VLC bit values */
		81	const uint8_t huffbits; / VLC bit size */
		82	const uint16_t levels; / table to build run/level tables */
		83	}
		84	CoefVLCTable;
		85
67	typedef struct WMADecodeContext	86	typedef struct WMADecodeContext
68	{	87	{
69	GetBitContext gb;	88	GetBitContext gb;


diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c index aa81b5b81f..d1baca4c99 100644 --- a/apps/codecs/libwma/wmadeci.c +++ b/apps/codecs/libwma/wmadeci.c
@@ -28,39 +28,29 @@
28	#include "wmadec.h"	28	#include "wmadec.h"
29	#include "wmafixed.h"	29	#include "wmafixed.h"
30	#include "bitstream.h"	30	#include "bitstream.h"
		31	#include "wmadata.h"
31		32
32		33
33	#define VLCBITS 7 /7 is the lowest without glitching/
34	#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
35
36	#define EXPVLCBITS 7
37	#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
38
39	#define HGAINVLCBITS 9
40	#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
41
42
43	typedef struct CoefVLCTable
44	{
45	int n; /* total number of codes */
46	const uint32_t huffcodes; / VLC bit values */
47	const uint8_t huffbits; / VLC bit size */
48	const uint16_t levels; / table to build run/level tables */
49	}
50	CoefVLCTable;
51
52	static void wma_lsp_to_curve_init(WMADecodeContext *s, int frame_len);	34	static void wma_lsp_to_curve_init(WMADecodeContext *s, int frame_len);
		35	inline void vector_fmul_add_add(fixed32 dst, const fixed32 data,
		36	const fixed32 *window, int n);
		37	inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0,
		38	const fixed32 *src1, int len);
		39
		40	/declarations of statically allocated variables used to remove malloc calls/
53		41
54	fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR;	42	fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR;
55	/decode and window into IRAM on targets with at least 80KB of codec IRAM/	43	/decode and window into IRAM on targets with at least 80KB of codec IRAM/
56	fixed32 frame_out_buf[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] IBSS_ATTR_WMA_LARGE_IRAM;	44	fixed32 frame_out_buf[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] IBSS_ATTR_WMA_LARGE_IRAM;
57		45
58	//static variables that replace malloced stuff	46	/MDCT reconstruction windows/
59	fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128]; //these are the MDCT reconstruction windows	47	fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128];
60		48
61	uint16_t runtabarray[2], levtabarray[2]; //these are VLC lookup tables	49	/VLC lookup tables/
		50	uint16_t runtabarray[2], levtabarray[2];
62		51
63	uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336]; //these could be made smaller since only one can be 1336	52	/these could be made smaller since only one can be 1336/
		53	uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336];
64		54
65	#define VLCBUF1SIZE 4598	55	#define VLCBUF1SIZE 4598
66	#define VLCBUF2SIZE 3574	56	#define VLCBUF2SIZE 3574
@@ -76,141 +66,6 @@ VLC_TYPE vlcbuf4[VLCBUF4SIZE][2];
76		66
77		67
78		68
79	#include "wmadata.h" // PJJ
80
81
82
83	/*
84	* Helper functions for wma_window.
85	*
86	*
87	*/
88
89	#ifdef CPU_ARM
90	static inline
91	void vector_fmul_add_add(fixed32 dst, const fixed32 data,
92	const fixed32 *window, int n)
93	{
94	/* Block sizes are always power of two */
95	asm volatile (
96	"0:"
97	"ldmia %[d]!, {r0, r1};"
98	"ldmia %[w]!, {r4, r5};"
99	/* consume the first data and window value so we can use those
100	* registers again */
101	"smull r8, r9, r0, r4;"
102	"ldmia %[dst], {r0, r4};"
103	"add r0, r0, r9, lsl #1;" /* dst=dst+(r9<<1)*/
104	"smull r8, r9, r1, r5;"
105	"add r1, r4, r9, lsl #1;"
106	"stmia %[dst]!, {r0, r1};"
107	"subs %[n], %[n], #2;"
108	"bne 0b;"
109	: [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
110	: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
111	}
112
113	static inline
114	void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1,
115	int len)
116	{
117	/* Block sizes are always power of two */
118	asm volatile (
119	"add %[s1], %[s1], %[n], lsl #2;"
120	"0:"
121	"ldmia %[s0]!, {r0, r1};"
122	"ldmdb %[s1]!, {r4, r5};"
123	"smull r8, r9, r0, r5;"
124	"mov r0, r9, lsl #1;"
125	"smull r8, r9, r1, r4;"
126	"mov r1, r9, lsl #1;"
127	"stmia %[dst]!, {r0, r1};"
128	"subs %[n], %[n], #2;"
129	"bne 0b;"
130	: [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
131	: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
132	}
133
134	#elif defined(CPU_COLDFIRE)
135
136	static inline
137	void vector_fmul_add_add(fixed32 dst, const fixed32 data,
138	const fixed32 *window, int n)
139	{
140	/* Block sizes are always power of two. Smallest block is always way bigger
141	* than four too.*/
142	asm volatile (
143	"0:"
144	"movem.l (%[d]), %%d0-%%d3;"
145	"movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;"
146	"mac.l %%d0, %%d4, %%acc0;"
147	"mac.l %%d1, %%d5, %%acc1;"
148	"mac.l %%d2, %%a0, %%acc2;"
149	"mac.l %%d3, %%a1, %%acc3;"
150	"lea.l (16, %[d]), %[d];"
151	"lea.l (16, %[w]), %[w];"
152	"movclr.l %%acc0, %%d0;"
153	"movclr.l %%acc1, %%d1;"
154	"movclr.l %%acc2, %%d2;"
155	"movclr.l %%acc3, %%d3;"
156	"movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;"
157	"add.l %%d4, %%d0;"
158	"add.l %%d5, %%d1;"
159	"add.l %%a0, %%d2;"
160	"add.l %%a1, %%d3;"
161	"movem.l %%d0-%%d3, (%[dst]);"
162	"lea.l (16, %[dst]), %[dst];"
163	"subq.l #4, %[n];"
164	"jne 0b;"
165	: [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n)
166	: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
167	}
168
169	static inline
170	void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1,
171	int len)
172	{
173	/* Block sizes are always power of two. Smallest block is always way bigger
174	* than four too.*/
175	asm volatile (
176	"lea.l (-16, %[s1], %[n]*4), %[s1];"
177	"0:"
178	"movem.l (%[s0]), %%d0-%%d3;"
179	"movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
180	"mac.l %%d0, %%a1, %%acc0;"
181	"mac.l %%d1, %%a0, %%acc1;"
182	"mac.l %%d2, %%d5, %%acc2;"
183	"mac.l %%d3, %%d4, %%acc3;"
184	"lea.l (16, %[s0]), %[s0];"
185	"lea.l (-16, %[s1]), %[s1];"
186	"movclr.l %%acc0, %%d0;"
187	"movclr.l %%acc1, %%d1;"
188	"movclr.l %%acc2, %%d2;"
189	"movclr.l %%acc3, %%d3;"
190	"movem.l %%d0-%%d3, (%[dst]);"
191	"lea.l (16, %[dst]), %[dst];"
192	"subq.l #4, %[n];"
193	"jne 0b;"
194	: [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
195	: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
196	}
197
198	#else
199
200	static inline void vector_fmul_add_add(fixed32 dst, const fixed32 src0, const fixed32 *src1, int len){
201	int i;
202	for(i=0; i<len; i++)
203	dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
204	}
205
206	static inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1, int len){
207	int i;
208	src1 += len-1;
209	for(i=0; i<len; i++)
210	dst[i] = fixmul32b(src0[i], src1[-i]);
211	}
212
213	#endif
214		69
215	/**	70	/**
216	* Apply MDCT window and add into output.	71	* Apply MDCT window and add into output.
@@ -227,7 +82,9 @@ static inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const
227	int block_len, bsize, n;	82	int block_len, bsize, n;
228		83
229	/* left part */	84	/* left part */
230	/previous block was larger, so we'll use the size of the current block to set the window size/	85
		86	/* previous block was larger, so we'll use the size of the current
		87	* block to set the window size*/
231	if (s->block_len_bits <= s->prev_block_len_bits) {	88	if (s->block_len_bits <= s->prev_block_len_bits) {
232	block_len = s->block_len;	89	block_len = s->block_len;
233	bsize = s->frame_len_bits - s->block_len_bits;	90	bsize = s->frame_len_bits - s->block_len_bits;
@@ -314,7 +171,7 @@ static void init_coef_vlc(VLC *vlc,
314		171
315	int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)	172	int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
316	{	173	{
317	//WMADecodeContext *s = avctx->priv_data;	174
318	int i, flags1, flags2;	175	int i, flags1, flags2;
319	fixed32 *window;	176	fixed32 *window;
320	uint8_t *extradata;	177	uint8_t *extradata;
@@ -608,10 +465,11 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
608	}	465	}
609	*/	466	*/
610		467
611	/*ffmpeg uses malloc to only allocate as many window sizes as needed. However, we're really only interested in the worst case memory usage.	468	/* ffmpeg uses malloc to only allocate as many window sizes as needed.
612	* In the worst case you can have 5 window sizes, 128 doubling up 2048	469	* However, we're really only interested in the worst case memory usage.
613	* Smaller windows are handled differently.	470	* In the worst case you can have 5 window sizes, 128 doubling up 2048
614	* Since we don't have malloc, just statically allocate this	471	* Smaller windows are handled differently.
		472	* Since we don't have malloc, just statically allocate this
615	*/	473	*/
616	fixed32 *temp[5];	474	fixed32 *temp[5];
617	temp[0] = stat0;	475	temp[0] = stat0;
@@ -626,19 +484,15 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
626	int n, j;	484	int n, j;
627	fixed32 alpha;	485	fixed32 alpha;
628	n = 1 << (s->frame_len_bits - i);	486	n = 1 << (s->frame_len_bits - i);
629	//window = av_malloc(sizeof(fixed32) * n);
630	window = temp[i];	487	window = temp[i];
631		488
632	//fixed32 n2 = itofix32(n<<1); //2x the window length	489	/* this calculates 0.5/(2n) /
633	//alpha = fixdiv32(M_PI_F, n2); //PI / (2x Window length) == PI<<(s->frame_len_bits - i+1)	490	alpha = (1<<15)>>(s->frame_len_bits - i+1);
634
635	//alpha = M_PI_F>>(s->frame_len_bits - i+1);
636	alpha = (1<<15)>>(s->frame_len_bits - i+1); /* this calculates 0.5/(2n) /
637	for(j=0;j<n;++j)	491	for(j=0;j<n;++j)
638	{	492	{
639	fixed32 j2 = itofix32(j) + 0x8000;	493	fixed32 j2 = itofix32(j) + 0x8000;
640	window[j] = fsincos(fixmul32(j2,alpha)<<16, 0); //alpha between 0 and pi/2	494	/alpha between 0 and pi/2/
641		495	window[j] = fsincos(fixmul32(j2,alpha)<<16, 0);
642	}	496	}
643	s->windows[i] = window;	497	s->windows[i] = window;
644		498
@@ -663,6 +517,7 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx)
663	s->noise_table = noisetable_exp;	517	s->noise_table = noisetable_exp;
664	}	518	}
665	#if 0	519	#if 0
		520	/* We use a lookup table computered in advance, so no need to do this*/
666	{	521	{
667	unsigned int seed;	522	unsigned int seed;
668	fixed32 norm;	523	fixed32 norm;
@@ -836,7 +691,9 @@ static void wma_lsp_to_curve(WMADecodeContext *s,
836	*val_max_ptr = val_max;	691	*val_max_ptr = val_max;
837	}	692	}
838		693
839	/* decode exponents coded with LSP coefficients (same idea as Vorbis) */	694	/* decode exponents coded with LSP coefficients (same idea as Vorbis)
		695	* only used for low bitrate (< 16kbps) files
		696	*/
840	static void decode_exp_lsp(WMADecodeContext *s, int ch)	697	static void decode_exp_lsp(WMADecodeContext *s, int ch)
841	{	698	{
842	fixed32 lsp_coefs[NB_LSP_COEFS];	699	fixed32 lsp_coefs[NB_LSP_COEFS];
@@ -858,7 +715,7 @@ static void decode_exp_lsp(WMADecodeContext *s, int ch)
858	lsp_coefs);	715	lsp_coefs);
859	}	716	}
860		717
861	/* decode exponents coded with VLC codes */	718	/* decode exponents coded with VLC codes - used for bitrate >= 32kbps*/
862	static int decode_exp_vlc(WMADecodeContext *s, int ch)	719	static int decode_exp_vlc(WMADecodeContext *s, int ch)
863	{	720	{
864	int last_exp, n, code;	721	int last_exp, n, code;
@@ -879,7 +736,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch)
879	if (s->version == 1) //wmav1 only	736	if (s->version == 1) //wmav1 only
880	{	737	{
881	last_exp = get_bits(&s->gb, 5) + 10;	738	last_exp = get_bits(&s->gb, 5) + 10;
882	/* XXX: use a table */	739
883	v = pow_10_to_yover16_ptr[last_exp];	740	v = pow_10_to_yover16_ptr[last_exp];
884	max_scale = v;	741	max_scale = v;
885	n = *ptr++;	742	n = *ptr++;
@@ -901,7 +758,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch)
901	}	758	}
902	/* NOTE: this offset is the same as MPEG4 AAC ! */	759	/* NOTE: this offset is the same as MPEG4 AAC ! */
903	last_exp += code - 60;	760	last_exp += code - 60;
904	/* XXX: use a table */	761
905	v = pow_10_to_yover16_ptr[last_exp];	762	v = pow_10_to_yover16_ptr[last_exp];
906	if (v > max_scale)	763	if (v > max_scale)
907	{	764	{
@@ -1136,7 +993,7 @@ static int wma_decode_block(WMADecodeContext s, int32_t scratch_buffer)
1136	for(;;)	993	for(;;)
1137	{	994	{
1138	code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX);	995	code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX);
1139	//code = get_vlc(&s->gb, coef_vlc);	996
1140	if (code < 0)	997	if (code < 0)
1141	{	998	{
1142	return -8;	999	return -8;
@@ -1228,7 +1085,9 @@ static int wma_decode_block(WMADecodeContext s, int32_t scratch_buffer)
1228		1085
1229		1086
1230	if (s->use_noise_coding)	1087	if (s->use_noise_coding)
1231	{	1088	{
		1089	/This case is only used for low bitrates (typically less then 32kbps)/
		1090
1232	/TODO: mult should be converted to 32 bit to speed up noise coding/	1091	/TODO: mult should be converted to 32 bit to speed up noise coding/
1233		1092
1234	mult = fixdiv64(pow_table[total_gain+20],Fixed32To64(s->max_exponent[ch]));	1093	mult = fixdiv64(pow_table[total_gain+20],Fixed32To64(s->max_exponent[ch]));


diff --git a/apps/codecs/libwma/wmafixed.c b/apps/codecs/libwma/wmafixed.c index 3c96700076..5569309145 100644 --- a/apps/codecs/libwma/wmafixed.c +++ b/apps/codecs/libwma/wmafixed.c
@@ -63,20 +63,139 @@ fixed64 Fixed32To64(fixed32 x)
63	return (fixed64)x;	63	return (fixed64)x;
64	}	64	}
65		65
		66
66	/*	67	/*
67	Not performance senstitive code here	68	* Helper functions for wma_window.
		69	*
		70	*
		71	*/
68		72
69	*/	73	#ifdef CPU_ARM
		74	inline void vector_fmul_add_add(fixed32 dst, const fixed32 data,
		75	const fixed32 *window, int n)
		76	{
		77	/* Block sizes are always power of two */
		78	asm volatile (
		79	"0:"
		80	"ldmia %[d]!, {r0, r1};"
		81	"ldmia %[w]!, {r4, r5};"
		82	/* consume the first data and window value so we can use those
		83	* registers again */
		84	"smull r8, r9, r0, r4;"
		85	"ldmia %[dst], {r0, r4};"
		86	"add r0, r0, r9, lsl #1;" /* dst=dst+(r9<<1)*/
		87	"smull r8, r9, r1, r5;"
		88	"add r1, r4, r9, lsl #1;"
		89	"stmia %[dst]!, {r0, r1};"
		90	"subs %[n], %[n], #2;"
		91	"bne 0b;"
		92	: [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
		93	: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
		94	}
		95
		96	inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1,
		97	int len)
		98	{
		99	/* Block sizes are always power of two */
		100	asm volatile (
		101	"add %[s1], %[s1], %[n], lsl #2;"
		102	"0:"
		103	"ldmia %[s0]!, {r0, r1};"
		104	"ldmdb %[s1]!, {r4, r5};"
		105	"smull r8, r9, r0, r5;"
		106	"mov r0, r9, lsl #1;"
		107	"smull r8, r9, r1, r4;"
		108	"mov r1, r9, lsl #1;"
		109	"stmia %[dst]!, {r0, r1};"
		110	"subs %[n], %[n], #2;"
		111	"bne 0b;"
		112	: [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
		113	: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
		114	}
70		115
		116	#elif defined(CPU_COLDFIRE)
71		117
72	fixed64 fixmul64byfixed(fixed64 x, fixed32 y)	118	inline void vector_fmul_add_add(fixed32 dst, const fixed32 data,
		119	const fixed32 *window, int n)
73	{	120	{
		121	/* Block sizes are always power of two. Smallest block is always way bigger
		122	* than four too.*/
		123	asm volatile (
		124	"0:"
		125	"movem.l (%[d]), %%d0-%%d3;"
		126	"movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;"
		127	"mac.l %%d0, %%d4, %%acc0;"
		128	"mac.l %%d1, %%d5, %%acc1;"
		129	"mac.l %%d2, %%a0, %%acc2;"
		130	"mac.l %%d3, %%a1, %%acc3;"
		131	"lea.l (16, %[d]), %[d];"
		132	"lea.l (16, %[w]), %[w];"
		133	"movclr.l %%acc0, %%d0;"
		134	"movclr.l %%acc1, %%d1;"
		135	"movclr.l %%acc2, %%d2;"
		136	"movclr.l %%acc3, %%d3;"
		137	"movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;"
		138	"add.l %%d4, %%d0;"
		139	"add.l %%d5, %%d1;"
		140	"add.l %%a0, %%d2;"
		141	"add.l %%a1, %%d3;"
		142	"movem.l %%d0-%%d3, (%[dst]);"
		143	"lea.l (16, %[dst]), %[dst];"
		144	"subq.l #4, %[n];"
		145	"jne 0b;"
		146	: [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n)
		147	: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
		148	}
74		149
75	//return x * y;	150	inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1,
76	return (x * y);	151	int len)
77	// return (fixed64) fixmul32(Fixed32From64(x),y);	152	{
		153	/* Block sizes are always power of two. Smallest block is always way bigger
		154	* than four too.*/
		155	asm volatile (
		156	"lea.l (-16, %[s1], %[n]*4), %[s1];"
		157	"0:"
		158	"movem.l (%[s0]), %%d0-%%d3;"
		159	"movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
		160	"mac.l %%d0, %%a1, %%acc0;"
		161	"mac.l %%d1, %%a0, %%acc1;"
		162	"mac.l %%d2, %%d5, %%acc2;"
		163	"mac.l %%d3, %%d4, %%acc3;"
		164	"lea.l (16, %[s0]), %[s0];"
		165	"lea.l (-16, %[s1]), %[s1];"
		166	"movclr.l %%acc0, %%d0;"
		167	"movclr.l %%acc1, %%d1;"
		168	"movclr.l %%acc2, %%d2;"
		169	"movclr.l %%acc3, %%d3;"
		170	"movem.l %%d0-%%d3, (%[dst]);"
		171	"lea.l (16, %[dst]), %[dst];"
		172	"subq.l #4, %[n];"
		173	"jne 0b;"
		174	: [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
		175	: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
78	}	176	}
79		177
		178	#else
		179
		180	inline void vector_fmul_add_add(fixed32 dst, const fixed32 src0, const fixed32 *src1, int len){
		181	int i;
		182	for(i=0; i<len; i++)
		183	dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
		184	}
		185
		186	inline void vector_fmul_reverse(fixed32 dst, const fixed32 src0, const fixed32 *src1, int len){
		187	int i;
		188	src1 += len-1;
		189	for(i=0; i<len; i++)
		190	dst[i] = fixmul32b(src0[i], src1[-i]);
		191	}
		192
		193	#endif
		194
		195	/*
		196	Not performance senstitive code here
		197
		198	*/
80		199
81	fixed32 fixdiv32(fixed32 x, fixed32 y)	200	fixed32 fixdiv32(fixed32 x, fixed32 y)
82	{	201	{


diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h index fb31cf8bfa..6b5137e044 100644 --- a/apps/codecs/libwma/wmafixed.h +++ b/apps/codecs/libwma/wmafixed.h
@@ -47,7 +47,6 @@ fixed64 IntTo64(int x);
47	int IntFrom64(fixed64 x);	47	int IntFrom64(fixed64 x);
48	fixed32 Fixed32From64(fixed64 x);	48	fixed32 Fixed32From64(fixed64 x);
49	fixed64 Fixed32To64(fixed32 x);	49	fixed64 Fixed32To64(fixed32 x);
50	fixed64 fixmul64byfixed(fixed64 x, fixed32 y);
51	fixed32 fixdiv32(fixed32 x, fixed32 y);	50	fixed32 fixdiv32(fixed32 x, fixed32 y);
52	fixed64 fixdiv64(fixed64 x, fixed64 y);	51	fixed64 fixdiv64(fixed64 x, fixed64 y);
53	fixed32 fixsqrt32(fixed32 x);	52	fixed32 fixsqrt32(fixed32 x);