From 87d59ab56c30eadc4691a41ba7540cca868c9b50 Mon Sep 17 00:00:00 2001
From: Andree Buschmann <AndreeBuschmann@t-online.de>
Date: Mon, 26 Jul 2010 21:43:07 +0000
Subject: Submit part of FS#11498. Major speedup for WMA Professional on ARM
 and Coldfire CPUs. Introduce asm routines for multiplications, move arrays
 with major impact on decoding speed to IRAM. Speeds up decoding by 25% on
 PP5022 and 34% on mcf5249.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27582 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libwmapro/mdct_tables.c |   9 +-
 apps/codecs/libwmapro/wmapro_math.h | 192 +++++++++++++++++++++++++++++++++---
 apps/codecs/libwmapro/wmaprodec.c   |  43 +++++---
 apps/codecs/libwmapro/wmaprodec.h   |  26 +++++
 4 files changed, 240 insertions(+), 30 deletions(-)

diff --git a/apps/codecs/libwmapro/mdct_tables.c b/apps/codecs/libwmapro/mdct_tables.c
index b87d1b4045..dd8b2a451c 100644
--- a/apps/codecs/libwmapro/mdct_tables.c
+++ b/apps/codecs/libwmapro/mdct_tables.c
@@ -1,3 +1,4 @@
+#include "wmaprodec.h"
 #include <inttypes.h>
   
 /* Tables for fixed-point trig tables for windowing and mdct */
@@ -689,7 +690,7 @@ const int32_t sine_4096[] = {
     0x8000078F, 0x800003DC, 0x80000164, 0x80000028
 };
 
-const int32_t sine_2048[] = {
+const int32_t sine_2048[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
     0xFFF36F03, 0xFFDA4D09, 0xFFC12B0F, 0xFFA80918, 0xFF8EE725, 0xFF75C536, 
     0xFF5CA34B, 0xFF438168, 0xFF2A5F8C, 0xFF113DB7, 0xFEF81BEE, 0xFEDEFA2D, 
     0xFEC5D876, 0xFEACB6CE, 0xFE939531, 0xFE7A73A2, 0xFE615224, 0xFE4830B4, 
@@ -1208,7 +1209,7 @@ const int32_t sine_1024[] = {
     0x800078E8, 0x80003DB0, 0x80001636, 0x80000279
 };
 
-const int32_t sine_512[] = {
+const int32_t sine_512[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
     0xFFCDBC0C, 0xFF693440, 0xFF04ACD1, 0xFEA025FE, 0xFE3BA003, 0xFDD71B21, 
     0xFD72978F, 0xFD0E1595, 0xFCA9956C, 0xFC45174E, 0xFBE09B85, 0xFB7C223F, 
     0xFB17ABC2, 0xFAB33854, 0xFA4EC823, 0xF9EA5B74, 0xF985F28F, 0xF9218DA0, 
@@ -1297,7 +1298,7 @@ const int32_t sine_512[] = {
     0x800058D4, 0x800009DF
 };
 
-const int32_t sine_256[] = {
+const int32_t sine_256[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
     0xFF9B781E, 0xFED26950, 0xFE095D6B, 0xFD40565E, 0xFC775617, 0xFBAE5E8D, 
     0xFAE571A2, 0xFA1C9157, 0xF953BF94, 0xF88AFE40, 0xF7C24F61, 0xF6F9B4C9, 
     0xF6313075, 0xF568C463, 0xF4A07264, 0xF3D83C74, 0xF3102493, 0xF2482C8D, 
@@ -1343,7 +1344,7 @@ const int32_t sine_256[] = {
     0x80078E60, 0x8003DAF2, 0x8001634E, 0x8000277B
 };
 
-const int32_t sine_128[] = {
+const int32_t sine_128[] ICONST_ATTR = {
     0xFF36F079, 0xFDA4D929, 0xFC12D91C, 0xFA80FFCF, 0xF8EF5CBC, 0xF75DFF6C, 
     0xF5CCF73F, 0xF43C53CB, 0xF2AC2474, 0xF11C7895, 0xEF8D5FC8, 0xEDFEE931, 
     0xEC71244A, 0xEAE4208A, 0xE957ED00, 0xE7CC9912, 0xE642341D, 0xE4B8CD16, 
diff --git a/apps/codecs/libwmapro/wmapro_math.h b/apps/codecs/libwmapro/wmapro_math.h
index 71cc3d33d7..30b9a987ee 100644
--- a/apps/codecs/libwmapro/wmapro_math.h
+++ b/apps/codecs/libwmapro/wmapro_math.h
@@ -3,21 +3,187 @@
 
 #include <inttypes.h>
 
+/* rockbox: not used
 #define fixtof16(x)       (float)((float)(x) / (float)(1 << 16))
 #define fixtof31(x)       (float)((float)(x) / (float)(1 << 31))
 #define ftofix16(x)       ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5:0.5)))
 #define ftofix31(x)       ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5:0.5)))
+*/
 
-static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt)
-{
-    int64_t temp;
-    temp = x;
-    temp *= y;
+#if defined(CPU_ARM)
+    /* Calculates: result = (X*Y)>>Z */
+    #define fixmulshift(X,Y,Z) \
+    ({ \
+        int32_t lo; \
+        int32_t hi; \
+        asm volatile ( \
+            "smull %[lo], %[hi], %[x], %[y] \n\t"   /* multiply */ \
+            "mov   %[lo], %[lo], lsr %[shr] \n\t"   /* lo >>= Z */ \
+            "orr   %[lo], %[lo], %[hi], lsl %[shl]" /* lo |= (hi << (32-Z)) */ \
+            : [lo]"=&r"(lo), [hi]"=&r"(hi) \
+            : [x]"r"(X), [y]"r"(Y), [shr]"r"(Z), [shl]"r"(32-Z)); \
+        lo; \
+    })
+     
+    /* Calculates: result = (X*Y)>>16 */
+    #define fixmul16(X,Y) \
+     ({ \
+        int32_t lo; \
+        int32_t hi; \
+        asm volatile ( \
+           "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
+           "mov   %[lo], %[lo], lsr #16    \n\t" /* lo >>= 16 */ \
+           "orr   %[lo], %[lo], %[hi], lsl #16"  /* lo |= (hi << 16) */ \
+           : [lo]"=&r"(lo), [hi]"=&r"(hi) \
+           : [x]"r"(X), [y]"r"(Y)); \
+        lo; \
+     })
+     
+    /* Calculates: result = (X*Y)>>24 */
+    #define fixmul24(X,Y) \
+     ({ \
+        int32_t lo; \
+        int32_t hi; \
+        asm volatile ( \
+           "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
+           "mov   %[lo], %[lo], lsr #24    \n\t" /* lo >>= 24 */ \
+           "orr   %[lo], %[lo], %[hi], lsl #8"   /* lo |= (hi << 8) */ \
+           : [lo]"=&r"(lo), [hi]"=&r"(hi) \
+           : [x]"r"(X), [y]"r"(Y)); \
+        lo; \
+     })
+     
+    /* Calculates: result = (X*Y)>>31 */
+    #define fixmul31(X,Y) \
+     ({ \
+        int32_t lo; \
+        int32_t hi; \
+        asm volatile ( \
+           "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
+           "mov   %[lo], %[lo], lsr #31    \n\t" /* lo >>= 31 */ \
+           "orr   %[lo], %[lo], %[hi], lsl #1"   /* lo |= (hi << 1) */ \
+           : [lo]"=&r"(lo), [hi]"=&r"(hi) \
+           : [x]"r"(X), [y]"r"(Y)); \
+        lo; \
+     })
+#elif defined(CPU_COLDFIRE)
+    /* Calculates: result = (X*Y)>>Z */
+    #define fixmulshift(X,Y,Z) \
+    ({ \
+        int32_t t1; \
+        int32_t t2; \
+        asm volatile ( \
+            "mac.l   %[x],%[y],%%acc0\n\t" /* multiply */ \
+            "mulu.l  %[y],%[x]       \n\t" /* get lower half, avoid emac stall */ \
+            "movclr.l %%acc0,%[t1]   \n\t" /* get higher half */ \
+            "moveq.l #31,%[t2]       \n\t" \
+            "sub.l   %[sh],%[t2]     \n\t" /* t2 = 31 - shift */ \
+            "ble.s   1f              \n\t" \
+            "asl.l   %[t2],%[t1]     \n\t" /* hi <<= 31 - shift */ \
+            "lsr.l   %[sh],%[x]      \n\t" /* (unsigned)lo >>= shift */ \
+            "or.l    %[x],%[t1]      \n\t" /* combine result */ \
+            "bra.s   2f              \n\t" \
+         "1:                         \n\t" \
+            "neg.l   %[t2]           \n\t" /* t2 = shift - 31 */ \
+            "asr.l   %[t2],%[t1]     \n\t" /* hi >>= t2 */ \
+         "2:                         \n" \
+        : [t1]"=&d"(t1), [t2]"=&d"(t2) \
+        : [x] "d"((X)), [y] "d"((Y)), [sh]"d"((Z))); \
+        t1; \
+    })
 
-    temp >>= shamt;
+    /* Calculates: result = (X*Y)>>16 */
+    #define fixmul16(X,Y) \
+    ({ \
+        int32_t t1, t2; \
+        asm volatile ( \
+            "mac.l   %[x],%[y],%%acc0\n\t" /* multiply */ \
+            "mulu.l  %[y],%[x]       \n\t" /* get lower half, avoid emac stall */ \
+            "movclr.l %%acc0,%[t1]   \n\t" /* get higher half */ \
+            "moveq.l #15,%[t2]       \n\t" \
+            "asl.l   %[t2],%[t1]     \n\t" /* hi <<= 15, plus one free */ \
+            "moveq.l #16,%[t2]       \n\t" \
+            "lsr.l   %[t2],%[x]      \n\t" /* (unsigned)lo >>= 16 */ \
+            "or.l    %[x],%[t1]      \n\t" /* combine result */ \
+            : [t1]"=&d"(t1), [t2]"=&d"(t2) \
+            : [x] "d" ((X)), [y] "d" ((Y))); \
+        t1; \
+    })
+    
+    /* Calculates: result = (X*Y)>>24 */
+    #define fixmul24(X,Y) \
+    ({ \
+        int32_t t1, t2; \
+        asm volatile ( \
+            "mac.l   %[x],%[y],%%acc0\n\t" /* multiply */ \
+            "mulu.l  %[y],%[x]       \n\t" /* get lower half, avoid emac stall */ \
+            "movclr.l %%acc0,%[t1]   \n\t" /* get higher half */ \
+            "moveq.l #7,%[t2]        \n\t" \
+            "asl.l   %[t2],%[t1]     \n\t" /* hi <<= 7, plus one free */ \
+            "moveq.l #24,%[t2]       \n\t" \
+            "lsr.l   %[t2],%[x]      \n\t" /* (unsigned)lo >>= 24 */ \
+            "or.l    %[x],%[t1]      \n\t" /* combine result */ \
+            : [t1]"=&d"(t1), [t2]"=&d"(t2) \
+            : [x] "d" ((X)), [y] "d" ((Y))); \
+        t1; \
+    })
 
-    return (int32_t)temp;
-}
+    /* Calculates: result = (X*Y)>>32 */
+    #define fixmul31(X,Y) \
+    ({ \
+       int32_t t; \
+       asm volatile ( \
+          "mac.l %[x], %[y], %%acc0\n\t"   /* multiply */ \
+          "movclr.l %%acc0, %[t]\n\t"      /* get higher half as result */ \
+          : [t] "=d" (t) \
+          : [x] "r" ((X)), [y] "r" ((Y))); \
+       t; \
+    })
+#else
+    static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= shamt;
+    
+        return (int32_t)temp;
+    }
+    
+    static inline int32_t fixmul31(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 31;
+    
+        return (int32_t)temp;
+    }
+    
+    static inline int32_t fixmul24(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 24;
+    
+        return (int32_t)temp;
+    }
+    
+    static inline int32_t fixmul16(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 16;
+    
+        return (int32_t)temp;
+    }
+#endif /* CPU_COLDFIRE, CPU_ARM */
 
 #ifdef CPU_COLDFIRE
 static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0, 
@@ -62,18 +228,18 @@ static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
         int32_t s1 = src1[j];
         int32_t wi = -win[i];
         int32_t wj = -win[j];
-        dst[i] = fixmulshift(s0,wj,31) - fixmulshift(s1,wi,31);
-        dst[j] = fixmulshift(s0,wi,31) + fixmulshift(s1,wj,31);
+        dst[i] = fixmul31(s0, wj) - fixmul31(s1, wi);
+        dst[j] = fixmul31(s0, wi) + fixmul31(s1, wj);
     }
 }
 #endif
 
-static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, int32_t mul,
-                                        int len, int shift)
+static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, 
+                                        int32_t mul, int len)
 {
     int i;
     for(i=0; i<len; i++)
-        dst[i] = fixmulshift(src[i],mul,shift);   
+        dst[i] = fixmul24(src[i], mul);   
 }
 
 static inline int av_clip(int a, int amin, int amax)
diff --git a/apps/codecs/libwmapro/wmaprodec.c b/apps/codecs/libwmapro/wmaprodec.c
index b7879a2644..1f65157f64 100644
--- a/apps/codecs/libwmapro/wmaprodec.c
+++ b/apps/codecs/libwmapro/wmaprodec.c
@@ -133,6 +133,7 @@
 #define WMAPRO_BLOCK_MAX_BITS 12                                           ///< log2 of max block size
 #define WMAPRO_BLOCK_MAX_SIZE (1 << WMAPRO_BLOCK_MAX_BITS)                 ///< maximum block size
 #define WMAPRO_BLOCK_SIZES    (WMAPRO_BLOCK_MAX_BITS - BLOCK_MIN_BITS + 1) ///< possible block sizes
+#define WMAPRO_OUT_BUF_SIZE   (WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2)
 
 
 #define VLCBITS            9
@@ -151,6 +152,12 @@ static VLC              vec1_vlc;         ///< 1 coefficient per symbol
 static VLC              coef_vlc[2];      ///< coefficient run length vlc codes
 //static float            sin64[33];        ///< sinus table for decorrelation
 
+/* Global defined arrays to allow IRAM usage for some models. */
+static int32_t g_tmp[WMAPRO_BLOCK_MAX_SIZE] IBSS_ATTR_WMAPRO_LARGE_IRAM;
+static int32_t g_out_ch0[WMAPRO_OUT_BUF_SIZE] IBSS_ATTR;
+static int32_t g_out_ch1[WMAPRO_OUT_BUF_SIZE] IBSS_ATTR_WMAPRO_LARGE_IRAM;
+static int32_t g_out_multichannel[WMAPRO_MAX_CHANNELS-2][WMAPRO_OUT_BUF_SIZE];
+
 /**
  * @brief frame specific decoder context for a single channel
  */
@@ -171,8 +178,8 @@ typedef struct {
     int8_t   scale_factor_idx;                        ///< index for the transmitted scale factor values (used for resampling)
     int*     scale_factors;                           ///< pointer to the scale factor values used for decoding
     uint8_t  table_idx;                               ///< index in sf_offsets for the scale factor reference block
-    int32_t*   coeffs;                                  ///< pointer to the subframe decode buffer
-    DECLARE_ALIGNED(16, int32_t, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
+    int32_t* coeffs;                                  ///< pointer to the subframe decode buffer
+    int32_t* out;                                     ///< output buffer
 } WMAProChannelCtx;
 
 /**
@@ -195,7 +202,7 @@ typedef struct WMAProDecodeCtx {
     uint8_t          frame_data[MAX_FRAMESIZE +
                       FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
     PutBitContext    pb;                            ///< context for filling the frame_data buffer
-    DECLARE_ALIGNED(16, int32_t, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT input buffer
+    int32_t*         tmp;                           ///< IMDCT input buffer
 
     /* frame size dependent frame information (set during initialization) */
     uint32_t         decode_flags;                  ///< used compression features
@@ -229,8 +236,8 @@ typedef struct WMAProDecodeCtx {
     uint32_t         frame_num;                     ///< current frame number
     GetBitContext    gb;                            ///< bitstream reader context
     int              buf_bit_size;                  ///< buffer size in bits
-    int32_t*           samples;
-    int32_t*           samples_end;                   ///< maximum samplebuffer pointer
+    int32_t*         samples;
+    int32_t*         samples_end;                   ///< maximum samplebuffer pointer
     uint8_t          drc_gain;                      ///< gain for the DRC tool
     int8_t           skip_frame;                    ///< skip output step
     int8_t           parsed_all_subframes;          ///< all subframes decoded?
@@ -287,6 +294,15 @@ int decode_init(asf_waveformatex_t *wfx)
     int i;
     int log2_max_num_subframes;
     int num_possible_block_sizes;
+    
+    /* Use globally defined array. Allows IRAM usage for models with large IRAM. */
+    s->tmp = g_tmp;
+    
+    /* Use globally defined arrays. Allows IRAM usage for up to 2 channels. */
+    s->channel[0].out = g_out_ch0;
+    s->channel[1].out = g_out_ch1;
+    for (i=2; i<WMAPRO_MAX_CHANNELS; ++i)
+        s->channel[i].out = g_out_multichannel[i-2];
 
 #if defined(CPU_COLDFIRE)
     coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
@@ -657,9 +673,9 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
                 chgroup->decorrelation_matrix[y + i * chgroup->num_channels] =
                                                (v1 * cosv) + (v2 * sinv);
                 chgroup->fixdecorrelation_matrix[y + x * chgroup->num_channels] =
-                                               fixmulshift(f1, fixsinv, 31) - fixmulshift(f2, fixcosv, 31);
+                                               fixmul31(f1, fixsinv) - fixmul31(f2, fixcosv);
                 chgroup->fixdecorrelation_matrix[y + i * chgroup->num_channels] =
-                                               fixmulshift(f1, fixcosv, 31) + fixmulshift(f2, fixsinv, 31);
+                                               fixmul31(f1, fixcosv) + fixmul31(f2, fixsinv);
                                                
             }
         }
@@ -1009,20 +1025,21 @@ static void inverse_channel_transform(WMAProDecodeCtx *s)
                             data_ptr = data;
                                 
                             while (data_ptr < data_end)
-                                sum += fixmulshift(*data_ptr++, *mat++, 16);
+                                sum += fixmul16(*data_ptr++, *mat++);
 
                             (*ch)[y] = sum;
                         }
                     }
                 } else if (s->num_channels == 2) {
 
+                    /* Scale with sqrt(2). 0x016A09E6 = (sqrt(2)*(1<<24)) */
                     int len = FFMIN(sfb[1], s->subframe_len) - sfb[0];
                     vector_fixmul_scalar(ch_data[0] + sfb[0],
                                          ch_data[0] + sfb[0],
-                                         0x00016A00, len,16);
+                                         0x016A09E6, len);
                     vector_fixmul_scalar(ch_data[1] + sfb[0],
-		                                 ch_data[1] + sfb[0],
-		                                 0x00016A00, len,16);
+                                         ch_data[1] + sfb[0],
+                                         0x016A09E6, len);
 
                 }
             }
@@ -1049,7 +1066,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
             winlen = s->subframe_len;
         }
   
-        window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS];     
+        window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS];
             
         winlen >>= 1;
 
@@ -1261,7 +1278,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
                    
                 vector_fixmul_scalar(s->tmp+start, 
                                      s->channel[c].coeffs + start,
-                                     quant, end-start, 24);
+                                     quant, end-start);
 
                
             }
diff --git a/apps/codecs/libwmapro/wmaprodec.h b/apps/codecs/libwmapro/wmaprodec.h
index 40f3a60db6..3203dda583 100644
--- a/apps/codecs/libwmapro/wmaprodec.h
+++ b/apps/codecs/libwmapro/wmaprodec.h
@@ -1,5 +1,31 @@
+#include "codeclib.h"
 #include "../libasf/asf.h"
 
+#if   (CONFIG_CPU == MCF5250) || defined(CPU_S5L870X)
+/* Enough IRAM but performance suffers with ICODE_ATTR. */
+#define IBSS_ATTR_WMAPRO_LARGE_IRAM   IBSS_ATTR
+#define ICODE_ATTR_WMAPRO_LARGE_IRAM
+#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR
+#define ICONST_ATTR_WMAPRO_WIN_VS_TMP
+
+#elif (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024)
+/* Enough IRAM to move additional data and code to it. */
+#define IBSS_ATTR_WMAPRO_LARGE_IRAM   IBSS_ATTR
+#define ICODE_ATTR_WMAPRO_LARGE_IRAM  ICODE_ATTR
+#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR
+#define ICONST_ATTR_WMAPRO_WIN_VS_TMP
+
+#else
+/* Not enough IRAM available. */
+#define IBSS_ATTR_WMAPRO_LARGE_IRAM
+#define ICODE_ATTR_WMAPRO_LARGE_IRAM
+#define ICONST_ATTR_WMAPRO_LARGE_IRAM
+/* Models with large IRAM put tmp to IRAM rather than window coefficients as
+ * this is the fastest option. On models with smaller IRAM the 2nd-best option
+ * is to move the window coefficients to IRAM. */
+#define ICONST_ATTR_WMAPRO_WIN_VS_TMP ICONST_ATTR
+#endif
+
 int decode_init(asf_waveformatex_t *wfx);
 int decode_packet(asf_waveformatex_t *wfx,
                   void *data, int *data_size, void* pktdata, int size);
-- 
cgit v1.2.3