From 87d59ab56c30eadc4691a41ba7540cca868c9b50 Mon Sep 17 00:00:00 2001 From: Andree Buschmann Date: Mon, 26 Jul 2010 21:43:07 +0000 Subject: Submit part of FS#11498. Major speedup for WMA Professional on ARM and Coldfire CPUs. Introduce asm routines for multiplications, move arrays with major impact on decoding speed to IRAM. Speeds up decoding by 25% on PP5022 and 34% on mcf5249. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27582 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libwmapro/mdct_tables.c | 9 +- apps/codecs/libwmapro/wmapro_math.h | 192 +++++++++++++++++++++++++++++++++--- apps/codecs/libwmapro/wmaprodec.c | 43 +++++--- apps/codecs/libwmapro/wmaprodec.h | 26 +++++ 4 files changed, 240 insertions(+), 30 deletions(-) diff --git a/apps/codecs/libwmapro/mdct_tables.c b/apps/codecs/libwmapro/mdct_tables.c index b87d1b4045..dd8b2a451c 100644 --- a/apps/codecs/libwmapro/mdct_tables.c +++ b/apps/codecs/libwmapro/mdct_tables.c @@ -1,3 +1,4 @@ +#include "wmaprodec.h" #include /* Tables for fixed-point trig tables for windowing and mdct */ @@ -689,7 +690,7 @@ const int32_t sine_4096[] = { 0x8000078F, 0x800003DC, 0x80000164, 0x80000028 }; -const int32_t sine_2048[] = { +const int32_t sine_2048[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = { 0xFFF36F03, 0xFFDA4D09, 0xFFC12B0F, 0xFFA80918, 0xFF8EE725, 0xFF75C536, 0xFF5CA34B, 0xFF438168, 0xFF2A5F8C, 0xFF113DB7, 0xFEF81BEE, 0xFEDEFA2D, 0xFEC5D876, 0xFEACB6CE, 0xFE939531, 0xFE7A73A2, 0xFE615224, 0xFE4830B4, @@ -1208,7 +1209,7 @@ const int32_t sine_1024[] = { 0x800078E8, 0x80003DB0, 0x80001636, 0x80000279 }; -const int32_t sine_512[] = { +const int32_t sine_512[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = { 0xFFCDBC0C, 0xFF693440, 0xFF04ACD1, 0xFEA025FE, 0xFE3BA003, 0xFDD71B21, 0xFD72978F, 0xFD0E1595, 0xFCA9956C, 0xFC45174E, 0xFBE09B85, 0xFB7C223F, 0xFB17ABC2, 0xFAB33854, 0xFA4EC823, 0xF9EA5B74, 0xF985F28F, 0xF9218DA0, @@ -1297,7 +1298,7 @@ const int32_t sine_512[] = { 0x800058D4, 0x800009DF }; -const int32_t sine_256[] = { +const int32_t sine_256[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = { 0xFF9B781E, 0xFED26950, 0xFE095D6B, 0xFD40565E, 0xFC775617, 0xFBAE5E8D, 0xFAE571A2, 0xFA1C9157, 0xF953BF94, 0xF88AFE40, 0xF7C24F61, 0xF6F9B4C9, 0xF6313075, 0xF568C463, 0xF4A07264, 0xF3D83C74, 0xF3102493, 0xF2482C8D, @@ -1343,7 +1344,7 @@ const int32_t sine_256[] = { 0x80078E60, 0x8003DAF2, 0x8001634E, 0x8000277B }; -const int32_t sine_128[] = { +const int32_t sine_128[] ICONST_ATTR = { 0xFF36F079, 0xFDA4D929, 0xFC12D91C, 0xFA80FFCF, 0xF8EF5CBC, 0xF75DFF6C, 0xF5CCF73F, 0xF43C53CB, 0xF2AC2474, 0xF11C7895, 0xEF8D5FC8, 0xEDFEE931, 0xEC71244A, 0xEAE4208A, 0xE957ED00, 0xE7CC9912, 0xE642341D, 0xE4B8CD16, diff --git a/apps/codecs/libwmapro/wmapro_math.h b/apps/codecs/libwmapro/wmapro_math.h index 71cc3d33d7..30b9a987ee 100644 --- a/apps/codecs/libwmapro/wmapro_math.h +++ b/apps/codecs/libwmapro/wmapro_math.h @@ -3,21 +3,187 @@ #include +/* rockbox: not used #define fixtof16(x) (float)((float)(x) / (float)(1 << 16)) #define fixtof31(x) (float)((float)(x) / (float)(1 << 31)) #define ftofix16(x) ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5:0.5))) #define ftofix31(x) ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5:0.5))) +*/ -static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt) -{ - int64_t temp; - temp = x; - temp *= y; +#if defined(CPU_ARM) + /* Calculates: result = (X*Y)>>Z */ + #define fixmulshift(X,Y,Z) \ + ({ \ + int32_t lo; \ + int32_t hi; \ + asm volatile ( \ + "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \ + "mov %[lo], %[lo], lsr %[shr] \n\t" /* lo >>= Z */ \ + "orr %[lo], %[lo], %[hi], lsl %[shl]" /* lo |= (hi << (32-Z)) */ \ + : [lo]"=&r"(lo), [hi]"=&r"(hi) \ + : [x]"r"(X), [y]"r"(Y), [shr]"r"(Z), [shl]"r"(32-Z)); \ + lo; \ + }) + + /* Calculates: result = (X*Y)>>16 */ + #define fixmul16(X,Y) \ + ({ \ + int32_t lo; \ + int32_t hi; \ + asm volatile ( \ + "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \ + "mov %[lo], %[lo], lsr #16 \n\t" /* lo >>= 16 */ \ + "orr %[lo], %[lo], %[hi], lsl #16" /* lo |= (hi << 16) */ \ + : [lo]"=&r"(lo), [hi]"=&r"(hi) \ + : [x]"r"(X), [y]"r"(Y)); \ + lo; \ + }) + + /* Calculates: result = (X*Y)>>24 */ + #define fixmul24(X,Y) \ + ({ \ + int32_t lo; \ + int32_t hi; \ + asm volatile ( \ + "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \ + "mov %[lo], %[lo], lsr #24 \n\t" /* lo >>= 24 */ \ + "orr %[lo], %[lo], %[hi], lsl #8" /* lo |= (hi << 8) */ \ + : [lo]"=&r"(lo), [hi]"=&r"(hi) \ + : [x]"r"(X), [y]"r"(Y)); \ + lo; \ + }) + + /* Calculates: result = (X*Y)>>31 */ + #define fixmul31(X,Y) \ + ({ \ + int32_t lo; \ + int32_t hi; \ + asm volatile ( \ + "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \ + "mov %[lo], %[lo], lsr #31 \n\t" /* lo >>= 31 */ \ + "orr %[lo], %[lo], %[hi], lsl #1" /* lo |= (hi << 1) */ \ + : [lo]"=&r"(lo), [hi]"=&r"(hi) \ + : [x]"r"(X), [y]"r"(Y)); \ + lo; \ + }) +#elif defined(CPU_COLDFIRE) + /* Calculates: result = (X*Y)>>Z */ + #define fixmulshift(X,Y,Z) \ + ({ \ + int32_t t1; \ + int32_t t2; \ + asm volatile ( \ + "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \ + "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \ + "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \ + "moveq.l #31,%[t2] \n\t" \ + "sub.l %[sh],%[t2] \n\t" /* t2 = 31 - shift */ \ + "ble.s 1f \n\t" \ + "asl.l %[t2],%[t1] \n\t" /* hi <<= 31 - shift */ \ + "lsr.l %[sh],%[x] \n\t" /* (unsigned)lo >>= shift */ \ + "or.l %[x],%[t1] \n\t" /* combine result */ \ + "bra.s 2f \n\t" \ + "1: \n\t" \ + "neg.l %[t2] \n\t" /* t2 = shift - 31 */ \ + "asr.l %[t2],%[t1] \n\t" /* hi >>= t2 */ \ + "2: \n" \ + : [t1]"=&d"(t1), [t2]"=&d"(t2) \ + : [x] "d"((X)), [y] "d"((Y)), [sh]"d"((Z))); \ + t1; \ + }) - temp >>= shamt; + /* Calculates: result = (X*Y)>>16 */ + #define fixmul16(X,Y) \ + ({ \ + int32_t t1, t2; \ + asm volatile ( \ + "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \ + "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \ + "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \ + "moveq.l #15,%[t2] \n\t" \ + "asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \ + "moveq.l #16,%[t2] \n\t" \ + "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \ + "or.l %[x],%[t1] \n\t" /* combine result */ \ + : [t1]"=&d"(t1), [t2]"=&d"(t2) \ + : [x] "d" ((X)), [y] "d" ((Y))); \ + t1; \ + }) + + /* Calculates: result = (X*Y)>>24 */ + #define fixmul24(X,Y) \ + ({ \ + int32_t t1, t2; \ + asm volatile ( \ + "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \ + "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \ + "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \ + "moveq.l #7,%[t2] \n\t" \ + "asl.l %[t2],%[t1] \n\t" /* hi <<= 7, plus one free */ \ + "moveq.l #24,%[t2] \n\t" \ + "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 24 */ \ + "or.l %[x],%[t1] \n\t" /* combine result */ \ + : [t1]"=&d"(t1), [t2]"=&d"(t2) \ + : [x] "d" ((X)), [y] "d" ((Y))); \ + t1; \ + }) - return (int32_t)temp; -} + /* Calculates: result = (X*Y)>>32 */ + #define fixmul31(X,Y) \ + ({ \ + int32_t t; \ + asm volatile ( \ + "mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \ + "movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \ + : [t] "=d" (t) \ + : [x] "r" ((X)), [y] "r" ((Y))); \ + t; \ + }) +#else + static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt) + { + int64_t temp; + temp = x; + temp *= y; + + temp >>= shamt; + + return (int32_t)temp; + } + + static inline int32_t fixmul31(int32_t x, int32_t y) + { + int64_t temp; + temp = x; + temp *= y; + + temp >>= 31; + + return (int32_t)temp; + } + + static inline int32_t fixmul24(int32_t x, int32_t y) + { + int64_t temp; + temp = x; + temp *= y; + + temp >>= 24; + + return (int32_t)temp; + } + + static inline int32_t fixmul16(int32_t x, int32_t y) + { + int64_t temp; + temp = x; + temp *= y; + + temp >>= 16; + + return (int32_t)temp; + } +#endif /* CPU_COLDFIRE, CPU_ARM */ #ifdef CPU_COLDFIRE static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0, @@ -62,18 +228,18 @@ static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0, int32_t s1 = src1[j]; int32_t wi = -win[i]; int32_t wj = -win[j]; - dst[i] = fixmulshift(s0,wj,31) - fixmulshift(s1,wi,31); - dst[j] = fixmulshift(s0,wi,31) + fixmulshift(s1,wj,31); + dst[i] = fixmul31(s0, wj) - fixmul31(s1, wi); + dst[j] = fixmul31(s0, wi) + fixmul31(s1, wj); } } #endif -static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, int32_t mul, - int len, int shift) +static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, + int32_t mul, int len) { int i; for(i=0; itmp = g_tmp; + + /* Use globally defined arrays. Allows IRAM usage for up to 2 channels. */ + s->channel[0].out = g_out_ch0; + s->channel[1].out = g_out_ch1; + for (i=2; ichannel[i].out = g_out_multichannel[i-2]; #if defined(CPU_COLDFIRE) coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE); @@ -657,9 +673,9 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s, chgroup->decorrelation_matrix[y + i * chgroup->num_channels] = (v1 * cosv) + (v2 * sinv); chgroup->fixdecorrelation_matrix[y + x * chgroup->num_channels] = - fixmulshift(f1, fixsinv, 31) - fixmulshift(f2, fixcosv, 31); + fixmul31(f1, fixsinv) - fixmul31(f2, fixcosv); chgroup->fixdecorrelation_matrix[y + i * chgroup->num_channels] = - fixmulshift(f1, fixcosv, 31) + fixmulshift(f2, fixsinv, 31); + fixmul31(f1, fixcosv) + fixmul31(f2, fixsinv); } } @@ -1009,20 +1025,21 @@ static void inverse_channel_transform(WMAProDecodeCtx *s) data_ptr = data; while (data_ptr < data_end) - sum += fixmulshift(*data_ptr++, *mat++, 16); + sum += fixmul16(*data_ptr++, *mat++); (*ch)[y] = sum; } } } else if (s->num_channels == 2) { + /* Scale with sqrt(2). 0x016A09E6 = (sqrt(2)*(1<<24)) */ int len = FFMIN(sfb[1], s->subframe_len) - sfb[0]; vector_fixmul_scalar(ch_data[0] + sfb[0], ch_data[0] + sfb[0], - 0x00016A00, len,16); + 0x016A09E6, len); vector_fixmul_scalar(ch_data[1] + sfb[0], - ch_data[1] + sfb[0], - 0x00016A00, len,16); + ch_data[1] + sfb[0], + 0x016A09E6, len); } } @@ -1049,7 +1066,7 @@ static void wmapro_window(WMAProDecodeCtx *s) winlen = s->subframe_len; } - window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS]; + window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS]; winlen >>= 1; @@ -1261,7 +1278,7 @@ static int decode_subframe(WMAProDecodeCtx *s) vector_fixmul_scalar(s->tmp+start, s->channel[c].coeffs + start, - quant, end-start, 24); + quant, end-start); } diff --git a/apps/codecs/libwmapro/wmaprodec.h b/apps/codecs/libwmapro/wmaprodec.h index 40f3a60db6..3203dda583 100644 --- a/apps/codecs/libwmapro/wmaprodec.h +++ b/apps/codecs/libwmapro/wmaprodec.h @@ -1,5 +1,31 @@ +#include "codeclib.h" #include "../libasf/asf.h" +#if (CONFIG_CPU == MCF5250) || defined(CPU_S5L870X) +/* Enough IRAM but performance suffers with ICODE_ATTR. */ +#define IBSS_ATTR_WMAPRO_LARGE_IRAM IBSS_ATTR +#define ICODE_ATTR_WMAPRO_LARGE_IRAM +#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR +#define ICONST_ATTR_WMAPRO_WIN_VS_TMP + +#elif (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) +/* Enough IRAM to move additional data and code to it. */ +#define IBSS_ATTR_WMAPRO_LARGE_IRAM IBSS_ATTR +#define ICODE_ATTR_WMAPRO_LARGE_IRAM ICODE_ATTR +#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR +#define ICONST_ATTR_WMAPRO_WIN_VS_TMP + +#else +/* Not enough IRAM available. */ +#define IBSS_ATTR_WMAPRO_LARGE_IRAM +#define ICODE_ATTR_WMAPRO_LARGE_IRAM +#define ICONST_ATTR_WMAPRO_LARGE_IRAM +/* Models with large IRAM put tmp to IRAM rather than window coefficients as + * this is the fastest option. On models with smaller IRAM the 2nd-best option + * is to move the window coefficients to IRAM. */ +#define ICONST_ATTR_WMAPRO_WIN_VS_TMP ICONST_ATTR +#endif + int decode_init(asf_waveformatex_t *wfx); int decode_packet(asf_waveformatex_t *wfx, void *data, int *data_size, void* pktdata, int size); -- cgit v1.2.3