summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-26 21:43:07 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-26 21:43:07 +0000
commit87d59ab56c30eadc4691a41ba7540cca868c9b50 (patch)
treeba0b3196b5dcabc7a4c5296c1ea9027f033f023f
parentdbeb1ee07269942997a3676595a795d4aee547eb (diff)
downloadrockbox-87d59ab56c30eadc4691a41ba7540cca868c9b50.tar.gz
rockbox-87d59ab56c30eadc4691a41ba7540cca868c9b50.zip
Submit part of FS#11498. Major speedup for WMA Professional on ARM and Coldfire CPUs. Introduce asm routines for multiplications, move arrays with major impact on decoding speed to IRAM. Speeds up decoding by 25% on PP5022 and 34% on mcf5249.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27582 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libwmapro/mdct_tables.c9
-rw-r--r--apps/codecs/libwmapro/wmapro_math.h192
-rw-r--r--apps/codecs/libwmapro/wmaprodec.c43
-rw-r--r--apps/codecs/libwmapro/wmaprodec.h26
4 files changed, 240 insertions, 30 deletions
diff --git a/apps/codecs/libwmapro/mdct_tables.c b/apps/codecs/libwmapro/mdct_tables.c
index b87d1b4045..dd8b2a451c 100644
--- a/apps/codecs/libwmapro/mdct_tables.c
+++ b/apps/codecs/libwmapro/mdct_tables.c
@@ -1,3 +1,4 @@
1#include "wmaprodec.h"
1#include <inttypes.h> 2#include <inttypes.h>
2 3
3/* Tables for fixed-point trig tables for windowing and mdct */ 4/* Tables for fixed-point trig tables for windowing and mdct */
@@ -689,7 +690,7 @@ const int32_t sine_4096[] = {
689 0x8000078F, 0x800003DC, 0x80000164, 0x80000028 690 0x8000078F, 0x800003DC, 0x80000164, 0x80000028
690}; 691};
691 692
692const int32_t sine_2048[] = { 693const int32_t sine_2048[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
693 0xFFF36F03, 0xFFDA4D09, 0xFFC12B0F, 0xFFA80918, 0xFF8EE725, 0xFF75C536, 694 0xFFF36F03, 0xFFDA4D09, 0xFFC12B0F, 0xFFA80918, 0xFF8EE725, 0xFF75C536,
694 0xFF5CA34B, 0xFF438168, 0xFF2A5F8C, 0xFF113DB7, 0xFEF81BEE, 0xFEDEFA2D, 695 0xFF5CA34B, 0xFF438168, 0xFF2A5F8C, 0xFF113DB7, 0xFEF81BEE, 0xFEDEFA2D,
695 0xFEC5D876, 0xFEACB6CE, 0xFE939531, 0xFE7A73A2, 0xFE615224, 0xFE4830B4, 696 0xFEC5D876, 0xFEACB6CE, 0xFE939531, 0xFE7A73A2, 0xFE615224, 0xFE4830B4,
@@ -1208,7 +1209,7 @@ const int32_t sine_1024[] = {
1208 0x800078E8, 0x80003DB0, 0x80001636, 0x80000279 1209 0x800078E8, 0x80003DB0, 0x80001636, 0x80000279
1209}; 1210};
1210 1211
1211const int32_t sine_512[] = { 1212const int32_t sine_512[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
1212 0xFFCDBC0C, 0xFF693440, 0xFF04ACD1, 0xFEA025FE, 0xFE3BA003, 0xFDD71B21, 1213 0xFFCDBC0C, 0xFF693440, 0xFF04ACD1, 0xFEA025FE, 0xFE3BA003, 0xFDD71B21,
1213 0xFD72978F, 0xFD0E1595, 0xFCA9956C, 0xFC45174E, 0xFBE09B85, 0xFB7C223F, 1214 0xFD72978F, 0xFD0E1595, 0xFCA9956C, 0xFC45174E, 0xFBE09B85, 0xFB7C223F,
1214 0xFB17ABC2, 0xFAB33854, 0xFA4EC823, 0xF9EA5B74, 0xF985F28F, 0xF9218DA0, 1215 0xFB17ABC2, 0xFAB33854, 0xFA4EC823, 0xF9EA5B74, 0xF985F28F, 0xF9218DA0,
@@ -1297,7 +1298,7 @@ const int32_t sine_512[] = {
1297 0x800058D4, 0x800009DF 1298 0x800058D4, 0x800009DF
1298}; 1299};
1299 1300
1300const int32_t sine_256[] = { 1301const int32_t sine_256[] ICONST_ATTR_WMAPRO_WIN_VS_TMP = {
1301 0xFF9B781E, 0xFED26950, 0xFE095D6B, 0xFD40565E, 0xFC775617, 0xFBAE5E8D, 1302 0xFF9B781E, 0xFED26950, 0xFE095D6B, 0xFD40565E, 0xFC775617, 0xFBAE5E8D,
1302 0xFAE571A2, 0xFA1C9157, 0xF953BF94, 0xF88AFE40, 0xF7C24F61, 0xF6F9B4C9, 1303 0xFAE571A2, 0xFA1C9157, 0xF953BF94, 0xF88AFE40, 0xF7C24F61, 0xF6F9B4C9,
1303 0xF6313075, 0xF568C463, 0xF4A07264, 0xF3D83C74, 0xF3102493, 0xF2482C8D, 1304 0xF6313075, 0xF568C463, 0xF4A07264, 0xF3D83C74, 0xF3102493, 0xF2482C8D,
@@ -1343,7 +1344,7 @@ const int32_t sine_256[] = {
1343 0x80078E60, 0x8003DAF2, 0x8001634E, 0x8000277B 1344 0x80078E60, 0x8003DAF2, 0x8001634E, 0x8000277B
1344}; 1345};
1345 1346
1346const int32_t sine_128[] = { 1347const int32_t sine_128[] ICONST_ATTR = {
1347 0xFF36F079, 0xFDA4D929, 0xFC12D91C, 0xFA80FFCF, 0xF8EF5CBC, 0xF75DFF6C, 1348 0xFF36F079, 0xFDA4D929, 0xFC12D91C, 0xFA80FFCF, 0xF8EF5CBC, 0xF75DFF6C,
1348 0xF5CCF73F, 0xF43C53CB, 0xF2AC2474, 0xF11C7895, 0xEF8D5FC8, 0xEDFEE931, 1349 0xF5CCF73F, 0xF43C53CB, 0xF2AC2474, 0xF11C7895, 0xEF8D5FC8, 0xEDFEE931,
1349 0xEC71244A, 0xEAE4208A, 0xE957ED00, 0xE7CC9912, 0xE642341D, 0xE4B8CD16, 1350 0xEC71244A, 0xEAE4208A, 0xE957ED00, 0xE7CC9912, 0xE642341D, 0xE4B8CD16,
diff --git a/apps/codecs/libwmapro/wmapro_math.h b/apps/codecs/libwmapro/wmapro_math.h
index 71cc3d33d7..30b9a987ee 100644
--- a/apps/codecs/libwmapro/wmapro_math.h
+++ b/apps/codecs/libwmapro/wmapro_math.h
@@ -3,21 +3,187 @@
3 3
4#include <inttypes.h> 4#include <inttypes.h>
5 5
6/* rockbox: not used
6#define fixtof16(x) (float)((float)(x) / (float)(1 << 16)) 7#define fixtof16(x) (float)((float)(x) / (float)(1 << 16))
7#define fixtof31(x) (float)((float)(x) / (float)(1 << 31)) 8#define fixtof31(x) (float)((float)(x) / (float)(1 << 31))
8#define ftofix16(x) ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5:0.5))) 9#define ftofix16(x) ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5:0.5)))
9#define ftofix31(x) ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5:0.5))) 10#define ftofix31(x) ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5:0.5)))
11*/
10 12
11static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt) 13#if defined(CPU_ARM)
12{ 14 /* Calculates: result = (X*Y)>>Z */
13 int64_t temp; 15 #define fixmulshift(X,Y,Z) \
14 temp = x; 16 ({ \
15 temp *= y; 17 int32_t lo; \
18 int32_t hi; \
19 asm volatile ( \
20 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
21 "mov %[lo], %[lo], lsr %[shr] \n\t" /* lo >>= Z */ \
22 "orr %[lo], %[lo], %[hi], lsl %[shl]" /* lo |= (hi << (32-Z)) */ \
23 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
24 : [x]"r"(X), [y]"r"(Y), [shr]"r"(Z), [shl]"r"(32-Z)); \
25 lo; \
26 })
27
28 /* Calculates: result = (X*Y)>>16 */
29 #define fixmul16(X,Y) \
30 ({ \
31 int32_t lo; \
32 int32_t hi; \
33 asm volatile ( \
34 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
35 "mov %[lo], %[lo], lsr #16 \n\t" /* lo >>= 16 */ \
36 "orr %[lo], %[lo], %[hi], lsl #16" /* lo |= (hi << 16) */ \
37 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
38 : [x]"r"(X), [y]"r"(Y)); \
39 lo; \
40 })
41
42 /* Calculates: result = (X*Y)>>24 */
43 #define fixmul24(X,Y) \
44 ({ \
45 int32_t lo; \
46 int32_t hi; \
47 asm volatile ( \
48 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
49 "mov %[lo], %[lo], lsr #24 \n\t" /* lo >>= 24 */ \
50 "orr %[lo], %[lo], %[hi], lsl #8" /* lo |= (hi << 8) */ \
51 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
52 : [x]"r"(X), [y]"r"(Y)); \
53 lo; \
54 })
55
56 /* Calculates: result = (X*Y)>>31 */
57 #define fixmul31(X,Y) \
58 ({ \
59 int32_t lo; \
60 int32_t hi; \
61 asm volatile ( \
62 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
63 "mov %[lo], %[lo], lsr #31 \n\t" /* lo >>= 31 */ \
64 "orr %[lo], %[lo], %[hi], lsl #1" /* lo |= (hi << 1) */ \
65 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
66 : [x]"r"(X), [y]"r"(Y)); \
67 lo; \
68 })
69#elif defined(CPU_COLDFIRE)
70 /* Calculates: result = (X*Y)>>Z */
71 #define fixmulshift(X,Y,Z) \
72 ({ \
73 int32_t t1; \
74 int32_t t2; \
75 asm volatile ( \
76 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
77 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
78 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
79 "moveq.l #31,%[t2] \n\t" \
80 "sub.l %[sh],%[t2] \n\t" /* t2 = 31 - shift */ \
81 "ble.s 1f \n\t" \
82 "asl.l %[t2],%[t1] \n\t" /* hi <<= 31 - shift */ \
83 "lsr.l %[sh],%[x] \n\t" /* (unsigned)lo >>= shift */ \
84 "or.l %[x],%[t1] \n\t" /* combine result */ \
85 "bra.s 2f \n\t" \
86 "1: \n\t" \
87 "neg.l %[t2] \n\t" /* t2 = shift - 31 */ \
88 "asr.l %[t2],%[t1] \n\t" /* hi >>= t2 */ \
89 "2: \n" \
90 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
91 : [x] "d"((X)), [y] "d"((Y)), [sh]"d"((Z))); \
92 t1; \
93 })
16 94
17 temp >>= shamt; 95 /* Calculates: result = (X*Y)>>16 */
96 #define fixmul16(X,Y) \
97 ({ \
98 int32_t t1, t2; \
99 asm volatile ( \
100 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
101 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
102 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
103 "moveq.l #15,%[t2] \n\t" \
104 "asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \
105 "moveq.l #16,%[t2] \n\t" \
106 "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \
107 "or.l %[x],%[t1] \n\t" /* combine result */ \
108 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
109 : [x] "d" ((X)), [y] "d" ((Y))); \
110 t1; \
111 })
112
113 /* Calculates: result = (X*Y)>>24 */
114 #define fixmul24(X,Y) \
115 ({ \
116 int32_t t1, t2; \
117 asm volatile ( \
118 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
119 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
120 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
121 "moveq.l #7,%[t2] \n\t" \
122 "asl.l %[t2],%[t1] \n\t" /* hi <<= 7, plus one free */ \
123 "moveq.l #24,%[t2] \n\t" \
124 "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 24 */ \
125 "or.l %[x],%[t1] \n\t" /* combine result */ \
126 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
127 : [x] "d" ((X)), [y] "d" ((Y))); \
128 t1; \
129 })
18 130
19 return (int32_t)temp; 131 /* Calculates: result = (X*Y)>>32 */
20} 132 #define fixmul31(X,Y) \
133 ({ \
134 int32_t t; \
135 asm volatile ( \
136 "mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \
137 "movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \
138 : [t] "=d" (t) \
139 : [x] "r" ((X)), [y] "r" ((Y))); \
140 t; \
141 })
142#else
143 static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt)
144 {
145 int64_t temp;
146 temp = x;
147 temp *= y;
148
149 temp >>= shamt;
150
151 return (int32_t)temp;
152 }
153
154 static inline int32_t fixmul31(int32_t x, int32_t y)
155 {
156 int64_t temp;
157 temp = x;
158 temp *= y;
159
160 temp >>= 31;
161
162 return (int32_t)temp;
163 }
164
165 static inline int32_t fixmul24(int32_t x, int32_t y)
166 {
167 int64_t temp;
168 temp = x;
169 temp *= y;
170
171 temp >>= 24;
172
173 return (int32_t)temp;
174 }
175
176 static inline int32_t fixmul16(int32_t x, int32_t y)
177 {
178 int64_t temp;
179 temp = x;
180 temp *= y;
181
182 temp >>= 16;
183
184 return (int32_t)temp;
185 }
186#endif /* CPU_COLDFIRE, CPU_ARM */
21 187
22#ifdef CPU_COLDFIRE 188#ifdef CPU_COLDFIRE
23static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0, 189static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
@@ -62,18 +228,18 @@ static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
62 int32_t s1 = src1[j]; 228 int32_t s1 = src1[j];
63 int32_t wi = -win[i]; 229 int32_t wi = -win[i];
64 int32_t wj = -win[j]; 230 int32_t wj = -win[j];
65 dst[i] = fixmulshift(s0,wj,31) - fixmulshift(s1,wi,31); 231 dst[i] = fixmul31(s0, wj) - fixmul31(s1, wi);
66 dst[j] = fixmulshift(s0,wi,31) + fixmulshift(s1,wj,31); 232 dst[j] = fixmul31(s0, wi) + fixmul31(s1, wj);
67 } 233 }
68} 234}
69#endif 235#endif
70 236
71static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, int32_t mul, 237static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src,
72 int len, int shift) 238 int32_t mul, int len)
73{ 239{
74 int i; 240 int i;
75 for(i=0; i<len; i++) 241 for(i=0; i<len; i++)
76 dst[i] = fixmulshift(src[i],mul,shift); 242 dst[i] = fixmul24(src[i], mul);
77} 243}
78 244
79static inline int av_clip(int a, int amin, int amax) 245static inline int av_clip(int a, int amin, int amax)
diff --git a/apps/codecs/libwmapro/wmaprodec.c b/apps/codecs/libwmapro/wmaprodec.c
index b7879a2644..1f65157f64 100644
--- a/apps/codecs/libwmapro/wmaprodec.c
+++ b/apps/codecs/libwmapro/wmaprodec.c
@@ -133,6 +133,7 @@
133#define WMAPRO_BLOCK_MAX_BITS 12 ///< log2 of max block size 133#define WMAPRO_BLOCK_MAX_BITS 12 ///< log2 of max block size
134#define WMAPRO_BLOCK_MAX_SIZE (1 << WMAPRO_BLOCK_MAX_BITS) ///< maximum block size 134#define WMAPRO_BLOCK_MAX_SIZE (1 << WMAPRO_BLOCK_MAX_BITS) ///< maximum block size
135#define WMAPRO_BLOCK_SIZES (WMAPRO_BLOCK_MAX_BITS - BLOCK_MIN_BITS + 1) ///< possible block sizes 135#define WMAPRO_BLOCK_SIZES (WMAPRO_BLOCK_MAX_BITS - BLOCK_MIN_BITS + 1) ///< possible block sizes
136#define WMAPRO_OUT_BUF_SIZE (WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2)
136 137
137 138
138#define VLCBITS 9 139#define VLCBITS 9
@@ -151,6 +152,12 @@ static VLC vec1_vlc; ///< 1 coefficient per symbol
151static VLC coef_vlc[2]; ///< coefficient run length vlc codes 152static VLC coef_vlc[2]; ///< coefficient run length vlc codes
152//static float sin64[33]; ///< sinus table for decorrelation 153//static float sin64[33]; ///< sinus table for decorrelation
153 154
155/* Global defined arrays to allow IRAM usage for some models. */
156static int32_t g_tmp[WMAPRO_BLOCK_MAX_SIZE] IBSS_ATTR_WMAPRO_LARGE_IRAM;
157static int32_t g_out_ch0[WMAPRO_OUT_BUF_SIZE] IBSS_ATTR;
158static int32_t g_out_ch1[WMAPRO_OUT_BUF_SIZE] IBSS_ATTR_WMAPRO_LARGE_IRAM;
159static int32_t g_out_multichannel[WMAPRO_MAX_CHANNELS-2][WMAPRO_OUT_BUF_SIZE];
160
154/** 161/**
155 * @brief frame specific decoder context for a single channel 162 * @brief frame specific decoder context for a single channel
156 */ 163 */
@@ -171,8 +178,8 @@ typedef struct {
171 int8_t scale_factor_idx; ///< index for the transmitted scale factor values (used for resampling) 178 int8_t scale_factor_idx; ///< index for the transmitted scale factor values (used for resampling)
172 int* scale_factors; ///< pointer to the scale factor values used for decoding 179 int* scale_factors; ///< pointer to the scale factor values used for decoding
173 uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block 180 uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block
174 int32_t* coeffs; ///< pointer to the subframe decode buffer 181 int32_t* coeffs; ///< pointer to the subframe decode buffer
175 DECLARE_ALIGNED(16, int32_t, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer 182 int32_t* out; ///< output buffer
176} WMAProChannelCtx; 183} WMAProChannelCtx;
177 184
178/** 185/**
@@ -195,7 +202,7 @@ typedef struct WMAProDecodeCtx {
195 uint8_t frame_data[MAX_FRAMESIZE + 202 uint8_t frame_data[MAX_FRAMESIZE +
196 FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data 203 FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
197 PutBitContext pb; ///< context for filling the frame_data buffer 204 PutBitContext pb; ///< context for filling the frame_data buffer
198 DECLARE_ALIGNED(16, int32_t, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT input buffer 205 int32_t* tmp; ///< IMDCT input buffer
199 206
200 /* frame size dependent frame information (set during initialization) */ 207 /* frame size dependent frame information (set during initialization) */
201 uint32_t decode_flags; ///< used compression features 208 uint32_t decode_flags; ///< used compression features
@@ -229,8 +236,8 @@ typedef struct WMAProDecodeCtx {
229 uint32_t frame_num; ///< current frame number 236 uint32_t frame_num; ///< current frame number
230 GetBitContext gb; ///< bitstream reader context 237 GetBitContext gb; ///< bitstream reader context
231 int buf_bit_size; ///< buffer size in bits 238 int buf_bit_size; ///< buffer size in bits
232 int32_t* samples; 239 int32_t* samples;
233 int32_t* samples_end; ///< maximum samplebuffer pointer 240 int32_t* samples_end; ///< maximum samplebuffer pointer
234 uint8_t drc_gain; ///< gain for the DRC tool 241 uint8_t drc_gain; ///< gain for the DRC tool
235 int8_t skip_frame; ///< skip output step 242 int8_t skip_frame; ///< skip output step
236 int8_t parsed_all_subframes; ///< all subframes decoded? 243 int8_t parsed_all_subframes; ///< all subframes decoded?
@@ -287,6 +294,15 @@ int decode_init(asf_waveformatex_t *wfx)
287 int i; 294 int i;
288 int log2_max_num_subframes; 295 int log2_max_num_subframes;
289 int num_possible_block_sizes; 296 int num_possible_block_sizes;
297
298 /* Use globally defined array. Allows IRAM usage for models with large IRAM. */
299 s->tmp = g_tmp;
300
301 /* Use globally defined arrays. Allows IRAM usage for up to 2 channels. */
302 s->channel[0].out = g_out_ch0;
303 s->channel[1].out = g_out_ch1;
304 for (i=2; i<WMAPRO_MAX_CHANNELS; ++i)
305 s->channel[i].out = g_out_multichannel[i-2];
290 306
291#if defined(CPU_COLDFIRE) 307#if defined(CPU_COLDFIRE)
292 coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE); 308 coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
@@ -657,9 +673,9 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
657 chgroup->decorrelation_matrix[y + i * chgroup->num_channels] = 673 chgroup->decorrelation_matrix[y + i * chgroup->num_channels] =
658 (v1 * cosv) + (v2 * sinv); 674 (v1 * cosv) + (v2 * sinv);
659 chgroup->fixdecorrelation_matrix[y + x * chgroup->num_channels] = 675 chgroup->fixdecorrelation_matrix[y + x * chgroup->num_channels] =
660 fixmulshift(f1, fixsinv, 31) - fixmulshift(f2, fixcosv, 31); 676 fixmul31(f1, fixsinv) - fixmul31(f2, fixcosv);
661 chgroup->fixdecorrelation_matrix[y + i * chgroup->num_channels] = 677 chgroup->fixdecorrelation_matrix[y + i * chgroup->num_channels] =
662 fixmulshift(f1, fixcosv, 31) + fixmulshift(f2, fixsinv, 31); 678 fixmul31(f1, fixcosv) + fixmul31(f2, fixsinv);
663 679
664 } 680 }
665 } 681 }
@@ -1009,20 +1025,21 @@ static void inverse_channel_transform(WMAProDecodeCtx *s)
1009 data_ptr = data; 1025 data_ptr = data;
1010 1026
1011 while (data_ptr < data_end) 1027 while (data_ptr < data_end)
1012 sum += fixmulshift(*data_ptr++, *mat++, 16); 1028 sum += fixmul16(*data_ptr++, *mat++);
1013 1029
1014 (*ch)[y] = sum; 1030 (*ch)[y] = sum;
1015 } 1031 }
1016 } 1032 }
1017 } else if (s->num_channels == 2) { 1033 } else if (s->num_channels == 2) {
1018 1034
1035 /* Scale with sqrt(2). 0x016A09E6 = (sqrt(2)*(1<<24)) */
1019 int len = FFMIN(sfb[1], s->subframe_len) - sfb[0]; 1036 int len = FFMIN(sfb[1], s->subframe_len) - sfb[0];
1020 vector_fixmul_scalar(ch_data[0] + sfb[0], 1037 vector_fixmul_scalar(ch_data[0] + sfb[0],
1021 ch_data[0] + sfb[0], 1038 ch_data[0] + sfb[0],
1022 0x00016A00, len,16); 1039 0x016A09E6, len);
1023 vector_fixmul_scalar(ch_data[1] + sfb[0], 1040 vector_fixmul_scalar(ch_data[1] + sfb[0],
1024 ch_data[1] + sfb[0], 1041 ch_data[1] + sfb[0],
1025 0x00016A00, len,16); 1042 0x016A09E6, len);
1026 1043
1027 } 1044 }
1028 } 1045 }
@@ -1049,7 +1066,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
1049 winlen = s->subframe_len; 1066 winlen = s->subframe_len;
1050 } 1067 }
1051 1068
1052 window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS]; 1069 window = sine_windows[av_log2(winlen) - BLOCK_MIN_BITS];
1053 1070
1054 winlen >>= 1; 1071 winlen >>= 1;
1055 1072
@@ -1261,7 +1278,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
1261 1278
1262 vector_fixmul_scalar(s->tmp+start, 1279 vector_fixmul_scalar(s->tmp+start,
1263 s->channel[c].coeffs + start, 1280 s->channel[c].coeffs + start,
1264 quant, end-start, 24); 1281 quant, end-start);
1265 1282
1266 1283
1267 } 1284 }
diff --git a/apps/codecs/libwmapro/wmaprodec.h b/apps/codecs/libwmapro/wmaprodec.h
index 40f3a60db6..3203dda583 100644
--- a/apps/codecs/libwmapro/wmaprodec.h
+++ b/apps/codecs/libwmapro/wmaprodec.h
@@ -1,5 +1,31 @@
1#include "codeclib.h"
1#include "../libasf/asf.h" 2#include "../libasf/asf.h"
2 3
4#if (CONFIG_CPU == MCF5250) || defined(CPU_S5L870X)
5/* Enough IRAM but performance suffers with ICODE_ATTR. */
6#define IBSS_ATTR_WMAPRO_LARGE_IRAM IBSS_ATTR
7#define ICODE_ATTR_WMAPRO_LARGE_IRAM
8#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR
9#define ICONST_ATTR_WMAPRO_WIN_VS_TMP
10
11#elif (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024)
12/* Enough IRAM to move additional data and code to it. */
13#define IBSS_ATTR_WMAPRO_LARGE_IRAM IBSS_ATTR
14#define ICODE_ATTR_WMAPRO_LARGE_IRAM ICODE_ATTR
15#define ICONST_ATTR_WMAPRO_LARGE_IRAM ICONST_ATTR
16#define ICONST_ATTR_WMAPRO_WIN_VS_TMP
17
18#else
19/* Not enough IRAM available. */
20#define IBSS_ATTR_WMAPRO_LARGE_IRAM
21#define ICODE_ATTR_WMAPRO_LARGE_IRAM
22#define ICONST_ATTR_WMAPRO_LARGE_IRAM
23/* Models with large IRAM put tmp to IRAM rather than window coefficients as
24 * this is the fastest option. On models with smaller IRAM the 2nd-best option
25 * is to move the window coefficients to IRAM. */
26#define ICONST_ATTR_WMAPRO_WIN_VS_TMP ICONST_ATTR
27#endif
28
3int decode_init(asf_waveformatex_t *wfx); 29int decode_init(asf_waveformatex_t *wfx);
4int decode_packet(asf_waveformatex_t *wfx, 30int decode_packet(asf_waveformatex_t *wfx,
5 void *data, int *data_size, void* pktdata, int size); 31 void *data, int *data_size, void* pktdata, int size);