summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-02-07 14:04:46 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-02-07 14:04:46 +0000
commitcc572378fc76cb9d78160c2ffeafb1e7b867c88a (patch)
tree9f1f682f2f31e150ff0781deefbbc1aa54a7f56e
parentcbf136d57adab7aac13c1b3cf05e8c7a3cfa9c3d (diff)
downloadrockbox-cc572378fc76cb9d78160c2ffeafb1e7b867c88a.tar.gz
rockbox-cc572378fc76cb9d78160c2ffeafb1e7b867c88a.zip
Submit FS#10974: Speed up mpc-codec through changing the dct32-algorithm. +5% on PP5020, +12% on M5.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24544 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libmusepack/synth_filter.c840
-rw-r--r--apps/codecs/libmusepack/synth_filter_arm.S658
2 files changed, 716 insertions, 782 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index d8196eb40f..587b88dada 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -43,37 +43,20 @@
43#undef _ 43#undef _
44 44
45#if defined(MPC_FIXED_POINT) 45#if defined(MPC_FIXED_POINT)
46 #if defined(OPTIMIZE_FOR_SPEED) 46 #if defined(CPU_ARM)
47 // round at compile time to +/- 2^14 as a pre-shift before 32=32x32-multiply 47 // do not up-scale D-values to achieve higher speed in smull/mlal
48 #define D(value) (MPC_SHR_RND(value, 3)) 48 // operations. saves ~14/8 = 1.75 cycles per multiplication
49 #define D(value) (value)
49 50
50 // round at runtime to +/- 2^17 as a pre-shift before 32=32x32-multiply 51 // in this configuration a post-shift by >>16 is needed after synthesis
51 // samples are 18.14 fixed point. 30.2 after this shift, whereas the 52 #else
52 // 15.2 bits are significant (not including sign) 53 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
53 #define MPC_V_PRESHIFT(X) MPC_SHR_RND(X, 12) 54 #define D(value) (value << (14))
54 55 #endif
55 // in this configuration a post-shift by >>1 is needed after synthesis
56 #else
57 #if defined(CPU_ARM)
58 // do not up-scale D-values to achieve higher speed in smull/mlal
59 // operations. saves ~14/8 = 1.75 cycles per multiplication
60 #define D(value) (value)
61
62 // in this configuration a post-shift by >>16 is needed after synthesis
63 #else
64 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
65 #define D(value) (value << (14))
66 #endif
67 // do not perform pre-shift
68 #define MPC_V_PRESHIFT(X) (X)
69 #endif
70#else 56#else
71 // IMPORTANT: internal scaling is somehow strange for floating point, therefore we scale the coefficients Di_opt 57 // IMPORTANT: internal scaling is somehow strange for floating point, therefore we scale the coefficients Di_opt
72 // by the correct amount to have proper scaled output 58 // by the correct amount to have proper scaled output
73 #define D(value) MAKE_MPC_SAMPLE((double)value*(double)(0x1000)) 59 #define D(value) MAKE_MPC_SAMPLE((double)value*(double)(0x1000))
74
75 // do not perform pre-shift
76 #define MPC_V_PRESHIFT(X) (X)
77#endif 60#endif
78 61
79// Di_opt coefficients are +/- 2^17 (pre-shifted by <<16) 62// Di_opt coefficients are +/- 2^17 (pre-shifted by <<16)
@@ -115,343 +98,376 @@ static const MPC_SAMPLE_FORMAT Di_opt [512] ICONST_ATTR = {
115 98
116#undef D 99#undef D
117 100
118// needed to prevent from internal overflow in calculate_V (see below) 101// DCT32-coefficients were expanded (<<) by DCT32_COEFFICIENT_EXPAND
119#define OVERFLOW_FIX 2 102#define DCT32_COEFFICIENT_EXPAND 31
120
121// V-coefficients were expanded (<<) by V_COEFFICIENT_EXPAND
122#define V_COEFFICIENT_EXPAND 27
123 103
124#if defined(MPC_FIXED_POINT) 104#if defined(MPC_FIXED_POINT)
125 #if defined(OPTIMIZE_FOR_SPEED) 105 // define 64=32x32-multiplication for DCT-coefficients with samples. Via usage of MPC_FRACT highly optimized assembler might be used
126 // define 32=32x32-multiplication for DCT-coefficients with samples, vcoef will be pre-shifted on creation 106 // MULTIPLY_FRACT will perform >>32 after multiplication, as coef were expanded by DCT32_COEFFICIENT_EXPAND we'll correct this on the result.
127 // samples are rounded to +/- 2^19 as pre-shift before 32=32x32-multiply 107 // Will loose 4 bit accuracy on result in fract part without effect on final audio result
128 #define MPC_MULTIPLY_V(sample, vcoef) ( MPC_SHR_RND(sample, 12) * vcoef ) 108 #define MPC_DCT32_MUL(sample, coef) (MPC_MULTIPLY_FRACT(sample,coef) << (32-DCT32_COEFFICIENT_EXPAND))
129 109 #define MPC_DCT32_SHIFT(sample) (sample)
130 // pre- and postscale are used to avoid internal overflow in synthesis calculation
131 // samples are s15.0, v-coefs are 4.12 -> internal format is s19.12
132 #define MPC_MULTIPLY_V_PRESCALE(sample, vcoef) ( MPC_SHR_RND(sample, (12+OVERFLOW_FIX)) * vcoef )
133 #define MPC_MULTIPLY_V_POSTSCALE(sample, vcoef) ( MPC_SHR_RND(sample, (12-OVERFLOW_FIX)) * vcoef )
134 #define MPC_V_POSTSCALE(sample) (sample<<OVERFLOW_FIX)
135
136 // round to +/- 2^16 as pre-shift before 32=32x32-multiply
137 #define MPC_MAKE_INVCOS(value) (MPC_SHR_RND(value, 15))
138 #else
139 // define 64=32x32-multiplication for DCT-coefficients with samples. Via usage of MPC_FRACT highly optimized assembler might be used
140 // MULTIPLY_FRACT will do >>32 after multiplication, as V-coef were expanded by V_COEFFICIENT_EXPAND we'll correct this on the result.
141 // Will loose 5bit accuracy on result in fract part without effect on final audio result
142 #define MPC_MULTIPLY_V(sample, vcoef) ( (MPC_MULTIPLY_FRACT(sample, vcoef)) << (32-V_COEFFICIENT_EXPAND) )
143
144 // pre- and postscale are used to avoid internal overflow in synthesis calculation
145 // samples are s15.14, v-coefs are 4.27 -> internal format is s19.12
146 #define MPC_MULTIPLY_V_PRESCALE(sample, vcoef) ( (MPC_MULTIPLY_FRACT(sample, vcoef)) << (32-V_COEFFICIENT_EXPAND-OVERFLOW_FIX) )
147 #define MPC_MULTIPLY_V_POSTSCALE(sample, vcoef) ( (MPC_MULTIPLY_FRACT(sample, vcoef)) << (32-V_COEFFICIENT_EXPAND+OVERFLOW_FIX) )
148 #define MPC_V_POSTSCALE(sample) (sample<<OVERFLOW_FIX)
149
150 // directly use accurate 32bit-coefficients
151 #define MPC_MAKE_INVCOS(value) (value)
152 #endif
153#else 110#else
154 // for floating point use the standard multiplication macro 111 // for floating point use the standard multiplication macro
155 #define MPC_MULTIPLY_V (sample, vcoef) ( MPC_MULTIPLY(sample, vcoef) ) 112 #define MPC_DCT32_MUL(sample, coef) (MPC_MULTIPLY(sample, coef) )
156 #define MPC_MULTIPLY_V_PRESCALE (sample, vcoef) ( MPC_MULTIPLY(sample, vcoef) ) 113 #define MPC_DCT32_SHIFT(sample) (sample)
157 #define MPC_MULTIPLY_V_POSTSCALE(sample, vcoef) ( MPC_MULTIPLY(sample, vcoef) )
158 #define MPC_V_POSTSCALE(sample) (sample)
159
160 // downscale the accurate 32bit-coefficients and convert to float
161 #define MPC_MAKE_INVCOS(value) MAKE_MPC_SAMPLE((double)value/(double)(1<<V_COEFFICIENT_EXPAND))
162#endif 114#endif
163 115
164// define constants for DCT-synthesis 116/******************************************************************************
165// INVCOSxx = (0.5 / cos(xx*PI/64)) << 27, <<27 to saturate to +/- 2^31 117 * mpc_dct32(const int *in, int *out)
166#define INVCOS01 MPC_MAKE_INVCOS( 67189797) 118 *
167#define INVCOS02 MPC_MAKE_INVCOS( 67433575) 119 * mpc_dct32 is a dct32 with in[32]->dct[32] that contains the mirroring from
168#define INVCOS03 MPC_MAKE_INVCOS( 67843164) 120 * dct[32] to the expected out[64]. The symmetry is
169#define INVCOS04 MPC_MAKE_INVCOS( 68423604) 121 * out[16] = 0,
170#define INVCOS05 MPC_MAKE_INVCOS( 69182167) 122 * out[ 0..15] = dct[ 0..15],
171#define INVCOS06 MPC_MAKE_INVCOS( 70128577) 123 * out[32..17] = -dct[ 0..15],
172#define INVCOS07 MPC_MAKE_INVCOS( 71275330) 124 * out[33..48] = -dct[16..31],
173#define INVCOS08 MPC_MAKE_INVCOS( 72638111) 125 * out[63..48] = -dct[16..31].
174#define INVCOS09 MPC_MAKE_INVCOS( 74236348) 126 * The cos-tab has the format s0.31.
175#define INVCOS10 MPC_MAKE_INVCOS( 76093940) 127 *****************************************************************************/
176#define INVCOS11 MPC_MAKE_INVCOS( 78240207)
177#define INVCOS12 MPC_MAKE_INVCOS( 80711144)
178#define INVCOS13 MPC_MAKE_INVCOS( 83551089)
179#define INVCOS14 MPC_MAKE_INVCOS( 86814950)
180#define INVCOS15 MPC_MAKE_INVCOS( 90571242)
181#define INVCOS16 MPC_MAKE_INVCOS( 94906266)
182#define INVCOS17 MPC_MAKE_INVCOS( 99929967)
183#define INVCOS18 MPC_MAKE_INVCOS( 105784321)
184#define INVCOS19 MPC_MAKE_INVCOS( 112655602)
185#define INVCOS20 MPC_MAKE_INVCOS( 120792764)
186#define INVCOS21 MPC_MAKE_INVCOS( 130535899)
187#define INVCOS22 MPC_MAKE_INVCOS( 142361749)
188#define INVCOS23 MPC_MAKE_INVCOS( 156959571)
189#define INVCOS24 MPC_MAKE_INVCOS( 175363913)
190#define INVCOS25 MPC_MAKE_INVCOS( 199201203)
191#define INVCOS26 MPC_MAKE_INVCOS( 231182936)
192#define INVCOS27 MPC_MAKE_INVCOS( 276190692)
193#define INVCOS28 MPC_MAKE_INVCOS( 343988688)
194#define INVCOS29 MPC_MAKE_INVCOS( 457361460)
195#define INVCOS30 MPC_MAKE_INVCOS( 684664578)
196#define INVCOS31 MPC_MAKE_INVCOS(1367679739)
197
198void 128void
199mpc_calculate_new_V ( const MPC_SAMPLE_FORMAT * Sample, MPC_SAMPLE_FORMAT * V ) 129mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v)
200ICODE_ATTR_MPC_LARGE_IRAM; 130ICODE_ATTR_MPC_LARGE_IRAM;
201 131
202void 132void
203mpc_calculate_new_V ( const MPC_SAMPLE_FORMAT * Sample, MPC_SAMPLE_FORMAT * V ) 133mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v)
204{ 134{
205 // Calculating new V-buffer values for left channel 135 MPC_SAMPLE_FORMAT t0, t1, t2, t3, t4, t5, t6, t7;
206 // calculate new V-values (ISO-11172-3, p. 39) 136 MPC_SAMPLE_FORMAT t8, t9, t10, t11, t12, t13, t14, t15;
207 // based upon fast-MDCT algorithm by Byeong Gi Lee 137 MPC_SAMPLE_FORMAT t16, t17, t18, t19, t20, t21, t22, t23;
208 MPC_SAMPLE_FORMAT A[16]; 138 MPC_SAMPLE_FORMAT t24, t25, t26, t27, t28, t29, t30, t31;
209 MPC_SAMPLE_FORMAT B[16]; 139 MPC_SAMPLE_FORMAT t32, t33, t34, t35, t36, t37, t38, t39;
210 MPC_SAMPLE_FORMAT tmp; 140 MPC_SAMPLE_FORMAT t40, t41, t42, t43, t44, t45, t46, t47;
211 141 MPC_SAMPLE_FORMAT t48, t49, t50, t51, t52, t53, t54, t55;
212 A[ 0] = Sample[ 0] + Sample[31]; 142 MPC_SAMPLE_FORMAT t56, t57, t58, t59, t60, t61, t62, t63;
213 A[ 1] = Sample[ 1] + Sample[30]; 143 MPC_SAMPLE_FORMAT t64, t65, t66, t67, t68, t69, t70, t71;
214 A[ 2] = Sample[ 2] + Sample[29]; 144 MPC_SAMPLE_FORMAT t72, t73, t74, t75, t76, t77, t78, t79;
215 A[ 3] = Sample[ 3] + Sample[28]; 145 MPC_SAMPLE_FORMAT t80, t81, t82, t83, t84, t85, t86, t87;
216 A[ 4] = Sample[ 4] + Sample[27]; 146 MPC_SAMPLE_FORMAT t88, t89, t90, t91, t92, t93, t94, t95;
217 A[ 5] = Sample[ 5] + Sample[26]; 147 MPC_SAMPLE_FORMAT t96, t97, t98, t99, t100, t101, t102, t103;
218 A[ 6] = Sample[ 6] + Sample[25]; 148 MPC_SAMPLE_FORMAT t104, t105, t106, t107, t108, t109, t110, t111;
219 A[ 7] = Sample[ 7] + Sample[24]; 149 MPC_SAMPLE_FORMAT t112, t113, t114, t115, t116, t117, t118, t119;
220 A[ 8] = Sample[ 8] + Sample[23]; 150 MPC_SAMPLE_FORMAT t120, t121, t122, t123, t124, t125, t126, t127;
221 A[ 9] = Sample[ 9] + Sample[22]; 151 MPC_SAMPLE_FORMAT t128, t129, t130, t131, t132, t133, t134, t135;
222 A[10] = Sample[10] + Sample[21]; 152 MPC_SAMPLE_FORMAT t136, t137, t138, t139, t140, t141, t142, t143;
223 A[11] = Sample[11] + Sample[20]; 153 MPC_SAMPLE_FORMAT t144, t145, t146, t147, t148, t149, t150, t151;
224 A[12] = Sample[12] + Sample[19]; 154 MPC_SAMPLE_FORMAT t152, t153, t154, t155, t156, t157, t158, t159;
225 A[13] = Sample[13] + Sample[18]; 155 MPC_SAMPLE_FORMAT t160, t161, t162, t163, t164, t165, t166, t167;
226 A[14] = Sample[14] + Sample[17]; 156 MPC_SAMPLE_FORMAT t168, t169, t170, t171, t172, t173, t174, t175;
227 A[15] = Sample[15] + Sample[16]; 157 MPC_SAMPLE_FORMAT t176;
228 // 16 adds 158
229 159 /* costab[i] = cos(PI / (2 * 32) * i) */
230 B[ 0] = A[ 0] + A[15]; 160#define costab01 (0x7fd8878e) /* 0.998795456 */
231 B[ 1] = A[ 1] + A[14]; 161#define costab02 (0x7f62368f) /* 0.995184727 */
232 B[ 2] = A[ 2] + A[13]; 162#define costab03 (0x7e9d55fc) /* 0.989176510 */
233 B[ 3] = A[ 3] + A[12]; 163#define costab04 (0x7d8a5f40) /* 0.980785280 */
234 B[ 4] = A[ 4] + A[11]; 164#define costab05 (0x7c29fbee) /* 0.970031253 */
235 B[ 5] = A[ 5] + A[10]; 165#define costab06 (0x7a7d055b) /* 0.956940336 */
236 B[ 6] = A[ 6] + A[ 9]; 166#define costab07 (0x78848414) /* 0.941544065 */
237 B[ 7] = A[ 7] + A[ 8]; 167#define costab08 (0x7641af3d) /* 0.923879533 */
238 B[ 8] = MPC_MULTIPLY_V((A[ 0] - A[15]), INVCOS02); 168#define costab09 (0x73b5ebd1) /* 0.903989293 */
239 B[ 9] = MPC_MULTIPLY_V((A[ 1] - A[14]), INVCOS06); 169#define costab10 (0x70e2cbc6) /* 0.881921264 */
240 B[10] = MPC_MULTIPLY_V((A[ 2] - A[13]), INVCOS10); 170#define costab11 (0x6dca0d14) /* 0.857728610 */
241 B[11] = MPC_MULTIPLY_V((A[ 3] - A[12]), INVCOS14); 171#define costab12 (0x6a6d98a4) /* 0.831469612 */
242 B[12] = MPC_MULTIPLY_V((A[ 4] - A[11]), INVCOS18); 172#define costab13 (0x66cf8120) /* 0.803207531 */
243 B[13] = MPC_MULTIPLY_V((A[ 5] - A[10]), INVCOS22); 173#define costab14 (0x62f201ac) /* 0.773010453 */
244 B[14] = MPC_MULTIPLY_V((A[ 6] - A[ 9]), INVCOS26); 174#define costab15 (0x5ed77c8a) /* 0.740951125 */
245 B[15] = MPC_MULTIPLY_V((A[ 7] - A[ 8]), INVCOS30); 175#define costab16 (0x5a82799a) /* 0.707106781 */
246 // 8 adds, 8 subs, 8 muls, 8 shifts 176#define costab17 (0x55f5a4d2) /* 0.671558955 */
247 177#define costab18 (0x5133cc94) /* 0.634393284 */
248 A[ 0] = B[ 0] + B[ 7]; 178#define costab19 (0x4c3fdff4) /* 0.595699304 */
249 A[ 1] = B[ 1] + B[ 6]; 179#define costab20 (0x471cece7) /* 0.555570233 */
250 A[ 2] = B[ 2] + B[ 5]; 180#define costab21 (0x41ce1e65) /* 0.514102744 */
251 A[ 3] = B[ 3] + B[ 4]; 181#define costab22 (0x3c56ba70) /* 0.471396737 */
252 A[ 4] = MPC_MULTIPLY_V((B[ 0] - B[ 7]), INVCOS04); 182#define costab23 (0x36ba2014) /* 0.427555093 */
253 A[ 5] = MPC_MULTIPLY_V((B[ 1] - B[ 6]), INVCOS12); 183#define costab24 (0x30fbc54d) /* 0.382683432 */
254 A[ 6] = MPC_MULTIPLY_V((B[ 2] - B[ 5]), INVCOS20); 184#define costab25 (0x2b1f34eb) /* 0.336889853 */
255 A[ 7] = MPC_MULTIPLY_V((B[ 3] - B[ 4]), INVCOS28); 185#define costab26 (0x25280c5e) /* 0.290284677 */
256 A[ 8] = B[ 8] + B[15]; 186#define costab27 (0x1f19f97b) /* 0.242980180 */
257 A[ 9] = B[ 9] + B[14]; 187#define costab28 (0x18f8b83c) /* 0.195090322 */
258 A[10] = B[10] + B[13]; 188#define costab29 (0x12c8106f) /* 0.146730474 */
259 A[11] = B[11] + B[12]; 189#define costab30 (0x0c8bd35e) /* 0.098017140 */
260 A[12] = MPC_MULTIPLY_V((B[ 8] - B[15]), INVCOS04); 190#define costab31 (0x0647d97c) /* 0.049067674 */
261 A[13] = MPC_MULTIPLY_V((B[ 9] - B[14]), INVCOS12); 191
262 A[14] = MPC_MULTIPLY_V((B[10] - B[13]), INVCOS20); 192 t0 = in[ 0] + in[31]; t16 = MPC_DCT32_MUL(in[ 0] - in[31], costab01);
263 A[15] = MPC_MULTIPLY_V((B[11] - B[12]), INVCOS28); 193 t1 = in[15] + in[16]; t17 = MPC_DCT32_MUL(in[15] - in[16], costab31);
264 // 8 adds, 8 subs, 8 muls, 8 shifts 194
265 195 t41 = t16 + t17;
266 B[ 0] = A[ 0] + A[ 3]; 196 t59 = MPC_DCT32_MUL(t16 - t17, costab02);
267 B[ 1] = A[ 1] + A[ 2]; 197 t33 = t0 + t1;
268 B[ 2] = MPC_MULTIPLY_V((A[ 0] - A[ 3]), INVCOS08); 198 t50 = MPC_DCT32_MUL(t0 - t1, costab02);
269 B[ 3] = MPC_MULTIPLY_V((A[ 1] - A[ 2]), INVCOS24); 199
270 B[ 4] = A[ 4] + A[ 7]; 200 t2 = in[ 7] + in[24]; t18 = MPC_DCT32_MUL(in[ 7] - in[24], costab15);
271 B[ 5] = A[ 5] + A[ 6]; 201 t3 = in[ 8] + in[23]; t19 = MPC_DCT32_MUL(in[ 8] - in[23], costab17);
272 B[ 6] = MPC_MULTIPLY_V((A[ 4] - A[ 7]), INVCOS08); 202
273 B[ 7] = MPC_MULTIPLY_V((A[ 5] - A[ 6]), INVCOS24); 203 t42 = t18 + t19;
274 B[ 8] = A[ 8] + A[11]; 204 t60 = MPC_DCT32_MUL(t18 - t19, costab30);
275 B[ 9] = A[ 9] + A[10]; 205 t34 = t2 + t3;
276 B[10] = MPC_MULTIPLY_V((A[ 8] - A[11]), INVCOS08); 206 t51 = MPC_DCT32_MUL(t2 - t3, costab30);
277 B[11] = MPC_MULTIPLY_V((A[ 9] - A[10]), INVCOS24); 207
278 B[12] = A[12] + A[15]; 208 t4 = in[ 3] + in[28]; t20 = MPC_DCT32_MUL(in[ 3] - in[28], costab07);
279 B[13] = A[13] + A[14]; 209 t5 = in[12] + in[19]; t21 = MPC_DCT32_MUL(in[12] - in[19], costab25);
280 B[14] = MPC_MULTIPLY_V((A[12] - A[15]), INVCOS08); 210
281 B[15] = MPC_MULTIPLY_V((A[13] - A[14]), INVCOS24); 211 t43 = t20 + t21;
282 // 8 adds, 8 subs, 8 muls, 8 shifts 212 t61 = MPC_DCT32_MUL(t20 - t21, costab14);
283 213 t35 = t4 + t5;
284 A[ 0] = B[ 0] + B[ 1]; 214 t52 = MPC_DCT32_MUL(t4 - t5, costab14);
285 A[ 1] = MPC_MULTIPLY_V((B[ 0] - B[ 1]), INVCOS16); 215
286 A[ 2] = B[ 2] + B[ 3]; 216 t6 = in[ 4] + in[27]; t22 = MPC_DCT32_MUL(in[ 4] - in[27], costab09);
287 A[ 3] = MPC_MULTIPLY_V((B[ 2] - B[ 3]), INVCOS16); 217 t7 = in[11] + in[20]; t23 = MPC_DCT32_MUL(in[11] - in[20], costab23);
288 A[ 4] = B[ 4] + B[ 5]; 218
289 A[ 5] = MPC_MULTIPLY_V((B[ 4] - B[ 5]), INVCOS16); 219 t44 = t22 + t23;
290 A[ 6] = B[ 6] + B[ 7]; 220 t62 = MPC_DCT32_MUL(t22 - t23, costab18);
291 A[ 7] = MPC_MULTIPLY_V((B[ 6] - B[ 7]), INVCOS16); 221 t36 = t6 + t7;
292 A[ 8] = B[ 8] + B[ 9]; 222 t53 = MPC_DCT32_MUL(t6 - t7, costab18);
293 A[ 9] = MPC_MULTIPLY_V((B[ 8] - B[ 9]), INVCOS16); 223
294 A[10] = B[10] + B[11]; 224 t8 = in[ 1] + in[30]; t24 = MPC_DCT32_MUL(in[ 1] - in[30], costab03);
295 A[11] = MPC_MULTIPLY_V((B[10] - B[11]), INVCOS16); 225 t9 = in[14] + in[17]; t25 = MPC_DCT32_MUL(in[14] - in[17], costab29);
296 A[12] = B[12] + B[13]; 226
297 A[13] = MPC_MULTIPLY_V((B[12] - B[13]), INVCOS16); 227 t45 = t24 + t25;
298 A[14] = B[14] + B[15]; 228 t63 = MPC_DCT32_MUL(t24 - t25, costab06);
299 A[15] = MPC_MULTIPLY_V((B[14] - B[15]), INVCOS16); 229 t37 = t8 + t9;
300 // 8 adds, 8 subs, 8 muls, 8 shifts 230 t54 = MPC_DCT32_MUL(t8 - t9, costab06);
301 231
302 // multiple used expressions: -(A[12] + A[14] + A[15]) 232 t10 = in[ 6] + in[25]; t26 = MPC_DCT32_MUL(in[ 6] - in[25], costab13);
303 V[48] = -A[ 0]; 233 t11 = in[ 9] + in[22]; t27 = MPC_DCT32_MUL(in[ 9] - in[22], costab19);
304 V[ 0] = A[ 1]; 234
305 V[40] = -A[ 2] - (V[ 8] = A[ 3]); 235 t46 = t26 + t27;
306 V[36] = -((V[ 4] = A[ 5] + (V[12] = A[ 7])) + A[ 6]); 236 t64 = MPC_DCT32_MUL(t26 - t27, costab26);
307 V[44] = - A[ 4] - A[ 6] - A[ 7]; 237 t38 = t10 + t11;
308 V[ 6] = (V[10] = A[11] + (V[14] = A[15])) + A[13]; 238 t55 = MPC_DCT32_MUL(t10 - t11, costab26);
309 V[38] = (V[34] = -(V[ 2] = A[ 9] + A[13] + A[15]) - A[14]) + A[ 9] - A[10] - A[11]; 239
310 V[46] = (tmp = -(A[12] + A[14] + A[15])) - A[ 8]; 240 t12 = in[ 2] + in[29]; t28 = MPC_DCT32_MUL(in[ 2] - in[29], costab05);
311 V[42] = tmp - A[10] - A[11]; 241 t13 = in[13] + in[18]; t29 = MPC_DCT32_MUL(in[13] - in[18], costab27);
312 // 9 adds, 9 subs 242
313 243 t47 = t28 + t29;
314 A[ 0] = MPC_MULTIPLY_V_PRESCALE((Sample[ 0] - Sample[31]), INVCOS01); 244 t65 = MPC_DCT32_MUL(t28 - t29, costab10);
315 A[ 1] = MPC_MULTIPLY_V_PRESCALE((Sample[ 1] - Sample[30]), INVCOS03); 245 t39 = t12 + t13;
316 A[ 2] = MPC_MULTIPLY_V_PRESCALE((Sample[ 2] - Sample[29]), INVCOS05); 246 t56 = MPC_DCT32_MUL(t12 - t13, costab10);
317 A[ 3] = MPC_MULTIPLY_V_PRESCALE((Sample[ 3] - Sample[28]), INVCOS07); 247
318 A[ 4] = MPC_MULTIPLY_V_PRESCALE((Sample[ 4] - Sample[27]), INVCOS09); 248 t14 = in[ 5] + in[26]; t30 = MPC_DCT32_MUL(in[ 5] - in[26], costab11);
319 A[ 5] = MPC_MULTIPLY_V_PRESCALE((Sample[ 5] - Sample[26]), INVCOS11); 249 t15 = in[10] + in[21]; t31 = MPC_DCT32_MUL(in[10] - in[21], costab21);
320 A[ 6] = MPC_MULTIPLY_V_PRESCALE((Sample[ 6] - Sample[25]), INVCOS13); 250
321 A[ 7] = MPC_MULTIPLY_V_PRESCALE((Sample[ 7] - Sample[24]), INVCOS15); 251 t48 = t30 + t31;
322 A[ 8] = MPC_MULTIPLY_V_PRESCALE((Sample[ 8] - Sample[23]), INVCOS17); 252 t66 = MPC_DCT32_MUL(t30 - t31, costab22);
323 A[ 9] = MPC_MULTIPLY_V_PRESCALE((Sample[ 9] - Sample[22]), INVCOS19); 253 t40 = t14 + t15;
324 A[10] = MPC_MULTIPLY_V_PRESCALE((Sample[10] - Sample[21]), INVCOS21); 254 t57 = MPC_DCT32_MUL(t14 - t15, costab22);
325 A[11] = MPC_MULTIPLY_V_PRESCALE((Sample[11] - Sample[20]), INVCOS23); 255
326 A[12] = MPC_MULTIPLY_V_PRESCALE((Sample[12] - Sample[19]), INVCOS25); 256 t69 = t33 + t34; t89 = MPC_DCT32_MUL(t33 - t34, costab04);
327 A[13] = MPC_MULTIPLY_V_PRESCALE((Sample[13] - Sample[18]), INVCOS27); 257 t70 = t35 + t36; t90 = MPC_DCT32_MUL(t35 - t36, costab28);
328 A[14] = MPC_MULTIPLY_V_PRESCALE((Sample[14] - Sample[17]), INVCOS29); 258 t71 = t37 + t38; t91 = MPC_DCT32_MUL(t37 - t38, costab12);
329 A[15] = MPC_MULTIPLY_V_PRESCALE((Sample[15] - Sample[16]), INVCOS31); 259 t72 = t39 + t40; t92 = MPC_DCT32_MUL(t39 - t40, costab20);
330 // 16 subs, 16 muls, 16 shifts 260 t73 = t41 + t42; t94 = MPC_DCT32_MUL(t41 - t42, costab04);
331 261 t74 = t43 + t44; t95 = MPC_DCT32_MUL(t43 - t44, costab28);
332 B[ 0] = A[ 0] + A[15]; 262 t75 = t45 + t46; t96 = MPC_DCT32_MUL(t45 - t46, costab12);
333 B[ 1] = A[ 1] + A[14]; 263 t76 = t47 + t48; t97 = MPC_DCT32_MUL(t47 - t48, costab20);
334 B[ 2] = A[ 2] + A[13]; 264
335 B[ 3] = A[ 3] + A[12]; 265 t78 = t50 + t51; t100 = MPC_DCT32_MUL(t50 - t51, costab04);
336 B[ 4] = A[ 4] + A[11]; 266 t79 = t52 + t53; t101 = MPC_DCT32_MUL(t52 - t53, costab28);
337 B[ 5] = A[ 5] + A[10]; 267 t80 = t54 + t55; t102 = MPC_DCT32_MUL(t54 - t55, costab12);
338 B[ 6] = A[ 6] + A[ 9]; 268 t81 = t56 + t57; t103 = MPC_DCT32_MUL(t56 - t57, costab20);
339 B[ 7] = A[ 7] + A[ 8]; 269
340 B[ 8] = MPC_MULTIPLY_V((A[ 0] - A[15]), INVCOS02); 270 t83 = t59 + t60; t106 = MPC_DCT32_MUL(t59 - t60, costab04);
341 B[ 9] = MPC_MULTIPLY_V((A[ 1] - A[14]), INVCOS06); 271 t84 = t61 + t62; t107 = MPC_DCT32_MUL(t61 - t62, costab28);
342 B[10] = MPC_MULTIPLY_V((A[ 2] - A[13]), INVCOS10); 272 t85 = t63 + t64; t108 = MPC_DCT32_MUL(t63 - t64, costab12);
343 B[11] = MPC_MULTIPLY_V((A[ 3] - A[12]), INVCOS14); 273 t86 = t65 + t66; t109 = MPC_DCT32_MUL(t65 - t66, costab20);
344 B[12] = MPC_MULTIPLY_V((A[ 4] - A[11]), INVCOS18); 274
345 B[13] = MPC_MULTIPLY_V((A[ 5] - A[10]), INVCOS22); 275 t113 = t69 + t70;
346 B[14] = MPC_MULTIPLY_V((A[ 6] - A[ 9]), INVCOS26); 276 t114 = t71 + t72;
347 B[15] = MPC_MULTIPLY_V((A[ 7] - A[ 8]), INVCOS30); 277
348 // 8 adds, 8 subs, 8 muls, 8 shift 278 /* 0 */ v[48] = -MPC_DCT32_SHIFT(t113 + t114);
349 279 /* 16 */ v[32] = -(v[ 0] = MPC_DCT32_SHIFT(MPC_DCT32_MUL(t113 - t114, costab16)));
350 A[ 0] = B[ 0] + B[ 7]; 280
351 A[ 1] = B[ 1] + B[ 6]; 281 t115 = t73 + t74;
352 A[ 2] = B[ 2] + B[ 5]; 282 t116 = t75 + t76;
353 A[ 3] = B[ 3] + B[ 4]; 283
354 A[ 4] = MPC_MULTIPLY_V((B[ 0] - B[ 7]), INVCOS04); 284 t32 = t115 + t116;
355 A[ 5] = MPC_MULTIPLY_V((B[ 1] - B[ 6]), INVCOS12); 285
356 A[ 6] = MPC_MULTIPLY_V((B[ 2] - B[ 5]), INVCOS20); 286 /* 1 */ v[49] = v[47] = -MPC_DCT32_SHIFT(t32);
357 A[ 7] = MPC_MULTIPLY_V((B[ 3] - B[ 4]), INVCOS28); 287
358 A[ 8] = B[ 8] + B[15]; 288 t118 = t78 + t79;
359 A[ 9] = B[ 9] + B[14]; 289 t119 = t80 + t81;
360 A[10] = B[10] + B[13]; 290
361 A[11] = B[11] + B[12]; 291 t58 = t118 + t119;
362 A[12] = MPC_MULTIPLY_V((B[ 8] - B[15]), INVCOS04); 292
363 A[13] = MPC_MULTIPLY_V((B[ 9] - B[14]), INVCOS12); 293 /* 2 */ v[50] = v[46] = -MPC_DCT32_SHIFT(t58);
364 A[14] = MPC_MULTIPLY_V((B[10] - B[13]), INVCOS20); 294
365 A[15] = MPC_MULTIPLY_V((B[11] - B[12]), INVCOS28); 295 t121 = t83 + t84;
366 // 8 adds, 8 subs, 8 muls, 8 shift 296 t122 = t85 + t86;
367 297
368 B[ 0] = A[ 0] + A[ 3]; 298 t67 = t121 + t122;
369 B[ 1] = A[ 1] + A[ 2]; 299
370 B[ 2] = MPC_MULTIPLY_V((A[ 0] - A[ 3]), INVCOS08); 300 t49 = (t67 * 2) - t32;
371 B[ 3] = MPC_MULTIPLY_V((A[ 1] - A[ 2]), INVCOS24); 301
372 B[ 4] = A[ 4] + A[ 7]; 302 /* 3 */ v[51] = v[45] = -MPC_DCT32_SHIFT(t49);
373 B[ 5] = A[ 5] + A[ 6]; 303
374 B[ 6] = MPC_MULTIPLY_V((A[ 4] - A[ 7]), INVCOS08); 304 t125 = t89 + t90;
375 B[ 7] = MPC_MULTIPLY_V((A[ 5] - A[ 6]), INVCOS24); 305 t126 = t91 + t92;
376 B[ 8] = A[ 8] + A[11]; 306
377 B[ 9] = A[ 9] + A[10]; 307 t93 = t125 + t126;
378 B[10] = MPC_MULTIPLY_V((A[ 8] - A[11]), INVCOS08); 308
379 B[11] = MPC_MULTIPLY_V((A[ 9] - A[10]), INVCOS24); 309 /* 4 */ v[52] = v[44] = -MPC_DCT32_SHIFT(t93);
380 B[12] = A[12] + A[15]; 310
381 B[13] = A[13] + A[14]; 311 t128 = t94 + t95;
382 B[14] = MPC_MULTIPLY_V((A[12] - A[15]), INVCOS08); 312 t129 = t96 + t97;
383 B[15] = MPC_MULTIPLY_V((A[13] - A[14]), INVCOS24); 313
384 // 8 adds, 8 subs, 8 muls, 8 shift 314 t98 = t128 + t129;
385 315
386 A[ 0] = MPC_V_POSTSCALE((B[ 0] + B[ 1])); 316 t68 = (t98 * 2) - t49;
387 A[ 1] = MPC_MULTIPLY_V_POSTSCALE((B[ 0] - B[ 1]), INVCOS16); 317
388 A[ 2] = MPC_V_POSTSCALE((B[ 2] + B[ 3])); 318 /* 5 */ v[53] = v[43] = -MPC_DCT32_SHIFT(t68);
389 A[ 3] = MPC_MULTIPLY_V_POSTSCALE((B[ 2] - B[ 3]), INVCOS16); 319
390 A[ 4] = MPC_V_POSTSCALE((B[ 4] + B[ 5])); 320 t132 = t100 + t101;
391 A[ 5] = MPC_MULTIPLY_V_POSTSCALE((B[ 4] - B[ 5]), INVCOS16); 321 t133 = t102 + t103;
392 A[ 6] = MPC_V_POSTSCALE((B[ 6] + B[ 7])); 322
393 A[ 7] = MPC_MULTIPLY_V_POSTSCALE((B[ 6] - B[ 7]), INVCOS16); 323 t104 = t132 + t133;
394 A[ 8] = MPC_V_POSTSCALE((B[ 8] + B[ 9])); 324
395 A[ 9] = MPC_MULTIPLY_V_POSTSCALE((B[ 8] - B[ 9]), INVCOS16); 325 t82 = (t104 * 2) - t58;
396 A[10] = MPC_V_POSTSCALE((B[10] + B[11])); 326
397 A[11] = MPC_MULTIPLY_V_POSTSCALE((B[10] - B[11]), INVCOS16); 327 /* 6 */ v[54] = v[42] = -MPC_DCT32_SHIFT(t82);
398 A[12] = MPC_V_POSTSCALE((B[12] + B[13])); 328
399 A[13] = MPC_MULTIPLY_V_POSTSCALE((B[12] - B[13]), INVCOS16); 329 t136 = t106 + t107;
400 A[14] = MPC_V_POSTSCALE((B[14] + B[15])); 330 t137 = t108 + t109;
401 A[15] = MPC_MULTIPLY_V_POSTSCALE((B[14] - B[15]), INVCOS16); 331
402 // 8 adds, 8 subs, 8 muls, 8 shift 332 t110 = t136 + t137;
403 333
404 // multiple used expressions: A[ 4]+A[ 6]+A[ 7], A[ 9]+A[13]+A[15] 334 t87 = (t110 * 2) - t67;
405 V[ 5] = (V[11] = (V[13] = A[ 7] + (V[15] = A[15])) + A[11]) + A[ 5] + A[13]; 335
406 V[ 7] = (V[ 9] = A[ 3] + A[11] + A[15]) + A[13]; 336 t77 = (t87 * 2) - t68;
407 V[33] = -(V[ 1] = A[ 1] + A[ 9] + A[13] + A[15]) - A[14]; 337
408 V[35] = -(V[ 3] = A[ 5] + A[ 7] + A[ 9] + A[13] + A[15]) - A[ 6] - A[14]; 338 /* 7 */ v[55] = v[41] = -MPC_DCT32_SHIFT(t77);
409 V[37] = (tmp = -(A[10] + A[11] + A[13] + A[14] + A[15])) - A[ 5] - A[ 6] - A[ 7]; 339
410 V[39] = tmp - A[ 2] - A[ 3]; 340 t141 = MPC_DCT32_MUL(t69 - t70, costab08);
411 V[41] = (tmp += A[13] - A[12]) - A[ 2] - A[ 3]; 341 t142 = MPC_DCT32_MUL(t71 - t72, costab24);
412 V[43] = tmp - A[ 4] - A[ 6] - A[ 7]; 342 t143 = t141 + t142;
413 V[47] = (tmp = -(A[ 8] + A[12] + A[14] + A[15])) - A[ 0]; 343
414 V[45] = tmp - A[ 4] - A[ 6] - A[ 7]; 344 /* 8 */ v[56] = v[40] = -MPC_DCT32_SHIFT(t143);
415 // 22 adds, 18 subs 345 /* 24 */ v[24] = -(v[ 8] = MPC_DCT32_SHIFT((MPC_DCT32_MUL(t141 - t142, costab16) * 2) - t143));
416 346
417 V[32] = -(V[ 0] = MPC_V_PRESHIFT(V[ 0])); 347 t144 = MPC_DCT32_MUL(t73 - t74, costab08);
418 V[31] = -(V[ 1] = MPC_V_PRESHIFT(V[ 1])); 348 t145 = MPC_DCT32_MUL(t75 - t76, costab24);
419 V[30] = -(V[ 2] = MPC_V_PRESHIFT(V[ 2])); 349 t146 = t144 + t145;
420 V[29] = -(V[ 3] = MPC_V_PRESHIFT(V[ 3])); 350
421 V[28] = -(V[ 4] = MPC_V_PRESHIFT(V[ 4])); 351 t88 = (t146 * 2) - t77;
422 V[27] = -(V[ 5] = MPC_V_PRESHIFT(V[ 5])); 352
423 V[26] = -(V[ 6] = MPC_V_PRESHIFT(V[ 6])); 353 /* 9 */ v[57] = v[39] = -MPC_DCT32_SHIFT(t88);
424 V[25] = -(V[ 7] = MPC_V_PRESHIFT(V[ 7])); 354
425 V[24] = -(V[ 8] = MPC_V_PRESHIFT(V[ 8])); 355 t148 = MPC_DCT32_MUL(t78 - t79, costab08);
426 V[23] = -(V[ 9] = MPC_V_PRESHIFT(V[ 9])); 356 t149 = MPC_DCT32_MUL(t80 - t81, costab24);
427 V[22] = -(V[10] = MPC_V_PRESHIFT(V[10])); 357 t150 = t148 + t149;
428 V[21] = -(V[11] = MPC_V_PRESHIFT(V[11])); 358
429 V[20] = -(V[12] = MPC_V_PRESHIFT(V[12])); 359 t105 = (t150 * 2) - t82;
430 V[19] = -(V[13] = MPC_V_PRESHIFT(V[13])); 360
431 V[18] = -(V[14] = MPC_V_PRESHIFT(V[14])); 361 /* 10 */ v[58] = v[38] = -MPC_DCT32_SHIFT(t105);
432 V[17] = -(V[15] = MPC_V_PRESHIFT(V[15])); 362
433 // 16 adds, 16 shifts (OPTIMIZE_FOR_SPEED only) 363 t152 = MPC_DCT32_MUL(t83 - t84, costab08);
434 364 t153 = MPC_DCT32_MUL(t85 - t86, costab24);
435 V[63] = (V[33] = MPC_V_PRESHIFT(V[33])); 365 t154 = t152 + t153;
436 V[62] = (V[34] = MPC_V_PRESHIFT(V[34])); 366
437 V[61] = (V[35] = MPC_V_PRESHIFT(V[35])); 367 t111 = (t154 * 2) - t87;
438 V[60] = (V[36] = MPC_V_PRESHIFT(V[36])); 368
439 V[59] = (V[37] = MPC_V_PRESHIFT(V[37])); 369 t99 = (t111 * 2) - t88;
440 V[58] = (V[38] = MPC_V_PRESHIFT(V[38])); 370
441 V[57] = (V[39] = MPC_V_PRESHIFT(V[39])); 371 /* 11 */ v[59] = v[37] = -MPC_DCT32_SHIFT(t99);
442 V[56] = (V[40] = MPC_V_PRESHIFT(V[40])); 372
443 V[55] = (V[41] = MPC_V_PRESHIFT(V[41])); 373 t157 = MPC_DCT32_MUL(t89 - t90, costab08);
444 V[54] = (V[42] = MPC_V_PRESHIFT(V[42])); 374 t158 = MPC_DCT32_MUL(t91 - t92, costab24);
445 V[53] = (V[43] = MPC_V_PRESHIFT(V[43])); 375 t159 = t157 + t158;
446 V[52] = (V[44] = MPC_V_PRESHIFT(V[44])); 376
447 V[51] = (V[45] = MPC_V_PRESHIFT(V[45])); 377 t127 = (t159 * 2) - t93;
448 V[50] = (V[46] = MPC_V_PRESHIFT(V[46])); 378
449 V[49] = (V[47] = MPC_V_PRESHIFT(V[47])); 379 /* 12 */ v[60] = v[36] = -MPC_DCT32_SHIFT(t127);
450 V[48] = (V[48] = MPC_V_PRESHIFT(V[48])); 380
451 // 16 adds, 16 shifts (OPTIMIZE_FOR_SPEED only) 381 t160 = (MPC_DCT32_MUL(t125 - t126, costab16) * 2) - t127;
452 382
453 // OPTIMIZE_FOR_SPEED total: 143 adds, 107 subs, 80 muls, 112 shifts 383 /* 20 */ v[28] = -(v[ 4] = MPC_DCT32_SHIFT(t160));
454 // total: 111 adds, 107 subs, 80 muls, 80 shifts 384 /* 28 */ v[20] = -(v[12] = MPC_DCT32_SHIFT((((MPC_DCT32_MUL(t157 - t158, costab16) * 2) - t159) * 2) - t160));
385
386 t161 = MPC_DCT32_MUL(t94 - t95, costab08);
387 t162 = MPC_DCT32_MUL(t96 - t97, costab24);
388 t163 = t161 + t162;
389
390 t130 = (t163 * 2) - t98;
391
392 t112 = (t130 * 2) - t99;
393
394 /* 13 */ v[61] = v[35] = -MPC_DCT32_SHIFT(t112);
395
396 t164 = (MPC_DCT32_MUL(t128 - t129, costab16) * 2) - t130;
397
398 t166 = MPC_DCT32_MUL(t100 - t101, costab08);
399 t167 = MPC_DCT32_MUL(t102 - t103, costab24);
400 t168 = t166 + t167;
401
402 t134 = (t168 * 2) - t104;
403
404 t120 = (t134 * 2) - t105;
405
406 /* 14 */ v[62] = v[34] = -MPC_DCT32_SHIFT(t120);
407
408 t135 = (MPC_DCT32_MUL(t118 - t119, costab16) * 2) - t120;
409
410 /* 18 */ v[30] = -(v[ 2] = MPC_DCT32_SHIFT(t135));
411
412 t169 = (MPC_DCT32_MUL(t132 - t133, costab16) * 2) - t134;
413
414 t151 = (t169 * 2) - t135;
415
416 /* 22 */ v[26] = -(v[ 6] = MPC_DCT32_SHIFT(t151));
417
418 t170 = (((MPC_DCT32_MUL(t148 - t149, costab16) * 2) - t150) * 2) - t151;
419
420 /* 26 */ v[22] = -(v[10] = MPC_DCT32_SHIFT(t170));
421 /* 30 */ v[18] = -(v[14] = MPC_DCT32_SHIFT((((((MPC_DCT32_MUL(t166 - t167, costab16) * 2) - t168) * 2) - t169) * 2) - t170));
422
423 t171 = MPC_DCT32_MUL(t106 - t107, costab08);
424 t172 = MPC_DCT32_MUL(t108 - t109, costab24);
425 t173 = t171 + t172;
426
427 t138 = (t173 * 2) - t110;
428
429 t123 = (t138 * 2) - t111;
430
431 t139 = (MPC_DCT32_MUL(t121 - t122, costab16) * 2) - t123;
432
433 t117 = (t123 * 2) - t112;
434
435 /* 15 */ v[63] = v[33] =-MPC_DCT32_SHIFT(t117);
436
437 t124 = (MPC_DCT32_MUL(t115 - t116, costab16) * 2) - t117;
438
439 /* 17 */ v[31] = -(v[ 1] = MPC_DCT32_SHIFT(t124));
440
441 t131 = (t139 * 2) - t124;
442
443 /* 19 */ v[29] = -(v[ 3] = MPC_DCT32_SHIFT(t131));
444
445 t140 = (t164 * 2) - t131;
446
447 /* 21 */ v[27] = -(v[ 5] = MPC_DCT32_SHIFT(t140));
448
449 t174 = (MPC_DCT32_MUL(t136 - t137, costab16) * 2) - t138;
450
451 t155 = (t174 * 2) - t139;
452
453 t147 = (t155 * 2) - t140;
454
455 /* 23 */ v[25] = -(v[ 7] = MPC_DCT32_SHIFT(t147));
456
457 t156 = (((MPC_DCT32_MUL(t144 - t145, costab16) * 2) - t146) * 2) - t147;
458
459 /* 25 */ v[23] = -(v[ 9] = MPC_DCT32_SHIFT(t156));
460
461 t175 = (((MPC_DCT32_MUL(t152 - t153, costab16) * 2) - t154) * 2) - t155;
462
463 t165 = (t175 * 2) - t156;
464
465 /* 27 */ v[21] = -(v[11] = MPC_DCT32_SHIFT(t165));
466
467 t176 = (((((MPC_DCT32_MUL(t161 - t162, costab16) * 2) - t163) * 2) - t164) * 2) - t165;
468
469 /* 29 */ v[19] = -(v[13] = MPC_DCT32_SHIFT(t176));
470 /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176));
455} 471}
456 472
457#if defined(CPU_ARM) 473#if defined(CPU_ARM)
@@ -465,71 +481,57 @@ mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
465 const MPC_SAMPLE_FORMAT * V, 481 const MPC_SAMPLE_FORMAT * V,
466 const MPC_SAMPLE_FORMAT * D) 482 const MPC_SAMPLE_FORMAT * D)
467{ 483{
468 mpc_int32_t k; 484 mpc_int32_t k;
469 485
470 #if defined(OPTIMIZE_FOR_SPEED) 486#if defined(CPU_COLDFIRE)
471 // 32=32x32-multiply (FIXED_POINT) 487 // 64=32x32-multiply assembler for Coldfire
472 for ( k = 0; k < 32; k++, D += 16, V++ ) 488 for ( k = 0; k < 32; k++, D += 16, V++ )
473 { 489 {
474 *Data = V[ 0]*D[ 0] + V[ 96]*D[ 1] + V[128]*D[ 2] + V[224]*D[ 3] 490 asm volatile (
475 + V[256]*D[ 4] + V[352]*D[ 5] + V[384]*D[ 6] + V[480]*D[ 7] 491 "movem.l (%[D]), %%d0-%%d3 \n\t"
476 + V[512]*D[ 8] + V[608]*D[ 9] + V[640]*D[10] + V[736]*D[11] 492 "move.l (%[V]), %%a5 \n\t"
477 + V[768]*D[12] + V[864]*D[13] + V[896]*D[14] + V[992]*D[15]; 493 "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t"
478 *Data >>= 1; // post shift to compensate for pre-shifting 494 "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t"
479 Data += 1; 495 "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t"
480 // total: 32 * (16 muls, 15 adds) 496 "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t"
481 } 497 "movem.l (4*4, %[D]), %%d0-%%d3 \n\t"
482 #else 498 "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t"
483 #if defined(CPU_COLDFIRE) 499 "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t"
484 // 64=32x32-multiply assembler for Coldfire 500 "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t"
485 for ( k = 0; k < 32; k++, D += 16, V++ ) 501 "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t"
486 { 502 "movem.l (8*4, %[D]), %%d0-%%d3 \n\t"
487 asm volatile ( 503 "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t"
488 "movem.l (%[D]), %%d0-%%d3 \n\t" 504 "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t"
489 "move.l (%[V]), %%a5 \n\t" 505 "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t"
490 "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" 506 "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t"
491 "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" 507 "movem.l (12*4, %[D]), %%d0-%%d3 \n\t"
492 "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" 508 "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t"
493 "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" 509 "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t"
494 "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" 510 "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t"
495 "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" 511 "mac.l %%d3, %%a5, %%acc0 \n\t"
496 "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" 512 "movclr.l %%acc0, %%d0 \n\t"
497 "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" 513 "lsl.l #1, %%d0 \n\t"
498 "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" 514 "move.l %%d0, (%[Data])+ \n"
499 "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" 515 : [Data] "+a" (Data)
500 "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" 516 : [V] "a" (V), [D] "a" (D)
501 "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" 517 : "d0", "d1", "d2", "d3", "a5");
502 "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" 518 }
503 "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" 519#else
504 "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" 520 // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C
505 "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" 521 for ( k = 0; k < 32; k++, D += 16, V++ )
506 "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" 522 {
507 "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" 523 *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30)
508 "mac.l %%d3, %%a5, %%acc0 \n\t" 524 + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
509 "movclr.l %%acc0, %%d0 \n\t" 525 + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
510 "lsl.l #1, %%d0 \n\t" 526 + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
511 "move.l %%d0, (%[Data])+ \n" 527 + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
512 : [Data] "+a" (Data) 528 + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
513 : [V] "a" (V), [D] "a" (D) 529 + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
514 : "d0", "d1", "d2", "d3", "a5"); 530 + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
515 } 531 Data += 1;
516 #else 532 // total: 16 muls, 15 adds, 16 shifts
517 // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C 533 }
518 for ( k = 0; k < 32; k++, D += 16, V++ ) 534#endif /* COLDFIRE */
519 {
520 *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30)
521 + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
522 + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
523 + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
524 + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
525 + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
526 + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
527 + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
528 Data += 1;
529 // total: 16 muls, 15 adds, 16 shifts
530 }
531 #endif
532 #endif
533} 535}
534#endif /* CPU_ARM */ 536#endif /* CPU_ARM */
535 537
@@ -543,7 +545,7 @@ mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, cons
543 for ( n = 0; n < 36; n++, Y += 32, OutData += 32 ) 545 for ( n = 0; n < 36; n++, Y += 32, OutData += 32 )
544 { 546 {
545 V -= 64; 547 V -= 64;
546 mpc_calculate_new_V ( Y, V ); 548 mpc_dct32(Y, V);
547 mpc_decoder_windowing_D( OutData, V, Di_opt ); 549 mpc_decoder_windowing_D( OutData, V, Di_opt );
548 } 550 }
549 } 551 }
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 8c87b61609..51526040ca 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -1,363 +1,295 @@
1/*************************************************************************** 1/***************************************************************************
2 * __________ __ ___. 2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___ 3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / 4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < 5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ 6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/ 7 * \/ \/ \/ \/ \/
8 * $Id$ 8 * $Id$
9 * 9 *
10 * Copyright (C) 2008 by Andree Buschmann 10 * Copyright (C) 2008 by Andree Buschmann
11 * 11 *
12 * This program is free software; you can redistribute it and/or 12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License 13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2 14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version. 15 * of the License, or (at your option) any later version.
16 * 16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied. 18 * KIND, either express or implied.
19 * 19 *
20 ****************************************************************************/ 20 ****************************************************************************/
21 21
22#include "mpc_config.h" 22#include "mpc_config.h"
23 23
24 .section .text, "ax", %progbits 24 .section .text, "ax", %progbits
25 25
26#if defined(OPTIMIZE_FOR_SPEED) 26/****************************************************************************
27/**************************************************************************** 27 * void mpc_decoder_windowing_D(...)
28 * void mpc_decoder_windowing_D(...) 28 *
29 * 29 * 2nd step within synthesis filter. Does the dewindowing.
30 * 2nd step within synthesis filter. Does the dewindowing. 30 * 64=32x32 multiplies
31 * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) 31 * Uses un-shifted D[]-values. D[] will always be the second operand of
32 * Uses pre-shifted V[] and D[] values. D[] will always be the second operand 32 * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
33 * of mul/mla to achieve higher speed as D[] has lower amplitude than V[]. 33 ****************************************************************************/
34 ****************************************************************************/ 34 .align 2
35 .align 2 35 .global mpc_decoder_windowing_D
36 .global mpc_decoder_windowing_D 36 .type mpc_decoder_windowing_D, %function
37 .type mpc_decoder_windowing_D, %function 37#if 0
38mpc_decoder_windowing_D: 38mpc_decoder_windowing_D:
39 /* r0 = Data[] */ 39 /* r0 = Data[] */
40 /* r1 = V[] */ 40 /* r1 = V[] */
41 /* r2 = D[] */ 41 /* r2 = D[] */
42 /* lr = counter */ 42 /* lr = counter */
43 43 /************************************************************************
44 stmfd sp!, {r4-r11, lr} 44 * Reference implementation.
45 45 ***********************************************************************/
46 mov lr, #32 46 stmfd sp!, {r4-r8, lr}
47.loop32: 47
48 ldmia r2!, { r3-r10 } /* load D[00..07] */ 48 mov lr, #32
49 ldr r11, [r1] /* 0 */ 49.loop32:
50 mul r12, r11, r3 50 ldmia r2!, { r3-r6 } /* load D[00..03] */
51 ldr r11, [r1, #96*4] /* 1 */ 51 ldr r7, [r1] /* 0 */
52 mla r12, r11, r4, r12 52 smull r8, r12, r7, r3
53 ldr r11, [r1, #128*4] /* 2 */ 53 ldr r7, [r1, #96*4] /* 1 */
54 mla r12, r11, r5, r12 54 smlal r8, r12, r7, r4
55 ldr r11, [r1, #224*4] /* 3 */ 55 ldr r7, [r1, #128*4] /* 2 */
56 mla r12, r11, r6, r12 56 smlal r8, r12, r7, r5
57 ldr r11, [r1, #256*4] /* 4 */ 57 ldr r7, [r1, #224*4] /* 3 */
58 mla r12, r11, r7, r12 58 smlal r8, r12, r7, r6
59 ldr r11, [r1, #352*4] /* 5 */ 59 ldmia r2!, { r3-r6 } /* load D[04..07] */
60 mla r12, r11, r8, r12 60 ldr r7, [r1, #256*4] /* 4 */
61 ldr r11, [r1, #384*4] /* 6 */ 61 smlal r8, r12, r7, r3
62 mla r12, r11, r9, r12 62 ldr r7, [r1, #352*4] /* 5 */
63 ldr r11, [r1, #480*4] /* 7 */ 63 smlal r8, r12, r7, r4
64 mla r12, r11, r10, r12 64 ldr r7, [r1, #384*4] /* 6 */
65 ldmia r2!, { r3-r10 } /* load D[08..15] */ 65 smlal r8, r12, r7, r5
66 ldr r11, [r1, #512*4] /* 8 */ 66 ldr r7, [r1, #480*4] /* 7 */
67 mla r12, r11, r3, r12 67 smlal r8, r12, r7, r6
68 ldr r11, [r1, #608*4] /* 9 */ 68 ldmia r2!, { r3-r6 } /* load D[08..11] */
69 mla r12, r11, r4, r12 69 ldr r7, [r1, #512*4] /* 8 */
70 ldr r11, [r1, #640*4] /* 10 */ 70 smlal r8, r12, r7, r3
71 mla r12, r11, r5, r12 71 ldr r7, [r1, #608*4] /* 9 */
72 ldr r11, [r1, #736*4] /* 11 */ 72 smlal r8, r12, r7, r4
73 mla r12, r11, r6, r12 73 ldr r7, [r1, #640*4] /* 10 */
74 ldr r11, [r1, #768*4] /* 12 */ 74 smlal r8, r12, r7, r5
75 mla r12, r11, r7, r12 75 ldr r7, [r1, #736*4] /* 11 */
76 ldr r11, [r1, #864*4] /* 13 */ 76 smlal r8, r12, r7, r6
77 mla r12, r11, r8, r12 77 ldmia r2!, { r3-r6 } /* load D[12..15] */
78 ldr r11, [r1, #896*4] /* 14 */ 78 ldr r7, [r1, #768*4] /* 12 */
79 mla r12, r11, r9, r12 79 smlal r8, r12, r7, r3
80 ldr r11, [r1, #992*4] /* 15 */ 80 ldr r7, [r1, #864*4] /* 13 */
81 mla r12, r11, r10, r12 81 smlal r8, r12, r7, r4
82 mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ 82 ldr r7, [r1, #896*4] /* 14 */
83 str r12, [r0], #4 /* store Data */ 83 smlal r8, r12, r7, r5
84 add r1, r1, #4 /* V++ */ 84 ldr r7, [r1, #992*4] /* 15 */
85 85 smlal r8, r12, r7, r6
86 subs lr, lr, #1 86 mov r8, r8, lsr #16
87 bgt .loop32 87 orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */
88 88 str r8, [r0], #4 /* store Data */
89 ldmfd sp!, {r4-r11, pc} 89 add r1, r1, #4 /* V++ */
90.mpc_dewindowing_end: 90
91 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D 91 subs lr, lr, #1
92#else 92 bgt .loop32
93/**************************************************************************** 93
94 * void mpc_decoder_windowing_D(...) 94 ldmfd sp!, {r4-r8, pc}
95 * 95#else
96 * 2nd step within synthesis filter. Does the dewindowing. 96mpc_decoder_windowing_D:
97 * 64=32x32 multiplies 97 /* r0 = Data[] */
98 * Uses un-shifted D[]-values. D[] will always be the second operand of 98 /* r1 = V[] */
99 * smull/smlal to achieve higher speed as D[] has lower amplitude than V[]. 99 /* r2 = D[] */
100 ****************************************************************************/ 100 /* lr = counter */
101 .align 2 101 /************************************************************************
102 .global mpc_decoder_windowing_D 102 * Further speed up through making use of symmetries within D[]-window.
103 .type mpc_decoder_windowing_D, %function 103 * The row V[00] can be extracted as it has symmetries within this single
104#if 0 104 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
105mpc_decoder_windowing_D: 105 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
106 /* r0 = Data[] */ 106 * saved at the cost of 15 x 4 + 1 add's.
107 /* r1 = V[] */ 107 * The row V[16] can be extracted as it has symmetries within this single
108 /* r2 = D[] */ 108 * row. 8 smull/mlal and 8 ldr's can be saved.
109 /* lr = counter */ 109 ***********************************************************************/
110 /************************************************************************ 110 stmfd sp!, {r4-r11, lr}
111 * Reference implementation. 111
112 ***********************************************************************/ 112 /******************************************
113 stmfd sp!, {r4-r8, lr} 113 * row 0 with internal symmetry
114 114 *****************************************/
115 mov lr, #32 115 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
116.loop32: 116 ldmia r2!, { r3-r6 } /* load D[01..04] */
117 ldmia r2!, { r3-r6 } /* load D[00..03] */ 117 ldr r7 , [r1, #96*4] /* 1 */
118 ldr r7, [r1] /* 0 */ 118 ldr r10, [r1, #992*4] /* 15 */
119 smull r8, r12, r7, r3 119 rsb r10, r10, r7 /* V[01] - V[15] */
120 ldr r7, [r1, #96*4] /* 1 */ 120 smull r8, r9, r10, r3
121 smlal r8, r12, r7, r4 121 ldr r7 , [r1, #128*4] /* 2 */
122 ldr r7, [r1, #128*4] /* 2 */ 122 ldr r10, [r1, #896*4] /* 14 */
123 smlal r8, r12, r7, r5 123 add r10, r10, r7 /* V[02] + V[14] */
124 ldr r7, [r1, #224*4] /* 3 */ 124 smlal r8, r9, r10, r4
125 smlal r8, r12, r7, r6 125 ldr r7 , [r1, #224*4] /* 3 */
126 ldmia r2!, { r3-r6 } /* load D[04..07] */ 126 ldr r10, [r1, #864*4] /* 13 */
127 ldr r7, [r1, #256*4] /* 4 */ 127 rsb r10, r10, r7 /* V[03] - V[13] */
128 smlal r8, r12, r7, r3 128 smlal r8, r9, r10, r5
129 ldr r7, [r1, #352*4] /* 5 */ 129 ldr r7 , [r1, #256*4] /* 4 */
130 smlal r8, r12, r7, r4 130 ldr r10, [r1, #768*4] /* 12 */
131 ldr r7, [r1, #384*4] /* 6 */ 131 add r10, r10, r7 /* V[04] + V[12] */
132 smlal r8, r12, r7, r5 132 smlal r8, r9, r10, r6
133 ldr r7, [r1, #480*4] /* 7 */ 133 ldmia r2!, { r3-r6 } /* load D[05..08] */
134 smlal r8, r12, r7, r6 134 ldr r7 , [r1, #352*4] /* 5 */
135 ldmia r2!, { r3-r6 } /* load D[08..11] */ 135 ldr r10, [r1, #736*4] /* 11 */
136 ldr r7, [r1, #512*4] /* 8 */ 136 rsb r10, r10, r7 /* V[05] - V[11] */
137 smlal r8, r12, r7, r3 137 smlal r8, r9, r10, r3
138 ldr r7, [r1, #608*4] /* 9 */ 138 ldr r7 , [r1, #384*4] /* 6 */
139 smlal r8, r12, r7, r4 139 ldr r10, [r1, #640*4] /* 10 */
140 ldr r7, [r1, #640*4] /* 10 */ 140 add r10, r10, r7 /* V[06] + V[10] */
141 smlal r8, r12, r7, r5 141 smlal r8, r9, r10, r4
142 ldr r7, [r1, #736*4] /* 11 */ 142 ldr r7 , [r1, #480*4] /* 7 */
143 smlal r8, r12, r7, r6 143 ldr r10, [r1, #608*4] /* 9 */
144 ldmia r2!, { r3-r6 } /* load D[12..15] */ 144 rsb r10, r10, r7 /* V[07] - V[09] */
145 ldr r7, [r1, #768*4] /* 12 */ 145 smlal r8, r9, r10, r5
146 smlal r8, r12, r7, r3 146 ldr r10, [r1, #512*4] /* 8 */
147 ldr r7, [r1, #864*4] /* 13 */ 147 smlal r8, r9, r10, r6
148 smlal r8, r12, r7, r4 148 mov r8, r8, lsr #16
149 ldr r7, [r1, #896*4] /* 14 */ 149 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
150 smlal r8, r12, r7, r5 150 str r8, [r0], #4 /* store Data */
151 ldr r7, [r1, #992*4] /* 15 */ 151 add r1, r1, #4 /* V+=1, r1 = V[01] */
152 smlal r8, r12, r7, r6 152 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
153 mov r8, r8, lsr #16 153
154 orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */ 154 /******************************************
155 str r8, [r0], #4 /* store Data */ 155 * rows 01..15 are symmetrc to rows 31..17
156 add r1, r1, #4 /* V++ */ 156 * r8 = lo, r9 = hi of 01..15
157 157 * r1 = V[01..15]
158 subs lr, lr, #1 158 * r10 = lo, r11 = hi of 31..17
159 bgt .loop32 159 * r12 = V[31..16]
160 160 *****************************************/
161 ldmfd sp!, {r4-r8, pc} 161 mov lr, #15
162#else 162 add r12, r1, #30*4 /* r12 = V[31] */
163mpc_decoder_windowing_D: 163.loop15:
164 /* r0 = Data[] */ 164 ldmia r2!, { r3-r6 } /* load D[00..03] */
165 /* r1 = V[] */ 165 ldr r7, [r12, #768*4] /* 12 */
166 /* r2 = D[] */ 166 smull r10, r11, r7, r6
167 /* lr = counter */ 167 ldr r7, [r12, #864*4] /* 13 */
168 /************************************************************************ 168 smlal r10, r11, r7, r5
169 * Further speed up through making use of symmetries within D[]-window. 169 ldr r7, [r12, #896*4] /* 14 */
170 * The row V[00] can be extracted as it has symmetries within this single 170 smlal r10, r11, r7, r4
171 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. 171 ldr r7, [r12, #992*4] /* 15 */
172 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be 172 smlal r10, r11, r7, r3
173 * saved at the cost of 15 x 4 + 1 add's. 173 ldr r7, [r1] /* 0 */
174 * The row V[16] can be extracted as it has symmetries within this single 174 smull r8, r9, r7, r3
175 * row. 8 smull/mlal and 8 ldr's can be saved. 175 ldr r7, [r1, #96*4] /* 1 */
176 ***********************************************************************/ 176 smlal r8, r9, r7, r4
177 stmfd sp!, {r4-r11, lr} 177 ldr r7, [r1, #128*4] /* 2 */
178 178 smlal r8, r9, r7, r5
179 /****************************************** 179 ldr r7, [r1, #224*4] /* 3 */
180 * row 0 with internal symmetry 180 smlal r8, r9, r7, r6
181 *****************************************/ 181 ldmia r2!, { r3-r6 } /* load D[04..07] */
182 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ 182 ldr r7, [r1, #256*4] /* 4 */
183 ldmia r2!, { r3-r6 } /* load D[01..04] */ 183 smlal r8, r9, r7, r3
184 ldr r7 , [r1, #96*4] /* 1 */ 184 ldr r7, [r1, #352*4] /* 5 */
185 ldr r10, [r1, #992*4] /* 15 */ 185 smlal r8, r9, r7, r4
186 rsb r10, r10, r7 /* V[01] - V[15] */ 186 ldr r7, [r1, #384*4] /* 6 */
187 smull r8, r9, r10, r3 187 smlal r8, r9, r7, r5
188 ldr r7 , [r1, #128*4] /* 2 */ 188 ldr r7, [r1, #480*4] /* 7 */
189 ldr r10, [r1, #896*4] /* 14 */ 189 smlal r8, r9, r7, r6
190 add r10, r10, r7 /* V[02] + V[14] */ 190 ldr r7, [r12, #512*4] /* 8 */
191 smlal r8, r9, r10, r4 191 smlal r10, r11, r7, r6
192 ldr r7 , [r1, #224*4] /* 3 */ 192 ldr r7, [r12, #608*4] /* 9 */
193 ldr r10, [r1, #864*4] /* 13 */ 193 smlal r10, r11, r7, r5
194 rsb r10, r10, r7 /* V[03] - V[13] */ 194 ldr r7, [r12, #640*4] /* 10 */
195 smlal r8, r9, r10, r5 195 smlal r10, r11, r7, r4
196 ldr r7 , [r1, #256*4] /* 4 */ 196 ldr r7, [r12, #736*4] /* 11 */
197 ldr r10, [r1, #768*4] /* 12 */ 197 smlal r10, r11, r7, r3
198 add r10, r10, r7 /* V[04] + V[12] */ 198 ldmia r2!, { r3-r6 } /* load D[08..11] */
199 smlal r8, r9, r10, r6 199 ldr r7, [r12, #256*4] /* 4 */
200 ldmia r2!, { r3-r6 } /* load D[05..08] */ 200 smlal r10, r11, r7, r6
201 ldr r7 , [r1, #352*4] /* 5 */ 201 ldr r7, [r12, #352*4] /* 5 */
202 ldr r10, [r1, #736*4] /* 11 */ 202 smlal r10, r11, r7, r5
203 rsb r10, r10, r7 /* V[05] - V[11] */ 203 ldr r7, [r12, #384*4] /* 6 */
204 smlal r8, r9, r10, r3 204 smlal r10, r11, r7, r4
205 ldr r7 , [r1, #384*4] /* 6 */ 205 ldr r7, [r12, #480*4] /* 7 */
206 ldr r10, [r1, #640*4] /* 10 */ 206 smlal r10, r11, r7, r3
207 add r10, r10, r7 /* V[06] + V[10] */ 207 ldr r7, [r1, #512*4] /* 8 */
208 smlal r8, r9, r10, r4 208 smlal r8, r9, r7, r3
209 ldr r7 , [r1, #480*4] /* 7 */ 209 ldr r7, [r1, #608*4] /* 9 */
210 ldr r10, [r1, #608*4] /* 9 */ 210 smlal r8, r9, r7, r4
211 rsb r10, r10, r7 /* V[07] - V[09] */ 211 ldr r7, [r1, #640*4] /* 10 */
212 smlal r8, r9, r10, r5 212 smlal r8, r9, r7, r5
213 ldr r10, [r1, #512*4] /* 8 */ 213 ldr r7, [r1, #736*4] /* 11 */
214 smlal r8, r9, r10, r6 214 smlal r8, r9, r7, r6
215 mov r8, r8, lsr #16 215 ldmia r2!, { r3-r6 } /* load D[12..15] */
216 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ 216 ldr r7, [r1, #768*4] /* 12 */
217 str r8, [r0], #4 /* store Data */ 217 smlal r8, r9, r7, r3
218 add r1, r1, #4 /* V+=1, r1 = V[01] */ 218 ldr r7, [r1, #864*4] /* 13 */
219 add r2, r2, #7*4 /* D+=7, r2 = D[16] */ 219 smlal r8, r9, r7, r4
220 220 ldr r7, [r1, #896*4] /* 14 */
221 /****************************************** 221 smlal r8, r9, r7, r5
222 * rows 01..15 are symmetrc to rows 31..17 222 ldr r7, [r1, #992*4] /* 15 */
223 * r8 = lo, r9 = hi of 01..15 223 smlal r8, r9, r7, r6
224 * r1 = V[01..15] 224 ldr r7, [r12] /* 0 */
225 * r10 = lo, r11 = hi of 31..17 225 smlal r10, r11, r7, r6
226 * r12 = V[31..16] 226 ldr r7, [r12, #96*4] /* 1 */
227 *****************************************/ 227 smlal r10, r11, r7, r5
228 mov lr, #15 228 ldr r7, [r12, #128*4] /* 2 */
229 add r12, r1, #30*4 /* r12 = V[31] */ 229 smlal r10, r11, r7, r4
230.loop15: 230 ldr r7, [r12, #224*4] /* 3 */
231 ldmia r2!, { r3-r6 } /* load D[00..03] */ 231 smlal r10, r11, r7, r3
232 ldr r7, [r12, #768*4] /* 12 */ 232 /* store Data[01..15] */
233 smull r10, r11, r7, r6 233 mov r8, r8, lsr #16
234 ldr r7, [r12, #864*4] /* 13 */ 234 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
235 smlal r10, r11, r7, r5 235 str r8, [r0] /* store Data */
236 ldr r7, [r12, #896*4] /* 14 */ 236 /* store Data[31..17] */
237 smlal r10, r11, r7, r4 237 add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
238 ldr r7, [r12, #992*4] /* 15 */ 238 mov r10, r10, lsr #16
239 smlal r10, r11, r7, r3 239 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
240 ldr r7, [r1] /* 0 */ 240 rsb r10, r10, #0 /* r10 = -r10 */
241 smull r8, r9, r7, r3 241 str r10, [r0], #4 /* store Data */
242 ldr r7, [r1, #96*4] /* 1 */ 242 sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
243 smlal r8, r9, r7, r4 243 /* correct adresses for next loop */
244 ldr r7, [r1, #128*4] /* 2 */ 244 sub r12, r12, #4 /* r12 = V-- */
245 smlal r8, r9, r7, r5 245 add r1, r1, #4 /* r1 = V++ */
246 ldr r7, [r1, #224*4] /* 3 */ 246 /* next loop */
247 smlal r8, r9, r7, r6 247 subs lr, lr, #1
248 ldmia r2!, { r3-r6 } /* load D[04..07] */ 248 bgt .loop15
249 ldr r7, [r1, #256*4] /* 4 */ 249
250 smlal r8, r9, r7, r3 250 /******************************************
251 ldr r7, [r1, #352*4] /* 5 */ 251 * V[16] with internal symmetry
252 smlal r8, r9, r7, r4 252 *****************************************/
253 ldr r7, [r1, #384*4] /* 6 */ 253 ldmia r2!, { r3-r6 } /* load D[00..03] */
254 smlal r8, r9, r7, r5 254 ldr r7 , [r1] /* 0 */
255 ldr r7, [r1, #480*4] /* 7 */ 255 ldr r10, [r1, #992*4] /* 15 */
256 smlal r8, r9, r7, r6 256 rsb r10, r10, r7 /* V[00] - V[15] */
257 ldr r7, [r12, #512*4] /* 8 */ 257 smull r8, r9, r10, r3
258 smlal r10, r11, r7, r6 258 ldr r7 , [r1, #96*4] /* 1 */
259 ldr r7, [r12, #608*4] /* 9 */ 259 ldr r10, [r1, #896*4] /* 14 */
260 smlal r10, r11, r7, r5 260 rsb r10, r10, r7 /* V[01] - V[14] */
261 ldr r7, [r12, #640*4] /* 10 */ 261 smlal r8, r9, r10, r4
262 smlal r10, r11, r7, r4 262 ldr r7 , [r1, #128*4] /* 2 */
263 ldr r7, [r12, #736*4] /* 11 */ 263 ldr r10, [r1, #864*4] /* 13 */
264 smlal r10, r11, r7, r3 264 rsb r10, r10, r7 /* V[02] - V[13] */
265 ldmia r2!, { r3-r6 } /* load D[08..11] */ 265 smlal r8, r9, r10, r5
266 ldr r7, [r12, #256*4] /* 4 */ 266 ldr r7 , [r1, #224*4] /* 3 */
267 smlal r10, r11, r7, r6 267 ldr r10, [r1, #768*4] /* 12 */
268 ldr r7, [r12, #352*4] /* 5 */ 268 rsb r10, r10, r7 /* V[03] - V[12] */
269 smlal r10, r11, r7, r5 269 smlal r8, r9, r10, r6
270 ldr r7, [r12, #384*4] /* 6 */ 270 ldmia r2!, { r3-r6 } /* load D[04..07] */
271 smlal r10, r11, r7, r4 271 ldr r7 , [r1, #256*4] /* 4 */
272 ldr r7, [r12, #480*4] /* 7 */ 272 ldr r10, [r1, #736*4] /* 11 */
273 smlal r10, r11, r7, r3 273 rsb r10, r10, r7 /* V[04] - V[11] */
274 ldr r7, [r1, #512*4] /* 8 */ 274 smlal r8, r9, r10, r3
275 smlal r8, r9, r7, r3 275 ldr r7 , [r1, #352*4] /* 5 */
276 ldr r7, [r1, #608*4] /* 9 */ 276 ldr r10, [r1, #640*4] /* 10 */
277 smlal r8, r9, r7, r4 277 rsb r10, r10, r7 /* V[05] - V[10] */
278 ldr r7, [r1, #640*4] /* 10 */ 278 smlal r8, r9, r10, r4
279 smlal r8, r9, r7, r5 279 ldr r7 , [r1, #384*4] /* 6 */
280 ldr r7, [r1, #736*4] /* 11 */ 280 ldr r10, [r1, #608*4] /* 9 */
281 smlal r8, r9, r7, r6 281 rsb r10, r10, r7 /* V[06] - V[09] */
282 ldmia r2!, { r3-r6 } /* load D[12..15] */ 282 smlal r8, r9, r10, r5
283 ldr r7, [r1, #768*4] /* 12 */ 283 ldr r7 , [r1, #480*4] /* 7 */
284 smlal r8, r9, r7, r3 284 ldr r10, [r1, #512*4] /* 8 */
285 ldr r7, [r1, #864*4] /* 13 */ 285 rsb r10, r10, r7 /* V[07] - V[08] */
286 smlal r8, r9, r7, r4 286 smlal r8, r9, r10, r6
287 ldr r7, [r1, #896*4] /* 14 */ 287 mov r8, r8, lsr #16
288 smlal r8, r9, r7, r5 288 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
289 ldr r7, [r1, #992*4] /* 15 */ 289 str r8, [r0], #4 /* store Data */
290 smlal r8, r9, r7, r6 290 add r1, r1, #4 /* V++ */
291 ldr r7, [r12] /* 0 */ 291
292 smlal r10, r11, r7, r6 292 ldmfd sp!, {r4-r11, pc}
293 ldr r7, [r12, #96*4] /* 1 */ 293#endif
294 smlal r10, r11, r7, r5 294.mpc_dewindowing_end:
295 ldr r7, [r12, #128*4] /* 2 */ 295 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
296 smlal r10, r11, r7, r4
297 ldr r7, [r12, #224*4] /* 3 */
298 smlal r10, r11, r7, r3
299 /* store Data[01..15] */
300 mov r8, r8, lsr #16
301 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
302 str r8, [r0] /* store Data */
303 /* store Data[31..17] */
304 add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
305 mov r10, r10, lsr #16
306 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
307 rsb r10, r10, #0 /* r10 = -r10 */
308 str r10, [r0], #4 /* store Data */
309 sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
310 /* correct adresses for next loop */
311 sub r12, r12, #4 /* r12 = V-- */
312 add r1, r1, #4 /* r1 = V++ */
313 /* next loop */
314 subs lr, lr, #1
315 bgt .loop15
316
317 /******************************************
318 * V[16] with internal symmetry
319 *****************************************/
320 ldmia r2!, { r3-r6 } /* load D[00..03] */
321 ldr r7 , [r1] /* 0 */
322 ldr r10, [r1, #992*4] /* 15 */
323 rsb r10, r10, r7 /* V[00] - V[15] */
324 smull r8, r9, r10, r3
325 ldr r7 , [r1, #96*4] /* 1 */
326 ldr r10, [r1, #896*4] /* 14 */
327 rsb r10, r10, r7 /* V[01] - V[14] */
328 smlal r8, r9, r10, r4
329 ldr r7 , [r1, #128*4] /* 2 */
330 ldr r10, [r1, #864*4] /* 13 */
331 rsb r10, r10, r7 /* V[02] - V[13] */
332 smlal r8, r9, r10, r5
333 ldr r7 , [r1, #224*4] /* 3 */
334 ldr r10, [r1, #768*4] /* 12 */
335 rsb r10, r10, r7 /* V[03] - V[12] */
336 smlal r8, r9, r10, r6
337 ldmia r2!, { r3-r6 } /* load D[04..07] */
338 ldr r7 , [r1, #256*4] /* 4 */
339 ldr r10, [r1, #736*4] /* 11 */
340 rsb r10, r10, r7 /* V[04] - V[11] */
341 smlal r8, r9, r10, r3
342 ldr r7 , [r1, #352*4] /* 5 */
343 ldr r10, [r1, #640*4] /* 10 */
344 rsb r10, r10, r7 /* V[05] - V[10] */
345 smlal r8, r9, r10, r4
346 ldr r7 , [r1, #384*4] /* 6 */
347 ldr r10, [r1, #608*4] /* 9 */
348 rsb r10, r10, r7 /* V[06] - V[09] */
349 smlal r8, r9, r10, r5
350 ldr r7 , [r1, #480*4] /* 7 */
351 ldr r10, [r1, #512*4] /* 8 */
352 rsb r10, r10, r7 /* V[07] - V[08] */
353 smlal r8, r9, r10, r6
354 mov r8, r8, lsr #16
355 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
356 str r8, [r0], #4 /* store Data */
357 add r1, r1, #4 /* V++ */
358
359 ldmfd sp!, {r4-r11, pc}
360#endif
361.mpc_dewindowing_end:
362 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
363#endif