summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2008-06-15 12:17:22 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2008-06-15 12:17:22 +0000
commit16714539922fb8828180ff605df3586f3eab97c3 (patch)
tree1f2f67dc89313ce9b099222b353675ac170dbc75
parent23bca7e400a9ca5ae26abdae30508485047a058c (diff)
downloadrockbox-16714539922fb8828180ff605df3586f3eab97c3.tar.gz
rockbox-16714539922fb8828180ff605df3586f3eab97c3.zip
Musepack speed optimization. Speep up 64 bit precision synthesizer by another 1.5MHz through using symmetries within D[] filter coefficients. For ARM only.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17724 a1c6a512-1295-4272-9138-f99709370657
-rwxr-xr-xapps/codecs/libmusepack/synth_filter_arm.S204
1 files changed, 203 insertions, 1 deletions
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 41bfda740b..83867086aa 100755
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -99,12 +99,15 @@ mpc_decoder_windowing_D:
99 .align 2 99 .align 2
100 .global mpc_decoder_windowing_D 100 .global mpc_decoder_windowing_D
101 .type mpc_decoder_windowing_D, %function 101 .type mpc_decoder_windowing_D, %function
102#if 0
102mpc_decoder_windowing_D: 103mpc_decoder_windowing_D:
103 /* r0 = Data[] */ 104 /* r0 = Data[] */
104 /* r1 = V[] */ 105 /* r1 = V[] */
105 /* r2 = D[] */ 106 /* r2 = D[] */
106 /* lr = counter */ 107 /* lr = counter */
107 108 /************************************************************************
109 * Reference implementation.
110 ***********************************************************************/
108 stmfd sp!, {r4-r9, lr} 111 stmfd sp!, {r4-r9, lr}
109 112
110 mov lr, #32 113 mov lr, #32
@@ -154,6 +157,205 @@ mpc_decoder_windowing_D:
154 bgt .loop32 157 bgt .loop32
155 158
156 ldmfd sp!, {r4-r9, pc} 159 ldmfd sp!, {r4-r9, pc}
160#else
161mpc_decoder_windowing_D:
162 /* r0 = Data[] */
163 /* r1 = V[] */
164 /* r2 = D[] */
165 /* lr = counter */
166 /************************************************************************
167 * Further speed up through making use of symmetries within D[]-window.
168 * The row V[00] can be extracted as it has symmetries within this single
169 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
170 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
171 * saved at the cost of 15 x 4 + 1 add's.
172 * The row V[16] can be extracted as it has symmetries within this single
173 * row. 8 smull/mlal and 8 ldr's can be saved.
174 ***********************************************************************/
175 stmfd sp!, {r4-r12, lr}
176
177 /******************************************
178 * row 0 with internal symmetry
179 *****************************************/
180 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
181 ldmia r2!, { r3-r6 } /* load D[01..04] */
182 ldr r7 , [r1, #96*4] /* 1 */
183 ldr r10, [r1, #992*4] /* 15 */
184 rsb r10, r10, r7 /* V[01] - V[15] */
185 smull r8, r9, r10, r3
186 ldr r7 , [r1, #128*4] /* 2 */
187 ldr r10, [r1, #896*4] /* 14 */
188 add r10, r10, r7 /* V[02] + V[14] */
189 smlal r8, r9, r10, r4
190 ldr r7 , [r1, #224*4] /* 3 */
191 ldr r10, [r1, #864*4] /* 13 */
192 rsb r10, r10, r7 /* V[03] - V[13] */
193 smlal r8, r9, r10, r5
194 ldr r7 , [r1, #256*4] /* 4 */
195 ldr r10, [r1, #768*4] /* 12 */
196 add r10, r10, r7 /* V[04] + V[12] */
197 smlal r8, r9, r10, r6
198 ldmia r2!, { r3-r6 } /* load D[05..08] */
199 ldr r7 , [r1, #352*4] /* 5 */
200 ldr r10, [r1, #736*4] /* 11 */
201 rsb r10, r10, r7 /* V[05] - V[11] */
202 smlal r8, r9, r10, r3
203 ldr r7 , [r1, #384*4] /* 6 */
204 ldr r10, [r1, #640*4] /* 10 */
205 add r10, r10, r7 /* V[06] + V[10] */
206 smlal r8, r9, r10, r4
207 ldr r7 , [r1, #480*4] /* 7 */
208 ldr r10, [r1, #608*4] /* 9 */
209 rsb r10, r10, r7 /* V[07] - V[09] */
210 smlal r8, r9, r10, r5
211 ldr r10, [r1, #512*4] /* 8 */
212 smlal r8, r9, r10, r6
213 mov r8, r8, lsr #16
214 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
215 str r8, [r0], #4 /* store Data */
216 add r1, r1, #4 /* V+=1, r1 = V[01] */
217 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
218
219 /******************************************
220 * rows 01..15 are symmetrc to rows 31..17
221 * r8 = lo, r9 = hi of 01..15
222 * r1 = V[01..15]
223 * r10 = lo, r11 = hi of 31..17
224 * r12 = V[31..16]
225 *****************************************/
226 mov lr, #15
227 add r12, r1, #30*4 /* r12 = V[31] */
228.loop15:
229 ldmia r2!, { r3-r6 } /* load D[00..03] */
230 ldr r7, [r12, #768*4] /* 12 */
231 smull r10, r11, r7, r6
232 ldr r7, [r12, #864*4] /* 13 */
233 smlal r10, r11, r7, r5
234 ldr r7, [r12, #896*4] /* 14 */
235 smlal r10, r11, r7, r4
236 ldr r7, [r12, #992*4] /* 15 */
237 smlal r10, r11, r7, r3
238 ldr r7, [r1] /* 0 */
239 smull r8, r9, r7, r3
240 ldr r7, [r1, #96*4] /* 1 */
241 smlal r8, r9, r7, r4
242 ldr r7, [r1, #128*4] /* 2 */
243 smlal r8, r9, r7, r5
244 ldr r7, [r1, #224*4] /* 3 */
245 smlal r8, r9, r7, r6
246 ldmia r2!, { r3-r6 } /* load D[04..07] */
247 ldr r7, [r1, #256*4] /* 4 */
248 smlal r8, r9, r7, r3
249 ldr r7, [r1, #352*4] /* 5 */
250 smlal r8, r9, r7, r4
251 ldr r7, [r1, #384*4] /* 6 */
252 smlal r8, r9, r7, r5
253 ldr r7, [r1, #480*4] /* 7 */
254 smlal r8, r9, r7, r6
255 ldr r7, [r12, #512*4] /* 8 */
256 smlal r10, r11, r7, r6
257 ldr r7, [r12, #608*4] /* 9 */
258 smlal r10, r11, r7, r5
259 ldr r7, [r12, #640*4] /* 10 */
260 smlal r10, r11, r7, r4
261 ldr r7, [r12, #736*4] /* 11 */
262 smlal r10, r11, r7, r3
263 ldmia r2!, { r3-r6 } /* load D[08..11] */
264 ldr r7, [r12, #256*4] /* 4 */
265 smlal r10, r11, r7, r6
266 ldr r7, [r12, #352*4] /* 5 */
267 smlal r10, r11, r7, r5
268 ldr r7, [r12, #384*4] /* 6 */
269 smlal r10, r11, r7, r4
270 ldr r7, [r12, #480*4] /* 7 */
271 smlal r10, r11, r7, r3
272 ldr r7, [r1, #512*4] /* 8 */
273 smlal r8, r9, r7, r3
274 ldr r7, [r1, #608*4] /* 9 */
275 smlal r8, r9, r7, r4
276 ldr r7, [r1, #640*4] /* 10 */
277 smlal r8, r9, r7, r5
278 ldr r7, [r1, #736*4] /* 11 */
279 smlal r8, r9, r7, r6
280 ldmia r2!, { r3-r6 } /* load D[12..15] */
281 ldr r7, [r1, #768*4] /* 12 */
282 smlal r8, r9, r7, r3
283 ldr r7, [r1, #864*4] /* 13 */
284 smlal r8, r9, r7, r4
285 ldr r7, [r1, #896*4] /* 14 */
286 smlal r8, r9, r7, r5
287 ldr r7, [r1, #992*4] /* 15 */
288 smlal r8, r9, r7, r6
289 ldr r7, [r12] /* 0 */
290 smlal r10, r11, r7, r6
291 ldr r7, [r12, #96*4] /* 1 */
292 smlal r10, r11, r7, r5
293 ldr r7, [r12, #128*4] /* 2 */
294 smlal r10, r11, r7, r4
295 ldr r7, [r12, #224*4] /* 3 */
296 smlal r10, r11, r7, r3
297 /* store Data[01..15] */
298 mov r8, r8, lsr #16
299 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
300 str r8, [r0] /* store Data */
301 /* store Data[31..17] */
302 add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
303 mov r10, r10, lsr #16
304 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
305 rsb r10, r10, #0 /* r10 = -r10 */
306 str r10, [r0], #4 /* store Data */
307 sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
308 /* correct adresses for next loop */
309 sub r12, r12, #4 /* r12 = V-- */
310 add r1, r1, #4 /* r1 = V++ */
311 /* next loop */
312 subs lr, lr, #1
313 bgt .loop15
314
315 /******************************************
316 * V[16] with internal symmetry
317 *****************************************/
318 ldmia r2!, { r3-r6 } /* load D[00..03] */
319 ldr r7 , [r1] /* 0 */
320 ldr r10, [r1, #992*4] /* 15 */
321 rsb r10, r10, r7 /* V[00] - V[15] */
322 smull r8, r9, r10, r3
323 ldr r7 , [r1, #96*4] /* 1 */
324 ldr r10, [r1, #896*4] /* 14 */
325 rsb r10, r10, r7 /* V[01] - V[14] */
326 smlal r8, r9, r10, r4
327 ldr r7 , [r1, #128*4] /* 2 */
328 ldr r10, [r1, #864*4] /* 13 */
329 rsb r10, r10, r7 /* V[02] - V[13] */
330 smlal r8, r9, r10, r5
331 ldr r7 , [r1, #224*4] /* 3 */
332 ldr r10, [r1, #768*4] /* 12 */
333 rsb r10, r10, r7 /* V[03] - V[12] */
334 smlal r8, r9, r10, r6
335 ldmia r2!, { r3-r6 } /* load D[04..07] */
336 ldr r7 , [r1, #256*4] /* 4 */
337 ldr r10, [r1, #736*4] /* 11 */
338 rsb r10, r10, r7 /* V[04] - V[11] */
339 smlal r8, r9, r10, r3
340 ldr r7 , [r1, #352*4] /* 5 */
341 ldr r10, [r1, #640*4] /* 10 */
342 rsb r10, r10, r7 /* V[05] - V[10] */
343 smlal r8, r9, r10, r4
344 ldr r7 , [r1, #384*4] /* 6 */
345 ldr r10, [r1, #608*4] /* 9 */
346 rsb r10, r10, r7 /* V[06] - V[09] */
347 smlal r8, r9, r10, r5
348 ldr r7 , [r1, #480*4] /* 7 */
349 ldr r10, [r1, #512*4] /* 8 */
350 rsb r10, r10, r7 /* V[07] - V[08] */
351 smlal r8, r9, r10, r6
352 mov r8, r8, lsr #16
353 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
354 str r8, [r0], #4 /* store Data */
355 add r1, r1, #4 /* V++ */
356
357 ldmfd sp!, {r4-r12, pc}
358#endif
157.mpc_dewindowing_end: 359.mpc_dewindowing_end:
158 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D 360 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
159#endif 361#endif