diff options
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/libmusepack/synth_filter_arm.S | 210 |
1 files changed, 208 insertions, 2 deletions
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 731a21ce21..5bdae93561 100644 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S | |||
@@ -92,7 +92,7 @@ mpc_decoder_windowing_D: | |||
92 | bgt .loop32 | 92 | bgt .loop32 |
93 | 93 | ||
94 | ldmpc regs=r4-r8 | 94 | ldmpc regs=r4-r8 |
95 | #else | 95 | #elif defined(CPU_ARM7TDMI) /* arm7 only */ |
96 | mpc_decoder_windowing_D: | 96 | mpc_decoder_windowing_D: |
97 | /* r0 = Data[] */ | 97 | /* r0 = Data[] */ |
98 | /* r1 = V[] */ | 98 | /* r1 = V[] */ |
@@ -106,6 +106,7 @@ mpc_decoder_windowing_D: | |||
106 | * saved at the cost of 15 x 4 + 1 add's. | 106 | * saved at the cost of 15 x 4 + 1 add's. |
107 | * The row V[16] can be extracted as it has symmetries within this single | 107 | * The row V[16] can be extracted as it has symmetries within this single |
108 | * row. 8 smull/mlal and 8 ldr's can be saved. | 108 | * row. 8 smull/mlal and 8 ldr's can be saved. |
109 | * Used for arm7 only. For arm9 and above see implementation below. | ||
109 | ***********************************************************************/ | 110 | ***********************************************************************/ |
110 | stmfd sp!, {r4-r11, lr} | 111 | stmfd sp!, {r4-r11, lr} |
111 | 112 | ||
@@ -152,7 +153,7 @@ mpc_decoder_windowing_D: | |||
152 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | 153 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ |
153 | 154 | ||
154 | /****************************************** | 155 | /****************************************** |
155 | * rows 01..15 are symmetrc to rows 31..17 | 156 | * rows 01..15 are symmetric to rows 31..17 |
156 | * r8 = lo, r9 = hi of 01..15 | 157 | * r8 = lo, r9 = hi of 01..15 |
157 | * r1 = V[01..15] | 158 | * r1 = V[01..15] |
158 | * r10 = lo, r11 = hi of 31..17 | 159 | * r10 = lo, r11 = hi of 31..17 |
@@ -290,6 +291,211 @@ mpc_decoder_windowing_D: | |||
290 | add r1, r1, #4 /* V++ */ | 291 | add r1, r1, #4 /* V++ */ |
291 | 292 | ||
292 | ldmpc regs=r4-r11 | 293 | ldmpc regs=r4-r11 |
294 | #else /* arm9 and above */ | ||
295 | mpc_decoder_windowing_D: | ||
296 | /* r0 = Data[] */ | ||
297 | /* r1 = V[] */ | ||
298 | /* r2 = D[] */ | ||
299 | /* lr = counter */ | ||
300 | /************************************************************************ | ||
301 | * Further speed up through making use of symmetries within D[]-window. | ||
302 | * The row V[00] can be extracted as it has symmetries within this single | ||
303 | * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. | ||
304 | * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be | ||
305 | * saved at the cost of 15 x 4 + 1 add's. | ||
306 | * The row V[16] can be extracted as it has symmetries within this single | ||
307 | * row. 8 smull/mlal and 8 ldr's can be saved. | ||
308 | * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds | ||
309 | * up decoding even though several ldm-calls are replaced with ldr to free | ||
310 | * 2 registers. | ||
311 | ***********************************************************************/ | ||
312 | stmfd sp!, {r4-r11, lr} | ||
313 | |||
314 | /****************************************** | ||
315 | * row 0 with internal symmetry | ||
316 | *****************************************/ | ||
317 | add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ | ||
318 | ldmia r2!, { r3-r6 } /* load D[01..04] */ | ||
319 | ldr r7 , [r1, #96*4] /* 1 */ | ||
320 | ldr r10, [r1, #992*4] /* 15 */ | ||
321 | ldr r11, [r1, #128*4] /* 2 */ | ||
322 | ldr r12, [r1, #896*4] /* 14 */ | ||
323 | rsb r10, r10, r7 /* V[01] - V[15] */ | ||
324 | smull r8, r9, r10, r3 | ||
325 | ldr r7 , [r1, #224*4] /* 3 */ | ||
326 | ldr r10, [r1, #864*4] /* 13 */ | ||
327 | add r12, r12, r11 /* V[02] + V[14] */ | ||
328 | smlal r8, r9, r12, r4 | ||
329 | ldr r11, [r1, #256*4] /* 4 */ | ||
330 | ldr r12, [r1, #768*4] /* 12 */ | ||
331 | rsb r10, r10, r7 /* V[03] - V[13] */ | ||
332 | smlal r8, r9, r10, r5 | ||
333 | ldr r7 , [r1, #352*4] /* 5 */ | ||
334 | ldr r10, [r1, #736*4] /* 11 */ | ||
335 | add r12, r12, r11 /* V[04] + V[12] */ | ||
336 | smlal r8, r9, r12, r6 | ||
337 | ldmia r2!, { r3-r6 } /* load D[05..08] */ | ||
338 | ldr r11, [r1, #384*4] /* 6 */ | ||
339 | ldr r12, [r1, #640*4] /* 10 */ | ||
340 | rsb r10, r10, r7 /* V[05] - V[11] */ | ||
341 | smlal r8, r9, r10, r3 | ||
342 | ldr r7 , [r1, #480*4] /* 7 */ | ||
343 | ldr r10, [r1, #608*4] /* 9 */ | ||
344 | add r12, r12, r11 /* V[06] + V[10] */ | ||
345 | smlal r8, r9, r12, r4 | ||
346 | ldr r11, [r1, #512*4] /* 8 */ | ||
347 | rsb r10, r10, r7 /* V[07] - V[09] */ | ||
348 | smlal r8, r9, r10, r5 | ||
349 | smlal r8, r9, r11, r6 | ||
350 | mov r8, r8, lsr #16 | ||
351 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
352 | str r8, [r0], #4 /* store Data */ | ||
353 | add r1, r1, #4 /* V+=1, r1 = V[01] */ | ||
354 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | ||
355 | |||
356 | /****************************************** | ||
357 | * rows 01..15 are symmetric to rows 31..17 | ||
358 | * r8 = lo, r9 = hi of 01..15 | ||
359 | * r1 = V[01..15] | ||
360 | * r10 = lo, r11 = hi of 31..17 | ||
361 | * r12 = V[31..16] | ||
362 | *****************************************/ | ||
363 | mov lr, #15 | ||
364 | add r12, r1, #30*4 /* r12 = V[31] */ | ||
365 | .loop15: | ||
366 | ldmia r2!, { r3-r4 } /* load D[00..01] */ | ||
367 | ldr r7, [r12, #896*4] /* 14 */ | ||
368 | ldr r5, [r12, #992*4] /* 15 */ | ||
369 | smull r10, r11, r7, r4 | ||
370 | ldr r7, [r1] /* 0 */ | ||
371 | smlal r10, r11, r5, r3 | ||
372 | ldr r5, [r1, #96*4] /* 1 */ | ||
373 | smull r8, r9, r7, r3 | ||
374 | ldr r7, [r12, #768*4] /* 12 */ | ||
375 | smlal r8, r9, r5, r4 | ||
376 | ldmia r2!, { r3-r4 } /* load D[02..03] */ | ||
377 | ldr r5, [r12, #864*4] /* 13 */ | ||
378 | smlal r10, r11, r7, r4 | ||
379 | ldr r7, [r1, #128*4] /* 2 */ | ||
380 | smlal r10, r11, r5, r3 | ||
381 | ldr r5, [r1, #224*4] /* 3 */ | ||
382 | smlal r8, r9, r7, r3 | ||
383 | ldr r7, [r1, #256*4] /* 4 */ | ||
384 | smlal r8, r9, r5, r4 | ||
385 | ldmia r2!, { r3-r4 } /* load D[04..04] */ | ||
386 | ldr r5, [r1, #352*4] /* 5 */ | ||
387 | smlal r8, r9, r7, r3 | ||
388 | ldr r7, [r12, #640*4] /* 10 */ | ||
389 | smlal r8, r9, r5, r4 | ||
390 | ldr r5, [r12, #736*4] /* 11 */ | ||
391 | smlal r10, r11, r7, r4 | ||
392 | ldr r7, [r1, #384*4] /* 6 */ | ||
393 | smlal r10, r11, r5, r3 | ||
394 | ldmia r2!, { r3-r4 } /* load D[06..07] */ | ||
395 | ldr r5, [r1, #480*4] /* 7 */ | ||
396 | smlal r8, r9, r7, r3 | ||
397 | ldr r7, [r12, #512*4] /* 8 */ | ||
398 | smlal r8, r9, r5, r4 | ||
399 | ldr r5, [r12, #608*4] /* 9 */ | ||
400 | smlal r10, r11, r7, r4 | ||
401 | ldr r7, [r12, #384*4] /* 6 */ | ||
402 | smlal r10, r11, r5, r3 | ||
403 | ldmia r2!, { r3-r4 } /* load D[08..09] */ | ||
404 | ldr r5, [r12, #480*4] /* 7 */ | ||
405 | smlal r10, r11, r7, r4 | ||
406 | ldr r7, [r1, #512*4] /* 8 */ | ||
407 | smlal r10, r11, r5, r3 | ||
408 | ldr r5, [r1, #608*4] /* 9 */ | ||
409 | smlal r8, r9, r7, r3 | ||
410 | ldr r7, [r1, #640*4] /* 10 */ | ||
411 | smlal r8, r9, r5, r4 | ||
412 | ldmia r2!, { r3-r4 } /* load D[10..11] */ | ||
413 | ldr r5, [r1, #736*4] /* 11 */ | ||
414 | smlal r8, r9, r7, r3 | ||
415 | ldr r7, [r12, #256*4] /* 4 */ | ||
416 | smlal r8, r9, r5, r4 | ||
417 | ldr r5, [r12, #352*4] /* 5 */ | ||
418 | smlal r10, r11, r7, r4 | ||
419 | ldr r7, [r1, #768*4] /* 12 */ | ||
420 | smlal r10, r11, r5, r3 | ||
421 | ldmia r2!, { r3-r4 } /* load D[12..13] */ | ||
422 | ldr r5, [r1, #864*4] /* 13 */ | ||
423 | smlal r8, r9, r7, r3 | ||
424 | ldr r7, [r12, #128*4] /* 2 */ | ||
425 | smlal r8, r9, r5, r4 | ||
426 | ldr r5, [r12, #224*4] /* 3 */ | ||
427 | smlal r10, r11, r7, r4 | ||
428 | ldr r7, [r12] /* 0 */ | ||
429 | smlal r10, r11, r5, r3 | ||
430 | ldmia r2!, { r3-r4 } /* load D[14..15] */ | ||
431 | ldr r5, [r12, #96*4] /* 1 */ | ||
432 | smlal r10, r11, r7, r4 | ||
433 | ldr r7, [r1, #896*4] /* 14 */ | ||
434 | smlal r10, r11, r5, r3 | ||
435 | ldr r5, [r1, #992*4] /* 15 */ | ||
436 | smlal r8, r9, r7, r3 | ||
437 | smlal r8, r9, r5, r4 | ||
438 | /* store Data[01..15] */ | ||
439 | mov r8, r8, lsr #16 | ||
440 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
441 | str r8, [r0] /* store Data */ | ||
442 | /* store Data[31..17] */ | ||
443 | add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */ | ||
444 | mov r10, r10, lsr #16 | ||
445 | orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
446 | rsb r10, r10, #0 /* r10 = -r10 */ | ||
447 | str r10, [r0], #4 /* store Data */ | ||
448 | sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */ | ||
449 | /* correct adresses for next loop */ | ||
450 | sub r12, r12, #4 /* r12 = V-- */ | ||
451 | add r1, r1, #4 /* r1 = V++ */ | ||
452 | /* next loop */ | ||
453 | subs lr, lr, #1 | ||
454 | bgt .loop15 | ||
455 | |||
456 | /****************************************** | ||
457 | * V[16] with internal symmetry | ||
458 | *****************************************/ | ||
459 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
460 | ldr r7 , [r1] /* 0 */ | ||
461 | ldr r10, [r1, #992*4] /* 15 */ | ||
462 | ldr r11, [r1, #96*4] /* 1 */ | ||
463 | ldr r12, [r1, #896*4] /* 14 */ | ||
464 | rsb r10, r10, r7 /* V[00] - V[15] */ | ||
465 | smull r8, r9, r10, r3 | ||
466 | ldr r7 , [r1, #128*4] /* 2 */ | ||
467 | ldr r10, [r1, #864*4] /* 13 */ | ||
468 | rsb r12, r12, r11 /* V[01] - V[14] */ | ||
469 | smlal r8, r9, r12, r4 | ||
470 | ldr r11, [r1, #224*4] /* 3 */ | ||
471 | ldr r12, [r1, #768*4] /* 12 */ | ||
472 | rsb r10, r10, r7 /* V[02] - V[13] */ | ||
473 | smlal r8, r9, r10, r5 | ||
474 | ldr r7 , [r1, #256*4] /* 4 */ | ||
475 | ldr r10, [r1, #736*4] /* 11 */ | ||
476 | rsb r12, r12, r11 /* V[03] - V[12] */ | ||
477 | smlal r8, r9, r12, r6 | ||
478 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
479 | ldr r11, [r1, #352*4] /* 5 */ | ||
480 | ldr r12, [r1, #640*4] /* 10 */ | ||
481 | rsb r10, r10, r7 /* V[04] - V[11] */ | ||
482 | smlal r8, r9, r10, r3 | ||
483 | ldr r7 , [r1, #384*4] /* 6 */ | ||
484 | ldr r10, [r1, #608*4] /* 9 */ | ||
485 | rsb r12, r12, r11 /* V[05] - V[10] */ | ||
486 | smlal r8, r9, r12, r4 | ||
487 | ldr r11, [r1, #480*4] /* 7 */ | ||
488 | ldr r12, [r1, #512*4] /* 8 */ | ||
489 | rsb r10, r10, r7 /* V[06] - V[09] */ | ||
490 | smlal r8, r9, r10, r5 | ||
491 | rsb r12, r12, r11 /* V[07] - V[08] */ | ||
492 | smlal r8, r9, r12, r6 | ||
493 | mov r8, r8, lsr #16 | ||
494 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
495 | str r8, [r0], #4 /* store Data */ | ||
496 | add r1, r1, #4 /* V++ */ | ||
497 | |||
498 | ldmpc regs=r4-r11 | ||
293 | #endif | 499 | #endif |
294 | .mpc_dewindowing_end: | 500 | .mpc_dewindowing_end: |
295 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D | 501 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D |