summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2010-07-20 23:35:07 +0000
committerNils Wallménius <nils@rockbox.org>2010-07-20 23:35:07 +0000
commit4f5b390a6df9733b46e254a7e367e066a80ccb9b (patch)
treeb9e8696d7cb431ca739c9c3017189241eca39a84
parentf32294d6abff7c5952b3a0c079a54b53eb42eb40 (diff)
downloadrockbox-4f5b390a6df9733b46e254a7e367e066a80ccb9b.tar.gz
rockbox-4f5b390a6df9733b46e254a7e367e066a80ccb9b.zip
Convert inline coldfire assembler to a 'real' assembler function, with tweaks by Buschel. Speeds up mpc decoding by ~1% on h300.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27504 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libmusepack/SOURCES3
-rw-r--r--apps/codecs/libmusepack/synth_filter.c53
-rw-r--r--apps/codecs/libmusepack/synth_filter_coldfire.S78
3 files changed, 90 insertions, 44 deletions
diff --git a/apps/codecs/libmusepack/SOURCES b/apps/codecs/libmusepack/SOURCES
index 31848214e0..60d762afd2 100644
--- a/apps/codecs/libmusepack/SOURCES
+++ b/apps/codecs/libmusepack/SOURCES
@@ -9,3 +9,6 @@ synth_filter.c
9#if defined(CPU_ARM) 9#if defined(CPU_ARM)
10synth_filter_arm.S 10synth_filter_arm.S
11#endif 11#endif
12#if defined(CPU_COLDFIRE)
13synth_filter_coldfire.S
14#endif
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index 0f415a4838..9a79328106 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -472,7 +472,7 @@ mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v)
472 /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176)); 472 /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176));
473} 473}
474 474
475#if defined(CPU_ARM) 475#if defined(CPU_ARM) || defined(CPU_COLDFIRE)
476extern void 476extern void
477mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, 477mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
478 const MPC_SAMPLE_FORMAT * V, 478 const MPC_SAMPLE_FORMAT * V,
@@ -485,57 +485,22 @@ mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
485{ 485{
486 mpc_int32_t k; 486 mpc_int32_t k;
487 487
488#if defined(CPU_COLDFIRE)
489 // 64=32x32-multiply assembler for Coldfire
490 for ( k = 0; k < 32; k++, D += 16, V++ )
491 {
492 asm volatile (
493 "movem.l (%[D]), %%d0-%%d3 \n\t"
494 "move.l (%[V]), %%a5 \n\t"
495 "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t"
496 "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t"
497 "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t"
498 "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t"
499 "movem.l (4*4, %[D]), %%d0-%%d3 \n\t"
500 "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t"
501 "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t"
502 "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t"
503 "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t"
504 "movem.l (8*4, %[D]), %%d0-%%d3 \n\t"
505 "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t"
506 "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t"
507 "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t"
508 "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t"
509 "movem.l (12*4, %[D]), %%d0-%%d3 \n\t"
510 "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t"
511 "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t"
512 "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t"
513 "mac.l %%d3, %%a5, %%acc0 \n\t"
514 "movclr.l %%acc0, %%d0 \n\t"
515 "lsl.l #1, %%d0 \n\t"
516 "move.l %%d0, (%[Data])+ \n"
517 : [Data] "+a" (Data)
518 : [V] "a" (V), [D] "a" (D)
519 : "d0", "d1", "d2", "d3", "a5");
520 }
521#else
522 // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C 488 // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C
523 for ( k = 0; k < 32; k++, D += 16, V++ ) 489 for ( k = 0; k < 32; k++, D += 16, V++ )
524 { 490 {
525 *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30) 491 *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30)
526 + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30) 492 + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
527 + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30) 493 + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
528 + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30) 494 + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
529 + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30) 495 + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
530 + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30) 496 + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
531 + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30) 497 + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
532 + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30); 498 + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
533 Data += 1; 499 Data += 1;
534 // total: 16 muls, 15 adds, 16 shifts 500 // total: 16 muls, 15 adds, 16 shifts
535 } 501 }
536#endif /* COLDFIRE */
537} 502}
538#endif /* CPU_ARM */ 503#endif /* CPU_ARM || CPU_COLDFIRE */
539 504
540static void 505static void
541mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y) 506mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y)
diff --git a/apps/codecs/libmusepack/synth_filter_coldfire.S b/apps/codecs/libmusepack/synth_filter_coldfire.S
new file mode 100644
index 0000000000..758ab3d496
--- /dev/null
+++ b/apps/codecs/libmusepack/synth_filter_coldfire.S
@@ -0,0 +1,78 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2005 by Thom Johansen
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23/*
24 * static void
25 * mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
26 * const MPC_SAMPLE_FORMAT * V,
27 * const MPC_SAMPLE_FORMAT * D)
28 */
29
30#if defined(USE_IRAM)
31 .section .icode
32#else
33 .text
34#endif
35 .align 2
36 .global mpc_decoder_windowing_D
37 .type mpc_decoder_windowing_D, @function
38
39mpc_decoder_windowing_D:
40 lea.l (-9*4, %sp), %sp
41 movem.l %d2-%d7/%a2-%a4, (%sp) | save some registers
42 movem.l (9*4+4, %sp), %a0-%a2 | a0 = Data, a1 = V, a2 = D
43 moveq.l #32, %d0 | loop counter
44
45 move.l (%a1), %a4
46 0: | loop
47 movem.l (%a2), %d1-%d7/%a3
48
49 mac.l %d1, %a4, ( 96*4, %a1), %a4, %acc0
50 mac.l %d2, %a4, (128*4, %a1), %a4, %acc0
51 mac.l %d3, %a4, (224*4, %a1), %a4, %acc0
52 mac.l %d4, %a4, (256*4, %a1), %a4, %acc0
53 mac.l %d5, %a4, (352*4, %a1), %a4, %acc0
54 mac.l %d6, %a4, (384*4, %a1), %a4, %acc0
55 mac.l %d7, %a4, (480*4, %a1), %a4, %acc0
56 mac.l %a3, %a4, (512*4, %a1), %a4, %acc0
57 movem.l (8*4, %a2), %d1-%d7/%a3
58 mac.l %d1, %a4, (608*4, %a1), %a4, %acc0
59 mac.l %d2, %a4, (640*4, %a1), %a4, %acc0
60 mac.l %d3, %a4, (736*4, %a1), %a4, %acc0
61 mac.l %d4, %a4, (768*4, %a1), %a4, %acc0
62 mac.l %d5, %a4, (864*4, %a1), %a4, %acc0
63 mac.l %d6, %a4, (896*4, %a1), %a4, %acc0
64 mac.l %d7, %a4, (992*4, %a1), %a4, %acc0
65 mac.l %a3, %a4, ( 4, %a1), %a4, %acc0
66
67 lea.l (16*4, %a2), %a2
68 addq.l #4, %a1
69 movclr.l %acc0, %d1
70 lsl.l #1, %d1
71 move.l %d1, (%a0)+
72 subq.l #1, %d0
73 bne 0b
74
75 movem.l (%sp), %d2-%d7/%a2-%a4 | restore stacked regs
76 lea.l (9*4, %sp), %sp
77 rts
78