From 4f5b390a6df9733b46e254a7e367e066a80ccb9b Mon Sep 17 00:00:00 2001 From: Nils Wallménius Date: Tue, 20 Jul 2010 23:35:07 +0000 Subject: Convert inline coldfire assembler to a 'real' assembler function, with tweaks by Buschel. Speeds up mpc decoding by ~1% on h300. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27504 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmusepack/SOURCES | 3 + apps/codecs/libmusepack/synth_filter.c | 53 +++-------------- apps/codecs/libmusepack/synth_filter_coldfire.S | 78 +++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 44 deletions(-) create mode 100644 apps/codecs/libmusepack/synth_filter_coldfire.S diff --git a/apps/codecs/libmusepack/SOURCES b/apps/codecs/libmusepack/SOURCES index 31848214e0..60d762afd2 100644 --- a/apps/codecs/libmusepack/SOURCES +++ b/apps/codecs/libmusepack/SOURCES @@ -9,3 +9,6 @@ synth_filter.c #if defined(CPU_ARM) synth_filter_arm.S #endif +#if defined(CPU_COLDFIRE) +synth_filter_coldfire.S +#endif diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c index 0f415a4838..9a79328106 100644 --- a/apps/codecs/libmusepack/synth_filter.c +++ b/apps/codecs/libmusepack/synth_filter.c @@ -472,7 +472,7 @@ mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v) /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176)); } -#if defined(CPU_ARM) +#if defined(CPU_ARM) || defined(CPU_COLDFIRE) extern void mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, const MPC_SAMPLE_FORMAT * V, @@ -485,57 +485,22 @@ mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, { mpc_int32_t k; -#if defined(CPU_COLDFIRE) - // 64=32x32-multiply assembler for Coldfire - for ( k = 0; k < 32; k++, D += 16, V++ ) - { - asm volatile ( - "movem.l (%[D]), %%d0-%%d3 \n\t" - "move.l (%[V]), %%a5 \n\t" - "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" - "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, %%acc0 \n\t" - "movclr.l %%acc0, %%d0 \n\t" - "lsl.l #1, %%d0 \n\t" - "move.l %%d0, (%[Data])+ \n" - : [Data] "+a" (Data) - : [V] "a" (V), [D] "a" (D) - : "d0", "d1", "d2", "d3", "a5"); - } -#else // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C for ( k = 0; k < 32; k++, D += 16, V++ ) { *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30) - + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30) - + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30) - + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30) - + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30) - + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30) - + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30) - + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30); + + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30) + + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30) + + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30) + + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30) + + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30) + + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30) + + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30); Data += 1; // total: 16 muls, 15 adds, 16 shifts } -#endif /* COLDFIRE */ } -#endif /* CPU_ARM */ +#endif /* CPU_ARM || CPU_COLDFIRE */ static void mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y) diff --git a/apps/codecs/libmusepack/synth_filter_coldfire.S b/apps/codecs/libmusepack/synth_filter_coldfire.S new file mode 100644 index 0000000000..758ab3d496 --- /dev/null +++ b/apps/codecs/libmusepack/synth_filter_coldfire.S @@ -0,0 +1,78 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2005 by Thom Johansen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +/* + * static void + * mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, + * const MPC_SAMPLE_FORMAT * V, + * const MPC_SAMPLE_FORMAT * D) + */ + +#if defined(USE_IRAM) + .section .icode +#else + .text +#endif + .align 2 + .global mpc_decoder_windowing_D + .type mpc_decoder_windowing_D, @function + +mpc_decoder_windowing_D: + lea.l (-9*4, %sp), %sp + movem.l %d2-%d7/%a2-%a4, (%sp) | save some registers + movem.l (9*4+4, %sp), %a0-%a2 | a0 = Data, a1 = V, a2 = D + moveq.l #32, %d0 | loop counter + + move.l (%a1), %a4 + 0: | loop + movem.l (%a2), %d1-%d7/%a3 + + mac.l %d1, %a4, ( 96*4, %a1), %a4, %acc0 + mac.l %d2, %a4, (128*4, %a1), %a4, %acc0 + mac.l %d3, %a4, (224*4, %a1), %a4, %acc0 + mac.l %d4, %a4, (256*4, %a1), %a4, %acc0 + mac.l %d5, %a4, (352*4, %a1), %a4, %acc0 + mac.l %d6, %a4, (384*4, %a1), %a4, %acc0 + mac.l %d7, %a4, (480*4, %a1), %a4, %acc0 + mac.l %a3, %a4, (512*4, %a1), %a4, %acc0 + movem.l (8*4, %a2), %d1-%d7/%a3 + mac.l %d1, %a4, (608*4, %a1), %a4, %acc0 + mac.l %d2, %a4, (640*4, %a1), %a4, %acc0 + mac.l %d3, %a4, (736*4, %a1), %a4, %acc0 + mac.l %d4, %a4, (768*4, %a1), %a4, %acc0 + mac.l %d5, %a4, (864*4, %a1), %a4, %acc0 + mac.l %d6, %a4, (896*4, %a1), %a4, %acc0 + mac.l %d7, %a4, (992*4, %a1), %a4, %acc0 + mac.l %a3, %a4, ( 4, %a1), %a4, %acc0 + + lea.l (16*4, %a2), %a2 + addq.l #4, %a1 + movclr.l %acc0, %d1 + lsl.l #1, %d1 + move.l %d1, (%a0)+ + subq.l #1, %d0 + bne 0b + + movem.l (%sp), %d2-%d7/%a2-%a4 | restore stacked regs + lea.l (9*4, %sp), %sp + rts + -- cgit v1.2.3