From 4f5b390a6df9733b46e254a7e367e066a80ccb9b Mon Sep 17 00:00:00 2001
From: Nils Wallménius <nils@rockbox.org>
Date: Tue, 20 Jul 2010 23:35:07 +0000
Subject: Convert inline coldfire assembler to a 'real' assembler function,
 with tweaks by Buschel. Speeds up mpc decoding by ~1% on h300.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27504 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libmusepack/SOURCES                 |  3 +
 apps/codecs/libmusepack/synth_filter.c          | 53 +++--------------
 apps/codecs/libmusepack/synth_filter_coldfire.S | 78 +++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 44 deletions(-)
 create mode 100644 apps/codecs/libmusepack/synth_filter_coldfire.S

diff --git a/apps/codecs/libmusepack/SOURCES b/apps/codecs/libmusepack/SOURCES
index 31848214e0..60d762afd2 100644
--- a/apps/codecs/libmusepack/SOURCES
+++ b/apps/codecs/libmusepack/SOURCES
@@ -9,3 +9,6 @@ synth_filter.c
 #if defined(CPU_ARM)
 synth_filter_arm.S
 #endif
+#if defined(CPU_COLDFIRE)
+synth_filter_coldfire.S
+#endif
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index 0f415a4838..9a79328106 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -472,7 +472,7 @@ mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v)
   /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176));
 }
 
-#if defined(CPU_ARM)
+#if defined(CPU_ARM) || defined(CPU_COLDFIRE)
 extern void
 mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, 
                         const MPC_SAMPLE_FORMAT * V,
@@ -485,57 +485,22 @@ mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
 {
     mpc_int32_t k;
     
-#if defined(CPU_COLDFIRE)
-    // 64=32x32-multiply assembler for Coldfire
-    for ( k = 0; k < 32; k++, D += 16, V++ ) 
-    {
-        asm volatile (
-        "movem.l (%[D]), %%d0-%%d3                    \n\t"
-        "move.l (%[V]), %%a5                          \n\t"
-        "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t"
-        "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t"
-        "movem.l (4*4, %[D]), %%d0-%%d3               \n\t"
-        "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t"
-        "movem.l (8*4, %[D]), %%d0-%%d3               \n\t"
-        "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t"
-        "movem.l (12*4, %[D]), %%d0-%%d3              \n\t"
-        "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t"
-        "mac.l %%d3, %%a5, %%acc0                     \n\t"
-        "movclr.l %%acc0, %%d0                        \n\t"
-        "lsl.l #1, %%d0                               \n\t"
-        "move.l %%d0, (%[Data])+                      \n"
-        : [Data] "+a" (Data)
-        : [V] "a" (V), [D] "a" (D)
-        : "d0", "d1", "d2", "d3", "a5");
-    }
-#else
     // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C
     for ( k = 0; k < 32; k++, D += 16, V++ )
     {
         *Data = MPC_MULTIPLY_EX(V[  0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30)
-          + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
-          + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
-          + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
-          + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
-          + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
-          + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
-          + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
+              + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
+              + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
+              + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
+              + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
+              + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
+              + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
+              + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
         Data += 1;
         // total: 16 muls, 15 adds, 16 shifts
     }
-#endif /* COLDFIRE */
 }
-#endif /* CPU_ARM */
+#endif /* CPU_ARM || CPU_COLDFIRE */
 
 static void 
 mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y)
diff --git a/apps/codecs/libmusepack/synth_filter_coldfire.S b/apps/codecs/libmusepack/synth_filter_coldfire.S
new file mode 100644
index 0000000000..758ab3d496
--- /dev/null
+++ b/apps/codecs/libmusepack/synth_filter_coldfire.S
@@ -0,0 +1,78 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by Thom Johansen 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+/*
+ * static void 
+ * mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, 
+ *                         const MPC_SAMPLE_FORMAT * V,
+ *                         const MPC_SAMPLE_FORMAT * D)
+ */
+
+#if defined(USE_IRAM)
+    .section .icode
+#else
+    .text
+#endif
+    .align 2
+    .global     mpc_decoder_windowing_D
+    .type       mpc_decoder_windowing_D, @function
+
+mpc_decoder_windowing_D:
+    lea.l    (-9*4, %sp), %sp
+    movem.l  %d2-%d7/%a2-%a4, (%sp)       | save some registers
+    movem.l  (9*4+4, %sp), %a0-%a2        | a0 = Data, a1 = V, a2 = D
+    moveq.l  #32, %d0                     | loop counter
+
+    move.l   (%a1), %a4
+    0:                                    | loop
+    movem.l  (%a2), %d1-%d7/%a3
+
+    mac.l %d1, %a4, ( 96*4, %a1), %a4, %acc0
+    mac.l %d2, %a4, (128*4, %a1), %a4, %acc0
+    mac.l %d3, %a4, (224*4, %a1), %a4, %acc0
+    mac.l %d4, %a4, (256*4, %a1), %a4, %acc0
+    mac.l %d5, %a4, (352*4, %a1), %a4, %acc0
+    mac.l %d6, %a4, (384*4, %a1), %a4, %acc0
+    mac.l %d7, %a4, (480*4, %a1), %a4, %acc0
+    mac.l %a3, %a4, (512*4, %a1), %a4, %acc0
+    movem.l (8*4, %a2), %d1-%d7/%a3
+    mac.l %d1, %a4, (608*4, %a1), %a4, %acc0
+    mac.l %d2, %a4, (640*4, %a1), %a4, %acc0
+    mac.l %d3, %a4, (736*4, %a1), %a4, %acc0
+    mac.l %d4, %a4, (768*4, %a1), %a4, %acc0
+    mac.l %d5, %a4, (864*4, %a1), %a4, %acc0
+    mac.l %d6, %a4, (896*4, %a1), %a4, %acc0
+    mac.l %d7, %a4, (992*4, %a1), %a4, %acc0
+    mac.l %a3, %a4, (    4, %a1), %a4, %acc0
+
+    lea.l    (16*4, %a2), %a2
+    addq.l   #4, %a1
+    movclr.l %acc0, %d1
+    lsl.l    #1, %d1
+    move.l   %d1, (%a0)+
+    subq.l   #1, %d0
+    bne 0b
+
+    movem.l  (%sp), %d2-%d7/%a2-%a4       | restore stacked regs
+    lea.l    (9*4, %sp), %sp
+    rts
+
-- 
cgit v1.2.3