From e7cdd6cbc6040c3c6225580ba155edfdfd35efb1 Mon Sep 17 00:00:00 2001
From: Jens Arnold <amiconn@rockbox.org>
Date: Wed, 4 Jul 2007 19:23:18 +0000
Subject: Assemblerised CMUL() for ARM, giving ~20% speedup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@13787 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libwma/wmadeci.c  | 24 +++++++++++++++++++++++-
 apps/codecs/libwma/wmafixed.h | 24 ------------------------
 2 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'apps/codecs')

diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c
index 6647ed4b40..29651382e2 100644
--- a/apps/codecs/libwma/wmadeci.c
+++ b/apps/codecs/libwma/wmadeci.c
@@ -50,7 +50,29 @@ uint32_t bswap_32(uint32_t x)
     return (b1 >> 24) | (b2 >> 8) | (b3 << 8) | (b4 << 24);
 }
 
-#ifdef CPU_COLDFIRE
+#ifdef CPU_ARM
+static inline
+void CMUL(fixed32 *x, fixed32 *y,
+          fixed32  a, fixed32  b,
+          fixed32  t, fixed32  v)
+{   
+    /* This version loses one bit of precision. Could be solved at the cost
+     * of 2 extra cycles if it becomes an issue. */
+    int x1, y1, l;
+    asm(
+        "smull    %[l], %[y1], %[b], %[t] \n"
+        "smlal    %[l], %[y1], %[a], %[v] \n"
+        "rsb      %[b], %[b], #0          \n"
+        "smull    %[l], %[x1], %[a], %[t] \n"
+        "smlal    %[l], %[x1], %[b], %[v] \n"
+        : [l] "=&r" (l), [x1]"=&r" (x1), [y1]"=&r" (y1), [b] "+r" (b)
+        : [a] "r" (a),   [t] "r" (t),    [v] "r" (v)
+        : "cc"
+    );
+    *x = x1 << 1;
+    *y = y1 << 1;
+}
+#elif defined CPU_COLDFIRE
 static inline
 void CMUL(fixed32 *x, fixed32 *y,
           fixed32  a, fixed32  b,
diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h
index 887973a78a..99ddec759e 100644
--- a/apps/codecs/libwma/wmafixed.h
+++ b/apps/codecs/libwma/wmafixed.h
@@ -61,30 +61,6 @@ long fsincos(unsigned long phase, fixed32 *cos);
        __result;  \
     })
 
-/*
-	Special fixmul32 that does a 16.16 x 1.31 multiply that returns a 16.16 value.
-	this is needed because the fft constants are all normalized to be less then 1
-	and can't fit into a 16 bit number without excessive rounding
-
-
-*/
-
-
-#  define fixmul32b(x, y)  \
-    ({ int32_t __hi;  \
-       uint32_t __lo;  \
-       int32_t __result;  \
-       asm ("smull    %0, %1, %3, %4\n\t"  \
-        "movs    %0, %0, lsr %5\n\t"  \
-        "adc    %2, %0, %1, lsl %6"  \
-        : "=&r" (__lo), "=&r" (__hi), "=r" (__result)  \
-        : "%r" (x), "r" (y),  \
-          "M" (31), "M" (1)  \
-        : "cc");  \
-       __result;  \
-    })
-
-
 #elif defined(CPU_COLDFIRE)
 static inline int32_t fixmul32(int32_t x, int32_t y)
 {
-- 
cgit v1.2.3