From 77934cbc961a69e7d18588276f0e64a692854125 Mon Sep 17 00:00:00 2001
From: Jens Arnold <amiconn@rockbox.org>
Date: Wed, 19 Nov 2008 00:34:48 +0000
Subject: Compile-time choice between 16 bit and 32 bit integers for the
 filters. 32 bit filters are faster on ARMv4 (with assembler code), so use
 them there. Nice speedup on PP and Gigabeat F/X.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19140 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/demac/libdemac/decoder.c             |  13 +-
 apps/codecs/demac/libdemac/demac_config.h        |  27 ++-
 apps/codecs/demac/libdemac/filter.c              |  41 ++--
 apps/codecs/demac/libdemac/filter.h              |  12 +-
 apps/codecs/demac/libdemac/vector_math16.h       | 140 -----------
 apps/codecs/demac/libdemac/vector_math16_arm7.h  | 293 -----------------------
 apps/codecs/demac/libdemac/vector_math32_armv4.h | 210 ++++++++++++++++
 apps/codecs/demac/libdemac/vector_math_generic.h | 142 +++++++++++
 8 files changed, 416 insertions(+), 462 deletions(-)
 delete mode 100644 apps/codecs/demac/libdemac/vector_math16.h
 delete mode 100644 apps/codecs/demac/libdemac/vector_math16_arm7.h
 create mode 100644 apps/codecs/demac/libdemac/vector_math32_armv4.h
 create mode 100644 apps/codecs/demac/libdemac/vector_math_generic.h

(limited to 'apps')

diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c
index 540db47636..31bcb28b72 100644
--- a/apps/codecs/demac/libdemac/decoder.c
+++ b/apps/codecs/demac/libdemac/decoder.c
@@ -33,15 +33,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 
 /* Statically allocate the filter buffers */
 
-static int16_t filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]   /* 2432 bytes */
-               IBSS_ATTR __attribute__((aligned(16)));
-static int16_t filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2] /* 5120 bytes */
-               IBSS_ATTR __attribute__((aligned(16)));
+static filter_int filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]   
+                  IBSS_ATTR __attribute__((aligned(16))); /* 2432/4864 bytes */
+static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2]
+                  IBSS_ATTR __attribute__((aligned(16))); /* 5120/10240 bytes */
 
 /* This is only needed for "insane" files, and no current Rockbox targets
    can hope to decode them in realtime, although the Gigabeat S comes close. */
-static int16_t filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] /* 17408 bytes */
-               IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16)));
+static filter_int filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] 
+                  IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16)));
+                  /* 17408 or 34816 bytes */
 
 void init_frame_decoder(struct ape_ctx_t* ape_ctx,
                         unsigned char* inbuffer, int* firstbyte,
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 93fda76e25..86c2d24919 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -39,12 +39,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 
 #define APE_OUTPUT_DEPTH 29
 
-/* On PP5002 code should go into IRAM. Otherwise put the insane
- * filter buffer into IRAM as long as there is no better use. */
+/* On ARMv4, using 32 bit ints for the filters is faster. */
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#define FILTER_BITS 32
+#endif
+
 #if CONFIG_CPU == PP5002
+/* Code in IRAM for speed, not enough IRAM for the insane filter buffer. */
 #define ICODE_SECTION_DEMAC_ARM   .icode
 #define ICODE_ATTR_DEMAC          ICODE_ATTR
 #define IBSS_ATTR_DEMAC_INSANEBUF
+#elif CONFIG_CPU == PP5020
+/* Not enough IRAM for the insane filter buffer. */
+#define ICODE_SECTION_DEMAC_ARM   .text
+#define ICODE_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC_INSANEBUF
 #else
 #define ICODE_SECTION_DEMAC_ARM   .text
 #define ICODE_ATTR_DEMAC
@@ -75,6 +84,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 
 #ifndef PREDICTOR_HISTORY_SIZE
 #define PREDICTOR_HISTORY_SIZE 512
+#endif     
+
+#ifndef FILTER_BITS
+#define FILTER_BITS 16
+#endif
+
+
+#ifndef __ASSEMBLER__
+#include <inttypes.h>
+#if FILTER_BITS == 32
+typedef int32_t filter_int;
+#elif FILTER_BITS == 16
+typedef int16_t filter_int;
+#endif
 #endif
 
 #endif /* _DEMAC_CONFIG_H */
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index b47a37a041..5601fffcd4 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -28,27 +28,38 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #include "demac.h"
 #include "filter.h"
 #include "demac_config.h"
+     
+#if FILTER_BITS == 32
+
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#include "vector_math32_armv4.h"
+#else
+#include "vector_math_generic.h"
+#endif
+
+#else /* FILTER_BITS == 16 */
 
 #ifdef CPU_COLDFIRE
 #include "vector_math16_cf.h"
-#elif ARM_ARCH >= 6
+#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
 #include "vector_math16_armv6.h"
-#elif ARM_ARCH >= 5 /* Assume all our ARMv5 targets are ARMv5te(j) */
+#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
+/* Assume all our ARMv5 targets are ARMv5te(j) */
 #include "vector_math16_armv5te.h"
-#elif defined CPU_ARM7TDMI
-#include "vector_math16_arm7.h"
 #else
-#include "vector_math16.h"
+#include "vector_math_generic.h"
 #endif
 
+#endif /* FILTER_BITS */
+
 struct filter_t {
-    int16_t* coeffs; /* ORDER entries */
+    filter_int* coeffs; /* ORDER entries */
 
     /* We store all the filter delays in a single buffer */
-    int16_t* history_end;
+    filter_int* history_end;
 
-    int16_t* delay;
-    int16_t* adaptcoeffs;
+    filter_int* delay;
+    filter_int* adaptcoeffs;
 
     int avg;
 };
@@ -89,7 +100,7 @@ struct filter_t {
 #if defined(CPU_ARM) && (ARM_ARCH >= 6)
 #define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
 #else
-#define SATURATE(x) (int16_t)(((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
+#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
 #endif
 
 /* Apply the filter with state f to count entries in data[] */
@@ -145,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
         /* Have we filled the history buffer? */
         if (f->delay == f->history_end) {
             memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
-                    (ORDER*2) * sizeof(int16_t));
+                    (ORDER*2) * sizeof(filter_int));
             f->adaptcoeffs = f->coeffs + ORDER*2;
             f->delay = f->coeffs + ORDER*3;
         }
@@ -190,7 +201,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
         /* Have we filled the history buffer? */
         if (f->delay == f->history_end) {
             memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
-                    (ORDER*2) * sizeof(int16_t));
+                    (ORDER*2) * sizeof(filter_int));
             f->adaptcoeffs = f->coeffs + ORDER*2;
             f->delay = f->coeffs + ORDER*3;
         }
@@ -200,7 +211,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
 static struct filter_t filter0 IBSS_ATTR;
 static struct filter_t filter1 IBSS_ATTR;
 
-static void do_init_filter(struct filter_t* f, int16_t* buf)
+static void do_init_filter(struct filter_t* f, filter_int* buf)
 {
     f->coeffs = buf;
     f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE;
@@ -210,13 +221,13 @@ static void do_init_filter(struct filter_t* f, int16_t* buf)
     f->delay = f->coeffs + ORDER*3;
 
     /* Zero coefficients and history buffer */
-    memset(f->coeffs, 0, ORDER*3 * sizeof(int16_t));
+    memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int));
 
     /* Zero the running average */
     f->avg = 0;
 }
 
-void INIT_FILTER(int16_t* buf)
+void INIT_FILTER(filter_int* buf)
 {
     do_init_filter(&filter0, buf);
     do_init_filter(&filter1, buf + ORDER*3 + FILTER_HISTORY_SIZE);
diff --git a/apps/codecs/demac/libdemac/filter.h b/apps/codecs/demac/libdemac/filter.h
index acbb155b29..bbe51d4572 100644
--- a/apps/codecs/demac/libdemac/filter.h
+++ b/apps/codecs/demac/libdemac/filter.h
@@ -25,21 +25,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifndef _APE_FILTER_H
 #define _APE_FILTER_H
 
-#include <inttypes.h>
+#include "demac_config.h"
 
-void init_filter_16_11(int16_t* buf);
+void init_filter_16_11(filter_int* buf);
 int apply_filter_16_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 
-void init_filter_64_11(int16_t* buf);
+void init_filter_64_11(filter_int* buf);
 int apply_filter_64_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 
-void init_filter_32_10(int16_t* buf);
+void init_filter_32_10(filter_int* buf);
 int apply_filter_32_10(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 
-void init_filter_256_13(int16_t* buf);
+void init_filter_256_13(filter_int* buf);
 int apply_filter_256_13(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 
-void init_filter_1280_15(int16_t* buf);
+void init_filter_1280_15(filter_int* buf);
 int apply_filter_1280_15(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 
 #endif
diff --git a/apps/codecs/demac/libdemac/vector_math16.h b/apps/codecs/demac/libdemac/vector_math16.h
deleted file mode 100644
index 5d82abe930..0000000000
--- a/apps/codecs/demac/libdemac/vector_math16.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-
-libdemac - A Monkey's Audio decoder
-
-$Id:$
-
-Copyright (C) Dave Chapman 2007
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
-
-*/
-
-static inline void vector_add(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 32
-    int order = (ORDER >> 5);
-    while (order--)
-#endif
-    {
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-#if ORDER > 16
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-        *v1++ += *v2++;
-#endif
-    }
-}
-
-static inline void vector_sub(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 32
-    int order = (ORDER >> 5);
-    while (order--)
-#endif
-    {
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-#if ORDER > 16
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-        *v1++ -= *v2++;
-#endif
-    }
-}
-
-static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
-{
-    int res = 0;
-
-#if ORDER > 16
-    int order = (ORDER >> 4);
-    while (order--)
-#endif
-    {
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-        res += *v1++ * *v2++;
-    }
-    return res;
-}
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
deleted file mode 100644
index 653bb1f53f..0000000000
--- a/apps/codecs/demac/libdemac/vector_math16_arm7.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-
-libdemac - A Monkey's Audio decoder
-
-$Id$
-
-Copyright (C) Dave Chapman 2007
-
-ARM7 vector math copyright (C) 2007 Jens Arnold
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
-
-*/
-
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). */
-static inline void vector_add(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-
-#define ADDHALFREGS(sum, s1)                             /* Adds register */    \
-        "mov   " #s1  ", " #s1  ",   ror #16         \n" /* halves straight. */ \
-        "add     r8    , " #s1  ", " #sum ", lsl #16 \n" /* Clobbers 's1' */    \
-        "add   " #sum ", " #s1  ", " #sum ", lsr #16 \n" /* and r8. */          \
-        "mov   " #sum ", " #sum ",   lsl #16         \n" \
-        "orr   " #sum ", " #sum ",   r8    , lsr #16 \n"
-
-#define ADDHALFXREGS(sum, s1, s2)                        /* Adds register */    \
-        "add   " #s1  ", " #s1  ", " #sum ", lsl #16 \n" /* halves across. */ \
-        "add   " #sum ", " #s2  ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
-        "mov   " #sum ", " #sum ",   lsl #16         \n" \
-        "orr   " #sum ", " #sum ", " #s1  ", lsr #16 \n"
-
-    asm volatile (
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-
-    "10:                             \n"
-        "ldrh    r4, [%[v2]], #2     \n"
-        "mov     r4, r4, lsl #16     \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        ADDHALFXREGS(r0, r4, r5)
-        ADDHALFXREGS(r1, r5, r6)
-        ADDHALFXREGS(r2, r6, r7)
-        ADDHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "mov     r4, r8              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        ADDHALFXREGS(r0, r4, r5)
-        ADDHALFXREGS(r1, r5, r6)
-        ADDHALFXREGS(r2, r6, r7)
-        ADDHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "mov     r4, r8              \n"
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        ADDHALFREGS(r0, r4)
-        ADDHALFREGS(r1, r5)
-        ADDHALFREGS(r2, r6)
-        ADDHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        ADDHALFREGS(r0, r4)
-        ADDHALFREGS(r1, r5)
-        ADDHALFREGS(r2, r6)
-        ADDHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4",
-        "r5", "r6", "r7", "r8", "memory"
-    );
-}
-
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). */
-static inline void vector_sub(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-
-#define SUBHALFREGS(dif, s1)                             /* Subtracts register */ \
-        "sub     r8    , " #dif ", " #s1            "\n" /* halves straight. */   \
-        "and     r8    ,   r8    ,   r9              \n" /* Needs r9 = 0x0000ffff, */ \
-        "mov   " #dif ", " #dif ",   lsr #16         \n" /* clobbers r8. */      \
-        "sub   " #dif ", " #dif ", " #s1  ", lsr #16 \n"  \
-        "orr   " #dif ",   r8    , " #dif ", lsl #16 \n"
-
-#define SUBHALFXREGS(dif, s1, s2)                        /* Subtracts register */ \
-        "sub   " #s1  ", " #dif ", " #s1  ", lsr #16 \n" /* halves across. */     \
-        "and   " #s1  ", " #s1  ",   r9              \n" /* Needs r9 = 0x0000ffff, */ \
-        "rsb   " #dif ", " #s2  ", " #dif ", lsr #16 \n" /* clobbers 's1'. */     \
-        "orr   " #dif ", " #s1  ", " #dif ", lsl #16 \n"
-        
-    asm volatile (
-        "mov     r9, #0xff           \n"
-        "orr     r9, r9, #0xff00     \n"
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-
-    "10:                             \n"
-        "ldrh    r4, [%[v2]], #2     \n"
-        "mov     r4, r4, lsl #16     \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        SUBHALFXREGS(r0, r4, r5)
-        SUBHALFXREGS(r1, r5, r6)
-        SUBHALFXREGS(r2, r6, r7)
-        SUBHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "mov     r4, r8              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        SUBHALFXREGS(r0, r4, r5)
-        SUBHALFXREGS(r1, r5, r6)
-        SUBHALFXREGS(r2, r6, r7)
-        SUBHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "mov     r4, r8              \n"
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        SUBHALFREGS(r0, r4)
-        SUBHALFREGS(r1, r5)
-        SUBHALFREGS(r2, r6)
-        SUBHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        SUBHALFREGS(r0, r4)
-        SUBHALFREGS(r1, r5)
-        SUBHALFREGS(r2, r6)
-        SUBHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4", "r5", 
-        "r6", "r7", "r8", "r9", "memory"
-    );
-}
-
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). It is optimised
- * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
- * than the C version. */
-static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
-{
-    int res = 0;
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-
-#define MLABLOCK2(f1, f2)                   \
-        "mov     r8, " #f1 ", lsl #16   \n" \
-        "mov     r8,   r8   , asr #16   \n" \
-        "mov     r9, " #f2 ", lsl #16   \n" \
-        "mov     r9,   r9   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r8, " #f1 ", asr #16   \n" \
-        "mov     r9, " #f2 ", asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n"
-
-#define MLABLOCK2_U2(f1, f2)                \
-        "mov     r8, " #f1 ", lsl #16   \n" \
-        "mov     r8,   r8   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r8, " #f1 ", asr #16   \n" \
-        "mov     r9, " #f2 ", lsl #16   \n" \
-        "mov     r9,   r9   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r9, " #f2 ", asr #16   \n"
-
-    asm volatile (
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-
-    "10:                             \n"
-        "ldrsh   r9, [%[v2]], #2     \n"
-    "1:                              \n"
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2_U2(r0, r4)
-        MLABLOCK2_U2(r1, r5)
-        MLABLOCK2_U2(r2, r6)
-        MLABLOCK2_U2(r3, r7)
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2_U2(r0, r4)
-        MLABLOCK2_U2(r1, r5)
-        MLABLOCK2_U2(r2, r6)
-        MLABLOCK2_U2(r3, r7)
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2(r0, r4)
-        MLABLOCK2(r1, r5)
-        MLABLOCK2(r2, r6)
-        MLABLOCK2(r3, r7)
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2(r0, r4)
-        MLABLOCK2(r1, r5)
-        MLABLOCK2(r2, r6)
-        MLABLOCK2(r3, r7)
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2),
-        [res]"+r"(res)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4",
-        "r5", "r6", "r7", "r8", "r9"
-    );
-    return res;
-}
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
new file mode 100644
index 0000000000..b729bd3a0a
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -0,0 +1,210 @@
+/*
+
+libdemac - A Monkey's Audio decoder
+
+$Id$
+
+Copyright (C) Dave Chapman 2007
+
+ARMv4 vector math copyright (C) 2008 Jens Arnold
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+
+*/
+
+static inline void vector_add(int32_t* v1, int32_t* v2)
+{
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+
+#define ADDBLOCK4                        \
+        "ldmia   %[v1],  {r0-r3}     \n" \
+        "ldmia   %[v2]!, {r4-r7}     \n" \
+        "add     r0, r0, r4          \n" \
+        "add     r1, r1, r5          \n" \
+        "add     r2, r2, r6          \n" \
+        "add     r3, r3, r7          \n" \
+        "stmia   %[v1]!, {r0-r3}     \n"
+
+    asm volatile (
+    "1:                              \n"
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+#if ORDER > 16
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+#endif
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1  \n"
+        "bne     1b                  \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "memory"
+    );
+}
+
+static inline void vector_sub(int32_t* v1, int32_t* v2)
+{
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+
+#define SUBBLOCK4                        \
+        "ldmia   %[v1],  {r0-r3}     \n" \
+        "ldmia   %[v2]!, {r4-r7}     \n" \
+        "sub     r0, r0, r4          \n" \
+        "sub     r1, r1, r5          \n" \
+        "sub     r2, r2, r6          \n" \
+        "sub     r3, r3, r7          \n" \
+        "stmia   %[v1]!, {r0-r3}     \n"
+
+    asm volatile (
+    "1:                              \n"
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+#if ORDER > 16
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+#endif
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1  \n"
+        "bne     1b                  \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "memory"
+    );
+}
+
+static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
+{
+    int res = 0;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+
+    asm volatile (
+#if ORDER > 16
+        "ldmia   %[v2]!, {r6-r7}         \n"
+    "1:                                  \n"
+        "ldmia   %[v1]!, {r0,r1,r3-r5}   \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r4}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0,r1,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r5}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6,r7}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0,r1,r3-r5}   \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r4}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0,r1,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r5}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+#if ORDER > 32
+        "ldmia   %[v2]!, {r0-r2,r6,r7}   \n"
+#else
+        "ldmia   %[v2]!, {r0-r2}         \n"
+#endif
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+
+#else /* ORDER <= 16 */
+
+#define MLABLOCK4                            \
+        "ldmia   %[v1]!, {r0-r3}         \n" \
+        "ldmia   %[v2]!, {r4-r7}         \n" \
+        "mla     %[res], r4, r0, %[res]  \n" \
+        "mla     %[res], r5, r1, %[res]  \n" \
+        "mla     %[res], r6, r2, %[res]  \n" \
+        "mla     %[res], r7, r3, %[res]  \n"
+
+        MLABLOCK4
+        MLABLOCK4
+        MLABLOCK4
+        MLABLOCK4
+#endif /* ORDER <= 16 */
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"+r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3",
+        "r4", "r5", "r6", "r7"
+#if ORDER > 16
+        ,"r8"
+#endif
+    );
+    return res;
+}
diff --git a/apps/codecs/demac/libdemac/vector_math_generic.h b/apps/codecs/demac/libdemac/vector_math_generic.h
new file mode 100644
index 0000000000..7b61db77be
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math_generic.h
@@ -0,0 +1,142 @@
+/*
+
+libdemac - A Monkey's Audio decoder
+
+$Id$
+
+Copyright (C) Dave Chapman 2007
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+
+*/
+
+#include "demac_config.h"
+
+static inline void vector_add(filter_int* v1, filter_int* v2)
+{
+#if ORDER > 32
+    int order = (ORDER >> 5);
+    while (order--)
+#endif
+    {
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+#if ORDER > 16
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+#endif
+    }
+}
+
+static inline void vector_sub(filter_int* v1, filter_int* v2)
+{
+#if ORDER > 32
+    int order = (ORDER >> 5);
+    while (order--)
+#endif
+    {
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+#if ORDER > 16
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+#endif
+    }
+}
+
+static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
+{
+    int res = 0;
+
+#if ORDER > 16
+    int order = (ORDER >> 4);
+    while (order--)
+#endif
+    {
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+    }
+    return res;
+}
-- 
cgit v1.2.3