7 files changed, 280 insertions, 326 deletions
diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c
index 540db47636..31bcb28b72 100644
--- a/apps/codecs/demac/libdemac/decoder.c
+++ b/apps/codecs/demac/libdemac/decoder.c
@@ -33,15 +33,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 /* Statically allocate the filter buffers */
-static int16_t filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]   /* 2432 bytes */
+static filter_int filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]   
-               IBSS_ATTR __attribute__((aligned(16)));
+                  IBSS_ATTR __attribute__((aligned(16))); /* 2432/4864 bytes */
-static int16_t filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2] /* 5120 bytes */
+static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2]
-               IBSS_ATTR __attribute__((aligned(16)));
+                  IBSS_ATTR __attribute__((aligned(16))); /* 5120/10240 bytes */
 /* This is only needed for "insane" files, and no current Rockbox targets
   can hope to decode them in realtime, although the Gigabeat S comes close. */
-static int16_t filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] /* 17408 bytes */
+static filter_int filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] 
-               IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16)));
+                  IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16)));
+                  /* 17408 or 34816 bytes */
 void init_frame_decoder(struct ape_ctx_t* ape_ctx,
                        unsigned char* inbuffer, int* firstbyte,
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 93fda76e25..86c2d24919 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -39,12 +39,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #define APE_OUTPUT_DEPTH 29
-/* On PP5002 code should go into IRAM. Otherwise put the insane
+/* On ARMv4, using 32 bit ints for the filters is faster. */
- * filter buffer into IRAM as long as there is no better use. */
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#define FILTER_BITS 32
+#endif
 #if CONFIG_CPU == PP5002
+/* Code in IRAM for speed, not enough IRAM for the insane filter buffer. */
 #define ICODE_SECTION_DEMAC_ARM   .icode
 #define ICODE_ATTR_DEMAC          ICODE_ATTR
 #define IBSS_ATTR_DEMAC_INSANEBUF
+#elif CONFIG_CPU == PP5020
+/* Not enough IRAM for the insane filter buffer. */
+#define ICODE_SECTION_DEMAC_ARM   .text
+#define ICODE_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC_INSANEBUF
 #else
 #define ICODE_SECTION_DEMAC_ARM   .text
 #define ICODE_ATTR_DEMAC
@@ -75,6 +84,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifndef PREDICTOR_HISTORY_SIZE
 #define PREDICTOR_HISTORY_SIZE 512
+#endif     
+#ifndef FILTER_BITS
+#define FILTER_BITS 16
+#endif
+#ifndef __ASSEMBLER__
+#include <inttypes.h>
+#if FILTER_BITS == 32
+typedef int32_t filter_int;
+#elif FILTER_BITS == 16
+typedef int16_t filter_int;
+#endif
 #endif
 #endif /* _DEMAC_CONFIG_H */
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index b47a37a041..5601fffcd4 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -28,27 +28,38 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #include "demac.h"
 #include "filter.h"
 #include "demac_config.h"
+     
+#if FILTER_BITS == 32
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#include "vector_math32_armv4.h"
+#else
+#include "vector_math_generic.h"
+#endif
+#else /* FILTER_BITS == 16 */
 #ifdef CPU_COLDFIRE
 #include "vector_math16_cf.h"
-#elif ARM_ARCH >= 6
+#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
 #include "vector_math16_armv6.h"
-#elif ARM_ARCH >= 5 /* Assume all our ARMv5 targets are ARMv5te(j) */
+#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
+/* Assume all our ARMv5 targets are ARMv5te(j) */
 #include "vector_math16_armv5te.h"
-#elif defined CPU_ARM7TDMI
-#include "vector_math16_arm7.h"
 #else
-#include "vector_math16.h"
+#include "vector_math_generic.h"
 #endif
+#endif /* FILTER_BITS */
 struct filter_t {
-    int16_t* coeffs; /* ORDER entries */
+    filter_int* coeffs; /* ORDER entries */
    /* We store all the filter delays in a single buffer */
-    int16_t* history_end;
+    filter_int* history_end;
-    int16_t* delay;
+    filter_int* delay;
-    int16_t* adaptcoeffs;
+    filter_int* adaptcoeffs;
    int avg;
 };
@@ -89,7 +100,7 @@ struct filter_t {
 #if defined(CPU_ARM) && (ARM_ARCH >= 6)
 #define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
 #else
-#define SATURATE(x) (int16_t)(((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
+#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
 #endif
 /* Apply the filter with state f to count entries in data[] */
@@ -145,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
        /* Have we filled the history buffer? */
        if (f->delay == f->history_end) {
            memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
-                    (ORDER*2) * sizeof(int16_t));
+                    (ORDER*2) * sizeof(filter_int));
            f->adaptcoeffs = f->coeffs + ORDER*2;
            f->delay = f->coeffs + ORDER*3;
        }
@@ -190,7 +201,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
        /* Have we filled the history buffer? */
        if (f->delay == f->history_end) {
            memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
-                    (ORDER*2) * sizeof(int16_t));
+                    (ORDER*2) * sizeof(filter_int));
            f->adaptcoeffs = f->coeffs + ORDER*2;
            f->delay = f->coeffs + ORDER*3;
        }
@@ -200,7 +211,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
 static struct filter_t filter0 IBSS_ATTR;
 static struct filter_t filter1 IBSS_ATTR;
-static void do_init_filter(struct filter_t* f, int16_t* buf)
+static void do_init_filter(struct filter_t* f, filter_int* buf)
 {
    f->coeffs = buf;
    f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE;
@@ -210,13 +221,13 @@ static void do_init_filter(struct filter_t* f, int16_t* buf)
    f->delay = f->coeffs + ORDER*3;
    /* Zero coefficients and history buffer */
-    memset(f->coeffs, 0, ORDER*3 * sizeof(int16_t));
+    memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int));
    /* Zero the running average */
    f->avg = 0;
 }
-void INIT_FILTER(int16_t* buf)
+void INIT_FILTER(filter_int* buf)
 {
    do_init_filter(&filter0, buf);
    do_init_filter(&filter1, buf + ORDER*3 + FILTER_HISTORY_SIZE);
diff --git a/apps/codecs/demac/libdemac/filter.h b/apps/codecs/demac/libdemac/filter.h
index acbb155b29..bbe51d4572 100644
--- a/apps/codecs/demac/libdemac/filter.h
+++ b/apps/codecs/demac/libdemac/filter.h
@@ -25,21 +25,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifndef _APE_FILTER_H
 #define _APE_FILTER_H
-#include <inttypes.h>
+#include "demac_config.h"
-void init_filter_16_11(int16_t* buf);
+void init_filter_16_11(filter_int* buf);
 int apply_filter_16_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
-void init_filter_64_11(int16_t* buf);
+void init_filter_64_11(filter_int* buf);
 int apply_filter_64_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
-void init_filter_32_10(int16_t* buf);
+void init_filter_32_10(filter_int* buf);
 int apply_filter_32_10(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
-void init_filter_256_13(int16_t* buf);
+void init_filter_256_13(filter_int* buf);
 int apply_filter_256_13(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
-void init_filter_1280_15(int16_t* buf);
+void init_filter_1280_15(filter_int* buf);
 int apply_filter_1280_15(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
 #endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
deleted file mode 100644
index 653bb1f53f..0000000000
--- a/apps/codecs/demac/libdemac/vector_math16_arm7.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-libdemac - A Monkey's Audio decoder
-$Id$
-Copyright (C) Dave Chapman 2007
-ARM7 vector math copyright (C) 2007 Jens Arnold
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
-*/
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). */
-static inline void vector_add(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-#define ADDHALFREGS(sum, s1)                             /* Adds register */    \
-        "mov   " #s1  ", " #s1  ",   ror #16         \n" /* halves straight. */ \
-        "add     r8    , " #s1  ", " #sum ", lsl #16 \n" /* Clobbers 's1' */    \
-        "add   " #sum ", " #s1  ", " #sum ", lsr #16 \n" /* and r8. */          \
-        "mov   " #sum ", " #sum ",   lsl #16         \n" \
-        "orr   " #sum ", " #sum ",   r8    , lsr #16 \n"
-#define ADDHALFXREGS(sum, s1, s2)                        /* Adds register */    \
-        "add   " #s1  ", " #s1  ", " #sum ", lsl #16 \n" /* halves across. */ \
-        "add   " #sum ", " #s2  ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
-        "mov   " #sum ", " #sum ",   lsl #16         \n" \
-        "orr   " #sum ", " #sum ", " #s1  ", lsr #16 \n"
-    asm volatile (
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-    "10:                             \n"
-        "ldrh    r4, [%[v2]], #2     \n"
-        "mov     r4, r4, lsl #16     \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        ADDHALFXREGS(r0, r4, r5)
-        ADDHALFXREGS(r1, r5, r6)
-        ADDHALFXREGS(r2, r6, r7)
-        ADDHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "mov     r4, r8              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        ADDHALFXREGS(r0, r4, r5)
-        ADDHALFXREGS(r1, r5, r6)
-        ADDHALFXREGS(r2, r6, r7)
-        ADDHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "mov     r4, r8              \n"
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        ADDHALFREGS(r0, r4)
-        ADDHALFREGS(r1, r5)
-        ADDHALFREGS(r2, r6)
-        ADDHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        ADDHALFREGS(r0, r4)
-        ADDHALFREGS(r1, r5)
-        ADDHALFREGS(r2, r6)
-        ADDHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4",
-        "r5", "r6", "r7", "r8", "memory"
-    );
-}
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). */
-static inline void vector_sub(int16_t* v1, int16_t* v2)
-{
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-#define SUBHALFREGS(dif, s1)                             /* Subtracts register */ \
-        "sub     r8    , " #dif ", " #s1            "\n" /* halves straight. */   \
-        "and     r8    ,   r8    ,   r9              \n" /* Needs r9 = 0x0000ffff, */ \
-        "mov   " #dif ", " #dif ",   lsr #16         \n" /* clobbers r8. */      \
-        "sub   " #dif ", " #dif ", " #s1  ", lsr #16 \n"  \
-        "orr   " #dif ",   r8    , " #dif ", lsl #16 \n"
-#define SUBHALFXREGS(dif, s1, s2)                        /* Subtracts register */ \
-        "sub   " #s1  ", " #dif ", " #s1  ", lsr #16 \n" /* halves across. */     \
-        "and   " #s1  ", " #s1  ",   r9              \n" /* Needs r9 = 0x0000ffff, */ \
-        "rsb   " #dif ", " #s2  ", " #dif ", lsr #16 \n" /* clobbers 's1'. */     \
-        "orr   " #dif ", " #s1  ", " #dif ", lsl #16 \n"
-        
-    asm volatile (
-        "mov     r9, #0xff           \n"
-        "orr     r9, r9, #0xff00     \n"
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-    "10:                             \n"
-        "ldrh    r4, [%[v2]], #2     \n"
-        "mov     r4, r4, lsl #16     \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        SUBHALFXREGS(r0, r4, r5)
-        SUBHALFXREGS(r1, r5, r6)
-        SUBHALFXREGS(r2, r6, r7)
-        SUBHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "mov     r4, r8              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r5-r8}     \n"
-        SUBHALFXREGS(r0, r4, r5)
-        SUBHALFXREGS(r1, r5, r6)
-        SUBHALFXREGS(r2, r6, r7)
-        SUBHALFXREGS(r3, r7, r8)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "mov     r4, r8              \n"
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        SUBHALFREGS(r0, r4)
-        SUBHALFREGS(r1, r5)
-        SUBHALFREGS(r2, r6)
-        SUBHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v1],  {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        SUBHALFREGS(r0, r4)
-        SUBHALFREGS(r1, r5)
-        SUBHALFREGS(r2, r6)
-        SUBHALFREGS(r3, r7)
-        "stmia   %[v1]!, {r0-r3}     \n"
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4", "r5", 
-        "r6", "r7", "r8", "r9", "memory"
-    );
-}
-/* This version fetches data as 32 bit words, and *requires* v1 to be
- * 32 bit aligned, otherwise it will result either in a data abort, or
- * incorrect results (if ARM aligncheck is disabled). It is optimised
- * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
- * than the C version. */
-static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
-{
-    int res = 0;
-#if ORDER > 16
-    int cnt = ORDER>>4;
-#endif
-#define MLABLOCK2(f1, f2)                   \
-        "mov     r8, " #f1 ", lsl #16   \n" \
-        "mov     r8,   r8   , asr #16   \n" \
-        "mov     r9, " #f2 ", lsl #16   \n" \
-        "mov     r9,   r9   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r8, " #f1 ", asr #16   \n" \
-        "mov     r9, " #f2 ", asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n"
-#define MLABLOCK2_U2(f1, f2)                \
-        "mov     r8, " #f1 ", lsl #16   \n" \
-        "mov     r8,   r8   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r8, " #f1 ", asr #16   \n" \
-        "mov     r9, " #f2 ", lsl #16   \n" \
-        "mov     r9,   r9   , asr #16   \n" \
-        "mla     %[res], r9, r8, %[res] \n" \
-        "mov     r9, " #f2 ", asr #16   \n"
-    asm volatile (
-        "tst     %[v2], #2           \n"
-        "beq     20f                 \n"
-    "10:                             \n"
-        "ldrsh   r9, [%[v2]], #2     \n"
-    "1:                              \n"
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2_U2(r0, r4)
-        MLABLOCK2_U2(r1, r5)
-        MLABLOCK2_U2(r2, r6)
-        MLABLOCK2_U2(r3, r7)
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2_U2(r0, r4)
-        MLABLOCK2_U2(r1, r5)
-        MLABLOCK2_U2(r2, r6)
-        MLABLOCK2_U2(r3, r7)
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-        "b       99f                 \n"
-    "20:                             \n"
-    "1:                              \n"
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2(r0, r4)
-        MLABLOCK2(r1, r5)
-        MLABLOCK2(r2, r6)
-        MLABLOCK2(r3, r7)
-        "ldmia   %[v1]!, {r0-r3}     \n"
-        "ldmia   %[v2]!, {r4-r7}     \n"
-        MLABLOCK2(r0, r4)
-        MLABLOCK2(r1, r5)
-        MLABLOCK2(r2, r6)
-        MLABLOCK2(r3, r7)
-#if ORDER > 16
-        "subs    %[cnt], %[cnt], #1  \n"
-        "bne     1b                  \n"
-#endif
-    "99:                             \n"
-        : /* outputs */
-#if ORDER > 16
-        [cnt]"+r"(cnt),
-#endif
-        [v1] "+r"(v1),
-        [v2] "+r"(v2),
-        [res]"+r"(res)
-        : /* inputs */
-        : /* clobbers */
-        "r0", "r1", "r2", "r3", "r4",
-        "r5", "r6", "r7", "r8", "r9"
-    );
-    return res;
-}
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
new file mode 100644
index 0000000000..b729bd3a0a
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -0,0 +1,210 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv4 vector math copyright (C) 2008 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+static inline void vector_add(int32_t* v1, int32_t* v2)
+{
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+#define ADDBLOCK4                        \
+        "ldmia   %[v1],  {r0-r3}     \n" \
+        "ldmia   %[v2]!, {r4-r7}     \n" \
+        "add     r0, r0, r4          \n" \
+        "add     r1, r1, r5          \n" \
+        "add     r2, r2, r6          \n" \
+        "add     r3, r3, r7          \n" \
+        "stmia   %[v1]!, {r0-r3}     \n"
+    asm volatile (
+    "1:                              \n"
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+#if ORDER > 16
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+        ADDBLOCK4
+#endif
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1  \n"
+        "bne     1b                  \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "memory"
+    );
+}
+static inline void vector_sub(int32_t* v1, int32_t* v2)
+{
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+#define SUBBLOCK4                        \
+        "ldmia   %[v1],  {r0-r3}     \n" \
+        "ldmia   %[v2]!, {r4-r7}     \n" \
+        "sub     r0, r0, r4          \n" \
+        "sub     r1, r1, r5          \n" \
+        "sub     r2, r2, r6          \n" \
+        "sub     r3, r3, r7          \n" \
+        "stmia   %[v1]!, {r0-r3}     \n"
+    asm volatile (
+    "1:                              \n"
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+#if ORDER > 16
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+        SUBBLOCK4
+#endif
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1  \n"
+        "bne     1b                  \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "memory"
+    );
+}
+static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
+{
+    int res = 0;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 16
+        "ldmia   %[v2]!, {r6-r7}         \n"
+    "1:                                  \n"
+        "ldmia   %[v1]!, {r0,r1,r3-r5}   \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r4}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0,r1,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r5}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6,r7}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0,r1,r3-r5}   \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "ldmia   %[v2]!, {r0-r2,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r4}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+        "ldmia   %[v2]!, {r0,r1,r6-r8}   \n"
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "ldmia   %[v1]!, {r0-r5}         \n"
+        "mla     %[res], r6, r0, %[res]  \n"
+        "mla     %[res], r7, r1, %[res]  \n"
+        "mla     %[res], r8, r2, %[res]  \n"
+#if ORDER > 32
+        "ldmia   %[v2]!, {r0-r2,r6,r7}   \n"
+#else
+        "ldmia   %[v2]!, {r0-r2}         \n"
+#endif
+        "mla     %[res], r0, r3, %[res]  \n"
+        "mla     %[res], r1, r4, %[res]  \n"
+        "mla     %[res], r2, r5, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+#else /* ORDER <= 16 */
+#define MLABLOCK4                            \
+        "ldmia   %[v1]!, {r0-r3}         \n" \
+        "ldmia   %[v2]!, {r4-r7}         \n" \
+        "mla     %[res], r4, r0, %[res]  \n" \
+        "mla     %[res], r5, r1, %[res]  \n" \
+        "mla     %[res], r6, r2, %[res]  \n" \
+        "mla     %[res], r7, r3, %[res]  \n"
+        MLABLOCK4
+        MLABLOCK4
+        MLABLOCK4
+        MLABLOCK4
+#endif /* ORDER <= 16 */
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"+r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3",
+        "r4", "r5", "r6", "r7"
+#if ORDER > 16
+        ,"r8"
+#endif
+    );
+    return res;
+}
diff --git a/apps/codecs/demac/libdemac/vector_math16.h b/apps/codecs/demac/libdemac/vector_math_generic.h
index 5d82abe930..7b61db77be 100644
--- a/apps/codecs/demac/libdemac/vector_math16.h
+++ b/apps/codecs/demac/libdemac/vector_math_generic.h
@@ -2,7 +2,7 @@
 libdemac - A Monkey's Audio decoder
-$Id:$
+$Id$
 Copyright (C) Dave Chapman 2007
@@ -22,7 +22,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 */
-static inline void vector_add(int16_t* v1, int16_t* v2)
+#include "demac_config.h"
+static inline void vector_add(filter_int* v1, filter_int* v2)
 {
 #if ORDER > 32
    int order = (ORDER >> 5);
@@ -66,7 +68,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
    }
 }
-static inline void vector_sub(int16_t* v1, int16_t* v2)
+static inline void vector_sub(filter_int* v1, filter_int* v2)
 {
 #if ORDER > 32
    int order = (ORDER >> 5);
@@ -110,7 +112,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
    }
 }
-static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
 {
    int res = 0;