30 files changed, 6196 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/SOURCES b/lib/rbcodec/codecs/demac/libdemac/SOURCES
new file mode 100644
index 0000000000..018f35a73c
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/SOURCES
@@ -0,0 +1,15 @@
+predictor.c
+#ifdef CPU_ARM
+predictor-arm.S
+udiv32_arm.S
+#elif defined CPU_COLDFIRE
+predictor-cf.S
+#endif
+entropy.c
+decoder.c
+parser.c
+filter_1280_15.c
+filter_16_11.c
+filter_256_13.c
+filter_32_10.c
+filter_64_11.c
diff --git a/lib/rbcodec/codecs/demac/libdemac/crc.c b/lib/rbcodec/codecs/demac/libdemac/crc.c
new file mode 100644
index 0000000000..fa3ea89d7e
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/crc.c
@@ -0,0 +1,120 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <inttypes.h>
+#include "demac.h"
+static const uint32_t crctab32[] =
+{
+  0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
+  0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+  0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+  0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+  0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
+  0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+  0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+  0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+  0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+  0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+  0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
+  0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+  0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
+  0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+  0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+  0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+  0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
+  0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+  0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
+  0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+  0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+  0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+  0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
+  0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+  0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
+  0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+  0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+  0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+  0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
+  0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+  0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
+  0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+  0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+  0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+  0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+  0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+  0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
+  0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+  0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+  0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+  0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
+  0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+  0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
+  0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+  0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+  0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+  0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
+  0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+  0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+  0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+  0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+  0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+  0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
+  0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+  0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
+  0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+  0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+  0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+  0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
+  0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+  0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
+  0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+  0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+  0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+};
+uint32_t ape_initcrc(void)
+{
+    return 0xffffffff;
+}
+/* Update the CRC from a block of WAV-format audio data */
+uint32_t ape_updatecrc(unsigned char *block, int count, uint32_t crc)
+{
+    while (count--)
+        crc = (crc >> 8) ^ crctab32[(crc & 0xff) ^ *block++];
+    return crc;
+}
+uint32_t ape_finishcrc(uint32_t crc)
+{
+    crc ^= 0xffffffff;
+    crc >>= 1;
+    return crc;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/decoder.c b/lib/rbcodec/codecs/demac/libdemac/decoder.c
new file mode 100644
index 0000000000..b0339a75d9
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/decoder.c
@@ -0,0 +1,216 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <inttypes.h>
+#include <string.h>
+#include "demac.h"
+#include "predictor.h"
+#include "entropy.h"
+#include "filter.h"
+#include "demac_config.h"
+/* Statically allocate the filter buffers */
+#ifdef FILTER256_IRAM
+static filter_int filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]   
+                  IBSS_ATTR_DEMAC MEM_ALIGN_ATTR; 
+                  /* 2432 or 4864 bytes */
+static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2]
+                  IBSS_ATTR_DEMAC MEM_ALIGN_ATTR; 
+                  /* 5120 or 10240 bytes */
+#define FILTERBUF64 filterbuf256
+#define FILTERBUF32 filterbuf32
+#define FILTERBUF16 filterbuf32
+#else
+static filter_int filterbuf64[(64*3 + FILTER_HISTORY_SIZE) * 2]   
+                  IBSS_ATTR_DEMAC MEM_ALIGN_ATTR; 
+                  /* 2432 or 4864 bytes */
+static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2]
+                  MEM_ALIGN_ATTR; /* 5120 or 10240 bytes */
+#define FILTERBUF64 filterbuf64
+#define FILTERBUF32 filterbuf64
+#define FILTERBUF16 filterbuf64
+#endif
+/* This is only needed for "insane" files, and no current Rockbox targets
+   can hope to decode them in realtime, except the Gigabeat S (at 528MHz). */
+static filter_int filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] 
+                  IBSS_ATTR_DEMAC_INSANEBUF MEM_ALIGN_ATTR;
+                  /* 17408 or 34816 bytes */
+void init_frame_decoder(struct ape_ctx_t* ape_ctx,
+                        unsigned char* inbuffer, int* firstbyte,
+                        int* bytesconsumed)
+{
+    init_entropy_decoder(ape_ctx, inbuffer, firstbyte, bytesconsumed);
+    //printf("CRC=0x%08x\n",ape_ctx->CRC);
+    //printf("Flags=0x%08x\n",ape_ctx->frameflags);
+    init_predictor_decoder(&ape_ctx->predictor);
+    switch (ape_ctx->compressiontype)
+    {
+        case 2000:
+            init_filter_16_11(FILTERBUF16);
+            break;
+        case 3000:
+            init_filter_64_11(FILTERBUF64);
+            break;
+        case 4000:
+            init_filter_256_13(filterbuf256);
+            init_filter_32_10(FILTERBUF32);
+            break;
+        case 5000:
+            init_filter_1280_15(filterbuf1280);
+            init_filter_256_13(filterbuf256);
+            init_filter_16_11(FILTERBUF32);
+    }
+}
+int ICODE_ATTR_DEMAC decode_chunk(struct ape_ctx_t* ape_ctx,
+                                  unsigned char* inbuffer, int* firstbyte,
+                                  int* bytesconsumed,
+                                  int32_t* decoded0, int32_t* decoded1,
+                                  int count)
+{
+    int32_t left, right;
+#ifdef ROCKBOX
+    int scale = (APE_OUTPUT_DEPTH - ape_ctx->bps);
+    #define SCALE(x) ((x) << scale)
+#else
+    #define SCALE(x) (x)
+#endif
+         
+    if ((ape_ctx->channels==1) || ((ape_ctx->frameflags
+        & (APE_FRAMECODE_PSEUDO_STEREO|APE_FRAMECODE_STEREO_SILENCE))
+        == APE_FRAMECODE_PSEUDO_STEREO)) {
+        entropy_decode(ape_ctx, inbuffer, firstbyte, bytesconsumed,
+                       decoded0, NULL, count);
+        if (ape_ctx->frameflags & APE_FRAMECODE_MONO_SILENCE) {
+            /* We are pure silence, so we're done. */
+            return 0;
+        }
+        switch (ape_ctx->compressiontype)
+        {
+            case 2000:
+                apply_filter_16_11(ape_ctx->fileversion,0,decoded0,count);
+                break;
+    
+            case 3000:
+                apply_filter_64_11(ape_ctx->fileversion,0,decoded0,count);
+                break;
+    
+            case 4000:
+                apply_filter_32_10(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_256_13(ape_ctx->fileversion,0,decoded0,count);
+                break;
+    
+            case 5000:
+                apply_filter_16_11(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_256_13(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_1280_15(ape_ctx->fileversion,0,decoded0,count);
+        }
+        /* Now apply the predictor decoding */
+        predictor_decode_mono(&ape_ctx->predictor,decoded0,count);
+        if (ape_ctx->channels==2) {
+            /* Pseudo-stereo - copy left channel to right channel */
+            while (count--)
+            {
+                left = *decoded0;
+                *(decoded1++) = *(decoded0++) = SCALE(left);
+            }
+        }
+#ifdef ROCKBOX
+         else {
+            /* Scale to output depth */
+            while (count--)
+            {
+                left = *decoded0;
+                *(decoded0++) = SCALE(left);
+            }
+        }
+#endif
+    } else { /* Stereo */
+        entropy_decode(ape_ctx, inbuffer, firstbyte, bytesconsumed,
+                       decoded0, decoded1, count);
+        if ((ape_ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE)
+            == APE_FRAMECODE_STEREO_SILENCE) {
+            /* We are pure silence, so we're done. */
+            return 0;
+        }
+        /* Apply filters - compression type 1000 doesn't have any */
+        switch (ape_ctx->compressiontype)
+        {
+            case 2000:
+                apply_filter_16_11(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_16_11(ape_ctx->fileversion,1,decoded1,count);
+                break;
+    
+            case 3000:
+                apply_filter_64_11(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_64_11(ape_ctx->fileversion,1,decoded1,count);
+                break;
+    
+            case 4000:
+                apply_filter_32_10(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_32_10(ape_ctx->fileversion,1,decoded1,count);
+                apply_filter_256_13(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_256_13(ape_ctx->fileversion,1,decoded1,count);
+                break;
+    
+            case 5000:
+                apply_filter_16_11(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_16_11(ape_ctx->fileversion,1,decoded1,count);
+                apply_filter_256_13(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_256_13(ape_ctx->fileversion,1,decoded1,count);
+                apply_filter_1280_15(ape_ctx->fileversion,0,decoded0,count);
+                apply_filter_1280_15(ape_ctx->fileversion,1,decoded1,count);
+        }
+        /* Now apply the predictor decoding */
+        predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count);
+        /* Decorrelate and scale to output depth */
+        while (count--)
+        {
+            left = *decoded1 - (*decoded0 / 2);
+            right = left + *decoded0;
+            *(decoded0++) = SCALE(left);
+            *(decoded1++) = SCALE(right);
+        }
+    }
+    return 0;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/decoder.h b/lib/rbcodec/codecs/demac/libdemac/decoder.h
new file mode 100644
index 0000000000..aeac569509
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/decoder.h
@@ -0,0 +1,40 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_DECODER_H
+#define _APE_DECODER_H
+#include <inttypes.h>
+#include "parser.h"
+void init_frame_decoder(struct ape_ctx_t* ape_ctx,
+                        unsigned char* inbuffer, int* firstbyte,
+                        int* bytesconsumed);
+int decode_chunk(struct ape_ctx_t* ape_ctx,
+                 unsigned char* inbuffer, int* firstbyte,
+                 int* bytesconsumed,
+                 int32_t* decoded0, int32_t* decoded1, 
+                 int count);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/demac.h b/lib/rbcodec/codecs/demac/libdemac/demac.h
new file mode 100644
index 0000000000..696b2aba73
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/demac.h
@@ -0,0 +1,45 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_DECODER_H
+#define _APE_DECODER_H
+#include <inttypes.h>
+#include "parser.h"
+void init_frame_decoder(struct ape_ctx_t* ape_ctx,
+                        unsigned char* inbuffer, int* firstbyte,
+                        int* bytesconsumed);
+int decode_chunk(struct ape_ctx_t* ape_ctx,
+                 unsigned char* inbuffer, int* firstbyte,
+                 int* bytesconsumed,
+                 int32_t* decoded0, int32_t* decoded1, 
+                 int count);
+uint32_t ape_initcrc(void);
+uint32_t ape_updatecrc(unsigned char *block, int count, uint32_t crc);
+uint32_t ape_finishcrc(uint32_t crc);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/demac_config.h b/lib/rbcodec/codecs/demac/libdemac/demac_config.h
new file mode 100644
index 0000000000..fa4f008036
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/demac_config.h
@@ -0,0 +1,145 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _DEMAC_CONFIG_H
+#define _DEMAC_CONFIG_H
+/* Build-time choices for libdemac.
+ * Note that this file is included by both .c and .S files. */
+#ifdef ROCKBOX
+#include "config.h"
+#ifndef __ASSEMBLER__
+#include "codeclib.h"
+#include <codecs.h>
+#endif
+#define APE_OUTPUT_DEPTH 29
+/* On ARMv4, using 32 bit ints for the filters is faster. */
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#define FILTER_BITS 32
+#endif
+#if !defined(CPU_PP) && !defined(CPU_S5L870X)
+#define FILTER256_IRAM
+#endif
+#if CONFIG_CPU == PP5002 || defined(CPU_S5L870X)
+/* Code and data IRAM for speed (PP5002 has a broken cache), not enough IRAM
+ * for the insane filter buffer. Reciprocal table for division in IRAM. */
+#define ICODE_SECTION_DEMAC_ARM   .icode
+#define ICODE_ATTR_DEMAC          ICODE_ATTR
+#define ICONST_ATTR_DEMAC         ICONST_ATTR
+#define IBSS_ATTR_DEMAC           IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
+#elif CONFIG_CPU == PP5020
+/* Code and small data in DRAM for speed (PP5020 IRAM isn't completely single
+ * cycle). Insane filter buffer not in IRAM in favour of reciprocal table for
+ * divison. Decoded data buffers should be in IRAM (defined by the caller). */
+#define ICODE_SECTION_DEMAC_ARM   .text
+#define ICODE_ATTR_DEMAC
+#define ICONST_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC_INSANEBUF
+#elif CONFIG_CPU == PP5022
+/* Code in DRAM, data in IRAM. Insane filter buffer not in IRAM in favour of
+ * reciprocal table for divison */
+#define ICODE_SECTION_DEMAC_ARM   .text
+#define ICODE_ATTR_DEMAC
+#define ICONST_ATTR_DEMAC         ICONST_ATTR
+#define IBSS_ATTR_DEMAC           IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
+#else
+/* Code in DRAM, data in IRAM, including insane filter buffer. */
+#define ICODE_SECTION_DEMAC_ARM   .text
+#define ICODE_ATTR_DEMAC
+#define ICONST_ATTR_DEMAC         ICONST_ATTR
+#define IBSS_ATTR_DEMAC           IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR
+#endif
+#else /* !ROCKBOX */
+#define APE_OUTPUT_DEPTH (ape_ctx->bps)
+#define MEM_ALIGN_ATTR __attribute__((aligned(16)))
+        /* adjust to target architecture for best performance */
+#define ICODE_ATTR_DEMAC
+#define ICONST_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC
+#define IBSS_ATTR_DEMAC_INSANEBUF
+/* Use to give gcc hints on which branch is most likely taken */
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define LIKELY(x)   __builtin_expect(!!(x), 1)
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define LIKELY(x)   (x)
+#define UNLIKELY(x) (x)
+#endif
+#endif /* !ROCKBOX */
+/* Defaults */
+#ifndef FILTER_HISTORY_SIZE
+#define FILTER_HISTORY_SIZE 512
+#endif
+#ifndef PREDICTOR_HISTORY_SIZE
+#define PREDICTOR_HISTORY_SIZE 512
+#endif     
+#ifndef FILTER_BITS
+#define FILTER_BITS 16
+#endif
+#ifndef __ASSEMBLER__
+#if defined(CPU_ARM) && (ARM_ARCH < 5 || defined(USE_IRAM))
+/* optimised unsigned integer division for ARMv4, in IRAM */
+unsigned udiv32_arm(unsigned a, unsigned b);
+#define UDIV32(a, b) udiv32_arm(a, b)
+#else
+/* default */
+#define UDIV32(a, b) (a / b)
+#endif
+#include <inttypes.h>
+#if FILTER_BITS == 32
+typedef int32_t filter_int;
+#elif FILTER_BITS == 16
+typedef int16_t filter_int;
+#endif
+#endif
+#endif /* _DEMAC_CONFIG_H */
diff --git a/lib/rbcodec/codecs/demac/libdemac/entropy.c b/lib/rbcodec/codecs/demac/libdemac/entropy.c
new file mode 100644
index 0000000000..1cef979808
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/entropy.c
@@ -0,0 +1,464 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <inttypes.h>
+#include <string.h>
+#include "parser.h"
+#include "entropy.h"
+#include "demac_config.h"
+#define MODEL_ELEMENTS 64
+/*
+  The following counts arrays for use with the range decoder are
+  hard-coded in the Monkey's Audio decoder.
+*/
+static const int counts_3970[65] ICONST_ATTR_DEMAC =
+{
+        0,14824,28224,39348,47855,53994,58171,60926,
+    62682,63786,64463,64878,65126,65276,65365,65419,
+    65450,65469,65480,65487,65491,65493,65494,65495,
+    65496,65497,65498,65499,65500,65501,65502,65503,
+    65504,65505,65506,65507,65508,65509,65510,65511,
+    65512,65513,65514,65515,65516,65517,65518,65519,
+    65520,65521,65522,65523,65524,65525,65526,65527,
+    65528,65529,65530,65531,65532,65533,65534,65535,
+    65536
+};
+/* counts_diff_3970[i] = counts_3970[i+1] - counts_3970[i] */
+static const int counts_diff_3970[64] ICONST_ATTR_DEMAC =
+{
+    14824,13400,11124,8507,6139,4177,2755,1756,
+    1104,677,415,248,150,89,54,31,
+    19,11,7,4,2,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1
+};
+static const int counts_3980[65] ICONST_ATTR_DEMAC =
+{
+        0,19578,36160,48417,56323,60899,63265,64435,
+    64971,65232,65351,65416,65447,65466,65476,65482,
+    65485,65488,65490,65491,65492,65493,65494,65495,
+    65496,65497,65498,65499,65500,65501,65502,65503,
+    65504,65505,65506,65507,65508,65509,65510,65511,
+    65512,65513,65514,65515,65516,65517,65518,65519,
+    65520,65521,65522,65523,65524,65525,65526,65527,
+    65528,65529,65530,65531,65532,65533,65534,65535,
+    65536
+};
+/* counts_diff_3980[i] = counts_3980[i+1] - counts_3980[i] */
+static const int counts_diff_3980[64] ICONST_ATTR_DEMAC =
+{
+    19578,16582,12257,7906,4576,2366,1170,536,
+    261,119,65,31,19,10,6,3,
+    3,2,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1
+};
+/*
+Range decoder adapted from rangecod.c included in:
+  http://www.compressconsult.com/rangecoder/rngcod13.zip
+  rangecod.c     range encoding
+  (c) Michael Schindler
+  1997, 1998, 1999, 2000
+  http://www.compressconsult.com/
+  michael@compressconsult.com
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+The encoding functions were removed, and functions turned into "static
+inline" functions. Some minor cosmetic changes were made (e.g. turning
+pre-processor symbols into upper-case, removing the rc parameter from
+each function (and the RNGC macro)).
+*/
+/* BITSTREAM READING FUNCTIONS */
+/* We deal with the input data one byte at a time - to ensure
+   functionality on CPUs of any endianness regardless of any requirements
+   for aligned reads.
+*/
+static unsigned char* bytebuffer IBSS_ATTR_DEMAC;
+static int bytebufferoffset IBSS_ATTR_DEMAC;
+static inline void skip_byte(void)
+{
+    bytebufferoffset--;
+    bytebuffer += bytebufferoffset & 4;
+    bytebufferoffset &= 3;
+}
+static inline int read_byte(void)
+{
+    int ch = bytebuffer[bytebufferoffset];
+    skip_byte();
+    return ch;
+}
+/* RANGE DECODING FUNCTIONS */
+/* SIZE OF RANGE ENCODING CODE VALUES. */
+#define CODE_BITS 32
+#define TOP_VALUE ((unsigned int)1 << (CODE_BITS-1))
+#define SHIFT_BITS (CODE_BITS - 9)
+#define EXTRA_BITS ((CODE_BITS-2) % 8 + 1)
+#define BOTTOM_VALUE (TOP_VALUE >> 8)
+struct rangecoder_t
+{
+    uint32_t low;        /* low end of interval */
+    uint32_t range;      /* length of interval */
+    uint32_t help;       /* bytes_to_follow resp. intermediate value */
+    unsigned int buffer; /* buffer for input/output */
+};
+static struct rangecoder_t rc IBSS_ATTR_DEMAC;
+/* Start the decoder */
+static inline void range_start_decoding(void)
+{
+    rc.buffer = read_byte();
+    rc.low = rc.buffer >> (8 - EXTRA_BITS);
+    rc.range = (uint32_t) 1 << EXTRA_BITS;
+}
+static inline void range_dec_normalize(void)
+{
+    while (rc.range <= BOTTOM_VALUE)
+    {   
+        rc.buffer = (rc.buffer << 8) | read_byte();
+        rc.low = (rc.low << 8) | ((rc.buffer >> 1) & 0xff);
+        rc.range <<= 8;
+    }
+}
+/* Calculate culmulative frequency for next symbol. Does NO update!*/
+/* tot_f is the total frequency                              */
+/* or: totf is (code_value)1<<shift                                      */
+/* returns the culmulative frequency                         */
+static inline int range_decode_culfreq(int tot_f)
+{
+    range_dec_normalize();
+    rc.help = UDIV32(rc.range, tot_f);
+    return UDIV32(rc.low, rc.help);
+}
+static inline int range_decode_culshift(int shift)
+{
+    range_dec_normalize();
+    rc.help = rc.range >> shift;
+    return UDIV32(rc.low, rc.help);
+}
+/* Update decoding state                                     */
+/* sy_f is the interval length (frequency of the symbol)     */
+/* lt_f is the lower end (frequency sum of < symbols)        */
+static inline void range_decode_update(int sy_f, int lt_f)
+{
+    rc.low -= rc.help * lt_f;
+    rc.range = rc.help * sy_f;
+}
+/* Decode a byte/short without modelling                     */
+static inline unsigned char decode_byte(void)
+{   int tmp = range_decode_culshift(8);
+    range_decode_update( 1,tmp);
+    return tmp;
+}
+static inline unsigned short range_decode_short(void)
+{   int tmp = range_decode_culshift(16);
+    range_decode_update( 1,tmp);
+    return tmp;
+}
+/* Decode n bits (n <= 16) without modelling - based on range_decode_short */
+static inline int range_decode_bits(int n)
+{   int tmp = range_decode_culshift(n);
+    range_decode_update( 1,tmp);
+    return tmp;
+}
+/* Finish decoding                                           */
+static inline void range_done_decoding(void)
+{   range_dec_normalize();      /* normalize to use up all bytes */
+}
+/*
+  range_get_symbol_* functions based on main decoding loop in simple_d.c from
+  http://www.compressconsult.com/rangecoder/rngcod13.zip
+  (c) Michael Schindler
+*/
+static inline int range_get_symbol_3980(void)
+{
+    int symbol, cf;
+    cf = range_decode_culshift(16);
+    /* figure out the symbol inefficiently; a binary search would be much better */
+    for (symbol = 0; counts_3980[symbol+1] <= cf; symbol++);
+    range_decode_update(counts_diff_3980[symbol],counts_3980[symbol]);
+    return symbol;
+}
+static inline int range_get_symbol_3970(void)
+{
+    int symbol, cf;
+    cf = range_decode_culshift(16);
+    /* figure out the symbol inefficiently; a binary search would be much better */
+    for (symbol = 0; counts_3970[symbol+1] <= cf; symbol++);
+    range_decode_update(counts_diff_3970[symbol],counts_3970[symbol]);
+    return symbol;
+}
+/* MAIN DECODING FUNCTIONS */
+struct rice_t
+{
+  uint32_t k;
+  uint32_t ksum;
+};
+static struct rice_t riceX IBSS_ATTR_DEMAC;
+static struct rice_t riceY IBSS_ATTR_DEMAC;
+static inline void update_rice(struct rice_t* rice, int x)
+{
+    rice->ksum += ((x + 1) / 2) - ((rice->ksum + 16) >> 5);
+    if (UNLIKELY(rice->k == 0)) {
+        rice->k = 1;
+    } else {
+        uint32_t lim = 1 << (rice->k + 4);
+        if (UNLIKELY(rice->ksum < lim)) {
+            rice->k--;
+        } else if (UNLIKELY(rice->ksum >= 2 * lim)) {
+            rice->k++;
+        }
+    }
+}
+static inline int entropy_decode3980(struct rice_t* rice)
+{
+    int base, x, pivot, overflow;
+    pivot = rice->ksum >> 5;
+    if (UNLIKELY(pivot == 0))
+        pivot=1;
+    overflow = range_get_symbol_3980();
+    if (UNLIKELY(overflow == (MODEL_ELEMENTS-1))) {
+        overflow = range_decode_short() << 16;
+        overflow |= range_decode_short();
+    }
+    if (pivot >= 0x10000) {
+        /* Codepath for 24-bit streams */
+        int nbits, lo_bits, base_hi, base_lo;
+        /* Count the number of bits in pivot */
+        nbits = 17; /* We know there must be at least 17 bits */
+        while ((pivot >> nbits) > 0) { nbits++; }
+        /* base_lo is the low (nbits-16) bits of base
+           base_hi is the high 16 bits of base
+        */
+        lo_bits = (nbits - 16);
+        base_hi = range_decode_culfreq((pivot >> lo_bits) + 1);
+        range_decode_update(1, base_hi);
+        base_lo = range_decode_culshift(lo_bits);
+        range_decode_update(1, base_lo);
+        base = (base_hi << lo_bits) + base_lo;
+    } else {
+        /* Codepath for 16-bit streams */
+        base = range_decode_culfreq(pivot);
+        range_decode_update(1, base);
+    }
+    x = base + (overflow * pivot);
+    update_rice(rice, x);
+    /* Convert to signed */
+    if (x & 1)
+        return (x >> 1) + 1;
+    else
+        return -(x >> 1);
+}
+static inline int entropy_decode3970(struct rice_t* rice)
+{
+    int x, tmpk;
+    int overflow = range_get_symbol_3970();
+    if (UNLIKELY(overflow == (MODEL_ELEMENTS - 1))) {
+        tmpk = range_decode_bits(5);
+        overflow = 0;
+    } else {
+        tmpk = (rice->k < 1) ? 0 : rice->k - 1;
+    }
+    if (tmpk <= 16) {
+        x = range_decode_bits(tmpk);
+    } else {
+        x = range_decode_short();
+        x |= (range_decode_bits(tmpk - 16) << 16);
+    }
+    x += (overflow << tmpk);
+    update_rice(rice, x);
+    /* Convert to signed */
+    if (x & 1)
+        return (x >> 1) + 1;
+    else
+        return -(x >> 1);
+}
+void init_entropy_decoder(struct ape_ctx_t* ape_ctx,
+                          unsigned char* inbuffer, int* firstbyte,
+                          int* bytesconsumed)
+{
+    bytebuffer = inbuffer;
+    bytebufferoffset = *firstbyte;
+    /* Read the CRC */
+    ape_ctx->CRC = read_byte();
+    ape_ctx->CRC = (ape_ctx->CRC << 8) | read_byte();
+    ape_ctx->CRC = (ape_ctx->CRC << 8) | read_byte();
+    ape_ctx->CRC = (ape_ctx->CRC << 8) | read_byte();
+    /* Read the frame flags if they exist */
+    ape_ctx->frameflags = 0;
+    if ((ape_ctx->fileversion > 3820) && (ape_ctx->CRC & 0x80000000)) {
+        ape_ctx->CRC &= ~0x80000000;
+        ape_ctx->frameflags = read_byte();
+        ape_ctx->frameflags = (ape_ctx->frameflags << 8) | read_byte();
+        ape_ctx->frameflags = (ape_ctx->frameflags << 8) | read_byte();
+        ape_ctx->frameflags = (ape_ctx->frameflags << 8) | read_byte();
+    }
+    /* Keep a count of the blocks decoded in this frame */
+    ape_ctx->blocksdecoded = 0;
+    /* Initialise the rice structs */
+    riceX.k = 10;
+    riceX.ksum = (1 << riceX.k) * 16;
+    riceY.k = 10;
+    riceY.ksum = (1 << riceY.k) * 16;
+    /* The first 8 bits of input are ignored. */
+    skip_byte();
+    range_start_decoding();
+    /* Return the new state of the buffer */
+    *bytesconsumed = (intptr_t)bytebuffer - (intptr_t)inbuffer;
+    *firstbyte = bytebufferoffset;
+}
+void ICODE_ATTR_DEMAC entropy_decode(struct ape_ctx_t* ape_ctx,
+                                     unsigned char* inbuffer, int* firstbyte,
+                                     int* bytesconsumed,
+                                     int32_t* decoded0, int32_t* decoded1,
+                                     int blockstodecode)
+{
+    bytebuffer = inbuffer;
+    bytebufferoffset = *firstbyte;
+    ape_ctx->blocksdecoded += blockstodecode;
+    if ((ape_ctx->frameflags & APE_FRAMECODE_LEFT_SILENCE)
+        && ((ape_ctx->frameflags & APE_FRAMECODE_RIGHT_SILENCE)
+            || (decoded1 == NULL))) {
+        /* We are pure silence, just memset the output buffer. */
+        memset(decoded0, 0, blockstodecode * sizeof(int32_t));
+        if (decoded1 != NULL)
+            memset(decoded1, 0, blockstodecode * sizeof(int32_t));
+    } else {
+        if (ape_ctx->fileversion > 3970) {
+            while (LIKELY(blockstodecode--)) {
+                *(decoded0++) = entropy_decode3980(&riceY);
+                if (decoded1 != NULL)
+                    *(decoded1++) = entropy_decode3980(&riceX);
+            }
+        } else {
+            while (LIKELY(blockstodecode--)) {
+                *(decoded0++) = entropy_decode3970(&riceY);
+                if (decoded1 != NULL)
+                    *(decoded1++) = entropy_decode3970(&riceX);
+            }
+        }
+    }
+    if (ape_ctx->blocksdecoded == ape_ctx->currentframeblocks)
+    {
+        range_done_decoding();
+    }
+    /* Return the new state of the buffer */
+    *bytesconsumed = bytebuffer - inbuffer;
+    *firstbyte = bytebufferoffset;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/entropy.h b/lib/rbcodec/codecs/demac/libdemac/entropy.h
new file mode 100644
index 0000000000..fac2a44d99
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/entropy.h
@@ -0,0 +1,40 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_ENTROPY_H
+#define _APE_ENTROPY_H
+#include <inttypes.h>
+void init_entropy_decoder(struct ape_ctx_t* ape_ctx,
+                          unsigned char* inbuffer, int* firstbyte,
+                          int* bytesconsumed);
+void entropy_decode(struct ape_ctx_t* ape_ctx,
+                    unsigned char* inbuffer, int* firstbyte,
+                    int* bytesconsumed,
+                    int32_t* decoded0, int32_t* decoded1,
+                    int blockstodecode);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter.c b/lib/rbcodec/codecs/demac/libdemac/filter.c
new file mode 100644
index 0000000000..903885cf00
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter.c
@@ -0,0 +1,296 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <string.h>
+#include <inttypes.h>
+#include "demac.h"
+#include "filter.h"
+#include "demac_config.h"
+     
+#if FILTER_BITS == 32
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#include "vector_math32_armv4.h"
+#else
+#include "vector_math_generic.h"
+#endif
+#else /* FILTER_BITS == 16 */
+#ifdef CPU_COLDFIRE
+#include "vector_math16_cf.h"
+#elif defined(CPU_ARM) && (ARM_ARCH >= 7)
+#include "vector_math16_armv7.h"
+#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
+#include "vector_math16_armv6.h"
+#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
+/* Assume all our ARMv5 targets are ARMv5te(j) */
+#include "vector_math16_armv5te.h"
+#elif (defined(__i386__) || defined(__i486__))  && defined(__MMX__) \
+    || defined(__x86_64__)
+#include "vector_math16_mmx.h"
+#else
+#include "vector_math_generic.h"
+#endif
+#endif /* FILTER_BITS */
+struct filter_t {
+    filter_int* coeffs; /* ORDER entries */
+    /* We store all the filter delays in a single buffer */
+    filter_int* history_end;
+    filter_int* delay;
+    filter_int* adaptcoeffs;
+    int avg;
+};
+/* We name the functions according to the ORDER and FRACBITS
+   pre-processor symbols and build multiple .o files from this .c file
+   - this increases code-size but gives the compiler more scope for
+   optimising the individual functions, as well as replacing a lot of
+   variables with constants.
+*/
+#if FRACBITS == 11
+  #if ORDER == 16
+     #define INIT_FILTER   init_filter_16_11
+     #define APPLY_FILTER apply_filter_16_11
+  #elif ORDER == 64
+     #define INIT_FILTER  init_filter_64_11
+     #define APPLY_FILTER apply_filter_64_11
+  #endif
+#elif FRACBITS == 13
+  #define INIT_FILTER  init_filter_256_13
+  #define APPLY_FILTER apply_filter_256_13
+#elif FRACBITS == 10
+  #define INIT_FILTER  init_filter_32_10
+  #define APPLY_FILTER apply_filter_32_10
+#elif FRACBITS == 15
+  #define INIT_FILTER  init_filter_1280_15
+  #define APPLY_FILTER apply_filter_1280_15
+#endif
+/* Some macros to handle the fixed-point stuff */
+/* Convert from (32-FRACBITS).FRACBITS fixed-point format to an
+   integer (rounding to nearest). */
+#define FP_HALF  (1 << (FRACBITS - 1))   /* 0.5 in fixed-point format. */
+#define FP_TO_INT(x) ((x + FP_HALF) >> FRACBITS)  /* round(x) */
+#ifdef CPU_ARM
+#if ARM_ARCH >= 6
+#define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
+#else /* ARM_ARCH < 6 */
+/* Keeping the asr #31 outside of the asm allows loads to be scheduled between
+   it and the rest of the block on ARM9E, with the load's result latency filled
+   by the other calculations. */
+#define SATURATE(x) ({ \
+    int __res = (x) >> 31; \
+    asm volatile ( \
+        "teq %0, %1, asr #15\n\t" \
+        "moveq %0, %1\n\t" \
+        "eorne %0, %0, #0xff\n\t" \
+        "eorne %0, %0, #0x7f00" \
+        : "+r" (__res) : "r" (x) : "cc" \
+    ); \
+    __res; \
+})
+#endif /* ARM_ARCH */
+#else /* CPU_ARM */
+#define SATURATE(x) (LIKELY((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF)
+#endif
+/* Apply the filter with state f to count entries in data[] */
+static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
+                                                  int32_t* data, int count)
+{
+    int res;
+    int absres; 
+#ifdef PREPARE_SCALARPRODUCT
+    PREPARE_SCALARPRODUCT
+#endif
+    while(LIKELY(count--))
+    {
+#ifdef FUSED_VECTOR_MATH
+        if (LIKELY(*data != 0)) {
+            if (*data < 0)
+                res = vector_sp_add(f->coeffs, f->delay - ORDER,
+                                    f->adaptcoeffs - ORDER);
+            else
+                res = vector_sp_sub(f->coeffs, f->delay - ORDER,
+                                    f->adaptcoeffs - ORDER);
+        } else {
+            res = scalarproduct(f->coeffs, f->delay - ORDER);
+        }
+        res = FP_TO_INT(res);
+#else
+        res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
+        if (LIKELY(*data != 0)) {
+            if (*data < 0)
+                vector_add(f->coeffs, f->adaptcoeffs - ORDER);
+            else
+                vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
+        }
+#endif
+        res += *data;
+        *data++ = res;
+        /* Update the output history */
+        *f->delay++ = SATURATE(res);
+        /* Version 3.98 and later files */
+        /* Update the adaption coefficients */
+        absres = (res < 0 ? -res : res);
+        if (UNLIKELY(absres > 3 * f->avg))
+            *f->adaptcoeffs = ((res >> 25) & 64) - 32;
+        else if (3 * absres > 4 * f->avg)
+            *f->adaptcoeffs = ((res >> 26) & 32) - 16;
+        else if (LIKELY(absres > 0))
+            *f->adaptcoeffs = ((res >> 27) & 16) - 8;
+        else
+            *f->adaptcoeffs = 0;
+        f->avg += (absres - f->avg) / 16;
+        f->adaptcoeffs[-1] >>= 1;
+        f->adaptcoeffs[-2] >>= 1;
+        f->adaptcoeffs[-8] >>= 1;
+        f->adaptcoeffs++;
+        /* Have we filled the history buffer? */
+        if (UNLIKELY(f->delay == f->history_end)) {
+            memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
+                    (ORDER*2) * sizeof(filter_int));
+            f->adaptcoeffs = f->coeffs + ORDER*2;
+            f->delay = f->coeffs + ORDER*3;
+        }
+    }
+}
+static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
+                                                  int32_t* data, int count)
+{
+    int res;
+    
+#ifdef PREPARE_SCALARPRODUCT
+    PREPARE_SCALARPRODUCT
+#endif
+    while(LIKELY(count--))
+    {
+#ifdef FUSED_VECTOR_MATH
+        if (LIKELY(*data != 0)) {
+            if (*data < 0)
+                res = vector_sp_add(f->coeffs, f->delay - ORDER,
+                                    f->adaptcoeffs - ORDER);
+            else
+                res = vector_sp_sub(f->coeffs, f->delay - ORDER,
+                                    f->adaptcoeffs - ORDER);
+        } else {
+            res = scalarproduct(f->coeffs, f->delay - ORDER);
+        }
+        res = FP_TO_INT(res);
+#else
+        res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
+        if (LIKELY(*data != 0)) {
+            if (*data < 0)
+                vector_add(f->coeffs, f->adaptcoeffs - ORDER);
+            else
+                vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
+        }
+#endif
+        /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
+           integer (rounding to nearest) and add the input value to
+           it */
+        res += *data;
+        *data++ = res;
+        /* Update the output history */
+        *f->delay++ = SATURATE(res);
+        /* Version ??? to < 3.98 files (untested) */
+        f->adaptcoeffs[0] = (res == 0) ? 0 : ((res >> 28) & 8) - 4;
+        f->adaptcoeffs[-4] >>= 1;
+        f->adaptcoeffs[-8] >>= 1;
+        f->adaptcoeffs++;
+        /* Have we filled the history buffer? */
+        if (UNLIKELY(f->delay == f->history_end)) {
+            memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
+                    (ORDER*2) * sizeof(filter_int));
+            f->adaptcoeffs = f->coeffs + ORDER*2;
+            f->delay = f->coeffs + ORDER*3;
+        }
+    }
+}
+static struct filter_t filter[2] IBSS_ATTR_DEMAC;
+static void do_init_filter(struct filter_t* f, filter_int* buf)
+{
+    f->coeffs = buf;
+    f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE;
+    /* Init pointers */
+    f->adaptcoeffs = f->coeffs + ORDER*2;
+    f->delay = f->coeffs + ORDER*3;
+    /* Zero coefficients and history buffer */
+    memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int));
+    /* Zero the running average */
+    f->avg = 0;
+}
+void INIT_FILTER(filter_int* buf)
+{
+    do_init_filter(&filter[0], buf);
+    do_init_filter(&filter[1], buf + ORDER*3 + FILTER_HISTORY_SIZE);
+}
+void ICODE_ATTR_DEMAC APPLY_FILTER(int fileversion, int channel,
+                                   int32_t* data, int count)
+{
+    if (fileversion >= 3980)
+        do_apply_filter_3980(&filter[channel], data, count);
+    else
+        do_apply_filter_3970(&filter[channel], data, count);
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter.h b/lib/rbcodec/codecs/demac/libdemac/filter.h
new file mode 100644
index 0000000000..609ea12496
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter.h
@@ -0,0 +1,50 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_FILTER_H
+#define _APE_FILTER_H
+#include "demac_config.h"
+void init_filter_16_11(filter_int* buf);
+void apply_filter_16_11(int fileversion, int channel,
+                        int32_t* decoded, int count);
+void init_filter_64_11(filter_int* buf);
+void apply_filter_64_11(int fileversion, int channel,
+                        int32_t* decoded, int count);
+void init_filter_32_10(filter_int* buf);
+void apply_filter_32_10(int fileversion, int channel,
+                        int32_t* decoded, int count);
+void init_filter_256_13(filter_int* buf);
+void apply_filter_256_13(int fileversion, int channel,
+                         int32_t* decoded, int count);
+void init_filter_1280_15(filter_int* buf);
+void apply_filter_1280_15(int fileversion, int channel,
+                          int32_t* decoded, int count);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter_1280_15.c b/lib/rbcodec/codecs/demac/libdemac/filter_1280_15.c
new file mode 100644
index 0000000000..f2301fb02a
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter_1280_15.c
@@ -0,0 +1,32 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include "demac_config.h"
+#ifndef FILTER256_IRAM
+#undef ICODE_ATTR_DEMAC
+#define ICODE_ATTR_DEMAC
+#endif
+#define ORDER 1280
+#define FRACBITS 15
+#include "filter.c"
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter_16_11.c b/lib/rbcodec/codecs/demac/libdemac/filter_16_11.c
new file mode 100644
index 0000000000..94c56e247f
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter_16_11.c
@@ -0,0 +1,27 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define ORDER 16
+#define FRACBITS 11
+#include "filter.c"
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter_256_13.c b/lib/rbcodec/codecs/demac/libdemac/filter_256_13.c
new file mode 100644
index 0000000000..9e4b9fcb13
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter_256_13.c
@@ -0,0 +1,32 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include "demac_config.h"
+#ifndef FILTER256_IRAM
+#undef ICODE_ATTR_DEMAC
+#define ICODE_ATTR_DEMAC
+#endif
+#define ORDER 256
+#define FRACBITS 13
+#include "filter.c"
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter_32_10.c b/lib/rbcodec/codecs/demac/libdemac/filter_32_10.c
new file mode 100644
index 0000000000..5ec85089db
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter_32_10.c
@@ -0,0 +1,27 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define ORDER 32
+#define FRACBITS 10
+#include "filter.c"
diff --git a/lib/rbcodec/codecs/demac/libdemac/filter_64_11.c b/lib/rbcodec/codecs/demac/libdemac/filter_64_11.c
new file mode 100644
index 0000000000..cd74fa5f6b
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/filter_64_11.c
@@ -0,0 +1,27 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define ORDER 64
+#define FRACBITS 11
+#include "filter.c"
diff --git a/lib/rbcodec/codecs/demac/libdemac/parser.c b/lib/rbcodec/codecs/demac/libdemac/parser.c
new file mode 100644
index 0000000000..2af4a292b8
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/parser.c
@@ -0,0 +1,402 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <inttypes.h>
+#include <string.h>
+#ifndef ROCKBOX
+#include <stdio.h>
+#include <stdlib.h>
+#include "inttypes.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+#include "parser.h"
+#ifdef APE_MAX
+#undef APE_MAX
+#endif
+#define APE_MAX(a,b) ((a)>(b)?(a):(b))
+static inline int16_t get_int16(unsigned char* buf)
+{
+    return(buf[0] | (buf[1] << 8));
+}
+static inline uint16_t get_uint16(unsigned char* buf)
+{
+    return(buf[0] | (buf[1] << 8));
+}
+static inline uint32_t get_uint32(unsigned char* buf)
+{
+    return(buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24));
+}
+int ape_parseheaderbuf(unsigned char* buf, struct ape_ctx_t* ape_ctx)
+{
+    unsigned char* header;
+    memset(ape_ctx,0,sizeof(struct ape_ctx_t));
+    /* TODO: Skip any leading junk such as id3v2 tags */
+    ape_ctx->junklength = 0;
+    memcpy(ape_ctx->magic, buf, 4);
+    if (memcmp(ape_ctx->magic,"MAC ",4)!=0)
+    {
+        return -1;
+    }
+    ape_ctx->fileversion = get_int16(buf + 4);
+    if (ape_ctx->fileversion >= 3980)
+    {
+        ape_ctx->padding1 = get_int16(buf + 6);
+        ape_ctx->descriptorlength = get_uint32(buf + 8);
+        ape_ctx->headerlength = get_uint32(buf + 12);
+        ape_ctx->seektablelength = get_uint32(buf + 16);
+        ape_ctx->wavheaderlength = get_uint32(buf + 20);
+        ape_ctx->audiodatalength = get_uint32(buf + 24);
+        ape_ctx->audiodatalength_high = get_uint32(buf + 28);
+        ape_ctx->wavtaillength = get_uint32(buf + 32);
+        memcpy(ape_ctx->md5, buf + 36, 16);
+        header = buf + ape_ctx->descriptorlength;
+        /* Read header data */
+        ape_ctx->compressiontype = get_uint16(header + 0);
+        ape_ctx->formatflags = get_uint16(header + 2);
+        ape_ctx->blocksperframe = get_uint32(header + 4);
+        ape_ctx->finalframeblocks = get_uint32(header + 8);
+        ape_ctx->totalframes = get_uint32(header + 12);
+        ape_ctx->bps = get_uint16(header + 16);
+        ape_ctx->channels = get_uint16(header + 18);
+        ape_ctx->samplerate = get_uint32(header + 20);
+        ape_ctx->seektablefilepos = ape_ctx->junklength + 
+                                    ape_ctx->descriptorlength +
+                                    ape_ctx->headerlength;
+        ape_ctx->firstframe = ape_ctx->junklength + ape_ctx->descriptorlength +
+                              ape_ctx->headerlength + ape_ctx->seektablelength +
+                              ape_ctx->wavheaderlength;
+    } else {
+        ape_ctx->headerlength = 32;
+        ape_ctx->compressiontype = get_uint16(buf + 6);
+        ape_ctx->formatflags = get_uint16(buf + 8);
+        ape_ctx->channels = get_uint16(buf + 10);
+        ape_ctx->samplerate = get_uint32(buf + 12);
+        ape_ctx->wavheaderlength = get_uint32(buf + 16);
+        ape_ctx->totalframes = get_uint32(buf + 24);
+        ape_ctx->finalframeblocks = get_uint32(buf + 28);
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_HAS_PEAK_LEVEL)
+        {
+            ape_ctx->headerlength += 4;
+        }
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_HAS_SEEK_ELEMENTS)
+        {
+            ape_ctx->seektablelength = get_uint32(buf + ape_ctx->headerlength);
+            ape_ctx->seektablelength *= sizeof(int32_t);
+            ape_ctx->headerlength += 4;
+        } else {
+            ape_ctx->seektablelength = ape_ctx->totalframes * sizeof(int32_t);
+        }
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_8_BIT)
+            ape_ctx->bps = 8;
+        else if (ape_ctx->formatflags & MAC_FORMAT_FLAG_24_BIT)
+            ape_ctx->bps = 24;
+        else
+            ape_ctx->bps = 16;
+        if (ape_ctx->fileversion >= 3950)
+            ape_ctx->blocksperframe = 73728 * 4;
+        else if ((ape_ctx->fileversion >= 3900) || (ape_ctx->fileversion >= 3800 && ape_ctx->compressiontype >= 4000))
+            ape_ctx->blocksperframe = 73728;
+        else
+            ape_ctx->blocksperframe = 9216;
+        ape_ctx->seektablefilepos = ape_ctx->junklength + ape_ctx->headerlength +
+                                    ape_ctx->wavheaderlength;
+        ape_ctx->firstframe = ape_ctx->junklength + ape_ctx->headerlength +
+                              ape_ctx->wavheaderlength + ape_ctx->seektablelength;
+    }
+    ape_ctx->totalsamples = ape_ctx->finalframeblocks;
+    if (ape_ctx->totalframes > 1)
+        ape_ctx->totalsamples += ape_ctx->blocksperframe * (ape_ctx->totalframes-1);
+    ape_ctx->numseekpoints = APE_MAX(ape_ctx->maxseekpoints,
+                                     ape_ctx->seektablelength / sizeof(int32_t));
+    return 0;
+}
+#ifndef ROCKBOX
+/* Helper functions */
+static int read_uint16(int fd, uint16_t* x)
+{
+    unsigned char tmp[2];
+    int n;
+    n = read(fd,tmp,2);
+    if (n != 2)
+        return -1;
+    *x = tmp[0] | (tmp[1] << 8);
+    return 0;
+}
+static int read_int16(int fd, int16_t* x)
+{
+    return read_uint16(fd, (uint16_t*)x);
+}
+static int read_uint32(int fd, uint32_t* x)
+{
+    unsigned char tmp[4];
+    int n;
+    n = read(fd,tmp,4);
+    if (n != 4)
+        return -1;
+    *x = tmp[0] | (tmp[1] << 8) | (tmp[2] << 16) | (tmp[3] << 24);
+    return 0;
+}
+int ape_parseheader(int fd, struct ape_ctx_t* ape_ctx)
+{
+    int i,n;
+    /* TODO: Skip any leading junk such as id3v2 tags */
+    ape_ctx->junklength = 0;
+    lseek(fd,ape_ctx->junklength,SEEK_SET);
+    n = read(fd,&ape_ctx->magic,4);
+    if (n != 4) return -1;
+    if (memcmp(ape_ctx->magic,"MAC ",4)!=0)
+    {
+        return -1;
+    }
+    if (read_int16(fd,&ape_ctx->fileversion) < 0)
+        return -1;
+    if (ape_ctx->fileversion >= 3980)
+    {
+        if (read_int16(fd,&ape_ctx->padding1) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->descriptorlength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->headerlength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->seektablelength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->wavheaderlength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->audiodatalength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->audiodatalength_high) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->wavtaillength) < 0)
+            return -1;
+        if (read(fd,&ape_ctx->md5,16) != 16)
+            return -1;
+        /* Skip any unknown bytes at the end of the descriptor.  This is for future
+           compatibility */
+        if (ape_ctx->descriptorlength > 52)
+            lseek(fd,ape_ctx->descriptorlength - 52, SEEK_CUR);
+        /* Read header data */
+        if (read_uint16(fd,&ape_ctx->compressiontype) < 0)
+            return -1;
+        if (read_uint16(fd,&ape_ctx->formatflags) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->blocksperframe) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->finalframeblocks) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->totalframes) < 0)
+            return -1;
+        if (read_uint16(fd,&ape_ctx->bps) < 0)
+            return -1;
+        if (read_uint16(fd,&ape_ctx->channels) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->samplerate) < 0)
+            return -1;
+    } else {
+        ape_ctx->descriptorlength = 0;
+        ape_ctx->headerlength = 32;
+        if (read_uint16(fd,&ape_ctx->compressiontype) < 0)
+            return -1;
+        if (read_uint16(fd,&ape_ctx->formatflags) < 0)
+            return -1;
+        if (read_uint16(fd,&ape_ctx->channels) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->samplerate) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->wavheaderlength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->wavtaillength) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->totalframes) < 0)
+            return -1;
+        if (read_uint32(fd,&ape_ctx->finalframeblocks) < 0)
+            return -1;
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_HAS_PEAK_LEVEL)
+        {
+            lseek(fd, 4, SEEK_CUR);   /* Skip the peak level */
+            ape_ctx->headerlength += 4;
+        }
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_HAS_SEEK_ELEMENTS)
+        {
+            if (read_uint32(fd,&ape_ctx->seektablelength) < 0)
+                return -1;
+            ape_ctx->headerlength += 4;
+            ape_ctx->seektablelength *= sizeof(int32_t);
+        } else {
+            ape_ctx->seektablelength = ape_ctx->totalframes * sizeof(int32_t);
+        }
+        if (ape_ctx->formatflags & MAC_FORMAT_FLAG_8_BIT)
+            ape_ctx->bps = 8;
+        else if (ape_ctx->formatflags & MAC_FORMAT_FLAG_24_BIT)
+            ape_ctx->bps = 24;
+        else
+            ape_ctx->bps = 16;
+        if (ape_ctx->fileversion >= 3950)
+            ape_ctx->blocksperframe = 73728 * 4;
+        else if ((ape_ctx->fileversion >= 3900) || (ape_ctx->fileversion >= 3800 && ape_ctx->compressiontype >= 4000))
+            ape_ctx->blocksperframe = 73728;
+        else
+            ape_ctx->blocksperframe = 9216;
+        /* Skip any stored wav header */
+        if (!(ape_ctx->formatflags & MAC_FORMAT_FLAG_CREATE_WAV_HEADER))
+        {
+            lseek(fd, ape_ctx->wavheaderlength, SEEK_CUR);
+        }
+    }
+    ape_ctx->totalsamples = ape_ctx->finalframeblocks;
+    if (ape_ctx->totalframes > 1)
+        ape_ctx->totalsamples += ape_ctx->blocksperframe * (ape_ctx->totalframes-1);
+    if (ape_ctx->seektablelength > 0)
+    {
+        ape_ctx->seektable = malloc(ape_ctx->seektablelength);
+        if (ape_ctx->seektable == NULL)
+            return -1;
+        for (i=0; i < ape_ctx->seektablelength / sizeof(uint32_t); i++)
+        {
+            if (read_uint32(fd,&ape_ctx->seektable[i]) < 0)
+            {
+                 free(ape_ctx->seektable);
+                 return -1;
+            }
+        }
+    }
+    ape_ctx->firstframe = ape_ctx->junklength + ape_ctx->descriptorlength +
+                           ape_ctx->headerlength + ape_ctx->seektablelength +
+                           ape_ctx->wavheaderlength;
+    return 0;
+}
+void ape_dumpinfo(struct ape_ctx_t* ape_ctx)
+{
+  int i;
+    printf("Descriptor Block:\n\n");
+    printf("magic                = \"%c%c%c%c\"\n",
+            ape_ctx->magic[0],ape_ctx->magic[1],
+            ape_ctx->magic[2],ape_ctx->magic[3]);
+    printf("fileversion          = %d\n",ape_ctx->fileversion);
+    printf("descriptorlength     = %d\n",ape_ctx->descriptorlength);
+    printf("headerlength         = %d\n",ape_ctx->headerlength);
+    printf("seektablelength      = %d\n",ape_ctx->seektablelength);
+    printf("wavheaderlength      = %d\n",ape_ctx->wavheaderlength);
+    printf("audiodatalength      = %d\n",ape_ctx->audiodatalength);
+    printf("audiodatalength_high = %d\n",ape_ctx->audiodatalength_high);
+    printf("wavtaillength        = %d\n",ape_ctx->wavtaillength);
+    printf("md5                  = ");
+    for (i = 0; i < 16; i++)
+        printf("%02x",ape_ctx->md5[i]);
+    printf("\n");
+    printf("\nHeader Block:\n\n");
+    printf("compressiontype      = %d\n",ape_ctx->compressiontype);
+    printf("formatflags          = %d\n",ape_ctx->formatflags);
+    printf("blocksperframe       = %d\n",ape_ctx->blocksperframe);
+    printf("finalframeblocks     = %d\n",ape_ctx->finalframeblocks);
+    printf("totalframes          = %d\n",ape_ctx->totalframes);
+    printf("bps                  = %d\n",ape_ctx->bps);
+    printf("channels             = %d\n",ape_ctx->channels);
+    printf("samplerate           = %d\n",ape_ctx->samplerate);
+    printf("\nSeektable\n\n");
+    if ((ape_ctx->seektablelength / sizeof(uint32_t)) != ape_ctx->totalframes)
+    {
+        printf("No seektable\n");
+    }
+    else
+    {
+        for ( i = 0; i < ape_ctx->seektablelength / sizeof(uint32_t) ; i++)
+        {
+            if (i < ape_ctx->totalframes-1) {
+                printf("%8d   %d (%d bytes)\n",i,ape_ctx->seektable[i],ape_ctx->seektable[i+1]-ape_ctx->seektable[i]);
+            } else {
+                printf("%8d   %d\n",i,ape_ctx->seektable[i]);
+            }
+        }
+    }
+    printf("\nCalculated information:\n\n");
+    printf("junklength           = %d\n",ape_ctx->junklength);
+    printf("firstframe           = %d\n",ape_ctx->firstframe);
+    printf("totalsamples         = %d\n",ape_ctx->totalsamples);
+}
+#endif /* !ROCKBOX */
diff --git a/lib/rbcodec/codecs/demac/libdemac/parser.h b/lib/rbcodec/codecs/demac/libdemac/parser.h
new file mode 100644
index 0000000000..6f07deac12
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/parser.h
@@ -0,0 +1,137 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_PARSER_H
+#define _APE_PARSER_H
+#include <inttypes.h>
+#include "demac_config.h"
+/* The earliest and latest file formats supported by this library */
+#define APE_MIN_VERSION 3970
+#define APE_MAX_VERSION 3990
+#define MAC_FORMAT_FLAG_8_BIT                 1    // is 8-bit [OBSOLETE]
+#define MAC_FORMAT_FLAG_CRC                   2    // uses the new CRC32 error detection [OBSOLETE]
+#define MAC_FORMAT_FLAG_HAS_PEAK_LEVEL        4    // uint32 nPeakLevel after the header [OBSOLETE]
+#define MAC_FORMAT_FLAG_24_BIT                8    // is 24-bit [OBSOLETE]
+#define MAC_FORMAT_FLAG_HAS_SEEK_ELEMENTS    16    // has the number of seek elements after the peak level
+#define MAC_FORMAT_FLAG_CREATE_WAV_HEADER    32    // create the wave header on decompression (not stored)
+/* Special frame codes:
+   MONO_SILENCE - All PCM samples in frame are zero (mono streams only)
+   LEFT_SILENCE - All PCM samples for left channel in frame are zero (stereo streams)
+   RIGHT_SILENCE - All PCM samples for left channel in frame are zero (stereo streams)
+   PSEUDO_STEREO - Left and Right channels are identical
+*/
+#define APE_FRAMECODE_MONO_SILENCE    1
+#define APE_FRAMECODE_LEFT_SILENCE    1 /* same as mono */
+#define APE_FRAMECODE_RIGHT_SILENCE   2
+#define APE_FRAMECODE_STEREO_SILENCE  3 /* combined */
+#define APE_FRAMECODE_PSEUDO_STEREO   4
+#define PREDICTOR_ORDER 8
+/* Total size of all predictor histories - 50 * sizeof(int32_t) */
+#define PREDICTOR_SIZE 50
+/* NOTE: This struct is used in predictor-arm.S - any updates need to
+   be reflected there. */
+struct predictor_t
+{
+    /* Filter histories */
+    int32_t* buf;
+    int32_t YlastA;
+    int32_t XlastA;
+    /* NOTE: The order of the next four fields is important for
+       predictor-arm.S */
+    int32_t YfilterB;
+    int32_t XfilterA;
+    int32_t XfilterB;
+    int32_t YfilterA;
+    /* Adaption co-efficients */
+    int32_t YcoeffsA[4];
+    int32_t XcoeffsA[4];
+    int32_t YcoeffsB[5];
+    int32_t XcoeffsB[5];
+    int32_t historybuffer[PREDICTOR_HISTORY_SIZE + PREDICTOR_SIZE];
+};
+struct ape_ctx_t
+{
+    /* Derived fields */
+    uint32_t      junklength;
+    uint32_t      firstframe;
+    uint32_t      totalsamples;
+    /* Info from Descriptor Block */
+    char          magic[4];
+    int16_t       fileversion;
+    int16_t       padding1;
+    uint32_t      descriptorlength;
+    uint32_t      headerlength;
+    uint32_t      seektablelength;
+    uint32_t      wavheaderlength;
+    uint32_t      audiodatalength;
+    uint32_t      audiodatalength_high;
+    uint32_t      wavtaillength;
+    uint8_t       md5[16];
+    /* Info from Header Block */
+    uint16_t      compressiontype;
+    uint16_t      formatflags;
+    uint32_t      blocksperframe;
+    uint32_t      finalframeblocks;
+    uint32_t      totalframes;
+    uint16_t      bps;
+    uint16_t      channels;
+    uint32_t      samplerate;
+    /* Seektable */
+    uint32_t*     seektable;        /* Seektable buffer */
+    uint32_t      maxseekpoints;    /* Max seekpoints we can store (size of seektable buffer) */
+    uint32_t      numseekpoints;    /* Number of seekpoints */
+    int           seektablefilepos; /* Location in .ape file of seektable */
+    /* Decoder state */
+    uint32_t      CRC;
+    int           frameflags;
+    int           currentframeblocks;
+    int           blocksdecoded;
+    struct predictor_t predictor;
+};
+int ape_parseheader(int fd, struct ape_ctx_t* ape_ctx);
+int ape_parseheaderbuf(unsigned char* buf, struct ape_ctx_t* ape_ctx);
+void ape_dumpinfo(struct ape_ctx_t* ape_ctx);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/predictor-arm.S b/lib/rbcodec/codecs/demac/libdemac/predictor-arm.S
new file mode 100644
index 0000000000..92a78ed9b4
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/predictor-arm.S
@@ -0,0 +1,702 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include "demac_config.h"
+    .section    ICODE_SECTION_DEMAC_ARM,"ax",%progbits
+    .align      2
+/* NOTE: The following need to be kept in sync with parser.h */
+#define YDELAYA        200
+#define YDELAYB        168
+#define XDELAYA        136
+#define XDELAYB        104
+#define YADAPTCOEFFSA   72
+#define XADAPTCOEFFSA   56
+#define YADAPTCOEFFSB   40
+#define XADAPTCOEFFSB   20
+/* struct predictor_t members: */
+#define buf              0    /* int32_t* buf */
+#define YlastA           4    /* int32_t YlastA; */
+#define XlastA           8    /* int32_t XlastA; */
+#define YfilterB        12    /* int32_t YfilterB; */
+#define XfilterA        16    /* int32_t XfilterA; */
+#define XfilterB        20    /* int32_t XfilterB; */
+#define YfilterA        24    /* int32_t YfilterA; */
+    
+#define YcoeffsA        28    /* int32_t YcoeffsA[4]; */
+#define XcoeffsA        44    /* int32_t XcoeffsA[4]; */
+#define YcoeffsB        60    /* int32_t YcoeffsB[5]; */
+#define XcoeffsB        80    /* int32_t XcoeffsB[5]; */
+#define historybuffer  100    /* int32_t historybuffer[] */
+@ Macro for loading 2 registers, for various ARM versions.
+@ Registers must start with an even register, and must be consecutive.
+.macro LDR2OFS reg1, reg2, base, offset
+#if ARM_ARCH >= 6
+    ldrd    \reg1, [\base, \offset]
+#else /* ARM_ARCH < 6 */
+#ifdef CPU_ARM7TDMI
+    add     \reg1, \base, \offset
+    ldmia   \reg1, {\reg1, \reg2}
+#else /* ARM9 (v4 and v5) is faster this way */
+    ldr     \reg1, [\base, \offset]
+    ldr     \reg2, [\base, \offset+4]
+#endif
+#endif /* ARM_ARCH */
+.endm
+@ Macro for storing 2 registers, for various ARM versions.
+@ Registers must start with an even register, and must be consecutive.
+.macro STR2OFS reg1, reg2, base, offset
+#if ARM_ARCH >= 6
+    strd    \reg1, [\base, \offset]
+#else
+    str     \reg1, [\base, \offset]
+    str     \reg2, [\base, \offset+4]
+#endif
+.endm
+    .global     predictor_decode_stereo
+    .type       predictor_decode_stereo,%function
+@ Register usage:
+@
+@ r0-r11 - scratch
+@ r12 - struct predictor_t* p
+@ r14 - int32_t* p->buf
+@ void predictor_decode_stereo(struct predictor_t* p,
+@                              int32_t* decoded0,
+@                              int32_t* decoded1,
+@                              int count)
+predictor_decode_stereo:
+    stmdb   sp!, {r1-r11, lr}
+    @ r1 (decoded0) is [sp]
+    @ r2 (decoded1) is [sp, #4]
+    @ r3 (count)    is [sp, #8]
+    mov     r12, r0       @ r12 := p
+    ldr     r14, [r0]     @ r14 := p->buf
+loop:
+@@@@@@@@@@@@@@@@@@@@@@@@@@@ PREDICTOR Y
+@ Predictor Y, Filter A
+    ldr     r11, [r12, #YlastA]     @ r11 := p->YlastA
+    add     r2, r14, #YDELAYA-12    @ r2 := &p->buf[YDELAYA-3]
+    ldmia   r2, {r2, r3, r10}       @ r2 := p->buf[YDELAYA-3]
+                                    @ r3 := p->buf[YDELAYA-2]
+                                    @ r10 := p->buf[YDELAYA-1]
+    add     r6, r12, #YcoeffsA
+    ldmia   r6, {r6 - r9}           @ r6 := p->YcoeffsA[0]
+                                    @ r7 := p->YcoeffsA[1]
+                                    @ r8 := p->YcoeffsA[2]
+                                    @ r9 := p->YcoeffsA[3]
+    subs    r10, r11, r10           @ r10 := r11 - r10
+    STR2OFS r10, r11, r14, #YDELAYA-4
+                                    @ p->buf[YDELAYA-1] = r10
+                                    @ p->buf[YDELAYA] = r11
+    mul     r0, r11, r6             @ r0 := p->buf[YDELAYA] * p->YcoeffsA[0]
+    mla     r0, r10, r7, r0         @ r0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+    mla     r0, r3, r8, r0          @ r0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+    mla     r0, r2, r9, r0          @ r0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
+    @ flags were set above, in the subs instruction
+    mvngt   r10, #0
+    movlt   r10, #1                 @ r10 := SIGN(r10) (see .c for SIGN macro)
+    cmp     r11, #0
+    mvngt   r11, #0
+    movlt   r11, #1                 @ r11 := SIGN(r11) (see .c for SIGN macro)
+    STR2OFS r10, r11, r14, #YADAPTCOEFFSA-4
+                                    @ p->buf[YADAPTCOEFFSA-1] := r10
+                                    @ p->buf[YADAPTCOEFFSA] := r11
+    @ NOTE: r0 now contains predictionA - don't overwrite.
+@ Predictor Y, Filter B
+    LDR2OFS r6, r7, r12, #YfilterB  @ r6 := p->YfilterB
+                                    @ r7 := p->XfilterA
+    add     r2, r14, #YDELAYB-16    @ r2 := &p->buf[YDELAYB-4]
+    ldmia   r2, {r2 - r4, r10}      @ r2 := p->buf[YDELAYB-4]
+                                    @ r3 := p->buf[YDELAYB-3]
+                                    @ r4 := p->buf[YDELAYB-2]
+                                    @ r10 := p->buf[YDELAYB-1]
+    rsb     r6, r6, r6, lsl #5      @ r6 := r6 * 32 - r6 ( == r6*31)
+    sub     r11, r7, r6, asr #5     @ r11 (p->buf[YDELAYB]) := r7 - (r6 >> 5)
+    str     r7, [r12, #YfilterB]    @ p->YfilterB := r7 (p->XfilterA)
+    add     r5, r12, #YcoeffsB
+    ldmia   r5, {r5 - r9}           @ r5 := p->YcoeffsB[0]
+                                    @ r6 := p->YcoeffsB[1]
+                                    @ r7 := p->YcoeffsB[2]
+                                    @ r8 := p->YcoeffsB[3]
+                                    @ r9 := p->YcoeffsB[4]
+    subs    r10, r11, r10           @ r10 := r11 - r10
+    STR2OFS r10, r11, r14, #YDELAYB-4
+                                    @ p->buf[YDELAYB-1] = r10
+                                    @ p->buf[YDELAYB] = r11
+    mul     r1, r11, r5             @ r1 := p->buf[YDELAYB] * p->YcoeffsB[0]
+    mla     r1, r10, r6, r1         @ r1 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
+    mla     r1, r4, r7, r1          @ r1 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
+    mla     r1, r3, r8, r1          @ r1 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
+    mla     r1, r2, r9, r1          @ r1 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
+    @ flags were set above, in the subs instruction
+    mvngt   r10, #0
+    movlt   r10, #1                 @ r10 := SIGN(r10) (see .c for SIGN macro)
+    cmp     r11, #0
+    mvngt   r11, #0
+    movlt   r11, #1                 @ r11 := SIGN(r11) (see .c for SIGN macro)
+    STR2OFS r10, r11, r14, #YADAPTCOEFFSB-4
+                                    @ p->buf[YADAPTCOEFFSB-1] := r10
+                                    @ p->buf[YADAPTCOEFFSB] := r11
+    @ r0 still contains predictionA
+    @ r1 contains predictionB
+    @ Finish Predictor Y
+    ldr     r2, [sp]                @ r2 := decoded0
+    add     r0, r0, r1, asr #1      @ r0 := r0 + (r1 >> 1)
+    ldr     r4, [r12, #YfilterA]    @ r4 := p->YfilterA
+    ldr     r3, [r2]                @ r3 := *decoded0
+    rsb     r4, r4, r4, lsl #5      @ r4 := r4 * 32 - r4 ( == r4*31)
+    add     r1, r3, r0, asr #10     @ r1 := r3 + (r0 >> 10)
+    str     r1, [r12, #YlastA]      @ p->YlastA := r1
+    add     r1, r1, r4, asr #5      @ r1 := r1 + (r4 >> 5)
+    str     r1, [r12, #YfilterA]    @ p->YfilterA := r1
+    @ r1 contains p->YfilterA
+    @ r2 contains decoded0
+    @ r3 contains *decoded0
+    @ r5, r6, r7, r8, r9 contain p->YcoeffsB[0..4]
+    @ r10, r11 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
+    str     r1, [r2], #4            @ *(decoded0++) := r1  (p->YfilterA)
+    str     r2, [sp]                @ save decoded0
+    cmp     r3, #0
+    beq     3f
+    add     r2, r14, #YADAPTCOEFFSB-16
+    ldmia   r2, {r2 - r4}           @ r2 := p->buf[YADAPTCOEFFSB-4]
+                                    @ r3 := p->buf[YADAPTCOEFFSB-3]
+                                    @ r4 := p->buf[YADAPTCOEFFSB-2]
+    blt     1f
+    @ *decoded0 > 0
+    sub     r5, r5, r11       @ r5 := p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
+    sub     r6, r6, r10       @ r6 := p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
+    sub     r9, r9, r2        @ r9 := p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
+    sub     r8, r8, r3        @ r8 := p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
+    sub     r7, r7, r4        @ r7 := p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
+    add     r0, r12, #YcoeffsB      
+    stmia   r0, {r5 - r9}           @ Save p->YcoeffsB[]
+    add     r1, r12, #YcoeffsA
+    ldmia   r1, {r2 - r5}           @ r2 := p->YcoeffsA[0]
+                                    @ r3 := p->YcoeffsA[1]
+                                    @ r4 := p->YcoeffsA[2]
+                                    @ r5 := p->YcoeffsA[3]
+    add     r6, r14, #YADAPTCOEFFSA-12
+    ldmia   r6, {r6 - r9}           @ r6 := p->buf[YADAPTCOEFFSA-3]
+                                    @ r7 := p->buf[YADAPTCOEFFSA-2]
+                                    @ r8 := p->buf[YADAPTCOEFFSA-1]
+                                    @ r9 := p->buf[YADAPTCOEFFSA]
+    sub     r5, r5, r6        @ r5 := p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+    sub     r4, r4, r7        @ r4 := p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+    sub     r3, r3, r8        @ r3 := p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+    sub     r2, r2, r9        @ r2 := p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+    
+    b       2f
+1:  @ *decoded0 < 0
+    add     r5, r5, r11       @ r5 := p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
+    add     r6, r6, r10       @ r6 := p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
+    add     r9, r9, r2        @ r9 := p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
+    add     r8, r8, r3        @ r9 := p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
+    add     r7, r7, r4        @ r8 := p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
+    add     r0, r12, #YcoeffsB      
+    stmia   r0, {r5 - r9}           @ Save p->YcoeffsB[]
+    add     r1, r12, #YcoeffsA
+    ldmia   r1, {r2 - r5}           @ r2 := p->YcoeffsA[0]
+                                    @ r3 := p->YcoeffsA[1]
+                                    @ r4 := p->YcoeffsA[2]
+                                    @ r5 := p->YcoeffsA[3]
+    add     r6, r14, #YADAPTCOEFFSA-12
+    ldmia   r6, {r6 - r9}           @ r6 := p->buf[YADAPTCOEFFSA-3]
+                                    @ r7 := p->buf[YADAPTCOEFFSA-2]
+                                    @ r8 := p->buf[YADAPTCOEFFSA-1]
+                                    @ r9 := p->buf[YADAPTCOEFFSA]
+    add     r5, r5, r6        @ r5 := p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
+    add     r4, r4, r7        @ r4 := p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
+    add     r3, r3, r8        @ r3 := p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
+    add     r2, r2, r9        @ r2 := p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
+    
+2:
+    stmia   r1, {r2 - r5}     @ Save p->YcoeffsA
+3:
+@@@@@@@@@@@@@@@@@@@@@@@@@@@ PREDICTOR X
+@ Predictor X, Filter A
+    ldr     r11, [r12, #XlastA]     @ r11 := p->XlastA
+    add     r2, r14, #XDELAYA-12    @ r2 := &p->buf[XDELAYA-3]
+    ldmia   r2, {r2, r3, r10}       @ r2 := p->buf[XDELAYA-3]
+                                    @ r3 := p->buf[XDELAYA-2]
+                                    @ r10 := p->buf[XDELAYA-1]
+    add     r6, r12, #XcoeffsA
+    ldmia   r6, {r6 - r9}           @ r6 := p->XcoeffsA[0]
+                                    @ r7 := p->XcoeffsA[1]
+                                    @ r8 := p->XcoeffsA[2]
+                                    @ r9 := p->XcoeffsA[3]
+    subs    r10, r11, r10           @ r10 := r11 - r10
+    STR2OFS r10, r11, r14, #XDELAYA-4
+                                    @ p->buf[XDELAYA-1] = r10
+                                    @ p->buf[XDELAYA] = r11
+    mul     r0, r11, r6             @ r0 := p->buf[XDELAYA] * p->XcoeffsA[0]
+    mla     r0, r10, r7, r0         @ r0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
+    mla     r0, r3, r8, r0          @ r0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
+    mla     r0, r2, r9, r0          @ r0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
+    @ flags were set above, in the subs instruction
+    mvngt   r10, #0
+    movlt   r10, #1                 @ r10 := SIGN(r10) (see .c for SIGN macro)
+    cmp     r11, #0
+    mvngt   r11, #0
+    movlt   r11, #1                 @ r11 := SIGN(r11) (see .c for SIGN macro)
+    STR2OFS r10, r11, r14, #XADAPTCOEFFSA-4
+                                    @ p->buf[XADAPTCOEFFSA-1] := r10
+                                    @ p->buf[XADAPTCOEFFSA] := r11
+    @ NOTE: r0 now contains predictionA - don't overwrite.
+@ Predictor X, Filter B
+    LDR2OFS r6, r7, r12, #XfilterB  @ r6 := p->XfilterB
+                                    @ r7 := p->YfilterA
+    add     r2, r14, #XDELAYB-16    @ r2 := &p->buf[XDELAYB-4]
+    ldmia   r2, {r2 - r4, r10}      @ r2 := p->buf[XDELAYB-4]
+                                    @ r3 := p->buf[XDELAYB-3]
+                                    @ r4 := p->buf[XDELAYB-2]
+                                    @ r10 := p->buf[XDELAYB-1]
+    rsb     r6, r6, r6, lsl #5      @ r6 := r2 * 32 - r6 ( == r6*31)
+    sub     r11, r7, r6, asr #5     @ r11 (p->buf[XDELAYB]) := r7 - (r6 >> 5)
+    str     r7, [r12, #XfilterB]    @ p->XfilterB := r7 (p->YfilterA)
+    add     r5, r12, #XcoeffsB
+    ldmia   r5, {r5 - r9}           @ r5 := p->XcoeffsB[0]
+                                    @ r6 := p->XcoeffsB[1]
+                                    @ r7 := p->XcoeffsB[2]
+                                    @ r8 := p->XcoeffsB[3]
+                                    @ r9 := p->XcoeffsB[4]
+    subs    r10, r11, r10           @ r10 := r11 - r10
+    STR2OFS r10, r11, r14, #XDELAYB-4
+                                    @ p->buf[XDELAYB-1] = r10
+                                    @ p->buf[XDELAYB] = r11
+    mul     r1, r11, r5             @ r1 := p->buf[XDELAYB] * p->XcoeffsB[0]
+    mla     r1, r10, r6, r1         @ r1 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
+    mla     r1, r4, r7, r1          @ r1 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
+    mla     r1, r3, r8, r1          @ r1 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
+    mla     r1, r2, r9, r1          @ r1 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
+    @ flags were set above, in the subs instruction
+    mvngt   r10, #0
+    movlt   r10, #1                 @ r10 := SIGN(r10) (see .c for SIGN macro)
+    cmp     r11, #0
+    mvngt   r11, #0
+    movlt   r11, #1                 @ r11 := SIGN(r11) (see .c for SIGN macro)
+    STR2OFS r10, r11, r14, #XADAPTCOEFFSB-4
+                                    @ p->buf[XADAPTCOEFFSB-1] := r10
+                                    @ p->buf[XADAPTCOEFFSB] := r11
+    @ r0 still contains predictionA
+    @ r1 contains predictionB
+    @ Finish Predictor X
+    ldr     r2, [sp, #4]            @ r2 := decoded1
+    add     r0, r0, r1, asr #1      @ r0 := r0 + (r1 >> 1)
+    ldr     r4, [r12, #XfilterA]    @ r4 := p->XfilterA
+    ldr     r3, [r2]                @ r3 := *decoded1
+    rsb     r4, r4, r4, lsl #5      @ r4 := r4 * 32 - r4 ( == r4*31)
+    add     r1, r3, r0, asr #10     @ r1 := r3 + (r0 >> 10)
+    str     r1, [r12, #XlastA]      @ p->XlastA := r1
+    add     r1, r1, r4, asr #5      @ r1 := r1 + (r4 >> 5)
+    str     r1, [r12, #XfilterA]    @ p->XfilterA := r1
+    @ r1 contains p->XfilterA
+    @ r2 contains decoded1
+    @ r3 contains *decoded1
+    @ r5, r6, r7, r8, r9 contain p->XcoeffsB[0..4]
+    @ r10, r11 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
+    str     r1, [r2], #4            @ *(decoded1++) := r1  (p->XfilterA)
+    str     r2, [sp, #4]            @ save decoded1
+    cmp     r3, #0
+    beq     3f
+    add     r2, r14, #XADAPTCOEFFSB-16
+    ldmia   r2, {r2 - r4}           @ r2 := p->buf[XADAPTCOEFFSB-4]
+                                    @ r3 := p->buf[XADAPTCOEFFSB-3]
+                                    @ r4 := p->buf[XADAPTCOEFFSB-2]
+    blt     1f
+    @ *decoded1 > 0
+    sub     r5, r5, r11       @ r5 := p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
+    sub     r6, r6, r10       @ r6 := p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
+    sub     r9, r9, r2        @ r9 := p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
+    sub     r8, r8, r3        @ r8 := p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
+    sub     r7, r7, r4        @ r7 := p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
+    add     r0, r12, #XcoeffsB      
+    stmia   r0, {r5 - r9}           @ Save p->XcoeffsB[]
+    add     r1, r12, #XcoeffsA
+    ldmia   r1, {r2 - r5}           @ r2 := p->XcoeffsA[0]
+                                    @ r3 := p->XcoeffsA[1]
+                                    @ r4 := p->XcoeffsA[2]
+                                    @ r5 := p->XcoeffsA[3]
+    add     r6, r14, #XADAPTCOEFFSA-12
+    ldmia   r6, {r6 - r9}           @ r6 := p->buf[XADAPTCOEFFSA-3]
+                                    @ r7 := p->buf[XADAPTCOEFFSA-2]
+                                    @ r8 := p->buf[XADAPTCOEFFSA-1]
+                                    @ r9 := p->buf[XADAPTCOEFFSA]
+    sub     r5, r5, r6        @ r5 := p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
+    sub     r4, r4, r7        @ r4 := p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
+    sub     r3, r3, r8        @ r3 := p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
+    sub     r2, r2, r9        @ r2 := p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
+    
+    b       2f
+1:  @ *decoded1 < 0
+    add     r5, r5, r11       @ r5 := p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
+    add     r6, r6, r10       @ r6 := p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
+    add     r9, r9, r2        @ r9 := p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
+    add     r8, r8, r3        @ r8 := p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
+    add     r7, r7, r4        @ r7 := p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
+    add     r0, r12, #XcoeffsB      
+    stmia   r0, {r5 - r9}           @ Save p->XcoeffsB[]
+    add     r1, r12, #XcoeffsA
+    ldmia   r1, {r2 - r5}           @ r2 := p->XcoeffsA[0]
+                                    @ r3 := p->XcoeffsA[1]
+                                    @ r4 := p->XcoeffsA[2]
+                                    @ r5 := p->XcoeffsA[3]
+    add     r6, r14, #XADAPTCOEFFSA-12
+    ldmia   r6, {r6 - r9}           @ r6 := p->buf[XADAPTCOEFFSA-3]
+                                    @ r7 := p->buf[XADAPTCOEFFSA-2]
+                                    @ r8 := p->buf[XADAPTCOEFFSA-1]
+                                    @ r9 := p->buf[XADAPTCOEFFSA]
+    add     r5, r5, r6        @ r5 := p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
+    add     r4, r4, r7        @ r4 := p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
+    add     r3, r3, r8        @ r3 := p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
+    add     r2, r2, r9        @ r2 := p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
+    
+2:
+    stmia   r1, {r2 - r5}           @ Save p->XcoeffsA
+3:
+    
+@@@@@@@@@@@@@@@@@@@@@@@@@@@ COMMON
+    add     r14, r14, #4                @ p->buf++
+    add     r11, r12, #historybuffer    @ r11 := &p->historybuffer[0]
+    sub     r10, r14, #PREDICTOR_HISTORY_SIZE*4
+                                       @ r10 := p->buf - PREDICTOR_HISTORY_SIZE
+    ldr     r0, [sp, #8]
+    cmp     r10, r11
+    beq     move_hist     @ The history buffer is full, we need to do a memmove
+    @ Check loop count
+    subs    r0, r0, #1
+    strne   r0, [sp, #8]
+    bne     loop
+done:
+    str     r14, [r12]              @ Save value of p->buf
+    add     sp, sp, #12             @ Don't bother restoring r1-r3 
+#ifdef ROCKBOX
+    ldmpc   regs=r4-r11
+#else
+    ldmia   sp!, {r4 - r11, pc}
+#endif
+move_hist:
+    @ dest = r11 (p->historybuffer)
+    @ src = r14 (p->buf)
+    @ n = 200
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldr     r0, [sp, #8]
+    add     r14, r12, #historybuffer    @ p->buf = &p->historybuffer[0]
+    @ Check loop count
+    subs    r0, r0, #1
+    strne   r0, [sp, #8]
+    bne     loop
+    
+    b       done
+    .size   predictor_decode_stereo, .-predictor_decode_stereo
+    .global     predictor_decode_mono
+    .type       predictor_decode_mono,%function
+@ Register usage:
+@
+@ r0-r11 - scratch
+@ r12 - struct predictor_t* p
+@ r14 - int32_t* p->buf
+@ void predictor_decode_mono(struct predictor_t* p,
+@                            int32_t* decoded0,
+@                            int count)
+predictor_decode_mono:
+    stmdb   sp!, {r1, r2, r4-r11, lr}
+    @ r1 (decoded0) is [sp]
+    @ r2 (count)    is [sp, #4]
+    mov     r12, r0         @ r12 := p
+    ldr     r14, [r0]       @ r14 := p->buf
+    
+loopm:
+@@@@@@@@@@@@@@@@@@@@@@@@@@@ PREDICTOR
+    ldr     r11, [r12, #YlastA]     @ r11 := p->YlastA
+    add     r2, r14, #YDELAYA-12    @ r2 := &p->buf[YDELAYA-3]
+    ldmia   r2, {r2, r3, r10}       @ r2 := p->buf[YDELAYA-3]
+                                    @ r3 := p->buf[YDELAYA-2]
+                                    @ r10 := p->buf[YDELAYA-1]
+    add     r5, r12, #YcoeffsA      @ r5 := &p->YcoeffsA[0]
+    ldmia   r5, {r6 - r9}           @ r6 := p->YcoeffsA[0]
+                                    @ r7 := p->YcoeffsA[1]
+                                    @ r8 := p->YcoeffsA[2]
+                                    @ r9 := p->YcoeffsA[3]
+    subs    r10, r11, r10           @ r10 := r11 - r10
+    STR2OFS r10, r11, r14, #YDELAYA-4
+                                    @ p->buf[YDELAYA-1] = r10
+                                    @ p->buf[YDELAYA] = r11
+    mul     r0, r11, r6             @ r0 := p->buf[YDELAYA] * p->YcoeffsA[0]
+    mla     r0, r10, r7, r0         @ r0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+    mla     r0, r3, r8, r0          @ r0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+    mla     r0, r2, r9, r0          @ r0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
+    @ flags were set above, in the subs instruction
+    mvngt   r10, #0
+    movlt   r10, #1                 @ r10 := SIGN(r10) (see .c for SIGN macro)
+    cmp     r11, #0
+    mvngt   r11, #0
+    movlt   r11, #1                 @ r11 := SIGN(r11) (see .c for SIGN macro)
+    STR2OFS r10, r11, r14, #YADAPTCOEFFSA-4
+                                    @ p->buf[YADAPTCOEFFSA-1] := r10
+                                    @ p->buf[YADAPTCOEFFSA] := r11
+    ldr     r2, [sp]                @ r2 := decoded0
+    ldr     r4, [r12, #YfilterA]    @ r4 := p->YfilterA
+    ldr     r3, [r2]                @ r3 := *decoded0
+    rsb     r4, r4, r4, lsl #5      @ r4 := r4 * 32 - r4 ( == r4*31)
+    add     r1, r3, r0, asr #10     @ r1 := r3 + (r0 >> 10)
+    str     r1, [r12, #YlastA]      @ p->YlastA := r1
+    add     r1, r1, r4, asr #5      @ r1 := r1 + (r4 >> 5)
+    str     r1, [r12, #YfilterA]    @ p->YfilterA := r1
+    @ r1 contains p->YfilterA
+    @ r2 contains decoded0
+    @ r3 contains *decoded0
+    @ r6, r7, r8, r9 contain p->YcoeffsA[0..3]
+    @ r10, r11 contain p->buf[YADAPTCOEFFSA-1] and p->buf[YADAPTCOEFFSA]
+    str     r1, [r2], #4            @ *(decoded0++) := r1  (p->YfilterA)
+    str     r2, [sp]                @ save decoded0
+    cmp     r3, #0
+    beq     3f
+    LDR2OFS r2, r3, r14, #YADAPTCOEFFSA-12
+                                    @ r2 := p->buf[YADAPTCOEFFSA-3]
+                                    @ r3 := p->buf[YADAPTCOEFFSA-2]
+    blt     1f
+    @ *decoded0 > 0
+    sub     r6, r6, r11     @ r6 := p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+    sub     r7, r7, r10     @ r7 := p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+    sub     r9, r9, r2      @ r9 := p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+    sub     r8, r8, r3      @ r8 := p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+    b       2f
+1:  @ *decoded0 < 0
+    add     r6, r6, r11     @ r6 := p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
+    add     r7, r7, r10     @ r7 := p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
+    add     r9, r9, r2      @ r9 := p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
+    add     r8, r8, r3      @ r8 := p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
+    
+2:
+    stmia   r5, {r6 - r9}           @ Save p->YcoeffsA
+3:
+@@@@@@@@@@@@@@@@@@@@@@@@@@@ COMMON
+    add     r14, r14, #4                @ p->buf++
+    add     r11, r12, #historybuffer    @ r11 := &p->historybuffer[0]
+    sub     r10, r14, #PREDICTOR_HISTORY_SIZE*4
+                                       @ r10 := p->buf - PREDICTOR_HISTORY_SIZE
+    ldr     r0, [sp, #4]
+    cmp     r10, r11
+    beq     move_histm    @ The history buffer is full, we need to do a memmove
+    @ Check loop count
+    subs    r0, r0, #1
+    strne   r0, [sp, #4]
+    bne     loopm
+donem:
+    str     r14, [r12]              @ Save value of p->buf
+    add     sp, sp, #8              @ Don't bother restoring r1, r2
+#ifdef ROCKBOX
+    ldmpc   regs=r4-r11
+#else
+    ldmia   sp!, {r4 - r11, pc}
+#endif
+move_histm:
+    @ dest = r11 (p->historybuffer)
+    @ src = r14 (p->buf)
+    @ n = 200
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldmia   r14!, {r0-r9}    @ 40 bytes
+    stmia   r11!, {r0-r9}
+    ldr     r0, [sp, #4]
+    add     r14, r12, #historybuffer    @ p->buf = &p->historybuffer[0]
+    @ Check loop count
+    subs    r0, r0, #1
+    strne   r0, [sp, #4]
+    bne     loopm
+    
+    b       donem
+    .size   predictor_decode_mono, .-predictor_decode_mono
diff --git a/lib/rbcodec/codecs/demac/libdemac/predictor-cf.S b/lib/rbcodec/codecs/demac/libdemac/predictor-cf.S
new file mode 100644
index 0000000000..fc1d901a59
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/predictor-cf.S
@@ -0,0 +1,660 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+Coldfire predictor copyright (C) 2007 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include "demac_config.h"
+/* NOTE: The following need to be kept in sync with parser.h */
+#define YDELAYA        200
+#define YDELAYB        168
+#define XDELAYA        136
+#define XDELAYB        104
+#define YADAPTCOEFFSA   72
+#define XADAPTCOEFFSA   56
+#define YADAPTCOEFFSB   40
+#define XADAPTCOEFFSB   20
+/* struct predictor_t members: */
+#define buf              0    /* int32_t* buf */
+#define YlastA           4    /* int32_t YlastA; */
+#define XlastA           8    /* int32_t XlastA; */
+#define YfilterB        12    /* int32_t YfilterB; */
+#define XfilterA        16    /* int32_t XfilterA; */
+#define XfilterB        20    /* int32_t XfilterB; */
+#define YfilterA        24    /* int32_t YfilterA; */
+#define YcoeffsA        28    /* int32_t YcoeffsA[4]; */
+#define XcoeffsA        44    /* int32_t XcoeffsA[4]; */
+#define YcoeffsB        60    /* int32_t YcoeffsB[5]; */
+#define XcoeffsB        80    /* int32_t XcoeffsB[5]; */
+#define historybuffer  100    /* int32_t historybuffer[] */
+    .text
+    .align  2
+    .global predictor_decode_stereo
+    .type   predictor_decode_stereo,@function
+| void predictor_decode_stereo(struct predictor_t* p,
+|                              int32_t* decoded0,
+|                              int32_t* decoded1,
+|                              int count)
+predictor_decode_stereo:
+    lea.l   (-12*4,%sp), %sp
+    movem.l %d2-%d7/%a2-%a6, (4,%sp)
+    movem.l (12*4+8,%sp), %a3-%a5       | %a3 = decoded0
+                                        | %a4 = decoded1
+    move.l  %a5, (%sp)                  | (%sp) = count
+    move.l  #0, %macsr                  | signed integer mode
+    move.l  (12*4+4,%sp), %a6           | %a6 = p
+    move.l  (%a6), %a5                  | %a5 = p->buf
+    
+.loop:
+    
+    | ***** PREDICTOR Y *****
+    
+    | Predictor Y, Filter A
+    
+    move.l  (YlastA,%a6), %d3           | %d3  = p->YlastA
+    movem.l (YDELAYA-12,%a5), %d0-%d2   | %d0  = p->buf[YDELAYA-3]
+                                        | %d1  = p->buf[YDELAYA-2]
+                                        | %d2  = p->buf[YDELAYA-1]
+    move.l  %d3, (YDELAYA,%a5)          | p->buf[YDELAYA]  = %d3
+    sub.l   %d3, %d2
+    neg.l   %d2                         | %d2 = %d3 - %d2
+    move.l  %d2, (YDELAYA-4,%a5)        | p->buf[YDELAYA-1]  = %d2
+    movem.l (YcoeffsA,%a6), %d4-%d7     | %d4  = p->YcoeffsA[0]
+                                        | %d5  = p->YcoeffsA[1]
+                                        | %d6  = p->YcoeffsA[2]
+                                        | %d7  = p->YcoeffsA[3]
+    mac.l   %d3, %d4, %acc0     | %acc0  = p->buf[YDELAYA] * p->YcoeffsA[0]
+    mac.l   %d2, %d5, %acc0     | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+    mac.l   %d1, %d6, %acc0     | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+    mac.l   %d0, %d7, %acc0     | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
+    tst.l   %d2
+    beq.s   1f
+    spl.b   %d2                         | pos: 0x??????ff, neg: 0x??????00
+    extb.l  %d2                         | pos: 0xffffffff, neg: 0x00000000
+    or.l    #1, %d2                     | pos: 0xffffffff, neg: 0x00000001
+1:                                      | %d2 = SIGN(%d2)
+    move.l  %d2, (YADAPTCOEFFSA-4,%a5)  | p->buf[YADAPTCOEFFSA-1]  = %d2
+    tst.l   %d3
+    beq.s   1f
+    spl.b   %d3
+    extb.l  %d3
+    or.l    #1, %d3
+1:                                      | %d3 = SIGN(%d3)
+    move.l  %d3, (YADAPTCOEFFSA,%a5)    | p->buf[YADAPTCOEFFSA]  = %d3
+    | Predictor Y, Filter B
+    
+    movem.l (YfilterB,%a6), %d2-%d3     | %d2  = p->YfilterB
+                                        | %d3  = p->XfilterA
+    move.l  %d3, (YfilterB,%a6)         | p->YfilterB = %d3
+    move.l  %d2, %d1                    | %d1  = %d2
+    lsl.l   #5, %d2                     | %d2  = %d2 * 32
+    sub.l   %d1, %d2                    | %d2 -= %d1 (== 31 * old_d2)
+    asr.l   #5, %d2                     | %d2 >>= 5
+    sub.l   %d2, %d3                    | %d3 -= %d2
+    
+    movem.l (YDELAYB-16,%a5), %d4-%d7   | %d4  = p->buf[YDELAYB-4]
+                                        | %d5  = p->buf[YDELAYB-3]
+                                        | %d6  = p->buf[YDELAYB-2]
+                                        | %d7  = p->buf[YDELAYB-1]
+    sub.l   %d3, %d7
+    neg.l   %d7                         | %d7  = %d3 - %d7
+    move.l  %d7, (YDELAYB-4,%a5)        | p->buf[YDELAYB-1]  = %d7
+    movem.l (YcoeffsB,%a6), %d1-%d2/%a0-%a2 | %d1  = p->YcoeffsB[0]
+                                        | %d2  = p->YcoeffsB[1]
+                                        | %a0  = p->YcoeffsB[2]
+                                        | %a1  = p->YcoeffsB[3]
+                                        | %a2  = p->YcoeffsB[4]
+    mac.l   %d3, %d1, %acc1     | %acc1  = p->buf[YDELAYB] * p->YcoeffsB[0]
+    mac.l   %d7, %d2, %acc1     | %acc1 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
+    mac.l   %d6, %a0, %acc1     | %acc1 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
+    mac.l   %d5, %a1, %acc1     | %acc1 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
+    mac.l   %d4, %a2, %acc1     | %acc1 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
+    
+    move.l  %d3, (YDELAYB, %a5)         | p->buf[YDELAYB]  = %d3
+    
+    tst.l   %d7
+    beq.s   1f
+    spl.b   %d7
+    extb.l  %d7
+    or.l    #1, %d7
+1:                                      | %d7 = SIGN(%d7)
+    move.l  %d7, (YADAPTCOEFFSB-4,%a5)  | p->buf[YADAPTCOEFFSB-1]  = %d7
+    tst.l   %d3
+    beq.s   1f
+    spl.b   %d3
+    extb.l  %d3
+    or.l    #1, %d3
+1:                                      | %d3 = SIGN(%d3)
+    move.l  %d3, (YADAPTCOEFFSB, %a5)   | p->buf[YADAPTCOEFFSB]  = %d3
+    | %d1, %d2, %a0, %a1, %a2 contain p->YcoeffsB[0..4]
+    | %d7, %d3 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
+    move.l  (%a3), %d0                  | %d0  = *decoded0
+    beq.s   3f
+    movem.l (YADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4  = p->buf[YADAPTCOEFFSB-4]
+                                            | %d5  = p->buf[YADAPTCOEFFSB-3]
+                                            | %d6  = p->buf[YADAPTCOEFFSB-2]
+    bmi.s   1f                          | flags still valid here
+    | *decoded0 > 0
+    sub.l   %d3, %d1        | %d1  = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
+    sub.l   %d7, %d2        | %d2  = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
+    sub.l   %d6, %a0        | %a0  = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
+    sub.l   %d5, %a1        | %a1  = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
+    sub.l   %d4, %a2        | %a2  = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
+    movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
+    
+    movem.l (YcoeffsA,%a6), %d4-%d7     | %d4  = p->YcoeffsA[0]
+                                        | %d5  = p->YcoeffsA[1]
+                                        | %d6  = p->YcoeffsA[2]
+                                        | %d7  = p->YcoeffsA[3]
+                                        
+    movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 
+                                        | %d2  = p->buf[YADAPTCOEFFSA-3]
+                                        | %a0  = p->buf[YADAPTCOEFFSA-2]
+                                        | %a1  = p->buf[YADAPTCOEFFSA-1]
+                                        | %a2  = p->buf[YADAPTCOEFFSA]
+                                        
+    sub.l   %a2, %d4        | %d4  = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+    sub.l   %a1, %d5        | %d5  = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+    sub.l   %a0, %d6        | %d6  = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+    sub.l   %d2, %d7        | %d7  = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+    
+    bra.s   2f
+1:  | *decoded0 < 0
+    add.l   %d3, %d1        | %d1  = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
+    add.l   %d7, %d2        | %d2  = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
+    add.l   %d6, %a0        | %a0  = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
+    add.l   %d5, %a1        | %a1  = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
+    add.l   %d4, %a2        | %a2  = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
+    movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
+    movem.l (YcoeffsA,%a6), %d4-%d7     | %d4  = p->YcoeffsA[0]
+                                        | %d5  = p->YcoeffsA[1]
+                                        | %d6  = p->YcoeffsA[2]
+                                        | %d7  = p->YcoeffsA[3]
+    movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 
+                                        | %d2  = p->buf[YADAPTCOEFFSA-3]
+                                        | %a0  = p->buf[YADAPTCOEFFSA-2]
+                                        | %a1  = p->buf[YADAPTCOEFFSA-1]
+                                        | %a2  = p->buf[YADAPTCOEFFSA]
+    add.l   %a2, %d4        | %d4  = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
+    add.l   %a1, %d5        | %d5  = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
+    add.l   %a0, %d6        | %d6  = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
+    add.l   %d2, %d7        | %d7  = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
+2:
+    movem.l %d4-%d7, (YcoeffsA,%a6)     | Save p->YcoeffsA[]
+3:
+    | Finish Predictor Y
+    movclr.l %acc0, %d1                 | %d1 = predictionA
+    movclr.l %acc1, %d2                 | %d2 = predictionB
+    asr.l   #1, %d2
+    add.l   %d2, %d1                    | %d1 += (%d2 >> 1)
+    asr.l   #8, %d1
+    asr.l   #2, %d1                     | %d1 >>= 10
+    add.l   %d0, %d1                    | %d1 += %d0
+    move.l  %d1, (YlastA,%a6)           | p->YlastA  = %d1
+    
+    move.l  (YfilterA,%a6), %d2         | %d2  = p->YfilterA
+    move.l  %d2, %d0
+    lsl.l   #5, %d2
+    sub.l   %d0, %d2                    | %d2 = 31 * %d2
+    asr.l   #5, %d2                     | %d2 >>= 5
+    add.l   %d1, %d2
+    move.l  %d2, (YfilterA,%a6)         | p->YfilterA  = %d2
+    | *decoded0 stored 2 instructions down, avoiding pipeline stall
+    | ***** PREDICTOR X *****
+    
+    | Predictor X, Filter A
+    
+    move.l  (XlastA,%a6), %d3           | %d3  = p->XlastA
+    move.l  %d2, (%a3)+                 | *(decoded0++)  = %d2 (p->YfilterA)
+    movem.l (XDELAYA-12,%a5), %d0-%d2   | %d0  = p->buf[XDELAYA-3]
+                                        | %d1  = p->buf[XDELAYA-2]
+                                        | %d2  = p->buf[XDELAYA-1]
+    move.l  %d3, (XDELAYA,%a5)          | p->buf[XDELAYA]  = %d3
+    sub.l   %d3, %d2
+    neg.l   %d2                         | %d2  = %d3 -%d2
+    move.l  %d2, (XDELAYA-4,%a5)        | p->buf[XDELAYA-1]  = %d2
+    movem.l (XcoeffsA,%a6), %d4-%d7     | %d4  = p->XcoeffsA[0]
+                                        | %d5  = p->XcoeffsA[1]
+                                        | %d6  = p->XcoeffsA[2]
+                                        | %d7  = p->XcoeffsA[3]
+    mac.l   %d3, %d4, %acc0     | %acc0  = p->buf[XDELAYA] * p->XcoeffsA[0]
+    mac.l   %d2, %d5, %acc0     | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
+    mac.l   %d1, %d6, %acc0     | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
+    mac.l   %d0, %d7, %acc0     | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
+    tst.l   %d2
+    beq.s   1f
+    spl.b   %d2                         | pos: 0x??????ff, neg: 0x??????00
+    extb.l  %d2                         | pos: 0xffffffff, neg: 0x00000000
+    or.l    #1, %d2                     | pos: 0xffffffff, neg: 0x00000001
+1:                                      | %d2 = SIGN(%d2)
+    move.l  %d2, (XADAPTCOEFFSA-4,%a5)  | p->buf[XADAPTCOEFFSA-1]  = %d2
+    tst.l   %d3
+    beq.s   1f
+    spl.b   %d3
+    extb.l  %d3
+    or.l    #1, %d3
+1:                                      | %d3 = SIGN(%d3)
+    move.l  %d3, (XADAPTCOEFFSA,%a5)    | p->buf[XADAPTCOEFFSA]  = %d3
+    | Predictor X, Filter B
+    
+    movem.l (XfilterB,%a6), %d2-%d3     | %d2  = p->XfilterB
+                                        | %d3  = p->YfilterA
+    move.l  %d3, (XfilterB,%a6)         | p->XfilterB = %d3
+    move.l  %d2, %d1                    | %d1  = %d2
+    lsl.l   #5, %d2                     | %d2  = %d2 * 32
+    sub.l   %d1, %d2                    | %d2 -= %d1 (== 31 * old_d2)
+    asr.l   #5, %d2                     | %d2 >>= 5
+    sub.l   %d2, %d3                    | %d3 -= %d2 
+    
+    movem.l (XDELAYB-16,%a5), %d4-%d7   | %d4  = p->buf[XDELAYB-4]
+                                        | %d5  = p->buf[XDELAYB-3]
+                                        | %d6  = p->buf[XDELAYB-2]
+                                        | %d7  = p->buf[XDELAYB-1]
+    sub.l   %d3, %d7
+    neg.l   %d7                         | %d7  = %d3 - %d7
+    move.l  %d7, (XDELAYB-4,%a5)        | p->buf[XDELAYB-1]  = %d7
+    movem.l (XcoeffsB,%a6), %d1-%d2/%a0-%a2 | %d1  = p->XcoeffsB[0]
+                                        | %d2  = p->XcoeffsB[1]
+                                        | %a0  = p->XcoeffsB[2]
+                                        | %a1  = p->XcoeffsB[3]
+                                        | %a2  = p->XcoeffsB[4]
+    mac.l   %d3, %d1, %acc1     | %acc1  = p->buf[XDELAYB] * p->XcoeffsB[0]
+    mac.l   %d7, %d2, %acc1     | %acc1 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
+    mac.l   %d6, %a0, %acc1     | %acc1 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
+    mac.l   %d5, %a1, %acc1     | %acc1 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
+    mac.l   %d4, %a2, %acc1     | %acc1 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
+    
+    move.l  %d3, (XDELAYB, %a5)         | p->buf[XDELAYB]  = %d3
+    
+    tst.l   %d7
+    beq.s   1f
+    spl.b   %d7
+    extb.l  %d7
+    or.l    #1, %d7
+1:                                      | %d7 = SIGN(%d7)
+    move.l  %d7, (XADAPTCOEFFSB-4,%a5)  | p->buf[XADAPTCOEFFSB-1]  = %d7
+    tst.l   %d3
+    beq.s   1f
+    spl.b   %d3
+    extb.l  %d3
+    or.l    #1, %d3
+1:                                      | %d3 = SIGN(%d3)
+    move.l  %d3, (XADAPTCOEFFSB, %a5)   | p->buf[XADAPTCOEFFSB]  = %d3
+    | %d1, %d2, %a0, %a1, %a2 contain p->XcoeffsB[0..4]
+    | %d7, %d3 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
+    move.l  (%a4), %d0                  | %d0  = *decoded1
+    beq.s   3f
+    
+    movem.l (XADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4  = p->buf[XADAPTCOEFFSB-4]
+                                            | %d5  = p->buf[XADAPTCOEFFSB-3]
+                                            | %d6  = p->buf[XADAPTCOEFFSB-2]
+    bmi.s   1f                          | flags still valid here
+    
+    | *decoded1 > 0
+    sub.l   %d3, %d1        | %d1  = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
+    sub.l   %d7, %d2        | %d2  = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
+    sub.l   %d6, %a0        | %a0  = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
+    sub.l   %d5, %a1        | %a1  = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
+    sub.l   %d4, %a2        | %a2  = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
+    movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
+    movem.l (XcoeffsA,%a6), %d4-%d7     | %d4  = p->XcoeffsA[0]
+                                        | %d5  = p->XcoeffsA[1]
+                                        | %d6  = p->XcoeffsA[2]
+                                        | %d7  = p->XcoeffsA[3]
+                                        
+    movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 
+                                        | %d2  = p->buf[XADAPTCOEFFSA-3]
+                                        | %a0  = p->buf[XADAPTCOEFFSA-2]
+                                        | %a1  = p->buf[XADAPTCOEFFSA-1]
+                                        | %a2  = p->buf[XADAPTCOEFFSA]
+                                        
+    sub.l   %a2, %d4        | %d4  = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
+    sub.l   %a1, %d5        | %d5  = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
+    sub.l   %a0, %d6        | %d6  = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
+    sub.l   %d2, %d7        | %d7  = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
+    bra.s   2f
+1:  | *decoded1 < 0
+    add.l   %d3, %d1        | %d1  = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
+    add.l   %d7, %d2        | %d2  = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
+    add.l   %d6, %a0        | %a0  = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
+    add.l   %d5, %a1        | %a1  = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
+    add.l   %d4, %a2        | %a2  = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
+    
+    movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
+    
+    movem.l (XcoeffsA,%a6), %d4-%d7     | %d4  = p->XcoeffsA[0]
+                                        | %d5  = p->XcoeffsA[1]
+                                        | %d6  = p->XcoeffsA[2]
+                                        | %d7  = p->XcoeffsA[3]
+    movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 
+                                        | %d2  = p->buf[XADAPTCOEFFSA-3]
+                                        | %a0  = p->buf[XADAPTCOEFFSA-2]
+                                        | %a1  = p->buf[XADAPTCOEFFSA-1]
+                                        | %a2  = p->buf[XADAPTCOEFFSA]
+                                        
+    add.l   %a2, %d4        | %d4  = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
+    add.l   %a1, %d5        | %d5  = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
+    add.l   %a0, %d6        | %d6  = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
+    add.l   %d2, %d7        | %d7  = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
+2:
+    movem.l %d4-%d7, (XcoeffsA,%a6)     | Save p->XcoeffsA[]
+3:
+    | Finish Predictor X
+    movclr.l %acc0, %d1                 | %d1 = predictionA
+    movclr.l %acc1, %d2                 | %d2 = predictionB
+    asr.l   #1, %d2
+    add.l   %d2, %d1                    | %d1 += (%d2 >> 1)
+    asr.l   #8, %d1
+    asr.l   #2, %d1                     | %d1 >>= 10
+    add.l   %d0, %d1                    | %d1 += %d0
+    move.l  %d1, (XlastA,%a6)           | p->XlastA  = %d1
+    
+    move.l  (XfilterA,%a6), %d2         | %d2  = p->XfilterA
+    move.l  %d2, %d0
+    lsl.l   #5, %d2
+    sub.l   %d0, %d2                    | %d2 = 31 * %d2
+    asr.l   #5, %d2                     | %d6 >>= 2
+    add.l   %d1, %d2
+    move.l  %d2, (XfilterA,%a6)         | p->XfilterA  = %d2
+    | *decoded1 stored 3 instructions down, avoiding pipeline stall
+    | ***** COMMON *****
+    
+    addq.l  #4, %a5                     | p->buf++
+    lea.l   (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a2
+                            | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
+    
+    move.l  %d2, (%a4)+                 | *(decoded1++)  = %d2 (p->XfilterA)
+    cmp.l   %a2, %a5
+    beq.s   .move_hist      | History buffer is full, we need to do a memmove
+    subq.l  #1, (%sp)                   | decrease loop count
+    bne.w   .loop
+.done:
+    move.l  %a5, (%a6)                  | Save value of p->buf
+    movem.l (4,%sp), %d2-%d7/%a2-%a6
+    lea.l   (12*4,%sp), %sp
+    rts
+    
+.move_hist:
+    lea.l   (historybuffer,%a6), %a2
+    | dest = %a2 (p->historybuffer)
+    | src = %a5 (p->buf)
+    | n = 200
+    
+    movem.l (%a5), %d0-%d7/%a0-%a1      | 40 bytes
+    movem.l %d0-%d7/%a0-%a1, (%a2)
+    movem.l (40,%a5), %d0-%d7/%a0-%a1   | 40 bytes
+    movem.l %d0-%d7/%a0-%a1, (40,%a2)
+    movem.l (80,%a5), %d0-%d7/%a0-%a1   | 40 bytes
+    movem.l %d0-%d7/%a0-%a1, (80,%a2)
+    movem.l (120,%a5), %d0-%d7/%a0-%a1  | 40 bytes
+    movem.l %d0-%d7/%a0-%a1, (120,%a2)
+    movem.l (160,%a5), %d0-%d7/%a0-%a1  | 40 bytes
+    movem.l %d0-%d7/%a0-%a1, (160,%a2)
+    move.l  %a2, %a5                    | p->buf = &p->historybuffer[0]
+    subq.l  #1, (%sp)                   | decrease loop count
+    bne.w   .loop
+    bra.s   .done
+    .size   predictor_decode_stereo, .-predictor_decode_stereo
+    .global predictor_decode_mono
+    .type   predictor_decode_mono,@function
+| void predictor_decode_mono(struct predictor_t* p,
+|                            int32_t* decoded0,
+|                            int count)
+predictor_decode_mono:
+    lea.l   (-11*4,%sp), %sp
+    movem.l %d2-%d7/%a2-%a6, (%sp)
+    move.l  #0, %macsr                  | signed integer mode
+    move.l  (11*4+4,%sp), %a6           | %a6 = p
+    move.l  (11*4+8,%sp), %a4           | %a4 = decoded0
+    move.l  (11*4+12,%sp), %d7          | %d7 = count
+    move.l  (%a6), %a5                  | %a5 = p->buf
+    move.l  (YlastA,%a6), %d3           | %d3  = p->YlastA
+    
+.loopm:
+    | ***** PREDICTOR *****
+    movem.l (YDELAYA-12,%a5), %d0-%d2   | %d0  = p->buf[YDELAYA-3]
+                                        | %d1  = p->buf[YDELAYA-2]
+                                        | %d2  = p->buf[YDELAYA-1]
+    move.l  %d3, (YDELAYA,%a5)          | p->buf[YDELAYA]  = %d3
+    sub.l   %d3, %d2
+    neg.l   %d2                         | %d2 = %d3 - %d2
+    move.l  %d2, (YDELAYA-4,%a5)        | p->buf[YDELAYA-1]  = %d2
+    movem.l (YcoeffsA,%a6), %a0-%a3     | %a0  = p->YcoeffsA[0]
+                                        | %a1  = p->YcoeffsA[1]
+                                        | %a2  = p->YcoeffsA[2]
+                                        | %a3  = p->YcoeffsA[3]
+    mac.l   %d3, %a0, %acc0     | %acc0  = p->buf[YDELAYA] * p->YcoeffsA[0]
+    mac.l   %d2, %a1, %acc0     | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+    mac.l   %d1, %a2, %acc0     | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+    mac.l   %d0, %a3, %acc0     | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
+    tst.l   %d2
+    beq.s   1f
+    spl.b   %d2                         | pos: 0x??????ff, neg: 0x??????00
+    extb.l  %d2                         | pos: 0xffffffff, neg: 0x00000000
+    or.l    #1, %d2                     | pos: 0xffffffff, neg: 0x00000001
+1:                                      | %d2 = SIGN(%d2)
+    move.l  %d2, (YADAPTCOEFFSA-4,%a5)  | p->buf[YADAPTCOEFFSA-1]  = %d2
+    tst.l   %d3
+    beq.s   1f
+    spl.b   %d3
+    extb.l  %d3
+    or.l    #1, %d3
+1:                                      | %d3 = SIGN(%d3)
+    move.l  %d3, (YADAPTCOEFFSA,%a5)    | p->buf[YADAPTCOEFFSA]  = %d3
+    move.l  (%a4), %d0                  | %d0 = *decoded0
+    beq.s   3f
+    movem.l (YADAPTCOEFFSA-12,%a5),%d4-%d5  | %d4  = p->buf[YADAPTCOEFFSA-3]
+                                            | %d5  = p->buf[YADAPTCOEFFSA-2]
+                                            
+    bmi.s   1f                          | flags still valid here
+    
+    | *decoded0 > 0
+    
+    sub.l   %d3, %a0        | %a0  = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+    sub.l   %d2, %a1        | %a1  = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+    sub.l   %d5, %a2        | %a2  = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+    sub.l   %d4, %a3        | %a3  = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+    bra.s   2f
+1:  | *decoded0 < 0
+    add.l   %d3, %a0        | %a0  = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+    add.l   %d2, %a1        | %a1  = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+    add.l   %d5, %a2        | %a2  = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+    add.l   %d4, %a3        | %a3  = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+2:
+    movem.l %a0-%a3, (YcoeffsA,%a6)     | save p->YcoeffsA[]
+3:
+    | Finish Predictor
+    movclr.l %acc0, %d3                 | %d3 = predictionA
+    asr.l   #8, %d3
+    asr.l   #2, %d3                     | %d3 >>= 10
+    add.l   %d0, %d3                    | %d3 += %d0
+    move.l  (YfilterA,%a6), %d2         | %d2  = p->YfilterA
+    move.l  %d2, %d0
+    lsl.l   #5, %d2
+    sub.l   %d0, %d2                    | %d2 = 31 * %d2
+    asr.l   #5, %d2                     | %d2 >>= 5
+    add.l   %d3, %d2
+    move.l  %d2, (YfilterA,%a6)         | p->YfilterA  = %d2
+    | *decoded0 stored 3 instructions down, avoiding pipeline stall
+    | ***** COMMON *****
+    addq.l  #4, %a5                     | p->buf++
+    lea.l   (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a3
+                            | %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
+    
+    move.l  %d2, (%a4)+                 | *(decoded0++)  = %d2 (p->YfilterA)
+    cmp.l   %a3, %a5
+    beq.s   .move_histm     | History buffer is full, we need to do a memmove
+                                              
+    subq.l  #1, %d7                     | decrease loop count
+    bne.w   .loopm
+    move.l  %d3, (YlastA,%a6)           | %d3  = p->YlastA
+.donem:
+    move.l  %a5, (%a6)                  | Save value of p->buf
+    movem.l (%sp), %d2-%d7/%a2-%a6
+    lea.l   (11*4,%sp), %sp
+    rts
+    
+.move_histm:
+    move.l  %d3, (YlastA,%a6)           | %d3  = p->YlastA
+    lea.l   (historybuffer,%a6), %a3
+    | dest = %a3 (p->historybuffer)
+    | src = %a5 (p->buf)
+    | n = 200
+    
+    movem.l (%a5), %d0-%d6/%a0-%a2      | 40 bytes
+    movem.l %d0-%d6/%a0-%a2, (%a3)
+    movem.l (40,%a5), %d0-%d6/%a0-%a2   | 40 bytes
+    movem.l %d0-%d6/%a0-%a2, (40,%a3)
+    movem.l (80,%a5), %d0-%d6/%a0-%a2   | 40 bytes
+    movem.l %d0-%d6/%a0-%a2, (80,%a3)
+    movem.l (120,%a5), %d0-%d6/%a0-%a2  | 40 bytes
+    movem.l %d0-%d6/%a0-%a2, (120,%a3)
+    movem.l (160,%a5), %d0-%d6/%a0-%a2  | 40 bytes
+    movem.l %d0-%d6/%a0-%a2, (160,%a3)
+    move.l  %a3, %a5                    | p->buf = &p->historybuffer[0]
+    move.l  (YlastA,%a6), %d3           | %d3  = p->YlastA
+    subq.l  #1, %d7                     | decrease loop count
+    bne.w   .loopm
+    bra.s   .donem
+    .size   predictor_decode_mono, .-predictor_decode_mono
diff --git a/lib/rbcodec/codecs/demac/libdemac/predictor.c b/lib/rbcodec/codecs/demac/libdemac/predictor.c
new file mode 100644
index 0000000000..45912dddbd
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/predictor.c
@@ -0,0 +1,271 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include <inttypes.h>
+#include <string.h>
+#include "parser.h"
+#include "predictor.h"
+#include "demac_config.h"
+/* Return 0 if x is zero, -1 if x is positive, 1 if x is negative */
+#define SIGN(x) (x) ? (((x) > 0) ? -1 : 1) : 0
+static const int32_t initial_coeffs[4] = {
+  360, 317, -109, 98
+};
+#define YDELAYA (18 + PREDICTOR_ORDER*4)
+#define YDELAYB (18 + PREDICTOR_ORDER*3)
+#define XDELAYA (18 + PREDICTOR_ORDER*2)
+#define XDELAYB (18 + PREDICTOR_ORDER)
+#define YADAPTCOEFFSA (18)
+#define XADAPTCOEFFSA (14)
+#define YADAPTCOEFFSB (10)
+#define XADAPTCOEFFSB (5)
+void init_predictor_decoder(struct predictor_t* p)
+{
+    /* Zero the history buffers */
+    memset(p->historybuffer, 0, PREDICTOR_SIZE * sizeof(int32_t));
+    p->buf = p->historybuffer;
+    /* Initialise and zero the co-efficients */
+    memcpy(p->YcoeffsA, initial_coeffs, sizeof(initial_coeffs));
+    memcpy(p->XcoeffsA, initial_coeffs, sizeof(initial_coeffs));
+    memset(p->YcoeffsB, 0, sizeof(p->YcoeffsB));
+    memset(p->XcoeffsB, 0, sizeof(p->XcoeffsB));
+    p->YfilterA = 0;
+    p->YfilterB = 0;
+    p->YlastA = 0;
+    p->XfilterA = 0;
+    p->XfilterB = 0;
+    p->XlastA = 0;
+}
+#if !defined(CPU_ARM) && !defined(CPU_COLDFIRE)
+void ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
+                                              int32_t* decoded0,
+                                              int32_t* decoded1,
+                                              int count)
+{
+    int32_t predictionA, predictionB; 
+    while (LIKELY(count--))
+    {
+        /* Predictor Y */
+        p->buf[YDELAYA] = p->YlastA;
+        p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
+        p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1];
+        p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
+        predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) + 
+                      (p->buf[YDELAYA-1] * p->YcoeffsA[1]) + 
+                      (p->buf[YDELAYA-2] * p->YcoeffsA[2]) + 
+                      (p->buf[YDELAYA-3] * p->YcoeffsA[3]);
+        /*  Apply a scaled first-order filter compression */
+        p->buf[YDELAYB] = p->XfilterA - ((p->YfilterB * 31) >> 5);
+        p->buf[YADAPTCOEFFSB] = SIGN(p->buf[YDELAYB]);
+        p->YfilterB = p->XfilterA;
+        p->buf[YDELAYB-1] = p->buf[YDELAYB] - p->buf[YDELAYB-1];
+        p->buf[YADAPTCOEFFSB-1] = SIGN(p->buf[YDELAYB-1]);
+        predictionB = (p->buf[YDELAYB] * p->YcoeffsB[0]) + 
+                      (p->buf[YDELAYB-1] * p->YcoeffsB[1]) + 
+                      (p->buf[YDELAYB-2] * p->YcoeffsB[2]) + 
+                      (p->buf[YDELAYB-3] * p->YcoeffsB[3]) + 
+                      (p->buf[YDELAYB-4] * p->YcoeffsB[4]);
+        p->YlastA = *decoded0 + ((predictionA + (predictionB >> 1)) >> 10);
+        p->YfilterA =  p->YlastA + ((p->YfilterA * 31) >> 5);
+        /* Predictor X */
+        p->buf[XDELAYA] = p->XlastA;
+        p->buf[XADAPTCOEFFSA] = SIGN(p->buf[XDELAYA]);
+        p->buf[XDELAYA-1] = p->buf[XDELAYA] - p->buf[XDELAYA-1];
+        p->buf[XADAPTCOEFFSA-1] = SIGN(p->buf[XDELAYA-1]);
+        predictionA = (p->buf[XDELAYA] * p->XcoeffsA[0]) + 
+                      (p->buf[XDELAYA-1] * p->XcoeffsA[1]) + 
+                      (p->buf[XDELAYA-2] * p->XcoeffsA[2]) + 
+                      (p->buf[XDELAYA-3] * p->XcoeffsA[3]);
+        /*  Apply a scaled first-order filter compression */
+        p->buf[XDELAYB] = p->YfilterA - ((p->XfilterB * 31) >> 5);
+        p->buf[XADAPTCOEFFSB] = SIGN(p->buf[XDELAYB]);
+        p->XfilterB = p->YfilterA;
+        p->buf[XDELAYB-1] = p->buf[XDELAYB] - p->buf[XDELAYB-1];
+        p->buf[XADAPTCOEFFSB-1] = SIGN(p->buf[XDELAYB-1]);
+        predictionB = (p->buf[XDELAYB] * p->XcoeffsB[0]) + 
+                      (p->buf[XDELAYB-1] * p->XcoeffsB[1]) + 
+                      (p->buf[XDELAYB-2] * p->XcoeffsB[2]) + 
+                      (p->buf[XDELAYB-3] * p->XcoeffsB[3]) + 
+                      (p->buf[XDELAYB-4] * p->XcoeffsB[4]);
+        p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10); 
+        p->XfilterA =  p->XlastA + ((p->XfilterA * 31) >> 5);
+        if (LIKELY(*decoded0 != 0))
+        {
+            if (*decoded0 > 0)
+            {
+                p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
+                p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
+                p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
+                p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
+                p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB];
+                p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1];
+                p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2];
+                p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3];
+                p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4];
+            }
+            else
+            {
+                p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
+                p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
+                p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
+                p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
+                p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB];
+                p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1];
+                p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2];
+                p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3];
+                p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4];
+            }
+        }
+        *(decoded0++) = p->YfilterA;
+        if (LIKELY(*decoded1 != 0))
+        {
+            if (*decoded1 > 0)
+            {
+                p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA];
+                p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1];
+                p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2];
+                p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3];
+                p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB];
+                p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1];
+                p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2];
+                p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3];
+                p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4];
+            }
+            else
+            {
+                p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA];
+                p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1];
+                p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2];
+                p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3];
+                p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB];
+                p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1];
+                p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2];
+                p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3];
+                p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4];
+            }
+        }
+        *(decoded1++) = p->XfilterA;
+        /* Combined */
+        p->buf++;
+        /* Have we filled the history buffer? */
+        if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
+            memmove(p->historybuffer, p->buf, 
+                    PREDICTOR_SIZE * sizeof(int32_t));
+            p->buf = p->historybuffer;
+        }
+    }
+}
+void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
+                                            int32_t* decoded0,
+                                            int count)
+{
+    int32_t predictionA, currentA, A;
+    currentA = p->YlastA;
+    while (LIKELY(count--))
+    {
+        A = *decoded0;
+        p->buf[YDELAYA] = currentA;
+        p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1];
+        predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) + 
+                      (p->buf[YDELAYA-1] * p->YcoeffsA[1]) + 
+                      (p->buf[YDELAYA-2] * p->YcoeffsA[2]) + 
+                      (p->buf[YDELAYA-3] * p->YcoeffsA[3]);
+        currentA = A + (predictionA >> 10);
+        p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
+        p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
+        
+        if (LIKELY(A != 0))
+        {
+            if (A > 0)
+            {
+                p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
+                p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
+                p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
+                p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
+            }
+            else
+            {
+                p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
+                p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
+                p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
+                p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
+            }
+        }
+        p->buf++;
+        /* Have we filled the history buffer? */
+        if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
+            memmove(p->historybuffer, p->buf, 
+                    PREDICTOR_SIZE * sizeof(int32_t));
+            p->buf = p->historybuffer;
+        }
+        p->YfilterA =  currentA + ((p->YfilterA * 31) >> 5);
+        *(decoded0++) = p->YfilterA;
+    }
+    p->YlastA = currentA;
+}
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/predictor.h b/lib/rbcodec/codecs/demac/libdemac/predictor.h
new file mode 100644
index 0000000000..6a0a81983b
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/predictor.h
@@ -0,0 +1,38 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#ifndef _APE_PREDICTOR_H
+#define _APE_PREDICTOR_H
+#include <inttypes.h>
+#include "parser.h"
+#include "filter.h"
+void init_predictor_decoder(struct predictor_t* p);
+void predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0,
+                             int32_t* decoded1, int count);
+void predictor_decode_mono(struct predictor_t* p, int32_t* decoded0,
+                           int count);
+#endif
diff --git a/lib/rbcodec/codecs/demac/libdemac/udiv32_arm-pre.S b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm-pre.S
new file mode 100644
index 0000000000..459cab8240
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm-pre.S
@@ -0,0 +1,25 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2010 by Andrew Mahone
+ *
+ * Wrapper for udiv32_arm.S to test available IRAM by pre-linking the codec.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#define APE_PRE
+#include "udiv32_arm.S"
diff --git a/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S
new file mode 100644
index 0000000000..7b851659bd
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S
@@ -0,0 +1,318 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ * Copyright (C) 2009 by Andrew Mahone
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
+ *           Developer's Guide
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+/* On targets with codec iram, a header file will be generated after an initial
+   link of the APE codec, stating the amount of IRAM remaining for use by the
+   reciprocal lookup table. */
+#if !defined(APE_PRE) && defined(USE_IRAM) && ARM_ARCH < 5
+#include "lib/rbcodec/codecs/ape_free_iram.h"
+#endif
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+#ifdef USE_IRAM
+#define DIV_RECIP
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .align
+    .global udiv32_arm
+    .type   udiv32_arm,%function
+#if ARM_ARCH < 5
+/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
+   for dividing a 30-bit value by a 15-bit value, with two operations per
+   iteration by storing quotient and remainder together and adding the previous
+   quotient bit during trial subtraction. Modified to work with any dividend
+   and divisor both less than 1 << 30, and skipping trials by calculating bits
+   in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
+    mov     \bits, #1
+    /* Shift the divisor left until it aligns with the numerator. If it already
+       has the high bit set, this is fine, everything inside .rept will be
+       skipped, and the add before and adcs after will set the one-bit result
+       to zero. */
+    cmn     \divisor, \dividend, lsr #16
+    movcs   \divisor, \divisor, lsl #16
+    addcs   \bits, \bits, #16
+    cmn     \divisor, \dividend, lsr #8
+    movcs   \divisor, \divisor, lsl #8
+    addcs   \bits, \bits, #8
+    cmn     \divisor, \dividend, lsr #4
+    movcs   \divisor, \divisor, lsl #4
+    addcs   \bits, \bits, #4
+    cmn     \divisor, \dividend, lsr #2
+    movcs   \divisor, \divisor, lsl #2
+    addcs   \bits, \bits, #2
+    cmn     \divisor, \dividend, lsr #1
+    movcs   \divisor, \divisor, lsl #1
+    addcs   \bits, \bits, #1
+    adds    \result, \dividend, \divisor
+    subcc   \result, \result, \divisor
+    rsb     \curbit, \bits, #31
+    add     pc, pc, \curbit, lsl #3
+    nop
+    .rept   30
+    adcs    \result, \divisor, \result, lsl #1
+    /* Fix the remainder portion of the result. This must be done because the
+       handler for 32-bit numerators needs the remainder. */
+    subcc   \result, \result, \divisor
+    .endr
+    /* Shift remainder/quotient left one, add final quotient bit */
+    adc     \result, \result, \result
+    mov     \remainder, \result, lsr \bits
+    eor     \quotient, \result, \remainder, lsl \bits
+.endm
+#ifndef FREE_IRAM
+.set recip_max, 2
+#else
+/* Each table entry is one word. Since a compare is done against the maximum
+   entry as an immediate, the maximum entry must be a valid ARM immediate,
+   which means a byte shifted by an even number of places. */
+.set recip_max, 2 + FREE_IRAM / 4
+.set recip_max_tmp, recip_max >> 8
+.set recip_mask_shift, 0
+.set tmp_shift, 16
+.rept 5
+    .if recip_max_tmp >> tmp_shift
+        .set recip_max_tmp, recip_max_tmp >> tmp_shift
+        .set recip_mask_shift, recip_mask_shift + tmp_shift
+    .endif
+    .set tmp_shift, tmp_shift >> 1
+.endr
+.if recip_max_tmp
+    .set recip_mask_shift, recip_mask_shift + 1
+.endif
+.set recip_mask_shift, (recip_mask_shift + 1) & 62
+.set recip_max, recip_max & (255 << recip_mask_shift)
+//.set recip_max, 2
+#endif
+udiv32_arm:
+#ifdef DIV_RECIP
+    cmp     r1, #3
+    bcc     .L_udiv_tiny
+    cmp     r1, #recip_max
+    bhi     .L_udiv
+    adr     r3, .L_udiv_recip_table-12
+    ldr     r2, [r3, r1, lsl #2]
+    mov     r3, r0
+    umull   ip, r0, r2, r0
+    mul     r2, r0, r1
+    cmp     r3, r2
+    bxcs    lr
+    sub     r0, r0, #1
+    bx      lr
+.L_udiv_tiny:
+    cmp     r1, #1
+    movhi   r0, r0, lsr #1
+    bxcs    lr
+    b       .L_div0
+#endif
+.L_udiv:
+    /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
+       and add the next bit of the result. The correction code at .L_udiv32
+       does not need the divisor inverted, but can be modified to work with it,
+       and this allows the zero divisor test to be done early and without an
+       explicit comparison. */
+    rsbs    r1, r1, #0
+#ifndef DIV_RECIP
+    beq .L_div0
+#endif
+    tst     r0, r0
+    /* High bit must be unset, otherwise shift numerator right, calculate,
+       and correct results. As this case is very uncommon we want to avoid
+       any other delays on the main path in handling it, so the long divide
+       calls the short divide as a function. */
+    bmi     .L_udiv32
+.L_udiv31:
+    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
+    bx      lr
+.L_udiv32:
+    /* store original numerator and divisor, we'll need them to correct the
+       result, */
+    stmdb   sp, { r0, r1, lr }
+    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+       address. */
+    mov     r0, r0, lsr #1
+    bl      .L_udiv31
+    ldmdb   sp, { r2, r3, lr }
+    /* Move the low bit of the original numerator to the carry bit */
+    movs    r2, r2, lsr #1
+    /* Shift the remainder left one and add in the carry bit */
+    adc     r1, r1, r1
+    /* Subtract the original divisor from the remainder, setting carry if the
+       result is non-negative */
+    adds    r1, r1, r3
+    /* Shift quotient left one and add carry bit */
+    adc     r0, r0, r0
+    bx      lr
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+#ifdef DIV_RECIP
+.L_udiv_recip_table:
+    .set div, 3
+    .rept recip_max - 2
+        .if (div - 1) & div
+            .set q, 0x40000000 / div
+            .set r, (0x40000000 - (q * div))<<1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set r, r << 1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set q, q + 1
+        .else
+            .set q, 0x40000000 / div * 4
+        .endif
+        .word q
+        .set div, div+1
+    .endr
+#endif
+    .size udiv32_arm, . - udiv32_arm
+#else
+.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
+    cmp     \numerator, \divisor
+    clz     \bits, \divisor
+    bcc     30f
+    mov     \inv, \divisor, lsl \bits
+    add     \neg, pc, \inv, lsr #25
+    cmp     \inv, #1<<31
+    ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
+    bls     20f
+    subs    \bits, \bits, #7
+    rsb     \neg, \divisor, #0
+    movpl   \divisor, \inv, lsl \bits
+    bmi     10f
+    mul     \inv, \divisor, \neg
+    smlawt  \divisor, \divisor, \inv, \divisor
+    mul     \inv, \divisor, \neg
+    /* This will save a cycle on ARMv6, but requires that the numerator sign
+       bit is not set (that of inv is guaranteed unset). The branch should
+       predict very well, making it typically 1 cycle, and thus both the branch
+       and test fill delay cycles for the multiplies. Based on logging of
+       numerator sizes in the APE codec, the branch is taken about 1/10^7 of
+       the time. */
+#if ARM_ARCH >= 6
+    tst     \numerator, \numerator
+    smmla   \divisor, \divisor, \inv, \divisor
+    bmi     40f
+    smmul   \inv, \numerator, \divisor
+#else
+    mov     \bits, #0
+    smlal   \bits, \divisor, \inv, \divisor
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+10:
+    rsb     \bits, \bits, #0
+    sub     \inv, \inv, #4
+    mov     \divisor, \inv, lsr \bits
+    umull   \bits, \inv, \numerator, \divisor
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+    bx      lr
+20:
+.ifnc "", "\div0label"
+    rsb     \bits, \bits, #31
+    bne     \div0label
+.endif
+    mov     \quotient, \numerator, lsr \bits
+    bx      lr
+30:
+    mov     \quotient, #0
+    bx      lr
+#if ARM_ARCH >= 6
+40:
+    umull   \bits, \inv, \numerator, \divisor
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+#endif
+.endm
+udiv32_arm:
+    ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+.L_udiv_est_table:
+    .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
+    .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
+    .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
+    .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
+    .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
+    .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
+    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
+    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
+#endif
+    .size udiv32_arm, . - udiv32_arm
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h
new file mode 100644
index 0000000000..ae7427c137
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv5te.h
@@ -0,0 +1,404 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv5te vector math copyright (C) 2008 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#define REPEAT_3(x) x x x
+#if ORDER > 16
+#define REPEAT_MLA(x) x x x x x x x
+#else
+#define REPEAT_MLA(x) x x x
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. If either condition isn't met, it will either
+ * result in a data abort or incorrect results. */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 16
+    int cnt = ORDER>>4;
+#endif
+#define ADDHALFREGS(sum, s1, s2)                         /* Adds register   */ \
+        "mov   " #s1  ", " #s1  ",   ror #16         \n" /* halves straight */ \
+        "add   " #sum ", " #s1  ", " #s2  ", lsl #16 \n" /* Clobbers 's1'   */ \
+        "add   " #s1  ", " #s1  ", " #s2  ", lsr #16 \n" \
+        "mov   " #s1  ", " #s1  ",   lsl #16         \n" \
+        "orr   " #sum ", " #s1  ", " #sum ", lsr #16 \n"
+#define ADDHALFXREGS(sum, s1, s2)                        /* Adds register  */ \
+        "add   " #s1  ", " #s1  ", " #sum ", lsl #16 \n" /* halves across. */ \
+        "add   " #sum ", " #s2  ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
+        "mov   " #sum ", " #sum ",   lsl #16         \n" \
+        "orr   " #sum ", " #sum ", " #s1  ", lsr #16 \n"
+    asm volatile (
+#if ORDER > 16
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[f2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "ldrh    r4, [%[s2]], #2         \n"
+        "mov     r4, r4, lsl #16         \n"
+        "ldrh    r3, [%[f2]], #2         \n"
+#if ORDER > 16
+        "mov     r3, r3, lsl #16         \n"
+    "1:                                  \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+#else
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smulbb  %[res], r0, r3          \n"
+#endif
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r5}         \n"
+        ADDHALFXREGS(r0, r4, r2)
+        ADDHALFXREGS(r1, r2, r5)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r4}         \n"
+        ADDHALFXREGS(r0, r5, r2)
+        ADDHALFXREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r5}         \n"
+        ADDHALFXREGS(r0, r4, r2)
+        ADDHALFXREGS(r1, r2, r5)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r4}         \n"
+        ADDHALFXREGS(r0, r5, r2)
+        ADDHALFXREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+#if ORDER > 16
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+        "b       99f                     \n"
+    "20:                                 \n"
+    "1:                                  \n"
+        "ldmia   %[v1],  {r1,r2}         \n"
+        "ldmia   %[f2]!, {r3,r4}         \n"
+#if ORDER > 16
+        "smlabb  %[res], r1, r3, %[res]  \n"
+#else
+        "smulbb  %[res], r1, r3          \n"
+#endif
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        "smlabb  %[res], r2, r4, %[res]  \n"
+        "smlatt  %[res], r2, r4, %[res]  \n"
+        "ldmia   %[s2]!, {r3,r4}         \n"
+        ADDHALFREGS(r0, r1, r3)
+        ADDHALFREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        REPEAT_3(
+        "ldmia   %[v1],  {r1,r2}         \n"
+        "ldmia   %[f2]!, {r3,r4}         \n"
+        "smlabb  %[res], r1, r3, %[res]  \n"
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        "smlabb  %[res], r2, r4, %[res]  \n"
+        "smlatt  %[res], r2, r4, %[res]  \n"
+        "ldmia   %[s2]!, {r3,r4}         \n"
+        ADDHALFREGS(r0, r1, r3)
+        ADDHALFREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        )
+#if ORDER > 16
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 16
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4", "r5", "cc", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. If either condition isn't met, it will either
+ * result in a data abort or incorrect results. */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 16
+    int cnt = ORDER>>4;
+#endif
+#define SUBHALFREGS(dif, s1, s2)                         /* Subtracts reg.  */ \
+        "mov   " #s1  ", " #s1  ",   ror #16         \n" /* halves straight */ \
+        "sub   " #dif ", " #s1  ", " #s2  ", lsl #16 \n" /* Clobbers 's1'   */ \
+        "sub   " #s1  ", " #s1  ", " #s2  ", lsr #16 \n" \
+        "mov   " #s1  ", " #s1  ",   lsl #16         \n" \
+        "orr   " #dif ", " #s1  ", " #dif ", lsr #16 \n"
+#define SUBHALFXREGS(dif, s1, s2, msk)                   /* Subtracts reg. */  \
+        "sub   " #s1  ", " #dif ", " #s1  ", lsr #16 \n" /* halves across. */  \
+        "and   " #s1  ", " #s1  ", " #msk "          \n" /* Needs msk =    */  \
+        "rsb   " #dif ", " #s2  ", " #dif ", lsr #16 \n" /*    0x0000ffff, */  \
+        "orr   " #dif ", " #s1  ", " #dif ", lsl #16 \n" /* clobbers 's1'. */
+    asm volatile (
+#if ORDER > 16
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[f2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "mov     r6, #0xff               \n"
+        "orr     r6, r6, #0xff00         \n"
+        "ldrh    r4, [%[s2]], #2         \n"
+        "mov     r4, r4, lsl #16         \n"
+        "ldrh    r3, [%[f2]], #2         \n"
+#if ORDER > 16
+        "mov     r3, r3, lsl #16         \n"
+    "1:                                  \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+#else
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smulbb  %[res], r0, r3          \n"
+#endif
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r5}         \n"
+        SUBHALFXREGS(r0, r4, r2, r6)
+        SUBHALFXREGS(r1, r2, r5, r6)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r4}         \n"
+        SUBHALFXREGS(r0, r5, r2, r6)
+        SUBHALFXREGS(r1, r2, r4, r6)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r5}         \n"
+        SUBHALFXREGS(r0, r4, r2, r6)
+        SUBHALFXREGS(r1, r2, r5, r6)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v1],  {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r2,r4}         \n"
+        SUBHALFXREGS(r0, r5, r2, r6)
+        SUBHALFXREGS(r1, r2, r4, r6)
+        "stmia   %[v1]!, {r0,r1}         \n"
+#if ORDER > 16
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+        "b       99f                     \n"
+    "20:                                 \n"
+    "1:                                  \n"
+        "ldmia   %[v1],  {r1,r2}         \n"
+        "ldmia   %[f2]!, {r3,r4}         \n"
+#if ORDER > 16
+        "smlabb  %[res], r1, r3, %[res]  \n"
+#else
+        "smulbb  %[res], r1, r3          \n"
+#endif
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        "smlabb  %[res], r2, r4, %[res]  \n"
+        "smlatt  %[res], r2, r4, %[res]  \n"
+        "ldmia   %[s2]!, {r3,r4}         \n"
+        SUBHALFREGS(r0, r1, r3)
+        SUBHALFREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        REPEAT_3(
+        "ldmia   %[v1],  {r1,r2}         \n"
+        "ldmia   %[f2]!, {r3,r4}         \n"
+        "smlabb  %[res], r1, r3, %[res]  \n"
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        "smlabb  %[res], r2, r4, %[res]  \n"
+        "smlatt  %[res], r2, r4, %[res]  \n"
+        "ldmia   %[s2]!, {r3,r4}         \n"
+        SUBHALFREGS(r0, r1, r3)
+        SUBHALFREGS(r1, r2, r4)
+        "stmia   %[v1]!, {r0,r1}         \n"
+        )
+#if ORDER > 16
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 16
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4", "r5", "r6", "cc", "memory"
+    );
+    return res;
+}
+/* This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned, otherwise it will result either in a data abort, or
+ * incorrect results (if ARM aligncheck is disabled). */
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[v2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "ldrh    r3, [%[v2]], #2         \n"
+#if ORDER > 32
+        "mov     r3, r3, lsl #16         \n"
+    "1:                                  \n"
+        "ldmia   %[v1]!, {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+#else
+        "ldmia   %[v1]!, {r0,r1}         \n"
+        "smulbb  %[res], r0, r3          \n"
+#endif
+        "ldmia   %[v2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        REPEAT_MLA(
+        "ldmia   %[v1]!, {r0,r1}         \n"
+        "smlabt  %[res], r0, r3, %[res]  \n"
+        "ldmia   %[v2]!, {r2,r3}         \n"
+        "smlatb  %[res], r0, r2, %[res]  \n"
+        "smlabt  %[res], r1, r2, %[res]  \n"
+        "smlatb  %[res], r1, r3, %[res]  \n"
+        )
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1  \n"
+        "bne     1b                  \n"
+#endif
+        "b       99f                 \n"
+    "20:                                 \n"
+    "1:                                  \n"
+        "ldmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v2]!, {r2,r3}         \n"
+#if ORDER > 32
+        "smlabb  %[res], r0, r2, %[res]  \n"
+#else
+        "smulbb  %[res], r0, r2          \n"
+#endif
+        "smlatt  %[res], r0, r2, %[res]  \n"
+        "smlabb  %[res], r1, r3, %[res]  \n"
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        REPEAT_MLA(
+        "ldmia   %[v1]!, {r0,r1}         \n"
+        "ldmia   %[v2]!, {r2,r3}         \n"
+        "smlabb  %[res], r0, r2, %[res]  \n"
+        "smlatt  %[res], r0, r2, %[res]  \n"
+        "smlabb  %[res], r1, r3, %[res]  \n"
+        "smlatt  %[res], r1, r3, %[res]  \n"
+        )
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"  
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "cc", "memory"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h
new file mode 100644
index 0000000000..8d27331b62
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv6.h
@@ -0,0 +1,490 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv6 vector math copyright (C) 2008 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#if ORDER > 16
+#define REPEAT_BLOCK(x) x x x
+#else
+#define REPEAT_BLOCK(x) x
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. If either condition isn't met, it will either
+ * result in a data abort or incorrect results. */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[f2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "ldrh    r3, [%[f2]], #2         \n"
+        "ldrh    r6, [%[s2]], #2         \n"
+        "ldmia   %[f2]!, {r2,r4}         \n"
+        "mov     r3, r3, lsl #16         \n"
+        "mov     r6, r6, lsl #16         \n"
+    "1:                                  \n"
+        "ldmia   %[s2]!, {r5,r7}         \n"
+        "pkhtb   r3, r3, r2              \n"
+        "pkhtb   r2, r2, r4              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r6, r5, r6, asr #16     \n"
+        "pkhbt   r5, r5, r7, lsl #16     \n"
+#if ORDER > 32
+        "smladx  %[res], r0, r3, %[res]  \n"
+#else
+        "smuadx  %[res], r0, r3          \n"
+#endif
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "sadd16  r0, r0, r6              \n"
+        "sadd16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        REPEAT_BLOCK(
+        "ldmia   %[s2]!, {r5,r6}         \n"
+        "pkhtb   r4, r4, r2              \n"
+        "pkhtb   r2, r2, r3              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r7, r5, r7, asr #16     \n"
+        "pkhbt   r5, r5, r6, lsl #16     \n"
+        "smladx  %[res], r0, r4, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r4}         \n"
+        "sadd16  r0, r0, r7              \n"
+        "sadd16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        "ldmia   %[s2]!, {r5,r7}         \n"
+        "pkhtb   r3, r3, r2              \n"
+        "pkhtb   r2, r2, r4              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r6, r5, r6, asr #16     \n"
+        "pkhbt   r5, r5, r7, lsl #16     \n"
+        "smladx  %[res], r0, r3, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "sadd16  r0, r0, r6              \n"
+        "sadd16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        )
+        "ldmia   %[s2]!, {r5,r6}         \n"
+        "pkhtb   r4, r4, r2              \n"
+        "pkhtb   r2, r2, r3              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r7, r5, r7, asr #16     \n"
+        "pkhbt   r5, r5, r6, lsl #16     \n"
+        "smladx  %[res], r0, r4, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldmneia %[f2]!, {r2,r4}         \n"
+        "sadd16  r0, r0, r7              \n"
+        "sadd16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        "bne     1b                      \n"
+#else
+        "sadd16  r0, r0, r7              \n"
+        "sadd16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+#endif
+        "b       99f                     \n"
+    "20:                                 \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r0, [%[v1]]             \n"
+#if ORDER > 32
+    "1:                                  \n"
+        "smlad   %[res], r0, r4, %[res]  \n"
+#else
+        "smuad   %[res], r0, r4          \n"
+#endif
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r1, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r2, [%[v1], #8]         \n"
+        "sadd16  r0, r0, r6              \n"
+        "sadd16  r1, r1, r7              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        REPEAT_BLOCK(
+        "smlad   %[res], r2, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r3, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r0, [%[v1], #8]         \n"
+        "sadd16  r2, r2, r6              \n"
+        "sadd16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+        "smlad   %[res], r0, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r1, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r2, [%[v1], #8]         \n"
+        "sadd16  r0, r0, r6              \n"
+        "sadd16  r1, r1, r7              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        )
+        "smlad   %[res], r2, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r3, r5, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldrned  r4, [%[f2]], #8         \n"
+        "ldrned  r0, [%[v1], #8]         \n"
+        "sadd16  r2, r2, r6              \n"
+        "sadd16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+        "bne     1b                      \n"
+#else
+        "sadd16  r2, r2, r6              \n"
+        "sadd16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. If either condition isn't met, it will either
+ * result in a data abort or incorrect results. */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[f2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "ldrh    r3, [%[f2]], #2         \n"
+        "ldrh    r6, [%[s2]], #2         \n"
+        "ldmia   %[f2]!, {r2,r4}         \n"
+        "mov     r3, r3, lsl #16         \n"
+        "mov     r6, r6, lsl #16         \n"
+    "1:                                  \n"
+        "ldmia   %[s2]!, {r5,r7}         \n"
+        "pkhtb   r3, r3, r2              \n"
+        "pkhtb   r2, r2, r4              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r6, r5, r6, asr #16     \n"
+        "pkhbt   r5, r5, r7, lsl #16     \n"
+#if ORDER > 32
+        "smladx  %[res], r0, r3, %[res]  \n"
+#else
+        "smuadx  %[res], r0, r3          \n"
+#endif
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "ssub16  r0, r0, r6              \n"
+        "ssub16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        REPEAT_BLOCK(
+        "ldmia   %[s2]!, {r5,r6}         \n"
+        "pkhtb   r4, r4, r2              \n"
+        "pkhtb   r2, r2, r3              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r7, r5, r7, asr #16     \n"
+        "pkhbt   r5, r5, r6, lsl #16     \n"
+        "smladx  %[res], r0, r4, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r4}         \n"
+        "ssub16  r0, r0, r7              \n"
+        "ssub16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        "ldmia   %[s2]!, {r5,r7}         \n"
+        "pkhtb   r3, r3, r2              \n"
+        "pkhtb   r2, r2, r4              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r6, r5, r6, asr #16     \n"
+        "pkhbt   r5, r5, r7, lsl #16     \n"
+        "smladx  %[res], r0, r3, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+        "ldmia   %[f2]!, {r2,r3}         \n"
+        "ssub16  r0, r0, r6              \n"
+        "ssub16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        )
+        "ldmia   %[s2]!, {r5,r6}         \n"
+        "pkhtb   r4, r4, r2              \n"
+        "pkhtb   r2, r2, r3              \n"
+        "ldrd    r0, [%[v1]]             \n"
+        "mov     r5, r5, ror #16         \n"
+        "pkhtb   r7, r5, r7, asr #16     \n"
+        "pkhbt   r5, r5, r6, lsl #16     \n"
+        "smladx  %[res], r0, r4, %[res]  \n"
+        "smladx  %[res], r1, r2, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldmneia %[f2]!, {r2,r4}         \n"
+        "ssub16  r0, r0, r7              \n"
+        "ssub16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        "bne     1b                      \n"
+#else
+        "ssub16  r0, r0, r7              \n"
+        "ssub16  r1, r1, r5              \n"
+        "strd    r0, [%[v1]], #8         \n"
+#endif
+        "b       99f                     \n"
+    "20:                                 \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r0, [%[v1]]             \n"
+#if ORDER > 32
+    "1:                                  \n"
+        "smlad   %[res], r0, r4, %[res]  \n"
+#else
+        "smuad   %[res], r0, r4          \n"
+#endif
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r1, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r2, [%[v1], #8]         \n"
+        "ssub16  r0, r0, r6              \n"
+        "ssub16  r1, r1, r7              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        REPEAT_BLOCK(
+        "smlad   %[res], r2, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r3, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r0, [%[v1], #8]         \n"
+        "ssub16  r2, r2, r6              \n"
+        "ssub16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+        "smlad   %[res], r0, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r1, r5, %[res]  \n"
+        "ldrd    r4, [%[f2]], #8         \n"
+        "ldrd    r2, [%[v1], #8]         \n"
+        "ssub16  r0, r0, r6              \n"
+        "ssub16  r1, r1, r7              \n"
+        "strd    r0, [%[v1]], #8         \n"
+        )
+        "smlad   %[res], r2, r4, %[res]  \n"
+        "ldrd    r6, [%[s2]], #8         \n"
+        "smlad   %[res], r3, r5, %[res]  \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldrned  r4, [%[f2]], #8         \n"
+        "ldrned  r0, [%[v1], #8]         \n"
+        "ssub16  r2, r2, r6              \n"
+        "ssub16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+        "bne     1b                      \n"
+#else
+        "ssub16  r2, r2, r6              \n"
+        "ssub16  r3, r3, r7              \n"
+        "strd    r2, [%[v1]], #8         \n"
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
+/* This version fetches data as 32 bit words, and *requires* v1 to be
+ * 32 bit aligned, otherwise it will result either in a data abort, or
+ * incorrect results (if ARM aligncheck is disabled). */
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+#endif
+        "tst     %[v2], #2               \n"
+        "beq     20f                     \n"
+    "10:                                 \n"
+        "bic     %[v2], %[v2], #2        \n"
+        "ldmia   %[v2]!, {r5-r7}         \n"
+        "ldrd    r0, [%[v1]], #8         \n"
+    "1:                                  \n"
+        "pkhtb   r3, r5, r6              \n"
+        "ldrd    r4, [%[v2]], #8         \n"
+#if ORDER > 32
+        "smladx  %[res], r0, r3, %[res]  \n"
+#else
+        "smuadx  %[res], r0, r3          \n"
+#endif
+        REPEAT_BLOCK(
+        "pkhtb   r0, r6, r7              \n"
+        "ldrd    r2, [%[v1]], #8         \n"
+        "smladx  %[res], r1, r0, %[res]  \n"
+        "pkhtb   r1, r7, r4              \n"
+        "ldrd    r6, [%[v2]], #8         \n"
+        "smladx  %[res], r2, r1, %[res]  \n"
+        "pkhtb   r2, r4, r5              \n"
+        "ldrd    r0, [%[v1]], #8         \n"
+        "smladx  %[res], r3, r2, %[res]  \n"
+        "pkhtb   r3, r5, r6              \n"
+        "ldrd    r4, [%[v2]], #8         \n"
+        "smladx  %[res], r0, r3, %[res]  \n"
+        )
+        "pkhtb   r0, r6, r7              \n"
+        "ldrd    r2, [%[v1]], #8         \n"
+        "smladx  %[res], r1, r0, %[res]  \n"
+        "pkhtb   r1, r7, r4              \n"
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldrned  r6, [%[v2]], #8         \n"
+        "smladx  %[res], r2, r1, %[res]  \n"
+        "pkhtb   r2, r4, r5              \n"
+        "ldrned  r0, [%[v1]], #8         \n"
+        "smladx  %[res], r3, r2, %[res]  \n"
+        "bne     1b                      \n"
+#else
+        "pkhtb   r4, r4, r5              \n"
+        "smladx  %[res], r2, r1, %[res]  \n"
+        "smladx  %[res], r3, r4, %[res]  \n"
+#endif
+        "b       99f                     \n"
+    "20:                                 \n"
+        "ldrd    r0, [%[v1]], #8         \n"
+        "ldmia   %[v2]!, {r5-r7}         \n"
+    "1:                                  \n"
+        "ldrd    r2, [%[v1]], #8         \n"
+#if ORDER > 32
+        "smlad   %[res], r0, r5, %[res]  \n"
+#else
+        "smuad   %[res], r0, r5          \n"
+#endif
+        REPEAT_BLOCK(
+        "ldrd    r4, [%[v2]], #8         \n"
+        "smlad   %[res], r1, r6, %[res]  \n"
+        "ldrd    r0, [%[v1]], #8         \n"
+        "smlad   %[res], r2, r7, %[res]  \n"
+        "ldrd    r6, [%[v2]], #8         \n"
+        "smlad   %[res], r3, r4, %[res]  \n"
+        "ldrd    r2, [%[v1]], #8         \n"
+        "smlad   %[res], r0, r5, %[res]  \n"
+        )
+#if ORDER > 32
+        "ldrd    r4, [%[v2]], #8         \n"
+        "smlad   %[res], r1, r6, %[res]  \n"
+        "subs    %[cnt], %[cnt], #1      \n"
+        "ldrned  r0, [%[v1]], #8         \n"
+        "smlad   %[res], r2, r7, %[res]  \n"
+        "ldrned  r6, [%[v2]], #8         \n"
+        "smlad   %[res], r3, r4, %[res]  \n"
+        "bne     1b                      \n"
+#else
+        "ldr     r4, [%[v2]], #4         \n"
+        "smlad   %[res], r1, r6, %[res]  \n"
+        "smlad   %[res], r2, r7, %[res]  \n"
+        "smlad   %[res], r3, r4, %[res]  \n"
+#endif
+    "99:                                 \n"
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3",
+        "r4", "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
new file mode 100644
index 0000000000..84afda3e5d
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv7 neon vector math copyright (C) 2010 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#if ORDER > 32
+#define REPEAT_BLOCK(x) x x x
+#elif ORDER > 16
+#define REPEAT_BLOCK(x) x
+#else
+#define REPEAT_BLOCK(x)
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4",
+        "d5", "d6", "d7", "d8", "d9"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h
new file mode 100644
index 0000000000..4d77d3be31
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_cf.h
@@ -0,0 +1,364 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+Coldfire vector math copyright (C) 2007 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
+#define REPEAT_2(x) x x
+#define REPEAT_3(x) x x x
+#define REPEAT_7(x) x x x x x x x
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. Performance will suffer if either condition
+ * isn't met. It also needs EMAC in signed integer mode. */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 16
+    int cnt = ORDER>>4;
+#endif
+#define ADDHALFREGS(s1, s2, sum)       /* Add register halves straight. */  \
+        "move.l " #s1  ", " #sum "\n"  /* 's1' and 's2' can be A or D */    \
+        "add.l  " #s2  ", " #s1  "\n"  /* regs, 'sum' must be a D reg. */   \
+        "clr.w  " #sum "          \n"  /* 's1' is clobbered! */             \
+        "add.l  " #s2  ", " #sum "\n"  \
+        "move.w " #s1  ", " #sum "\n"
+        
+#define ADDHALFXREGS(s1, s2, sum)      /* Add register halves across. */    \
+        "clr.w  " #sum "          \n"  /* Needs 'sum' pre-swapped, swaps */ \
+        "add.l  " #s1  ", " #sum "\n"  /* 's2', and clobbers 's1'. */       \
+        "swap   " #s2  "          \n"  /* 's1' can be an A or D reg. */     \
+        "add.l  " #s2  ", " #s1  "\n"  /* 'sum' and 's2' must be D regs. */ \
+        "move.w " #s1  ", " #sum "\n"
+    asm volatile (
+        "move.l  %[f2], %%d0                         \n"
+        "and.l   #2, %%d0                            \n"
+        "jeq     20f                                 \n"
+    "10:                                             \n"
+        "move.w  (%[f2])+, %%d0                      \n"
+        "move.w  (%[s2])+, %%d1                      \n"
+        "swap    %%d1                                \n"
+    "1:                                              \n"
+        REPEAT_2(
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
+        ADDHALFXREGS(%%d6, %%d2, %%d1)
+        "mac.w   %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
+        "move.l  %%d1, (%[v1])+                      \n"
+        ADDHALFXREGS(%%d7, %%d6, %%d2)
+        "mac.w   %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFXREGS(%%a0, %%d7, %%d6)
+        "mac.w   %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
+        "move.l  %%d6, (%[v1])+                      \n"
+        ADDHALFXREGS(%%a1, %%d1, %%d7)
+        "move.l  %%d7, (%[v1])+                      \n"
+        )
+#if ORDER > 16
+        "subq.l  #1, %[res]                          \n"
+        "bne.w   1b                                  \n"
+#endif
+        "jra     99f                                 \n"
+    "20:                                             \n"
+        "move.l  (%[f2])+, %%d0                      \n"
+    "1:                                              \n"
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+        ADDHALFREGS(%%d6, %%d1, %%d2)
+        "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%d7, %%d1, %%d2)
+        "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%a0, %%d1, %%d2)
+        "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%a1, %%d1, %%d2)
+        "move.l  %%d2, (%[v1])+                      \n"
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+        ADDHALFREGS(%%d6, %%d1, %%d2)
+        "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%d7, %%d1, %%d2)
+        "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%a0, %%d1, %%d2)
+        "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+        "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+#else
+        "mac.w   %%d0l, %%a1l, %%acc0                \n"
+#endif
+        "move.l  %%d2, (%[v1])+                      \n"
+        ADDHALFREGS(%%a1, %%d1, %%d2)
+        "move.l  %%d2, (%[v1])+                      \n"
+#if ORDER > 16
+        "subq.l  #1, %[res]                          \n"
+        "bne.w   1b                                  \n"
+#endif
+    "99:                                             \n"
+        "movclr.l %%acc0, %[res]                     \n"
+        : /* outputs */
+        [v1]"+a"(v1),
+        [f2]"+a"(f2),
+        [s2]"+a"(s2),
+        [res]"=d"(res)
+        : /* inputs */
+#if ORDER > 16
+        [cnt]"[res]"(cnt)
+#endif
+        : /* clobbers */
+        "d0", "d1", "d2", "d6", "d7", 
+        "a0", "a1", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. Performance will suffer if either condition
+ * isn't met. It also needs EMAC in signed integer mode. */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 16
+    int cnt = ORDER>>4;
+#endif
+#define SUBHALFREGS(min, sub, dif)    /* Subtract register halves straight. */ \
+        "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */         \
+        "sub.l  " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */     \
+        "clr.w  " #sub           "\n" /* 'min' and 'sub' are clobbered! */     \
+        "sub.l  " #sub ", " #dif "\n" \
+        "move.w " #min ", " #dif "\n" 
+        
+#define SUBHALFXREGS(min, s2, s1d)    /* Subtract register halves across. */ \
+        "clr.w  " #s1d           "\n" /* Needs 's1d' pre-swapped, swaps */   \
+        "sub.l  " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */         \
+        "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */      \
+        "swap   " #s2            "\n" /* 's2' and 's1d' must be D regs. */   \
+        "sub.l  " #s2  ", " #min "\n" \
+        "move.w " #min ", " #s1d "\n"
+    asm volatile (
+        "move.l  %[f2], %%d0                         \n"
+        "and.l   #2, %%d0                            \n"
+        "jeq     20f                                 \n"
+    "10:                                             \n"
+        "move.w  (%[f2])+, %%d0                      \n"
+        "move.w  (%[s2])+, %%d1                      \n"
+        "swap    %%d1                                \n"
+    "1:                                              \n"
+        REPEAT_2(
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
+        SUBHALFXREGS(%%d6, %%d2, %%d1)
+        "mac.w   %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
+        "move.l  %%d1, (%[v1])+                      \n"
+        SUBHALFXREGS(%%d7, %%d6, %%d2)
+        "mac.w   %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFXREGS(%%a0, %%d7, %%d6)
+        "mac.w   %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
+        "mac.w   %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
+        "move.l  %%d6, (%[v1])+                      \n"
+        SUBHALFXREGS(%%a1, %%d1, %%d7)
+        "move.l  %%d7, (%[v1])+                      \n"
+        )
+#if ORDER > 16
+        "subq.l  #1, %[res]                          \n"
+        "bne.w   1b                                  \n"
+#endif
+        "jra     99f                                 \n"
+    "20:                                             \n"
+        "move.l  (%[f2])+, %%d0                      \n"
+    "1:                                              \n"
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+        SUBHALFREGS(%%d6, %%d1, %%d2)
+        "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%d7, %%d1, %%d2)
+        "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%a0, %%d1, %%d2)
+        "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%a1, %%d1, %%d2)
+        "move.l  %%d2, (%[v1])+                      \n"
+        "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
+        "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+        SUBHALFREGS(%%d6, %%d1, %%d2)
+        "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%d7, %%d1, %%d2)
+        "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%a0, %%d1, %%d2)
+        "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+        "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+#else
+        "mac.w   %%d0l, %%a1l, %%acc0                \n"
+#endif
+        "move.l  %%d2, (%[v1])+                      \n"
+        SUBHALFREGS(%%a1, %%d1, %%d2)
+        "move.l  %%d2, (%[v1])+                      \n"
+#if ORDER > 16
+        "subq.l  #1, %[res]                          \n"
+        "bne.w   1b                                  \n"
+#endif
+    "99:                                             \n"
+        "movclr.l %%acc0, %[res]                     \n"
+        : /* outputs */
+        [v1]"+a"(v1),
+        [f2]"+a"(f2),
+        [s2]"+a"(s2),
+        [res]"=d"(res)
+        : /* inputs */
+#if ORDER > 16
+        [cnt]"[res]"(cnt)
+#endif
+        : /* clobbers */
+        "d0", "d1", "d2", "d6", "d7", 
+        "a0", "a1", "memory"
+    );
+    return res;
+}
+/* This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
+ * in signed integer mode. */
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 16
+    int cnt = ORDER>>4;
+#endif
+    asm volatile (
+        "move.l  %[v2], %%d0                         \n"
+        "and.l   #2, %%d0                            \n"
+        "jeq     20f                                 \n"
+    "10:                                             \n"
+        "move.l  (%[v1])+, %%d0                      \n"
+        "move.w  (%[v2])+, %%d1                      \n"
+    "1:                                              \n"
+        REPEAT_7(
+        "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+        "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+        )
+        "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+        "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+        "subq.l  #1, %[res]                          \n"
+        "bne.b   1b                                  \n"
+#else
+        "mac.w   %%d0l, %%d1u, %%acc0                \n"
+#endif
+        "jra     99f                                  \n"
+        
+    "20:                                             \n"
+        "move.l  (%[v1])+, %%d0                      \n"
+        "move.l  (%[v2])+, %%d1                      \n"
+    "1:                                              \n"
+        REPEAT_3(
+        "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
+        "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+        "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+        "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+        )
+        "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
+        "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+        "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+        "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
+        "subq.l  #1, %[res]                          \n"
+        "bne.b   1b                                  \n"
+#else
+        "mac.w   %%d2u, %%d1u, %%acc0                \n"
+        "mac.w   %%d2l, %%d1l, %%acc0                \n"
+#endif
+    "99:                                             \n"
+        "movclr.l %%acc0, %[res]                     \n"
+        : /* outputs */
+        [v1]"+a"(v1),
+        [v2]"+a"(v2),
+        [res]"=d"(res)
+        : /* inputs */
+#if ORDER > 16
+        [cnt]"[res]"(cnt)
+#endif
+        : /* clobbers */
+        "d0", "d1", "d2"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_mmx.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_mmx.h
new file mode 100644
index 0000000000..2177fe88ea
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_mmx.h
@@ -0,0 +1,234 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+MMX vector math copyright (C) 2010 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#define REPEAT_MB3(x, n) x(n) x(n+8) x(n+16)
+#define REPEAT_MB7(x, n) x(n) x(n+8) x(n+16) x(n+24) x(n+32) x(n+40) x(n+48)
+#define REPEAT_MB8(x, n) REPEAT_MB7(x, n) x(n+56)
+#if ORDER == 16     /* 3 times */
+#define REPEAT_MB(x) REPEAT_MB3(x, 8) 
+#elif ORDER == 32   /* 7 times */
+#define REPEAT_MB(x) REPEAT_MB7(x, 8) 
+#elif ORDER == 64   /* 5*3 == 15 times */
+#define REPEAT_MB(x) REPEAT_MB3(x,  8) REPEAT_MB3(x, 32) REPEAT_MB3(x, 56) \
+                     REPEAT_MB3(x, 80) REPEAT_MB3(x, 104)
+#elif ORDER == 256  /* 9*7 == 63 times */
+#define REPEAT_MB(x) REPEAT_MB7(x,   8) REPEAT_MB7(x,  64) REPEAT_MB7(x, 120) \
+                     REPEAT_MB7(x, 176) REPEAT_MB7(x, 232) REPEAT_MB7(x, 288) \
+                     REPEAT_MB7(x, 344) REPEAT_MB7(x, 400) REPEAT_MB7(x, 456)
+#elif ORDER == 1280 /* 8*8 == 64 times */
+#define REPEAT_MB(x) REPEAT_MB8(x,   0) REPEAT_MB8(x,  64) REPEAT_MB8(x, 128) \
+                     REPEAT_MB8(x, 192) REPEAT_MB8(x, 256) REPEAT_MB8(x, 320) \
+                     REPEAT_MB8(x, 384) REPEAT_MB8(x, 448)
+#else
+#error unsupported order
+#endif
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+    "1:                              \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "paddw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+#endif
+#define SP_ADD_BLOCK(n)                      \
+        "movq    " #n "(%[v1]), %%mm1    \n" \
+        "movq    %%mm1, %%mm0            \n" \
+        "pmaddwd " #n "(%[f2]), %%mm1    \n" \
+        "paddw   " #n "(%[s2]), %%mm0    \n" \
+        "movq    %%mm0, " #n "(%[v1])    \n" \
+        "paddd   %%mm1, %%mm2            \n"
+        
+REPEAT_MB(SP_ADD_BLOCK)
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+    "1:                              \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "psubw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+#endif
+#define SP_SUB_BLOCK(n)                      \
+        "movq    " #n "(%[v1]), %%mm1    \n" \
+        "movq    %%mm1, %%mm0            \n" \
+        "pmaddwd " #n "(%[f2]), %%mm1    \n" \
+        "psubw   " #n "(%[s2]), %%mm0    \n" \
+        "movq    %%mm0, " #n "(%[v1])    \n" \
+        "paddd   %%mm1, %%mm2            \n"
+REPEAT_MB(SP_SUB_BLOCK)
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+               
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm1, %%mm1        \n"
+    "1:                              \n"
+#else
+        "movq    (%[v1]), %%mm1      \n"
+        "pmaddwd (%[v2]), %%mm1      \n"
+#endif
+#define SP_BLOCK(n)                          \
+        "movq    " #n "(%[v1]), %%mm0    \n" \
+        "pmaddwd " #n "(%[v2]), %%mm0    \n" \
+        "paddd   %%mm0, %%mm1            \n"
+REPEAT_MB(SP_BLOCK)
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[v2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm1, %[t]         \n"
+        "psrlq   $32, %%mm1          \n"
+        "movd    %%mm1, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"1"(v1),
+        [v2]"2"(v2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [v2]"r"(v2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math32_armv4.h b/lib/rbcodec/codecs/demac/libdemac/vector_math32_armv4.h
new file mode 100644
index 0000000000..d6bb9b0d9c
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math32_armv4.h
@@ -0,0 +1,201 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv4 vector math copyright (C) 2008 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#if ORDER > 32
+#define REPEAT_BLOCK(x) x x x x x x x x
+#elif ORDER > 16
+#define REPEAT_BLOCK(x) x x x x x x x
+#else
+#define REPEAT_BLOCK(x) x x x
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+    "1:                                  \n"
+#else
+        "ldmia   %[v1],  {r0-r3}         \n"
+        "ldmia   %[f2]!, {r4-r7}         \n"
+        "mul     %[res], r4, r0          \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r4-r7}         \n"
+        "add     r0, r0, r4              \n"
+        "add     r1, r1, r5              \n"
+        "add     r2, r2, r6              \n"
+        "add     r3, r3, r7              \n"
+        "stmia   %[v1]!, {r0-r3}         \n"
+#endif
+        REPEAT_BLOCK(
+        "ldmia   %[v1],  {r0-r3}         \n"
+        "ldmia   %[f2]!, {r4-r7}         \n"
+        "mla     %[res], r4, r0, %[res]  \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r4-r7}         \n"
+        "add     r0, r0, r4              \n"
+        "add     r1, r1, r5              \n"
+        "add     r2, r2, r6              \n"
+        "add     r3, r3, r7              \n"
+        "stmia   %[v1]!, {r0-r3}         \n"
+        )
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+    "1:                                  \n"
+#else
+        "ldmia   %[v1],  {r0-r3}         \n"
+        "ldmia   %[f2]!, {r4-r7}         \n"
+        "mul     %[res], r4, r0          \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r4-r7}         \n"
+        "sub     r0, r0, r4              \n"
+        "sub     r1, r1, r5              \n"
+        "sub     r2, r2, r6              \n"
+        "sub     r3, r3, r7              \n"
+        "stmia   %[v1]!, {r0-r3}         \n"
+#endif
+        REPEAT_BLOCK(
+        "ldmia   %[v1],  {r0-r3}         \n"
+        "ldmia   %[f2]!, {r4-r7}         \n"
+        "mla     %[res], r4, r0, %[res]  \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        "ldmia   %[s2]!, {r4-r7}         \n"
+        "sub     r0, r0, r4              \n"
+        "sub     r1, r1, r5              \n"
+        "sub     r2, r2, r6              \n"
+        "sub     r3, r3, r7              \n"
+        "stmia   %[v1]!, {r0-r3}         \n"
+        )
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3", "r4",
+        "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
+{
+    int res;
+#if ORDER > 32
+    int cnt = ORDER>>5;
+#endif
+    asm volatile (
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+    "1:                                  \n"
+#else
+        "ldmia   %[v1]!, {r0-r3}         \n"
+        "ldmia   %[v2]!, {r4-r7}         \n"
+        "mul     %[res], r4, r0          \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+#endif
+        REPEAT_BLOCK(
+        "ldmia   %[v1]!, {r0-r3}         \n"
+        "ldmia   %[v2]!, {r4-r7}         \n"
+        "mla     %[res], r4, r0, %[res]  \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        )
+#if ORDER > 32
+        "subs    %[cnt], %[cnt], #1      \n"
+        "bne     1b                      \n"
+#endif
+        : /* outputs */
+#if ORDER > 32
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "r0", "r1", "r2", "r3",
+        "r4", "r5", "r6", "r7", "cc", "memory"
+    );
+    return res;
+}
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math_generic.h b/lib/rbcodec/codecs/demac/libdemac/vector_math_generic.h
new file mode 100644
index 0000000000..00bf07a007
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math_generic.h
@@ -0,0 +1,160 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#include "demac_config.h"
+static inline void vector_add(filter_int* v1, filter_int* v2)
+{
+#if ORDER > 32
+    int order = (ORDER >> 5);
+    while (order--)
+#endif
+    {
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+#if ORDER > 16
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+        *v1++ += *v2++;
+#endif
+    }
+}
+static inline void vector_sub(filter_int* v1, filter_int* v2)
+{
+#if ORDER > 32
+    int order = (ORDER >> 5);
+    while (order--)
+#endif
+    {
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+#if ORDER > 16
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+        *v1++ -= *v2++;
+#endif
+    }
+}
+static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
+{
+    int res = 0;
+#if ORDER > 32
+    int order = (ORDER >> 5);
+    while (order--)
+#endif
+    {
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+#if ORDER > 16
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+        res += *v1++ * *v2++;
+#endif
+    }
+    return res;
+}