From b5716df4cb2837bbbc42195cf1aefcf03e21d6a6 Mon Sep 17 00:00:00 2001
From: Sean Bartell <wingedtachikoma@gmail.com>
Date: Fri, 24 Jun 2011 01:25:21 -0400
Subject: Build librbcodec with DSP and metadata.

All associated files are moved to /lib/rbcodec.

Change-Id: I572ddd2b8a996aae1e98c081d06b1ed356dce222
---
 lib/rbcodec/dsp/compressor.c       |  363 +++++++++
 lib/rbcodec/dsp/compressor.h       |   29 +
 lib/rbcodec/dsp/dsp.c              | 1573 ++++++++++++++++++++++++++++++++++++
 lib/rbcodec/dsp/dsp.h              |  125 +++
 lib/rbcodec/dsp/dsp_arm.S          |  561 +++++++++++++
 lib/rbcodec/dsp/dsp_arm_v6.S       |  127 +++
 lib/rbcodec/dsp/dsp_asm.h          |   86 ++
 lib/rbcodec/dsp/dsp_cf.S           |  611 ++++++++++++++
 lib/rbcodec/dsp/eq.c               |  268 ++++++
 lib/rbcodec/dsp/eq.h               |   50 ++
 lib/rbcodec/dsp/eq_arm.S           |   89 ++
 lib/rbcodec/dsp/eq_cf.S            |   91 +++
 lib/rbcodec/dsp/eqs/Acoustic.cfg   |   17 +
 lib/rbcodec/dsp/eqs/Bass.cfg       |   17 +
 lib/rbcodec/dsp/eqs/Classical.cfg  |   17 +
 lib/rbcodec/dsp/eqs/Default.cfg    |   17 +
 lib/rbcodec/dsp/eqs/Disco.cfg      |   17 +
 lib/rbcodec/dsp/eqs/Electronic.cfg |   17 +
 lib/rbcodec/dsp/eqs/Hip-Hop.cfg    |   17 +
 lib/rbcodec/dsp/eqs/Jazz.cfg       |   17 +
 lib/rbcodec/dsp/eqs/Lounge.cfg     |   17 +
 lib/rbcodec/dsp/eqs/Pop.cfg        |   17 +
 lib/rbcodec/dsp/eqs/R&B.cfg        |   17 +
 lib/rbcodec/dsp/eqs/Rock.cfg       |   17 +
 lib/rbcodec/dsp/eqs/Vocal.cfg      |   17 +
 lib/rbcodec/dsp/tdspeed.c          |  450 +++++++++++
 lib/rbcodec/dsp/tdspeed.h          |   49 ++
 27 files changed, 4693 insertions(+)
 create mode 100644 lib/rbcodec/dsp/compressor.c
 create mode 100644 lib/rbcodec/dsp/compressor.h
 create mode 100644 lib/rbcodec/dsp/dsp.c
 create mode 100644 lib/rbcodec/dsp/dsp.h
 create mode 100644 lib/rbcodec/dsp/dsp_arm.S
 create mode 100644 lib/rbcodec/dsp/dsp_arm_v6.S
 create mode 100644 lib/rbcodec/dsp/dsp_asm.h
 create mode 100644 lib/rbcodec/dsp/dsp_cf.S
 create mode 100644 lib/rbcodec/dsp/eq.c
 create mode 100644 lib/rbcodec/dsp/eq.h
 create mode 100644 lib/rbcodec/dsp/eq_arm.S
 create mode 100644 lib/rbcodec/dsp/eq_cf.S
 create mode 100644 lib/rbcodec/dsp/eqs/Acoustic.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Bass.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Classical.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Default.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Disco.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Electronic.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Hip-Hop.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Jazz.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Lounge.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Pop.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/R&B.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Rock.cfg
 create mode 100644 lib/rbcodec/dsp/eqs/Vocal.cfg
 create mode 100644 lib/rbcodec/dsp/tdspeed.c
 create mode 100644 lib/rbcodec/dsp/tdspeed.h

(limited to 'lib/rbcodec/dsp')

diff --git a/lib/rbcodec/dsp/compressor.c b/lib/rbcodec/dsp/compressor.c
new file mode 100644
index 0000000000..3a8d52e4da
--- /dev/null
+++ b/lib/rbcodec/dsp/compressor.c
@@ -0,0 +1,363 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2009 Jeffrey Goode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+#include "fixedpoint.h"
+#include "fracmul.h"
+#include "settings.h"
+#include "dsp.h"
+#include "compressor.h"
+
+/* Define LOGF_ENABLE to enable logf output in this file */
+/*#define LOGF_ENABLE*/
+#include "logf.h"
+
+static int32_t comp_rel_slope IBSS_ATTR;   /* S7.24 format */
+static int32_t comp_makeup_gain IBSS_ATTR; /* S7.24 format */
+static int32_t comp_curve[66] IBSS_ATTR;   /* S7.24 format */
+static int32_t release_gain IBSS_ATTR;     /* S7.24 format */
+
+#define UNITY (1L << 24)                   /* unity gain in S7.24 format */
+
+/** COMPRESSOR UPDATE
+ *  Called via the menu system to configure the compressor process */
+bool compressor_update(void)
+{
+    static int curr_set[5];
+    int new_set[5] = {
+        global_settings.compressor_threshold,
+        global_settings.compressor_makeup_gain,
+        global_settings.compressor_ratio,
+        global_settings.compressor_knee,
+        global_settings.compressor_release_time};
+    
+    /* make menu values useful */
+    int  threshold  =  new_set[0];
+    bool auto_gain  = (new_set[1] == 1);
+    const int comp_ratios[] = {2, 4, 6, 10, 0};
+    int  ratio      =  comp_ratios[new_set[2]];
+    bool soft_knee  = (new_set[3] == 1);
+    int  release    =  new_set[4] * NATIVE_FREQUENCY / 1000;
+
+    bool changed = false;
+    bool active  = (threshold < 0);
+
+    for (int i = 0; i < 5; i++)
+    {
+        if (curr_set[i] != new_set[i])
+        {
+            changed = true;
+            curr_set[i] = new_set[i];
+            
+#if defined(ROCKBOX_HAS_LOGF) && defined(LOGF_ENABLE)
+            switch (i)
+            {
+            case 0:
+                logf("   Compressor Threshold: %d dB\tEnabled: %s",
+                    threshold, active ? "Yes" : "No");
+                break;
+            case 1:
+                logf("   Compressor Makeup Gain: %s",
+                    auto_gain ? "Auto" : "Off");
+                break;
+            case 2:
+                if (ratio)
+                    { logf("   Compressor Ratio: %d:1", ratio); }
+                else
+                    { logf("   Compressor Ratio: Limit"); }
+                break;
+            case 3:
+                logf("   Compressor Knee: %s", soft_knee?"Soft":"Hard");
+                break;
+            case 4:
+                logf("   Compressor Release: %d", release);
+                break;
+            }
+#endif
+        }
+    }
+
+    if (changed && active)
+    {
+        /* configure variables for compressor operation */
+        static const int32_t db[] = {
+            /* positive db equivalents in S15.16 format */
+            0x000000, 0x241FA4, 0x1E1A5E, 0x1A94C8,
+            0x181518, 0x1624EA, 0x148F82, 0x1338BD,
+            0x120FD2, 0x1109EB, 0x101FA4, 0x0F4BB6,
+            0x0E8A3C, 0x0DD840, 0x0D3377, 0x0C9A0E,
+            0x0C0A8C, 0x0B83BE, 0x0B04A5, 0x0A8C6C,
+            0x0A1A5E, 0x09ADE1, 0x094670, 0x08E398,
+            0x0884F6, 0x082A30, 0x07D2FA, 0x077F0F,
+            0x072E31, 0x06E02A, 0x0694C8, 0x064BDF,
+            0x060546, 0x05C0DA, 0x057E78, 0x053E03,
+            0x04FF5F, 0x04C273, 0x048726, 0x044D64,
+            0x041518, 0x03DE30, 0x03A89B, 0x037448,
+            0x03412A, 0x030F32, 0x02DE52, 0x02AE80,
+            0x027FB0, 0x0251D6, 0x0224EA, 0x01F8E2,
+            0x01CDB4, 0x01A359, 0x0179C9, 0x0150FC,
+            0x0128EB, 0x010190, 0x00DAE4, 0x00B4E1,
+            0x008F82, 0x006AC1, 0x004699, 0x002305};
+        
+        struct curve_point
+        {
+            int32_t db;     /* S15.16 format */
+            int32_t offset; /* S15.16 format */
+        } db_curve[5];
+        
+        /** Set up the shape of the compression curve first as decibel
+            values */
+        /* db_curve[0] = bottom of knee
+                   [1] = threshold
+                   [2] = top of knee
+                   [3] = 0 db input
+                   [4] = ~+12db input (2 bits clipping overhead) */
+        
+        db_curve[1].db = threshold << 16;
+        if (soft_knee)
+        {
+            /* bottom of knee is 3dB below the threshold for soft knee*/
+            db_curve[0].db = db_curve[1].db - (3 << 16);
+            /* top of knee is 3dB above the threshold for soft knee */
+            db_curve[2].db = db_curve[1].db + (3 << 16);
+            if (ratio)
+                /* offset = -3db * (ratio - 1) / ratio */
+                db_curve[2].offset = (int32_t)((long long)(-3 << 16)
+                    * (ratio - 1) / ratio);
+            else
+                /* offset = -3db for hard limit */
+                db_curve[2].offset = (-3 << 16);
+        }
+        else
+        {
+            /* bottom of knee is at the threshold for hard knee */
+            db_curve[0].db = threshold << 16;
+            /* top of knee is at the threshold for hard knee */
+            db_curve[2].db = threshold << 16;
+            db_curve[2].offset = 0;
+        }
+        
+        /* Calculate 0db and ~+12db offsets */
+        db_curve[4].db = 0xC0A8C; /* db of 2 bits clipping */
+        if (ratio)
+        {
+            /* offset = threshold * (ratio - 1) / ratio */
+            db_curve[3].offset = (int32_t)((long long)(threshold << 16)
+                * (ratio - 1) / ratio);
+            db_curve[4].offset = (int32_t)((long long)-db_curve[4].db
+                * (ratio - 1) / ratio) + db_curve[3].offset;
+        }
+        else
+        {
+            /* offset = threshold for hard limit */
+            db_curve[3].offset = (threshold << 16);
+            db_curve[4].offset = -db_curve[4].db + db_curve[3].offset;
+        }
+        
+        /** Now set up the comp_curve table with compression offsets in the
+            form of gain factors in S7.24 format */
+        /* comp_curve[0] is 0 (-infinity db) input */
+        comp_curve[0] = UNITY;
+        /* comp_curve[1 to 63] are intermediate compression values 
+           corresponding to the 6 MSB of the input values of a non-clipped
+           signal */
+        for (int i = 1; i < 64; i++)
+        {
+            /* db constants are stored as positive numbers;
+               make them negative here */
+            int32_t this_db = -db[i];
+            
+            /* no compression below the knee */
+            if (this_db <= db_curve[0].db)
+                comp_curve[i] = UNITY;
+            
+            /* if soft knee and below top of knee,
+               interpolate along soft knee slope */
+            else if (soft_knee && (this_db <= db_curve[2].db))
+                comp_curve[i] = fp_factor(fp_mul(
+                    ((this_db - db_curve[0].db) / 6),
+                    db_curve[2].offset, 16), 16) << 8;
+            
+            /* interpolate along ratio slope above the knee */
+            else
+                comp_curve[i] = fp_factor(fp_mul(
+                    fp_div((db_curve[1].db - this_db), db_curve[1].db, 16),
+                    db_curve[3].offset, 16), 16) << 8;
+        }
+        /* comp_curve[64] is the compression level of a maximum level,
+           non-clipped signal */
+        comp_curve[64] = fp_factor(db_curve[3].offset, 16) << 8;
+        
+        /* comp_curve[65] is the compression level of a maximum level,
+           clipped signal */
+        comp_curve[65] = fp_factor(db_curve[4].offset, 16) << 8;
+        
+#if defined(ROCKBOX_HAS_LOGF) && defined(LOGF_ENABLE)
+        logf("\n   *** Compression Offsets ***");
+        /* some settings for display only, not used in calculations */
+        db_curve[0].offset = 0;
+        db_curve[1].offset = 0;
+        db_curve[3].db = 0;
+        
+        for (int i = 0; i <= 4; i++)
+        {
+            logf("Curve[%d]: db: % 6.2f\toffset: % 6.2f", i,
+                (float)db_curve[i].db / (1 << 16),
+                (float)db_curve[i].offset / (1 << 16));
+        }
+        
+        logf("\nGain factors:");
+        for (int i = 1; i <= 65; i++)
+        {
+            debugf("%02d: %.6f  ", i, (float)comp_curve[i] / UNITY);
+            if (i % 4 == 0) debugf("\n");
+        }
+        debugf("\n");
+#endif
+        
+        /* if using auto peak, then makeup gain is max offset -
+           .1dB headroom */
+        comp_makeup_gain = auto_gain ?
+            fp_factor(-(db_curve[3].offset) - 0x199A, 16) << 8 : UNITY;
+        logf("Makeup gain:\t%.6f", (float)comp_makeup_gain / UNITY);
+
+        /* calculate per-sample gain change a rate of 10db over release time
+         */
+        comp_rel_slope = 0xAF0BB2 / release;
+        logf("Release slope:\t%.6f", (float)comp_rel_slope / UNITY);
+        
+        release_gain = UNITY;
+    }
+
+    return active;
+}
+
+/** GET COMPRESSION GAIN
+ *  Returns the required gain factor in S7.24 format in order to compress the
+ *  sample in accordance with the compression curve.  Always 1 or less.
+ */
+static inline int32_t get_compression_gain(struct dsp_data *data,
+                                           int32_t sample)
+{
+    const int frac_bits_offset = data->frac_bits - 15;
+    
+    /* sample must be positive */
+    if (sample < 0)
+        sample = -(sample + 1);
+        
+    /* shift sample into 15 frac bit range */
+    if (frac_bits_offset > 0)
+        sample >>= frac_bits_offset;
+    if (frac_bits_offset < 0)
+        sample <<= -frac_bits_offset;
+    
+    /* normal case: sample isn't clipped */
+    if (sample < (1 << 15))
+    {
+        /* index is 6 MSB, rem is 9 LSB */
+        int index = sample >> 9;
+        int32_t rem = (sample & 0x1FF) << 22;
+        
+        /* interpolate from the compression curve:
+            higher gain - ((rem / (1 << 31)) * (higher gain - lower gain)) */
+        return comp_curve[index] - (FRACMUL(rem,
+            (comp_curve[index] - comp_curve[index + 1])));
+    }
+    /* sample is somewhat clipped, up to 2 bits of overhead */
+    if (sample < (1 << 17))
+    {
+        /* straight interpolation:
+            higher gain - ((clipped portion of sample * 4/3
+            / (1 << 31)) * (higher gain - lower gain)) */
+        return comp_curve[64] - (FRACMUL(((sample - (1 << 15)) / 3) << 16,
+            (comp_curve[64] - comp_curve[65])));
+    }
+    
+    /* sample is too clipped, return invalid value */
+    return -1;
+}
+
+/** COMPRESSOR PROCESS
+ *  Changes the gain of the samples according to the compressor curve
+ */
+void compressor_process(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int num_chan = data->num_channels;
+    int32_t *in_buf[2] = {buf[0], buf[1]};
+    
+    while (count-- > 0)
+    {
+        int ch;
+        /* use lowest (most compressed) gain factor of the output buffer
+           sample pair for both samples (mono is also handled correctly here)
+         */
+        int32_t sample_gain = UNITY;
+        for (ch = 0; ch < num_chan; ch++)
+        {
+            int32_t this_gain = get_compression_gain(data, *in_buf[ch]);
+            if (this_gain < sample_gain)
+                sample_gain = this_gain;
+        }
+        
+        /* perform release slope; skip if no compression and no release slope
+         */
+        if ((sample_gain != UNITY) || (release_gain != UNITY))
+        {
+            /* if larger offset than previous slope, start new release slope
+             */
+            if ((sample_gain <= release_gain) && (sample_gain > 0))
+            {
+                release_gain = sample_gain;
+            }
+            else
+            /* keep sloping towards unity gain (and ignore invalid value) */
+            {
+                release_gain += comp_rel_slope;
+                if (release_gain > UNITY)
+                {
+                    release_gain = UNITY;
+                }
+            }
+        }
+        
+        /* total gain factor is the product of release gain and makeup gain,
+           but avoid computation if possible */
+        int32_t total_gain = ((release_gain == UNITY) ? comp_makeup_gain :
+            (comp_makeup_gain == UNITY) ? release_gain :
+                FRACMUL_SHL(release_gain, comp_makeup_gain, 7));
+        
+        /* Implement the compressor: apply total gain factor (if any) to the
+           output buffer sample pair/mono sample */
+        if (total_gain != UNITY)
+        {
+            for (ch = 0; ch < num_chan; ch++)
+            {
+                *in_buf[ch] = FRACMUL_SHL(total_gain, *in_buf[ch], 7);
+            }
+        }
+        in_buf[0]++;
+        in_buf[1]++;
+    }
+}
+
+void compressor_reset(void)
+{
+    release_gain = UNITY;
+}
diff --git a/lib/rbcodec/dsp/compressor.h b/lib/rbcodec/dsp/compressor.h
new file mode 100644
index 0000000000..6154372e05
--- /dev/null
+++ b/lib/rbcodec/dsp/compressor.h
@@ -0,0 +1,29 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2009 Jeffrey Goode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef COMPRESSOR_H
+#define COMPRESSOR_H
+
+void compressor_process(int count, struct dsp_data *data, int32_t *buf[]);
+bool compressor_update(void);
+void compressor_reset(void);
+
+#endif /* COMPRESSOR_H */
diff --git a/lib/rbcodec/dsp/dsp.c b/lib/rbcodec/dsp/dsp.c
new file mode 100644
index 0000000000..4da555747b
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp.c
@@ -0,0 +1,1573 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 Miika Pekkarinen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+#include "system.h"
+#include <sound.h>
+#include "dsp.h"
+#include "dsp-util.h"
+#include "eq.h"
+#include "compressor.h"
+#include "kernel.h"
+#include "settings.h"
+#include "replaygain.h"
+#include "tdspeed.h"
+#include "core_alloc.h"
+#include "fixedpoint.h"
+#include "fracmul.h"
+
+/* Define LOGF_ENABLE to enable logf output in this file */
+/*#define LOGF_ENABLE*/
+#include "logf.h"
+
+/* 16-bit samples are scaled based on these constants. The shift should be
+ * no more than 15.
+ */
+#define WORD_SHIFT              12
+#define WORD_FRACBITS           27
+
+#define NATIVE_DEPTH            16
+#define SMALL_SAMPLE_BUF_COUNT  128 /* Per channel */
+#define DEFAULT_GAIN            0x01000000
+
+/* enums to index conversion properly with stereo mode and other settings */
+enum
+{
+    SAMPLE_INPUT_LE_NATIVE_I_STEREO  = STEREO_INTERLEAVED,
+    SAMPLE_INPUT_LE_NATIVE_NI_STEREO = STEREO_NONINTERLEAVED,
+    SAMPLE_INPUT_LE_NATIVE_MONO      = STEREO_MONO,
+    SAMPLE_INPUT_GT_NATIVE_I_STEREO  = STEREO_INTERLEAVED + STEREO_NUM_MODES,
+    SAMPLE_INPUT_GT_NATIVE_NI_STEREO = STEREO_NONINTERLEAVED + STEREO_NUM_MODES,
+    SAMPLE_INPUT_GT_NATIVE_MONO      = STEREO_MONO + STEREO_NUM_MODES,
+    SAMPLE_INPUT_GT_NATIVE_1ST_INDEX = STEREO_NUM_MODES
+};
+
+enum
+{
+    SAMPLE_OUTPUT_MONO = 0,
+    SAMPLE_OUTPUT_STEREO,
+    SAMPLE_OUTPUT_DITHERED_MONO,
+    SAMPLE_OUTPUT_DITHERED_STEREO
+};
+
+/* No asm...yet */
+struct dither_data
+{
+    long error[3];  /* 00h */
+    long random;    /* 0ch */
+                    /* 10h */
+};
+
+struct crossfeed_data
+{
+    int32_t gain;           /* 00h - Direct path gain */
+    int32_t coefs[3];       /* 04h - Coefficients for the shelving filter */
+    int32_t history[4];     /* 10h - Format is x[n - 1], y[n - 1] for both channels */
+    int32_t delay[13][2];   /* 20h */
+    int32_t *index;         /* 88h - Current pointer into the delay line */
+                            /* 8ch */
+};
+
+/* Current setup is one lowshelf filters three peaking filters and one
+ *  highshelf filter. Varying the number of shelving filters make no sense,
+ *  but adding peaking filters is possible.
+ */
+struct eq_state
+{
+    char enabled[5];            /* 00h - Flags for active filters */
+    struct eqfilter filters[5]; /* 08h - packing is 4? */
+                                /* 10ch */
+};
+
+/* Include header with defines which functions are implemented in assembly
+   code for the target */
+#include <dsp_asm.h>
+
+/* Typedefs keep things much neater in this case */
+typedef void (*sample_input_fn_type)(int count, const char *src[],
+                                     int32_t *dst[]);
+typedef int (*resample_fn_type)(int count, struct dsp_data *data,
+                                const int32_t *src[], int32_t *dst[]);
+typedef void (*sample_output_fn_type)(int count, struct dsp_data *data,
+                                      const int32_t *src[], int16_t *dst);
+
+/* Single-DSP channel processing in place */
+typedef void (*channels_process_fn_type)(int count, int32_t *buf[]);
+/* DSP local channel processing in place */
+typedef void (*channels_process_dsp_fn_type)(int count, struct dsp_data *data,
+                                             int32_t *buf[]);
+
+/*
+ ***************************************************************************/
+
+struct dsp_config
+{
+    struct dsp_data data; /* Config members for use in external routines */
+    long codec_frequency; /* Sample rate of data coming from the codec */
+    long frequency;       /* Effective sample rate after pitch shift (if any) */
+    int  sample_depth;
+    int  sample_bytes;
+    int  stereo_mode;
+    int32_t  tdspeed_percent; /* Speed% * PITCH_SPEED_PRECISION */
+#ifdef HAVE_PITCHSCREEN
+    bool tdspeed_active;  /* Timestretch is in use */
+#endif
+#ifdef HAVE_SW_TONE_CONTROLS
+    /* Filter struct for software bass/treble controls */
+    struct eqfilter tone_filter;
+#endif
+    /* Functions that change depending upon settings - NULL if stage is
+       disabled */
+    sample_input_fn_type         input_samples;
+    resample_fn_type             resample;
+    sample_output_fn_type        output_samples;
+    /* These will be NULL for the voice codec and is more economical that
+       way */
+    channels_process_dsp_fn_type apply_gain;
+    channels_process_fn_type     apply_crossfeed;
+    channels_process_fn_type     eq_process;
+    channels_process_fn_type     channels_process;
+    channels_process_dsp_fn_type compressor_process;
+};
+
+/* General DSP config */
+static struct dsp_config dsp_conf[2] IBSS_ATTR;     /* 0=A, 1=V */
+/* Dithering */
+static struct dither_data dither_data[2] IBSS_ATTR; /* 0=left, 1=right */
+static long   dither_mask IBSS_ATTR;
+static long   dither_bias IBSS_ATTR;
+/* Crossfeed */
+struct crossfeed_data crossfeed_data IDATA_ATTR =    /* A */
+{
+    .index = (int32_t *)crossfeed_data.delay
+};
+
+/* Equalizer */
+static struct eq_state eq_data;                     /* A */
+
+/* Software tone controls */
+#ifdef HAVE_SW_TONE_CONTROLS
+static int prescale;                                /* A/V */
+static int bass;                                    /* A/V */
+static int treble;                                  /* A/V */
+#endif
+
+/* Settings applicable to audio codec only */
+#ifdef HAVE_PITCHSCREEN
+static int32_t  pitch_ratio = PITCH_SPEED_100;
+static int  big_sample_locks;
+#endif
+static int  channels_mode;
+       long dsp_sw_gain;
+       long dsp_sw_cross;
+static bool dither_enabled;
+static long eq_precut;
+static long track_gain;
+static bool new_gain;
+static long album_gain;
+static long track_peak;
+static long album_peak;
+static long replaygain;
+static bool crossfeed_enabled;
+
+#define AUDIO_DSP (dsp_conf[CODEC_IDX_AUDIO])
+#define VOICE_DSP (dsp_conf[CODEC_IDX_VOICE])
+
+/* The internal format is 32-bit samples, non-interleaved, stereo. This
+ * format is similar to the raw output from several codecs, so the amount
+ * of copying needed is minimized for that case.
+ */
+
+#define RESAMPLE_RATIO              4 /* Enough for 11,025 Hz -> 44,100 Hz */
+#define SMALL_RESAMPLE_BUF_COUNT    (SMALL_SAMPLE_BUF_COUNT * RESAMPLE_RATIO)
+#define BIG_SAMPLE_BUF_COUNT        SMALL_RESAMPLE_BUF_COUNT
+#define BIG_RESAMPLE_BUF_COUNT      (BIG_SAMPLE_BUF_COUNT * RESAMPLE_RATIO)
+
+static int32_t small_sample_buf[2][SMALL_SAMPLE_BUF_COUNT] IBSS_ATTR;
+static int32_t small_resample_buf[2][SMALL_RESAMPLE_BUF_COUNT] IBSS_ATTR;
+
+#ifdef HAVE_PITCHSCREEN
+static int32_t (* big_sample_buf)[BIG_SAMPLE_BUF_COUNT] = NULL;
+static int32_t (* big_resample_buf)[BIG_RESAMPLE_BUF_COUNT] = NULL;
+#endif
+
+static int sample_buf_count = SMALL_SAMPLE_BUF_COUNT;
+static int32_t *sample_buf[2] = { small_sample_buf[0], small_sample_buf[1] };
+static int resample_buf_count = SMALL_RESAMPLE_BUF_COUNT;
+static int32_t *resample_buf[2] = { small_resample_buf[0], small_resample_buf[1] };
+
+#ifdef HAVE_PITCHSCREEN
+int32_t sound_get_pitch(void)
+{
+    return pitch_ratio;
+}
+
+void sound_set_pitch(int32_t percent)
+{
+    pitch_ratio = percent;
+    dsp_configure(&AUDIO_DSP, DSP_SWITCH_FREQUENCY,
+                  AUDIO_DSP.codec_frequency);
+}
+
+static void tdspeed_set_pointers( bool time_stretch_active )
+{
+    if( time_stretch_active )
+    {
+        sample_buf_count = BIG_SAMPLE_BUF_COUNT;
+        resample_buf_count = BIG_RESAMPLE_BUF_COUNT;
+        sample_buf[0] = big_sample_buf[0];
+        sample_buf[1] = big_sample_buf[1];
+        resample_buf[0] = big_resample_buf[0];
+        resample_buf[1] = big_resample_buf[1];
+    }
+    else
+    {
+        sample_buf_count = SMALL_SAMPLE_BUF_COUNT;
+        resample_buf_count = SMALL_RESAMPLE_BUF_COUNT;
+        sample_buf[0] = small_sample_buf[0];
+        sample_buf[1] = small_sample_buf[1];
+        resample_buf[0] = small_resample_buf[0];
+        resample_buf[1] = small_resample_buf[1];
+    }
+}
+ 
+static void tdspeed_setup(struct dsp_config *dspc)
+{
+    /* Assume timestretch will not be used */
+    dspc->tdspeed_active = false;
+
+    tdspeed_set_pointers( false );
+
+    if (!dsp_timestretch_available())
+        return; /* Timestretch not enabled or buffer not allocated */
+
+    if (dspc->tdspeed_percent == 0)
+        dspc->tdspeed_percent = PITCH_SPEED_100;
+
+    if (!tdspeed_config(
+        dspc->codec_frequency == 0 ? NATIVE_FREQUENCY : dspc->codec_frequency,
+        dspc->stereo_mode != STEREO_MONO,
+        dspc->tdspeed_percent))
+        return; /* Timestretch not possible or needed with these parameters */
+
+    /* Timestretch is to be used */
+    dspc->tdspeed_active = true;
+
+    tdspeed_set_pointers( true );
+}
+
+
+static int move_callback(int handle, void* current, void* new)
+{
+    (void)handle;(void)current;
+
+    if ( big_sample_locks > 0 )
+        return BUFLIB_CB_CANNOT_MOVE;
+    
+    big_sample_buf = new;
+    
+    /* no allocation without timestretch enabled */
+    tdspeed_set_pointers( true );
+    return BUFLIB_CB_OK;
+}
+
+static void lock_sample_buf( bool lock )
+{
+    if ( lock )
+        big_sample_locks++;
+    else
+        big_sample_locks--;
+}
+
+static struct buflib_callbacks ops = {
+    .move_callback = move_callback,
+    .shrink_callback = NULL,
+};
+
+
+void dsp_timestretch_enable(bool enabled)
+{
+    /* Hook to set up timestretch buffer on first call to settings_apply() */
+    static int handle = -1;
+    if (enabled)
+    {
+        if (big_sample_buf)
+            return; /* already allocated and enabled */
+
+        /* Set up timestretch buffers */
+        big_sample_buf = &small_resample_buf[0];
+        handle = core_alloc_ex("resample buf",
+                               2 * BIG_RESAMPLE_BUF_COUNT * sizeof(int32_t),
+                               &ops);
+        big_sample_locks = 0;
+        enabled = handle >= 0;
+
+        if (enabled)
+        {
+            /* success, now setup tdspeed */
+            big_resample_buf = core_get_data(handle);
+
+            tdspeed_init();
+            tdspeed_setup(&AUDIO_DSP);
+        }
+    }
+
+    if (!enabled)
+    {
+        dsp_set_timestretch(PITCH_SPEED_100);
+        tdspeed_finish();
+
+        if (handle >= 0)
+            core_free(handle);
+
+        handle = -1;
+        big_sample_buf = NULL;
+    }
+}
+
+void dsp_set_timestretch(int32_t percent)
+{
+    AUDIO_DSP.tdspeed_percent = percent;
+    tdspeed_setup(&AUDIO_DSP);
+}
+
+int32_t dsp_get_timestretch()
+{
+    return AUDIO_DSP.tdspeed_percent;
+}
+
+bool dsp_timestretch_available()
+{
+    return (global_settings.timestretch_enabled && big_sample_buf);
+}
+#endif /* HAVE_PITCHSCREEN */
+
+/* Convert count samples to the internal format, if needed.  Updates src
+ * to point past the samples "consumed" and dst is set to point to the
+ * samples to consume. Note that for mono, dst[0] equals dst[1], as there
+ * is no point in processing the same data twice.
+ */
+
+/* convert count 16-bit mono to 32-bit mono */
+static void sample_input_lte_native_mono(
+    int count, const char *src[], int32_t *dst[])
+{
+    const int16_t *s = (int16_t *) src[0];
+    const int16_t * const send = s + count;
+    int32_t *d = dst[0] = dst[1] = sample_buf[0];
+    int scale = WORD_SHIFT;
+
+    while (s < send)
+    {
+        *d++ = *s++ << scale;
+    }
+
+    src[0] = (char *)s;
+}
+
+/* convert count 16-bit interleaved stereo to 32-bit noninterleaved */
+static void sample_input_lte_native_i_stereo(
+    int count, const char *src[], int32_t *dst[])
+{
+    const int32_t *s = (int32_t *) src[0];
+    const int32_t * const send = s + count;
+    int32_t *dl = dst[0] = sample_buf[0];
+    int32_t *dr = dst[1] = sample_buf[1];
+    int scale = WORD_SHIFT;
+
+    while (s < send)
+    {
+        int32_t slr = *s++;
+#ifdef ROCKBOX_LITTLE_ENDIAN
+        *dl++ = (slr >> 16) << scale;
+        *dr++ = (int32_t)(int16_t)slr << scale;
+#else  /* ROCKBOX_BIG_ENDIAN */
+        *dl++ = (int32_t)(int16_t)slr << scale;
+        *dr++ = (slr >> 16) << scale;
+#endif
+    }
+
+    src[0] = (char *)s;
+}
+
+/* convert count 16-bit noninterleaved stereo to 32-bit noninterleaved */
+static void sample_input_lte_native_ni_stereo(
+    int count, const char *src[], int32_t *dst[])
+{
+    const int16_t *sl = (int16_t *) src[0];
+    const int16_t *sr = (int16_t *) src[1];
+    const int16_t * const slend = sl + count;
+    int32_t *dl = dst[0] = sample_buf[0];
+    int32_t *dr = dst[1] = sample_buf[1];
+    int scale = WORD_SHIFT;
+
+    while (sl < slend)
+    {
+        *dl++ = *sl++ << scale;
+        *dr++ = *sr++ << scale;
+    }
+
+    src[0] = (char *)sl;
+    src[1] = (char *)sr;
+}
+
+/* convert count 32-bit mono to 32-bit mono */
+static void sample_input_gt_native_mono(
+    int count, const char *src[], int32_t *dst[])
+{
+    dst[0] = dst[1] = (int32_t *)src[0];
+    src[0] = (char *)(dst[0] + count);
+}
+
+/* convert count 32-bit interleaved stereo to 32-bit noninterleaved stereo */
+static void sample_input_gt_native_i_stereo(
+    int count, const char *src[], int32_t *dst[])
+{
+    const int32_t *s = (int32_t *)src[0];
+    const int32_t * const send = s + 2*count;
+    int32_t *dl = dst[0] = sample_buf[0];
+    int32_t *dr = dst[1] = sample_buf[1];
+
+    while (s < send)
+    {
+        *dl++ = *s++;
+        *dr++ = *s++;
+    }
+
+    src[0] = (char *)send;
+}
+
+/* convert 32 bit-noninterleaved stereo to 32-bit noninterleaved stereo */
+static void sample_input_gt_native_ni_stereo(
+    int count, const char *src[], int32_t *dst[])
+{
+    dst[0] = (int32_t *)src[0];
+    dst[1] = (int32_t *)src[1];
+    src[0] = (char *)(dst[0] + count);
+    src[1] = (char *)(dst[1] + count);
+}
+
+/**
+ * sample_input_new_format()
+ *
+ * set the to-native sample conversion function based on dsp sample parameters
+ *
+ * !DSPPARAMSYNC
+ * needs syncing with changes to the following dsp parameters:
+ *  * dsp->stereo_mode (A/V)
+ *  * dsp->sample_depth (A/V)
+ */
+static void sample_input_new_format(struct dsp_config *dsp)
+{
+    static const sample_input_fn_type sample_input_functions[] =
+    {
+        [SAMPLE_INPUT_LE_NATIVE_I_STEREO]  = sample_input_lte_native_i_stereo,
+        [SAMPLE_INPUT_LE_NATIVE_NI_STEREO] = sample_input_lte_native_ni_stereo,
+        [SAMPLE_INPUT_LE_NATIVE_MONO]      = sample_input_lte_native_mono,
+        [SAMPLE_INPUT_GT_NATIVE_I_STEREO]  = sample_input_gt_native_i_stereo,
+        [SAMPLE_INPUT_GT_NATIVE_NI_STEREO] = sample_input_gt_native_ni_stereo,
+        [SAMPLE_INPUT_GT_NATIVE_MONO]      = sample_input_gt_native_mono,
+    };
+
+    int convert = dsp->stereo_mode;
+
+    if (dsp->sample_depth > NATIVE_DEPTH)
+        convert += SAMPLE_INPUT_GT_NATIVE_1ST_INDEX;
+
+    dsp->input_samples = sample_input_functions[convert];
+}
+
+
+#ifndef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+/* write mono internal format to output format */
+static void sample_output_mono(int count, struct dsp_data *data,
+                               const int32_t *src[], int16_t *dst)
+{
+    const int32_t *s0 = src[0];
+    const int scale = data->output_scale;
+    const int dc_bias = 1 << (scale - 1);
+
+    while (count-- > 0)
+    {
+        int32_t lr = clip_sample_16((*s0++ + dc_bias) >> scale);
+        *dst++ = lr;
+        *dst++ = lr;
+    }
+}
+#endif /* DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO */
+
+/* write stereo internal format to output format */
+#ifndef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+static void sample_output_stereo(int count, struct dsp_data *data,
+                                 const int32_t *src[], int16_t *dst)
+{
+    const int32_t *s0 = src[0];
+    const int32_t *s1 = src[1];
+    const int scale = data->output_scale;
+    const int dc_bias = 1 << (scale - 1);
+
+    while (count-- > 0)
+    {
+        *dst++ = clip_sample_16((*s0++ + dc_bias) >> scale);
+        *dst++ = clip_sample_16((*s1++ + dc_bias) >> scale);
+    }
+}
+#endif /* DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO */
+
+/**
+ * The "dither" code to convert the 24-bit samples produced by libmad was
+ * taken from the coolplayer project - coolplayer.sourceforge.net
+ *
+ * This function handles mono and stereo outputs.
+ */
+static void sample_output_dithered(int count, struct dsp_data *data,
+                                   const int32_t *src[], int16_t *dst)
+{
+    const int32_t mask = dither_mask;
+    const int32_t bias = dither_bias;
+    const int scale = data->output_scale;
+    const int32_t min = data->clip_min;
+    const int32_t max = data->clip_max;
+    const int32_t range = max - min;
+    int ch;
+    int16_t *d;
+
+    for (ch = 0; ch < data->num_channels; ch++)
+    {
+        struct dither_data * const dither = &dither_data[ch];
+        const int32_t *s = src[ch];
+        int i;
+
+        for (i = 0, d = &dst[ch]; i < count; i++, s++, d += 2)
+        {
+            int32_t output, sample;
+            int32_t random;
+
+            /* Noise shape and bias (for correct rounding later) */
+            sample = *s;
+            sample += dither->error[0] - dither->error[1] + dither->error[2];
+            dither->error[2] = dither->error[1];
+            dither->error[1] = dither->error[0]/2;
+
+            output = sample + bias;
+
+            /* Dither, highpass triangle PDF */
+            random = dither->random*0x0019660dL + 0x3c6ef35fL;
+            output += (random & mask) - (dither->random & mask);
+            dither->random = random;
+
+            /* Round sample to output range */
+            output &= ~mask;
+
+            /* Error feedback */
+            dither->error[0] = sample - output;
+
+            /* Clip */
+            if ((uint32_t)(output - min) > (uint32_t)range)
+            {
+                int32_t c = min;
+                if (output > min)
+                    c += range;
+                output = c;
+            }
+
+            /* Quantize and store */
+            *d = output >> scale;
+        }
+    }
+
+    if (data->num_channels == 2)
+        return;
+
+    /* Have to duplicate left samples into the right channel since
+       pcm buffer and hardware is interleaved stereo */
+    d = &dst[0];
+
+    while (count-- > 0)
+    {
+        int16_t s = *d++;
+        *d++ = s;
+    }
+}
+
+/**
+ * sample_output_new_format()
+ *
+ * set the from-native to ouput sample conversion routine
+ *
+ * !DSPPARAMSYNC
+ * needs syncing with changes to the following dsp parameters:
+ *  * dsp->stereo_mode (A/V)
+ *  * dither_enabled (A)
+ */
+static void sample_output_new_format(struct dsp_config *dsp)
+{
+    static const sample_output_fn_type sample_output_functions[] =
+    {
+        sample_output_mono,
+        sample_output_stereo,
+        sample_output_dithered,
+        sample_output_dithered
+    };
+
+    int out = dsp->data.num_channels - 1;
+
+    if (dsp == &AUDIO_DSP && dither_enabled)
+        out += 2;
+
+    dsp->output_samples = sample_output_functions[out];
+}
+
+/**
+ * Linear interpolation resampling that introduces a one sample delay because
+ * of our inability to look into the future at the end of a frame.
+ */
+#ifndef DSP_HAVE_ASM_RESAMPLING
+static int dsp_downsample(int count, struct dsp_data *data,
+                          const int32_t *src[], int32_t *dst[])
+{
+    int ch = data->num_channels - 1;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
+    int32_t *d;
+
+    /* Rolled channel loop actually showed slightly faster. */
+    do
+    {
+        /* Just initialize things and not worry too much about the relatively
+         * uncommon case of not being able to spit out a sample for the frame.
+         */
+        const int32_t *s = src[ch];
+        int32_t last = data->resample_data.last_sample[ch];
+
+        data->resample_data.last_sample[ch] = s[count - 1];
+        d = dst[ch];
+        phase = data->resample_data.phase;
+        pos = phase >> 16;
+
+        /* Do we need last sample of previous frame for interpolation? */
+        if (pos > 0)
+            last = s[pos - 1];
+
+        while (pos < (uint32_t)count)
+        {
+            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
+            phase += delta;
+            pos = phase >> 16;
+            last = s[pos - 1];
+        }
+    }
+    while (--ch >= 0);
+
+    /* Wrap phase accumulator back to start of next frame. */
+    data->resample_data.phase = phase - (count << 16);
+    return d - dst[0];
+}
+
+static int dsp_upsample(int count, struct dsp_data *data,
+                        const int32_t *src[], int32_t *dst[])
+{
+    int  ch = data->num_channels - 1;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
+    int32_t *d;
+
+    /* Rolled channel loop actually showed slightly faster. */
+    do
+    {
+        /* Should always be able to output a sample for a ratio up to RESAMPLE_RATIO */
+        const int32_t *s = src[ch];
+        int32_t last = data->resample_data.last_sample[ch];
+
+        data->resample_data.last_sample[ch] = s[count - 1];
+        d = dst[ch];
+        phase = data->resample_data.phase;
+        pos = phase >> 16;
+
+        while (pos == 0)
+        {
+            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[0] - last);
+            phase += delta;
+            pos = phase >> 16;
+        }
+
+        while (pos < (uint32_t)count)
+        {
+            last = s[pos - 1];
+            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
+            phase += delta;
+            pos = phase >> 16;
+        }
+    }
+    while (--ch >= 0);
+
+    /* Wrap phase accumulator back to start of next frame. */
+    data->resample_data.phase = phase & 0xffff;
+    return d - dst[0];
+}
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+
+static void resampler_new_delta(struct dsp_config *dsp)
+{
+    dsp->data.resample_data.delta = (unsigned long)
+        dsp->frequency * 65536LL / NATIVE_FREQUENCY;
+
+    if (dsp->frequency == NATIVE_FREQUENCY)
+    {
+        /* NOTE: If fully glitch-free transistions from no resampling to
+           resampling are desired, last_sample history should be maintained
+           even when not resampling. */
+        dsp->resample = NULL;
+        dsp->data.resample_data.phase = 0;
+        dsp->data.resample_data.last_sample[0] = 0;
+        dsp->data.resample_data.last_sample[1] = 0;
+    }
+    else if (dsp->frequency < NATIVE_FREQUENCY)
+        dsp->resample = dsp_upsample;
+    else
+        dsp->resample = dsp_downsample;
+}
+
+/* Resample count stereo samples. Updates the src array, if resampling is
+ * done, to refer to the resampled data. Returns number of stereo samples
+ * for further processing.
+ */
+static inline int resample(struct dsp_config *dsp, int count, int32_t *src[])
+{
+    int32_t *dst[2] =
+    {
+        resample_buf[0],
+        resample_buf[1]
+    };
+    lock_sample_buf( true );
+    count = dsp->resample(count, &dsp->data, (const int32_t **)src, dst);
+
+    src[0] = dst[0];
+    src[1] = dst[dsp->data.num_channels - 1];
+    lock_sample_buf( false );
+    return count;
+}
+
+static void dither_init(struct dsp_config *dsp)
+{
+    memset(dither_data, 0, sizeof (dither_data));
+    dither_bias = (1L << (dsp->data.frac_bits - NATIVE_DEPTH));
+    dither_mask = (1L << (dsp->data.frac_bits + 1 - NATIVE_DEPTH)) - 1;
+}
+
+void dsp_dither_enable(bool enable)
+{
+    struct dsp_config *dsp = &AUDIO_DSP;
+    dither_enabled = enable;
+    sample_output_new_format(dsp);
+}
+
+/* Applies crossfeed to the stereo signal in src.
+ * Crossfeed is a process where listening over speakers is simulated. This
+ * is good for old hard panned stereo records, which might be quite fatiguing
+ * to listen to on headphones with no crossfeed.
+ */
+#ifndef DSP_HAVE_ASM_CROSSFEED
+static void apply_crossfeed(int count, int32_t *buf[])
+{
+    int32_t *hist_l = &crossfeed_data.history[0];
+    int32_t *hist_r = &crossfeed_data.history[2];
+    int32_t *delay = &crossfeed_data.delay[0][0];
+    int32_t *coefs = &crossfeed_data.coefs[0];
+    int32_t gain = crossfeed_data.gain;
+    int32_t *di = crossfeed_data.index;
+
+    int32_t acc;
+    int32_t left, right;
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        left = buf[0][i];
+        right = buf[1][i];
+
+        /* Filter delayed sample from left speaker */
+        acc = FRACMUL(*di, coefs[0]);
+        acc += FRACMUL(hist_l[0], coefs[1]);
+        acc += FRACMUL(hist_l[1], coefs[2]);
+        /* Save filter history for left speaker */
+        hist_l[1] = acc;
+        hist_l[0] = *di;
+        *di++ = left;
+        /* Filter delayed sample from right speaker */
+        acc = FRACMUL(*di, coefs[0]);
+        acc += FRACMUL(hist_r[0], coefs[1]);
+        acc += FRACMUL(hist_r[1], coefs[2]);
+        /* Save filter history for right speaker */
+        hist_r[1] = acc;
+        hist_r[0] = *di;
+        *di++ = right;
+        /* Now add the attenuated direct sound and write to outputs */
+        buf[0][i] = FRACMUL(left, gain) + hist_r[1];
+        buf[1][i] = FRACMUL(right, gain) + hist_l[1];
+
+        /* Wrap delay line index if bigger than delay line size */
+        if (di >= delay + 13*2)
+            di = delay;
+    }
+    /* Write back local copies of data we've modified */
+    crossfeed_data.index = di;
+}
+#endif /* DSP_HAVE_ASM_CROSSFEED */
+
+/**
+ * dsp_set_crossfeed(bool enable)
+ *
+ * !DSPPARAMSYNC
+ * needs syncing with changes to the following dsp parameters:
+ *  * dsp->stereo_mode (A)
+ */
+void dsp_set_crossfeed(bool enable)
+{
+    crossfeed_enabled = enable;
+    AUDIO_DSP.apply_crossfeed = (enable && AUDIO_DSP.data.num_channels > 1)
+                                    ? apply_crossfeed : NULL;
+}
+
+void dsp_set_crossfeed_direct_gain(int gain)
+{
+    crossfeed_data.gain = get_replaygain_int(gain * 10) << 7;
+    /* If gain is negative, the calculation overflowed and we need to clamp */
+    if (crossfeed_data.gain < 0)
+        crossfeed_data.gain = 0x7fffffff;
+}
+
+/* Both gains should be below 0 dB */
+void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
+{
+    int32_t *c = crossfeed_data.coefs;
+    long scaler = get_replaygain_int(lf_gain * 10) << 7;
+
+    cutoff = 0xffffffff/NATIVE_FREQUENCY*cutoff;
+    hf_gain -= lf_gain;
+    /* Divide cutoff by sqrt(10^(hf_gain/20)) to place cutoff at the -3 dB
+     * point instead of shelf midpoint. This is for compatibility with the old
+     * crossfeed shelf filter and should be removed if crossfeed settings are
+     * ever made incompatible for any other good reason.
+     */
+    cutoff = fp_div(cutoff, get_replaygain_int(hf_gain*5), 24);
+    filter_shelf_coefs(cutoff, hf_gain, false, c);
+    /* Scale coefs by LF gain and shift them to s0.31 format. We have no gains
+     * over 1 and can do this safely
+     */
+    c[0] = FRACMUL_SHL(c[0], scaler, 4);
+    c[1] = FRACMUL_SHL(c[1], scaler, 4);
+    c[2] <<= 4;
+}
+
+/* Apply a constant gain to the samples (e.g., for ReplayGain).
+ * Note that this must be called before the resampler.
+ */
+#ifndef DSP_HAVE_ASM_APPLY_GAIN
+static void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int32_t gain = data->gain;
+    int ch;
+
+    for (ch = 0; ch < data->num_channels; ch++)
+    {
+        int32_t *d = buf[ch];
+        int i;
+
+        for (i = 0; i < count; i++)
+            d[i] = FRACMUL_SHL(d[i], gain, 8);
+    }
+}
+#endif /* DSP_HAVE_ASM_APPLY_GAIN */
+
+/* Combine all gains to a global gain. */
+static void set_gain(struct dsp_config *dsp)
+{
+    /* gains are in S7.24 format */
+    dsp->data.gain = DEFAULT_GAIN;
+
+    /* Replay gain not relevant to voice */
+    if (dsp == &AUDIO_DSP && replaygain)
+    {
+        dsp->data.gain = replaygain;
+    }
+
+    if (dsp->eq_process && eq_precut)
+    {
+        dsp->data.gain = fp_mul(dsp->data.gain, eq_precut, 24);
+    }
+
+#ifdef HAVE_SW_VOLUME_CONTROL
+    if (global_settings.volume < SW_VOLUME_MAX ||
+        global_settings.volume > SW_VOLUME_MIN)
+    {
+        int vol_gain = get_replaygain_int(global_settings.volume * 100);
+        dsp->data.gain = (long) (((int64_t) dsp->data.gain * vol_gain) >> 24);
+    }
+#endif
+
+    if (dsp->data.gain == DEFAULT_GAIN)
+    {
+        dsp->data.gain = 0;
+    }
+    else
+    {
+        dsp->data.gain >>= 1;   /* convert gain to S8.23 format */
+    }
+
+    dsp->apply_gain = dsp->data.gain != 0 ? dsp_apply_gain : NULL;
+}
+
+/**
+ * Update the amount to cut the audio before applying the equalizer.
+ *
+ * @param precut to apply in decibels (multiplied by 10)
+ */
+void dsp_set_eq_precut(int precut)
+{
+    eq_precut = get_replaygain_int(precut * -10);
+    set_gain(&AUDIO_DSP);
+}
+
+/**
+ * Synchronize the equalizer filter coefficients with the global settings.
+ *
+ * @param band the equalizer band to synchronize
+ */
+void dsp_set_eq_coefs(int band)
+{
+    /* Adjust setting pointer to the band we actually want to change */
+    struct eq_band_setting *setting = &global_settings.eq_band_settings[band];
+
+    /* Convert user settings to format required by coef generator functions */
+    unsigned long cutoff = 0xffffffff / NATIVE_FREQUENCY * setting->cutoff;
+    unsigned long q = setting->q;
+    int gain = setting->gain;
+
+    if (q == 0)
+        q = 1;
+
+    /* NOTE: The coef functions assume the EMAC unit is in fractional mode,
+       which it should be, since we're executed from the main thread. */
+
+    /* Assume a band is disabled if the gain is zero */
+    if (gain == 0)
+    {
+        eq_data.enabled[band] = 0;
+    }
+    else
+    {
+        if (band == 0)
+            eq_ls_coefs(cutoff, q, gain, eq_data.filters[band].coefs);
+        else if (band == 4)
+            eq_hs_coefs(cutoff, q, gain, eq_data.filters[band].coefs);
+        else
+            eq_pk_coefs(cutoff, q, gain, eq_data.filters[band].coefs);
+
+        eq_data.enabled[band] = 1;
+    }
+}
+
+/* Apply EQ filters to those bands that have got it switched on. */
+static void eq_process(int count, int32_t *buf[])
+{
+    static const int shifts[] =
+    {
+        EQ_SHELF_SHIFT,  /* low shelf  */
+        EQ_PEAK_SHIFT,   /* peaking    */
+        EQ_PEAK_SHIFT,   /* peaking    */
+        EQ_PEAK_SHIFT,   /* peaking    */
+        EQ_SHELF_SHIFT,  /* high shelf */
+    };
+    unsigned int channels = AUDIO_DSP.data.num_channels;
+    int i;
+
+    /* filter configuration currently is 1 low shelf filter, 3 band peaking
+       filters and 1 high shelf filter, in that order. we need to know this
+       so we can choose the correct shift factor.
+     */
+    for (i = 0; i < 5; i++)
+    {
+        if (!eq_data.enabled[i])
+            continue;
+        eq_filter(buf, &eq_data.filters[i], count, channels, shifts[i]);
+    }
+}
+
+/**
+ * Use to enable the equalizer.
+ *
+ * @param enable true to enable the equalizer
+ */
+void dsp_set_eq(bool enable)
+{
+    AUDIO_DSP.eq_process = enable ? eq_process : NULL;
+    set_gain(&AUDIO_DSP);
+}
+
+static void dsp_set_stereo_width(int value)
+{
+    long width, straight, cross;
+
+    width = value * 0x7fffff / 100;
+
+    if (value <= 100)
+    {
+        straight = (0x7fffff + width) / 2;
+        cross = straight - width;
+    }
+    else
+    {
+        /* straight = (1 + width) / (2 * width) */
+        straight = ((int64_t)(0x7fffff + width) << 22) / width;
+        cross = straight - 0x7fffff;
+    }
+
+    dsp_sw_gain  = straight << 8;
+    dsp_sw_cross = cross << 8;
+}
+
+/**
+ * Implements the different channel configurations and stereo width.
+ */
+
+/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
+ * completeness. */
+#if 0
+static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
+{
+    /* The channels are each just themselves */
+    (void)count; (void)buf;
+}
+#endif
+
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
+static void channels_process_sound_chan_mono(int count, int32_t *buf[])
+{
+    int32_t *sl = buf[0], *sr = buf[1];
+
+    while (count-- > 0)
+    {
+        int32_t lr = *sl/2 + *sr/2;
+        *sl++ = lr;
+        *sr++ = lr;
+    }
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
+
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+static void channels_process_sound_chan_custom(int count, int32_t *buf[])
+{
+    const int32_t gain  = dsp_sw_gain;
+    const int32_t cross = dsp_sw_cross;
+    int32_t *sl = buf[0], *sr = buf[1];
+
+    while (count-- > 0)
+    {
+        int32_t l = *sl;
+        int32_t r = *sr;
+        *sl++ = FRACMUL(l, gain) + FRACMUL(r, cross);
+        *sr++ = FRACMUL(r, gain) + FRACMUL(l, cross);
+    }
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_CUSTOM */
+
+static void channels_process_sound_chan_mono_left(int count, int32_t *buf[])
+{
+    /* Just copy over the other channel */
+    memcpy(buf[1], buf[0], count * sizeof (*buf));
+}
+
+static void channels_process_sound_chan_mono_right(int count, int32_t *buf[])
+{
+    /* Just copy over the other channel */
+    memcpy(buf[0], buf[1], count * sizeof (*buf));
+}
+
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+static void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+{
+    int32_t *sl = buf[0], *sr = buf[1];
+
+    while (count-- > 0)
+    {
+        int32_t ch = *sl/2 - *sr/2;
+        *sl++ = ch;
+        *sr++ = -ch;
+    }
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_KARAOKE */
+
+static void dsp_set_channel_config(int value)
+{
+    static const channels_process_fn_type channels_process_functions[] =
+    {
+        /* SOUND_CHAN_STEREO = All-purpose index for no channel processing */
+        [SOUND_CHAN_STEREO]     = NULL,
+        [SOUND_CHAN_MONO]       = channels_process_sound_chan_mono,
+        [SOUND_CHAN_CUSTOM]     = channels_process_sound_chan_custom,
+        [SOUND_CHAN_MONO_LEFT]  = channels_process_sound_chan_mono_left,
+        [SOUND_CHAN_MONO_RIGHT] = channels_process_sound_chan_mono_right,
+        [SOUND_CHAN_KARAOKE]    = channels_process_sound_chan_karaoke,
+    };
+
+    if ((unsigned)value >= ARRAYLEN(channels_process_functions) ||
+        AUDIO_DSP.stereo_mode == STEREO_MONO)
+    {
+        value = SOUND_CHAN_STEREO;
+    }
+
+    /* This doesn't apply to voice */
+    channels_mode = value;
+    AUDIO_DSP.channels_process = channels_process_functions[value];
+}
+
+#if CONFIG_CODEC == SWCODEC
+
+#ifdef HAVE_SW_TONE_CONTROLS
+static void set_tone_controls(void)
+{
+    filter_bishelf_coefs(0xffffffff/NATIVE_FREQUENCY*200,
+                         0xffffffff/NATIVE_FREQUENCY*3500,
+                         bass, treble, -prescale,
+                         AUDIO_DSP.tone_filter.coefs);
+    /* Sync the voice dsp coefficients */
+    memcpy(&VOICE_DSP.tone_filter.coefs, AUDIO_DSP.tone_filter.coefs,
+           sizeof (VOICE_DSP.tone_filter.coefs));
+}
+#endif
+
+/* Hook back from firmware/ part of audio, which can't/shouldn't call apps/
+ * code directly.
+ */
+int dsp_callback(int msg, intptr_t param)
+{
+    switch (msg)
+    {
+#ifdef HAVE_SW_TONE_CONTROLS
+    case DSP_CALLBACK_SET_PRESCALE:
+        prescale = param;
+        set_tone_controls();
+        break;
+    /* prescaler is always set after calling any of these, so we wait with
+     * calculating coefs until the above case is hit.
+     */
+    case DSP_CALLBACK_SET_BASS:
+        bass = param;
+        break;
+    case DSP_CALLBACK_SET_TREBLE:
+        treble = param;
+        break;
+#ifdef HAVE_SW_VOLUME_CONTROL
+    case DSP_CALLBACK_SET_SW_VOLUME:
+        set_gain(&AUDIO_DSP);
+        break;
+#endif
+#endif
+    case DSP_CALLBACK_SET_CHANNEL_CONFIG:
+        dsp_set_channel_config(param);
+        break;
+    case DSP_CALLBACK_SET_STEREO_WIDTH:
+        dsp_set_stereo_width(param);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+#endif
+
+/* Process and convert src audio to dst based on the DSP configuration,
+ * reading count number of audio samples. dst is assumed to be large
+ * enough; use dsp_output_count() to get the required number. src is an
+ * array of pointers; for mono and interleaved stereo, it contains one
+ * pointer to the start of the audio data and the other is ignored; for
+ * non-interleaved stereo, it contains two pointers, one for each audio
+ * channel. Returns number of bytes written to dst.
+ */
+int dsp_process(struct dsp_config *dsp, char *dst, const char *src[], int count)
+{
+    static int32_t *tmp[2]; /* tdspeed_doit() needs it static */
+    static long last_yield;
+    long tick;
+    int written = 0;
+
+#if defined(CPU_COLDFIRE)
+    /* set emac unit for dsp processing, and save old macsr, we're running in
+       codec thread context at this point, so can't clobber it */
+    unsigned long old_macsr = coldfire_get_macsr();
+    coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
+#endif
+
+    if (new_gain)
+        dsp_set_replaygain(); /* Gain has changed */
+
+    /* Perform at least one yield before starting */
+    last_yield = current_tick;
+    yield();
+
+    /* Testing function pointers for NULL is preferred since the pointer
+       will be preloaded to be used for the call if not. */
+    while (count > 0)
+    {
+        int samples = MIN(sample_buf_count, count);
+        count -= samples;
+
+        dsp->input_samples(samples, src, tmp);
+
+#ifdef HAVE_PITCHSCREEN
+        if (dsp->tdspeed_active)
+            samples = tdspeed_doit(tmp, samples);
+#endif
+        
+        int chunk_offset = 0;
+        while (samples > 0)
+        {
+            int32_t *t2[2];
+            t2[0] = tmp[0]+chunk_offset;
+            t2[1] = tmp[1]+chunk_offset;
+
+            int chunk = MIN(sample_buf_count, samples);
+            chunk_offset += chunk;
+            samples -= chunk;
+
+            if (dsp->apply_gain)
+                dsp->apply_gain(chunk, &dsp->data, t2);
+
+            if (dsp->resample && (chunk = resample(dsp, chunk, t2)) <= 0)
+                break; /* I'm pretty sure we're downsampling here */
+
+            if (dsp->apply_crossfeed)
+                dsp->apply_crossfeed(chunk, t2);
+
+            if (dsp->eq_process)
+                dsp->eq_process(chunk, t2);
+
+#ifdef HAVE_SW_TONE_CONTROLS
+            if ((bass | treble) != 0)
+                eq_filter(t2, &dsp->tone_filter, chunk,
+                      dsp->data.num_channels, FILTER_BISHELF_SHIFT);
+#endif
+
+            if (dsp->channels_process)
+                dsp->channels_process(chunk, t2);
+            
+            if (dsp->compressor_process)
+                dsp->compressor_process(chunk, &dsp->data, t2);
+
+            dsp->output_samples(chunk, &dsp->data, (const int32_t **)t2, (int16_t *)dst);
+
+            written += chunk;
+            dst += chunk * sizeof (int16_t) * 2;
+
+            /* yield at least once each tick */
+            tick = current_tick;
+            if (TIME_AFTER(tick, last_yield))
+            {
+                last_yield = tick;
+                yield();
+            }
+        }
+    }
+
+#if defined(CPU_COLDFIRE)
+    /* set old macsr again */
+    coldfire_set_macsr(old_macsr);
+#endif
+    return written;
+}
+
+/* Given count number of input samples, calculate the maximum number of
+ * samples of output data that would be generated (the calculation is not
+ * entirely exact and rounds upwards to be on the safe side; during
+ * resampling, the number of samples generated depends on the current state
+ * of the resampler).
+ */
+/* dsp_input_size MUST be called afterwards */
+int dsp_output_count(struct dsp_config *dsp, int count)
+{
+#ifdef HAVE_PITCHSCREEN
+    if (dsp->tdspeed_active)
+        count = tdspeed_est_output_size();
+#endif
+    if (dsp->resample)
+    {
+        count = (int)(((unsigned long)count * NATIVE_FREQUENCY
+                    + (dsp->frequency - 1)) / dsp->frequency);
+    }
+
+    /* Now we have the resampled sample count which must not exceed
+     * resample_buf_count to avoid resample buffer overflow. One
+     * must call dsp_input_count() to get the correct input sample
+     * count.
+     */
+    if (count > resample_buf_count)
+        count = resample_buf_count;
+        
+    return count;
+}
+
+/* Given count output samples, calculate number of input samples
+ * that would be consumed in order to fill the output buffer.
+ */
+int dsp_input_count(struct dsp_config *dsp, int count)
+{
+    /* count is now the number of resampled input samples. Convert to
+       original input samples. */
+    if (dsp->resample)
+    {
+        /* Use the real resampling delta =
+         * dsp->frequency * 65536 / NATIVE_FREQUENCY, and
+         * round towards zero to avoid buffer overflows. */
+        count = (int)(((unsigned long)count *
+                      dsp->data.resample_data.delta) >> 16);
+    }
+
+#ifdef HAVE_PITCHSCREEN
+    if (dsp->tdspeed_active)
+        count = tdspeed_est_input_size(count);
+#endif
+
+    return count;
+}
+
+static void dsp_set_gain_var(long *var, long value)
+{
+    *var = value;
+    new_gain = true;
+}
+
+static void dsp_update_functions(struct dsp_config *dsp)
+{
+    sample_input_new_format(dsp);
+    sample_output_new_format(dsp);
+    if (dsp == &AUDIO_DSP)
+        dsp_set_crossfeed(crossfeed_enabled);
+}
+
+intptr_t dsp_configure(struct dsp_config *dsp, int setting, intptr_t value)
+{
+    switch (setting)
+    {
+    case DSP_MYDSP:
+        switch (value)
+        {
+        case CODEC_IDX_AUDIO:
+            return (intptr_t)&AUDIO_DSP;
+        case CODEC_IDX_VOICE:
+            return (intptr_t)&VOICE_DSP;
+        default:
+            return (intptr_t)NULL;
+        }
+
+    case DSP_SET_FREQUENCY:
+        memset(&dsp->data.resample_data, 0, sizeof (dsp->data.resample_data));
+        /* Fall through!!! */
+    case DSP_SWITCH_FREQUENCY:
+        dsp->codec_frequency = (value == 0) ? NATIVE_FREQUENCY : value;
+        /* Account for playback speed adjustment when setting dsp->frequency
+           if we're called from the main audio thread. Voice UI thread should
+           not need this feature.
+         */
+#ifdef HAVE_PITCHSCREEN
+        if (dsp == &AUDIO_DSP)
+            dsp->frequency = pitch_ratio * dsp->codec_frequency / PITCH_SPEED_100;
+        else
+#endif
+            dsp->frequency = dsp->codec_frequency;
+
+        resampler_new_delta(dsp);
+#ifdef HAVE_PITCHSCREEN
+        tdspeed_setup(dsp);
+#endif
+        break;
+
+    case DSP_SET_SAMPLE_DEPTH:
+        dsp->sample_depth = value;
+
+        if (dsp->sample_depth <= NATIVE_DEPTH)
+        {
+            dsp->data.frac_bits = WORD_FRACBITS;
+            dsp->sample_bytes = sizeof (int16_t); /* samples are 16 bits */
+            dsp->data.clip_max =  ((1 << WORD_FRACBITS) - 1);
+            dsp->data.clip_min = -((1 << WORD_FRACBITS));
+        }
+        else
+        {
+            dsp->data.frac_bits = value;
+            dsp->sample_bytes = sizeof (int32_t); /* samples are 32 bits */
+            dsp->data.clip_max = (1 << value) - 1;
+            dsp->data.clip_min = -(1 << value);
+        }
+
+        dsp->data.output_scale = dsp->data.frac_bits + 1 - NATIVE_DEPTH;
+        sample_input_new_format(dsp);
+        dither_init(dsp);
+        break;
+
+    case DSP_SET_STEREO_MODE:
+        dsp->stereo_mode = value;
+        dsp->data.num_channels = value == STEREO_MONO ? 1 : 2;
+        dsp_update_functions(dsp);
+#ifdef HAVE_PITCHSCREEN
+        tdspeed_setup(dsp);
+#endif
+        break;
+
+    case DSP_RESET:
+        dsp->stereo_mode = STEREO_NONINTERLEAVED;
+        dsp->data.num_channels = 2;
+        dsp->sample_depth = NATIVE_DEPTH;
+        dsp->data.frac_bits = WORD_FRACBITS;
+        dsp->sample_bytes = sizeof (int16_t);
+        dsp->data.output_scale = dsp->data.frac_bits + 1 - NATIVE_DEPTH;
+        dsp->data.clip_max =  ((1 << WORD_FRACBITS) - 1);
+        dsp->data.clip_min = -((1 << WORD_FRACBITS));
+        dsp->codec_frequency = dsp->frequency = NATIVE_FREQUENCY;
+
+        if (dsp == &AUDIO_DSP)
+        {
+            track_gain = 0;
+            album_gain = 0;
+            track_peak = 0;
+            album_peak = 0;
+            new_gain   = true;
+        }
+
+        dsp_update_functions(dsp);
+        resampler_new_delta(dsp);
+#ifdef HAVE_PITCHSCREEN
+        tdspeed_setup(dsp);
+#endif
+        if (dsp == &AUDIO_DSP)
+            compressor_reset();
+        break;
+
+    case DSP_FLUSH:
+        memset(&dsp->data.resample_data, 0,
+               sizeof (dsp->data.resample_data));
+        resampler_new_delta(dsp);
+        dither_init(dsp);
+#ifdef HAVE_PITCHSCREEN
+        tdspeed_setup(dsp);
+#endif
+        if (dsp == &AUDIO_DSP)
+            compressor_reset();
+        break;
+
+    case DSP_SET_TRACK_GAIN:
+        if (dsp == &AUDIO_DSP)
+            dsp_set_gain_var(&track_gain, value);
+        break;
+
+    case DSP_SET_ALBUM_GAIN:
+        if (dsp == &AUDIO_DSP)
+            dsp_set_gain_var(&album_gain, value);
+        break;
+
+    case DSP_SET_TRACK_PEAK:
+        if (dsp == &AUDIO_DSP)
+            dsp_set_gain_var(&track_peak, value);
+        break;
+
+    case DSP_SET_ALBUM_PEAK:
+        if (dsp == &AUDIO_DSP)
+            dsp_set_gain_var(&album_peak, value);
+        break;
+
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+
+int get_replaygain_mode(bool have_track_gain, bool have_album_gain)
+{
+    int type;
+
+    bool track = ((global_settings.replaygain_type == REPLAYGAIN_TRACK)
+        || ((global_settings.replaygain_type == REPLAYGAIN_SHUFFLE)
+            && global_settings.playlist_shuffle));
+
+    type = (!track && have_album_gain) ? REPLAYGAIN_ALBUM 
+        : have_track_gain ? REPLAYGAIN_TRACK : -1;
+    
+    return type;
+}
+
+void dsp_set_replaygain(void)
+{
+    long gain = 0;
+
+    new_gain = false;
+
+    if ((global_settings.replaygain_type != REPLAYGAIN_OFF) ||
+            global_settings.replaygain_noclip)
+    {
+        bool track_mode = get_replaygain_mode(track_gain != 0,
+            album_gain != 0) == REPLAYGAIN_TRACK;
+        long peak = (track_mode || !album_peak) ? track_peak : album_peak;
+
+        if (global_settings.replaygain_type != REPLAYGAIN_OFF)
+        {
+            gain = (track_mode || !album_gain) ? track_gain : album_gain;
+
+            if (global_settings.replaygain_preamp)
+            {
+                long preamp = get_replaygain_int(
+                    global_settings.replaygain_preamp * 10);
+
+                gain = (long) (((int64_t) gain * preamp) >> 24);
+            }
+        }
+
+        if (gain == 0)
+        {
+            /* So that noclip can work even with no gain information. */
+            gain = DEFAULT_GAIN;
+        }
+
+        if (global_settings.replaygain_noclip && (peak != 0)
+            && ((((int64_t) gain * peak) >> 24) >= DEFAULT_GAIN))
+        {
+            gain = (((int64_t) DEFAULT_GAIN << 24) / peak);
+        }
+
+        if (gain == DEFAULT_GAIN)
+        {
+            /* Nothing to do, disable processing. */
+            gain = 0;
+        }
+    }
+
+    /* Store in S7.24 format to simplify calculations. */
+    replaygain = gain;
+    set_gain(&AUDIO_DSP);
+}
+
+/** SET COMPRESSOR
+ *  Called by the menu system to configure the compressor process */
+void dsp_set_compressor(void)
+{
+    /* enable/disable the compressor */
+    AUDIO_DSP.compressor_process = compressor_update() ?
+                                        compressor_process : NULL;
+}
diff --git a/lib/rbcodec/dsp/dsp.h b/lib/rbcodec/dsp/dsp.h
new file mode 100644
index 0000000000..2a00f649f8
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp.h
@@ -0,0 +1,125 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 Miika Pekkarinen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _DSP_H
+#define _DSP_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define NATIVE_FREQUENCY       44100
+
+enum
+{
+    STEREO_INTERLEAVED = 0,
+    STEREO_NONINTERLEAVED,
+    STEREO_MONO,
+    STEREO_NUM_MODES,
+};
+
+enum
+{
+    CODEC_IDX_AUDIO = 0,
+    CODEC_IDX_VOICE,
+};
+
+enum
+{
+    DSP_MYDSP = 1,
+    DSP_SET_FREQUENCY,
+    DSP_SWITCH_FREQUENCY,
+    DSP_SET_SAMPLE_DEPTH,
+    DSP_SET_STEREO_MODE,
+    DSP_RESET,
+    DSP_FLUSH,
+    DSP_SET_TRACK_GAIN,
+    DSP_SET_ALBUM_GAIN,
+    DSP_SET_TRACK_PEAK,
+    DSP_SET_ALBUM_PEAK,
+    DSP_CROSSFEED
+};
+
+
+/****************************************************************************
+ * NOTE: Any assembly routines that use these structures must be updated
+ * if current data members are moved or changed.
+ */
+struct resample_data
+{
+    uint32_t delta;                     /* 00h */
+    uint32_t phase;                     /* 04h */
+    int32_t last_sample[2];             /* 08h */
+                                        /* 10h */
+};
+
+/* This is for passing needed data to external dsp routines. If another
+ * dsp parameter needs to be passed, add to the end of the structure
+ * and remove from dsp_config.
+ * If another function type becomes assembly/external and requires dsp
+ * config info, add a pointer paramter of type "struct dsp_data *".
+ * If removing something from other than the end, reserve the spot or
+ * else update every implementation for every target.
+ * Be sure to add the offset of the new member for easy viewing as well. :)
+ * It is the first member of dsp_config and all members can be accessesed
+ * through the main aggregate but this is intended to make a safe haven
+ * for these items whereas the c part can be rearranged at will. dsp_data
+ * could even moved within dsp_config without disurbing the order.
+ */
+struct dsp_data
+{
+    int output_scale;                   /* 00h */
+    int num_channels;                   /* 04h */
+    struct resample_data resample_data; /* 08h */
+    int32_t clip_min;                   /* 18h */
+    int32_t clip_max;                   /* 1ch */
+    int32_t gain;                       /* 20h - Note that this is in S8.23 format. */
+    int frac_bits;                      /* 24h */
+                                        /* 28h */
+};
+
+struct dsp_config;
+
+int dsp_process(struct dsp_config *dsp, char *dest,
+                const char *src[], int count);
+int dsp_input_count(struct dsp_config *dsp, int count);
+int dsp_output_count(struct dsp_config *dsp, int count);
+intptr_t dsp_configure(struct dsp_config *dsp, int setting,
+                       intptr_t value);
+int get_replaygain_mode(bool have_track_gain, bool have_album_gain);
+void dsp_set_replaygain(void);
+void dsp_set_crossfeed(bool enable);
+void dsp_set_crossfeed_direct_gain(int gain);
+void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain,
+                                    long cutoff);
+void dsp_set_eq(bool enable);
+void dsp_set_eq_precut(int precut);
+void dsp_set_eq_coefs(int band);
+void dsp_dither_enable(bool enable);
+void dsp_timestretch_enable(bool enable);
+bool dsp_timestretch_available(void);
+void sound_set_pitch(int32_t r);
+int32_t sound_get_pitch(void);
+void dsp_set_timestretch(int32_t percent);
+int32_t dsp_get_timestretch(void);
+int dsp_callback(int msg, intptr_t param);
+void dsp_set_compressor(void);
+
+#endif
diff --git a/lib/rbcodec/dsp/dsp_arm.S b/lib/rbcodec/dsp/dsp_arm.S
new file mode 100644
index 0000000000..7e360749a3
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp_arm.S
@@ -0,0 +1,561 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ #include "config.h"
+
+/****************************************************************************
+ *  void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ */
+
+#include "config.h"
+
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_mono
+    .type   channels_process_sound_chan_mono, %function
+channels_process_sound_chan_mono:
+    @ input: r0 = count, r1 = buf
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    beq     .mono_singlesample         @ Zero? Only one sample!
+                                       @
+.monoloop:                             @
+    ldmia   r1, { r3, r4 }             @ r3, r4 = Li0, Li1
+    ldmia   r2, { r12, r14 }           @ r12, r14 = Ri0, Ri1
+    mov     r3, r3, asr #1             @ Mo0 = Li0 / 2 + Ri0 / 2
+    mov     r4, r4, asr #1             @ Mo1 = Li1 / 2 + Ri1 / 2
+    add     r12, r3, r12, asr #1       @
+    add     r14, r4, r14, asr #1       @
+    subs    r0, r0, #2                 @
+    stmia   r1!, { r12, r14 }          @ store Mo0, Mo1
+    stmia   r2!, { r12, r14 }          @ store Mo0, Mo1
+    bgt     .monoloop                  @
+                                       @
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
+                                       @
+.mono_singlesample:                    @
+    ldr     r3, [r1]                   @ r3 = Ls
+    ldr     r12, [r2]                  @ r12 = Rs
+    mov     r3, r3, asr #1             @ Mo = Ls / 2 + Rs / 2
+    add     r12, r3, r12, asr #1       @
+    str     r12, [r1]                  @ store Mo
+    str     r12, [r2]                  @ store Mo
+                                       @
+    ldmpc   regs=r4                    @
+    .size   channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
+
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_custom
+    .type   channels_process_sound_chan_custom, %function
+channels_process_sound_chan_custom:
+    stmfd   sp!, { r4-r10, lr }
+    
+    ldr     r3, =dsp_sw_gain
+    ldr     r4, =dsp_sw_cross
+
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    ldr     r3, [r3]                   @ r3 = dsp_sw_gain
+    ldr     r4, [r4]                   @ r4 = dsp_sw_cross
+    
+    subs    r0, r0, #1
+    beq     .custom_single_sample      @ Zero? Only one sample!
+    
+.custom_loop:
+    ldmia   r1, { r5, r6 }             @ r5 = Li0, r6 = Li1
+    ldmia   r2, { r7, r8 }             @ r7 = Ri0, r8 = Ri1
+
+    subs    r0, r0, #2
+
+    smull   r9, r10, r5, r3            @ Lc0 = Li0*gain
+    smull   r12, r14, r7, r3           @ Rc0 = Ri0*gain
+    smlal   r9, r10, r7, r4            @ Lc0 += Ri0*cross
+    smlal   r12, r14, r5, r4           @ Rc0 += Li0*cross
+    
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r5, r9, r10, asl #1
+    orr     r7, r12, r14, asl #1
+    
+    smull   r9, r10, r6, r3            @ Lc1 = Li1*gain
+    smull   r12, r14, r8, r3           @ Rc1 = Ri1*gain
+    smlal   r9, r10, r8, r4            @ Lc1 += Ri1*cross
+    smlal   r12, r14, r6, r4           @ Rc1 += Li1*cross
+    
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r6, r9, r10, asl #1
+    orr     r8, r12, r14, asl #1
+    
+    stmia   r1!, { r5, r6 }            @ Store Lc0, Lc1
+    stmia   r2!, { r7, r8 }            @ Store Rc0, Rc1
+
+    bgt     .custom_loop
+    
+    ldmpc   cond=lt, regs=r4-r10       @ < 0? even count
+    
+.custom_single_sample:
+    ldr     r5, [r1]                   @ handle odd sample
+    ldr     r7, [r2]
+
+    smull   r9, r10, r5, r3            @ Lc0 = Li0*gain
+    smull   r12, r14, r7, r3           @ Rc0 = Ri0*gain
+    smlal   r9, r10, r7, r4            @ Lc0 += Ri0*cross
+    smlal   r12, r14, r5, r4           @ Rc0 += Li0*cross
+
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r5, r9, r10, asl #1
+    orr     r7, r12, r14, asl #1
+
+    str     r5, [r1]                   @ Store Lc0
+    str     r7, [r2]                   @ Store Rc0
+
+    ldmpc   regs=r4-r10
+    .size   channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
+
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_karaoke
+    .type   channels_process_sound_chan_karaoke, %function
+channels_process_sound_chan_karaoke:
+    @ input: r0 = count, r1 = buf
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    beq     .karaoke_singlesample      @ Zero? Only one sample!
+                                       @
+.karaokeloop:                          @
+    ldmia   r1, { r3, r4 }             @ r3, r4  = Li0, Li1
+    ldmia   r2, { r12, r14 }           @ r12, r14 = Ri0, Ri1
+    mov     r3, r3, asr #1             @ Lo0 = Li0 / 2 - Ri0 / 2
+    mov     r4, r4, asr #1             @ Lo1 = Li1 / 2 - Ri1 / 2
+    sub     r3, r3, r12, asr #1        @
+    sub     r4, r4, r14, asr #1        @
+    rsb     r12, r3, #0                @ Ro0 = -Lk0 = Rs0 / 2 - Ls0 / 2
+    rsb     r14, r4, #0                @ Ro1 = -Lk1 = Ri1 / 2 - Li1 / 2
+    subs    r0, r0, #2                 @
+    stmia   r1!, { r3, r4 }            @ store Lo0, Lo1
+    stmia   r2!, { r12, r14 }          @ store Ro0, Ro1
+    bgt     .karaokeloop               @
+                                       @
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
+                                       @
+.karaoke_singlesample:                 @
+    ldr     r3, [r1]                   @ r3 = Li
+    ldr     r12, [r2]                  @ r12 = Ri
+    mov     r3, r3, asr #1             @ Lk = Li / 2 - Ri /2
+    sub     r3, r3, r12, asr #1        @
+    rsb     r12, r3, #0                @ Rk = -Lo = Ri / 2 - Li / 2
+    str     r3, [r1]                   @ store Lo
+    str     r12, [r2]                  @ store Ro
+                                       @
+    ldmpc   regs=r4                    @
+    .size   channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
+#if ARM_ARCH < 6
+/****************************************************************************
+ *  void sample_output_mono(int count, struct dsp_data *data,
+ *                          const int32_t *src[], int16_t *dst)
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global sample_output_mono
+    .type   sample_output_mono, %function
+sample_output_mono:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r6, lr }
+
+    ldr     r1, [r1]                   @ lr = data->output_scale
+    ldr     r2, [r2]                   @ r2 = src[0]
+
+    mov     r4, #1
+    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
+    mov     r4, r4, lsr #1
+    mvn     r14, #0x8000               @ r14 = 0xffff7fff, needed for
+                                       @ clipping and masking
+    subs    r0, r0, #1                 @
+    beq     .som_singlesample          @ Zero? Only one sample!
+
+.somloop:
+    ldmia   r2!, { r5, r6 }
+    add     r5, r5, r4                 @ r6 = (r6 + 1<<(scale-1)) >> scale
+    mov     r5, r5, asr r1
+    mov     r12, r5, asr #15
+    teq     r12, r12, asr #31
+    eorne   r5, r14, r5, asr #31       @ Clip (-32768...+32767)
+    add     r6, r6, r4
+    mov     r6, r6, asr r1             @ r7 = (r7 + 1<<(scale-1)) >> scale
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    
+    and     r5, r5, r14, lsr #16
+    and     r6, r6, r14, lsr #16
+    orr     r5, r5, r5, lsl #16        @ pack first 2 halfwords into 1 word
+    orr     r6, r6, r6, lsl #16        @ pack last 2 halfwords into 1 word
+    stmia   r3!, { r5, r6 }
+    
+    subs    r0, r0, #2
+    bgt     .somloop     
+       
+    ldmpc   cond=lt, regs=r4-r6        @ even 'count'? return
+
+.som_singlesample:
+    ldr     r5, [r2]                   @ do odd sample
+    add     r5, r5, r4
+    mov     r5, r5, asr r1
+    mov     r12, r5, asr #15
+    teq     r12, r12, asr #31
+    eorne   r5, r14, r5, asr #31
+
+    and     r5, r5, r14, lsr #16       @ pack 2 halfwords into 1 word
+    orr     r5, r5, r5, lsl #16
+    str     r5, [r3]
+
+    ldmpc   regs=r4-r6
+    .size   sample_output_mono, .-sample_output_mono
+    
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global sample_output_stereo
+    .type   sample_output_stereo, %function
+sample_output_stereo:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r9, lr }
+
+    ldr     r1, [r1]                   @ r1 = data->output_scale
+    ldmia   r2, { r2, r5 }             @ r2 = src[0], r5 = src[1]
+
+    mov     r4, #1
+    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
+    mov     r4, r4, lsr #1             @
+    
+    mvn     r14, #0x8000               @ r14 = 0xffff7fff, needed for
+                                       @ clipping and masking
+    subs    r0, r0, #1                 @
+    beq     .sos_singlesample          @ Zero? Only one sample!
+
+.sosloop:
+    ldmia   r2!, { r6, r7 }            @ 2 left
+    ldmia   r5!, { r8, r9 }            @ 2 right
+
+    add     r6, r6, r4                 @ r6 = (r6 + 1<<(scale-1)) >> scale
+    mov     r6, r6, asr r1
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    add     r7, r7, r4
+    mov     r7, r7, asr r1             @ r7 = (r7 + 1<<(scale-1)) >> scale
+    mov     r12, r7, asr #15
+    teq     r12, r12, asr #31
+    eorne   r7, r14, r7, asr #31       @ Clip (-32768...+32767)
+    
+    add     r8, r8, r4                 @ r8 = (r8 + 1<<(scale-1)) >> scale
+    mov     r8, r8, asr r1
+    mov     r12, r8, asr #15
+    teq     r12, r12, asr #31
+    eorne   r8, r14, r8, asr #31       @ Clip (-32768...+32767)
+    add     r9, r9, r4                 @ r9 = (r9 + 1<<(scale-1)) >> scale
+    mov     r9, r9, asr r1
+    mov     r12, r9, asr #15
+    teq     r12, r12, asr #31
+    eorne   r9, r14, r9, asr #31       @ Clip (-32768...+32767)
+    
+    and     r6, r6, r14, lsr #16       @ pack first 2 halfwords into 1 word
+    orr     r8, r6, r8, asl #16
+    and     r7, r7, r14, lsr #16       @ pack last 2 halfwords into 1 word
+    orr     r9, r7, r9, asl #16
+
+    stmia   r3!, { r8, r9 }
+
+    subs    r0, r0, #2
+    bgt     .sosloop
+
+    ldmpc   cond=lt, regs=r4-r9        @ even 'count'? return
+
+.sos_singlesample:    
+    ldr     r6, [r2]                   @ left odd sample
+    ldr     r8, [r5]                   @ right odd sample
+
+    add     r6, r6, r4                 @ r6 = (r7 + 1<<(scale-1)) >> scale
+    mov     r6, r6, asr r1
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    add     r8, r8, r4                 @ r8 = (r8 + 1<<(scale-1)) >> scale
+    mov     r8, r8, asr r1
+    mov     r12, r8, asr #15
+    teq     r12, r12, asr #31
+    eorne   r8, r14, r8, asr #31       @ Clip (-32768...+32767)
+    
+    and     r6, r6, r14, lsr #16       @ pack 2 halfwords into 1 word
+    orr     r8, r6, r8, asl #16
+
+    str     r8, [r3]
+
+    ldmpc   regs=r4-r9
+    .size   sample_output_stereo, .-sample_output_stereo
+#endif /* ARM_ARCH < 6 */    
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t* src[])
+ */
+    .section .text
+    .global apply_crossfeed 
+apply_crossfeed:
+    @ unfortunately, we ended up in a bit of a register squeeze here, and need
+    @ to keep the count on the stack :/
+    stmdb   sp!, { r4-r11, lr }        @ stack modified regs
+    ldmia   r1, { r2-r3 }              @ r2 = src[0], r3 = src[1]
+    
+    ldr     r1, =crossfeed_data
+    ldmia   r1!, { r4-r11 }            @ load direct gain and filter data
+    mov     r12, r0                    @ better to ldm delay + count later
+    add     r0, r1, #13*4*2            @ calculate end of delay
+    stmdb   sp!, { r0, r12 }           @ stack end of delay adr and count
+    ldr     r0, [r1, #13*4*2]          @ fetch current delay line address
+
+    /* Register usage in loop:
+     * r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1],
+     * r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
+     * r8-r11 = filter history, r12 = temp, r14 = accumulator low
+     */
+.cfloop:
+    smull   r14, r1, r6, r8            @ acc = b1*dr[n - 1]
+    smlal   r14, r1, r7, r9            @ acc += a1*y_l[n - 1]
+    ldr     r8, [r0, #4]               @ r8 = dr[n]
+    smlal   r14, r1, r5, r8            @ acc += b0*dr[n]
+    mov     r9, r1, lsl #1             @ fix format for filter history
+    ldr     r12, [r2]                  @ load left input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_l[n] 
+    mov     r1, r1, lsl #1             @ fix format
+    str     r1, [r2], #4               @ save result
+
+    smull   r14, r1, r6, r10           @ acc = b1*dl[n - 1]
+    smlal   r14, r1, r7, r11           @ acc += a1*y_r[n - 1]
+    ldr     r10, [r0]                  @ r10 = dl[n]
+    str     r12, [r0], #4              @ save left input to delay line
+    smlal   r14, r1, r5, r10           @ acc += b0*dl[n]
+    mov     r11, r1, lsl #1            @ fix format for filter history
+    ldr     r12, [r3]                  @ load right input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_r[n]
+    str     r12, [r0], #4              @ save right input to delay line
+    mov     r1, r1, lsl #1             @ fix format
+    ldmia   sp, { r12, r14 }           @ fetch delay line end addr and count from stack
+    str     r1, [r3], #4               @ save result
+
+    cmp     r0, r12                    @ need to wrap to start of delay?
+    subeq   r0, r0, #13*4*2            @ wrap back delay line ptr to start
+ 
+    subs    r14, r14, #1               @ are we finished?
+    strne   r14, [sp, #4]              @ nope, save count back to stack
+    bne     .cfloop
+    
+    @ save data back to struct
+    ldr     r12, =crossfeed_data + 4*4
+    stmia   r12, { r8-r11 }            @ save filter history
+    str     r0, [r12, #30*4]           @ save delay line index
+    add     sp, sp, #8                 @ remove temp variables from stack
+    ldmpc   regs=r4-r11
+    .size   apply_crossfeed, .-apply_crossfeed
+
+/****************************************************************************
+ * int dsp_downsample(int count, struct dsp_data *data,
+ *                    in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .global     dsp_downsample
+dsp_downsample:
+    stmdb   sp!, { r4-r11, lr }     @ stack modified regs
+    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
+    sub     r5, r5, #1              @ pre-decrement num_channels for use
+    add     r4, r1, #12             @ r4 = &resample_data.phase
+    mov     r12, #0xff
+    orr     r12, r12, #0xff00       @ r12 = 0xffff
+.dschannel_loop:
+    ldr     r1, [r4]                @ r1 = resample_data.phase
+    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
+    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
+    add     r9, r4, #4              @ r9 = &last_sample[0]
+    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
+    sub     r11, r0, #1             
+    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
+    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
+    movs    r9, r1, lsr #16         @ r9 = pos = phase >> 16
+    ldreq   r11, [r7]               @ if pos = 0, load src[0] and jump into loop
+    beq     .dsuse_last_start
+    cmp     r9, r0                  @ if pos >= count, we're already done
+    bge     .dsloop_skip
+
+    @ Register usage in loop:
+    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
+    @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
+.dsloop:
+    add     r9, r7, r9, lsl #2      @ r9 = &s[pos]
+    ldmda   r9, { r10, r11 }        @ r10 = s[pos - 1], r11 = s[pos]
+.dsuse_last_start:
+    sub     r11, r11, r10           @ r11 = diff = s[pos] - s[pos - 1]
+    @ keep frac in lower bits to take advantage of multiplier early termination
+    and     r9, r1, r12             @ frac = phase & 0xffff
+    smull   r9, r14, r11, r9
+    add     r1, r1, r6              @ phase += delta
+    add     r10, r10, r9, lsr #16   @ r10 = out = s[pos - 1] + frac*diff
+    add     r10, r10, r14, lsl #16
+    str     r10, [r8], #4           @ *d++ = out
+    mov     r9, r1, lsr #16         @ pos = phase >> 16
+    cmp     r9, r0                  @ pos < count?
+    blt     .dsloop                 @ yup, do more samples
+.dsloop_skip:
+    subs    r5, r5, #1
+    bpl     .dschannel_loop         @ if (--ch) >= 0, do another channel
+    sub     r1, r1, r0, lsl #16     @ wrap phase back to start
+    str     r1, [r4]                @ store back
+    ldr     r1, [r3]                @ r1 = &dst[0]
+    sub     r8, r8, r1              @ dst - &dst[0]
+    mov     r0, r8, lsr #2          @ convert bytes->samples
+    ldmpc   regs=r4-r11             @ ... and we're out
+    .size   dsp_downsample, .-dsp_downsample
+
+/****************************************************************************
+ * int dsp_upsample(int count, struct dsp_data *dsp,
+ *                  in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .global     dsp_upsample
+dsp_upsample:
+    stmfd   sp!, { r4-r11, lr }     @ stack modified regs
+    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
+    sub     r5, r5, #1              @ pre-decrement num_channels for use
+    add     r4, r1, #12             @ r4 = &resample_data.phase
+    mov     r6, r6, lsl #16         @ we'll use carry to detect pos increments
+    stmfd   sp!, { r0, r4 }         @ stack count and &resample_data.phase
+.uschannel_loop:
+    ldr     r12, [r4]               @ r12 = resample_data.phase
+    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
+    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
+    add     r9, r4, #4              @ r9 = &last_sample[0]
+    mov     r1, r12, lsl #16        @ we'll use carry to detect pos increments
+    sub     r11, r0, #1             
+    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
+    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
+    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
+    movs    r14, r12, lsr #16       @ pos = resample_data.phase >> 16
+    beq     .usstart_0              @ pos = 0
+    cmp     r14, r0                 @ if pos >= count, we're already done
+    bge     .usloop_skip
+    add     r7, r7, r14, lsl #2     @ r7 = &s[pos]
+    ldr     r10, [r7, #-4]          @ r11 = s[pos - 1]
+    b       .usstart_0
+
+    @ Register usage in loop:
+    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
+    @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos]
+.usloop_1:
+    mov     r10, r11                @ r10 = previous sample
+.usstart_0:
+    ldr     r11, [r7], #4           @ r11 = next sample
+    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
+    sub     r9, r11, r10            @ r9 = diff = s[pos] - s[pos - 1]
+.usloop_0:
+    smull   r12, r14, r4, r9
+    adds    r1, r1, r6              @ phase += delta << 16
+    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
+    add     r14, r10, r14, lsl #16
+    add     r14, r14, r12, lsr #16  @ r14 = out = s[pos - 1] + frac*diff
+    str     r14, [r8], #4           @ *d++ = out
+    bcc     .usloop_0               @ if carry is set, pos is incremented
+    subs    r0, r0, #1              @ if count > 0, do another sample
+    bgt     .usloop_1
+.usloop_skip:
+    subs    r5, r5, #1
+    ldmfd   sp, { r0, r4 }          @ reload count and &resample_data.phase
+    bpl     .uschannel_loop         @ if (--ch) >= 0, do another channel
+    mov     r1, r1, lsr #16         @ wrap phase back to start of next frame
+    ldr     r2, [r3]                @ r1 = &dst[0]
+    str     r1, [r4]                @ store phase
+    sub     r8, r8, r2              @ dst - &dst[0]
+    mov     r0, r8, lsr #2          @ convert bytes->samples
+    add     sp, sp, #8              @ adjust stack for temp variables
+    ldmpc   regs=r4-r11             @ ... and we're out
+    .size       dsp_upsample, .-dsp_upsample
+
+/****************************************************************************
+ *  void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global dsp_apply_gain
+    .type   dsp_apply_gain, %function
+dsp_apply_gain:
+    @ input: r0 = count, r1 = data, r2 = buf[]
+    stmfd   sp!, { r4-r8, lr }
+
+    ldr     r3, [r1,  #4]           @ r3 = data->num_channels
+    ldr     r4, [r1, #32]           @ r5 = data->gain
+
+.dag_outerloop:
+    ldr     r1, [r2], #4            @ r1 = buf[0] and increment index of buf[]
+    subs    r12, r0, #1             @ r12 = r0 = count - 1
+    beq     .dag_singlesample       @ Zero? Only one sample!
+
+.dag_innerloop:
+    ldmia   r1, { r5, r6 }          @ load r5, r6 from r1
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    smull   r14, r5, r6, r4         @ r14 = FRACMUL_SHL(r6, r4, 8)
+    subs    r12, r12, #2
+    mov     r7, r7, lsr #23
+    mov     r14, r14, lsr #23
+    orr     r7, r7, r8, asl #9
+    orr     r14, r14, r5, asl #9
+    stmia   r1!, { r7, r14 }        @ save r7, r14 to [r1] and increment r1
+    bgt     .dag_innerloop          @ end of inner loop
+
+    blt     .dag_evencount          @ < 0? even count
+
+.dag_singlesample:
+    ldr     r5, [r1]                @ handle odd sample
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    mov     r7, r7, lsr #23
+    orr     r7, r7, r8, asl #9
+    str     r7, [r1]
+
+.dag_evencount:
+    subs    r3, r3, #1
+    bgt     .dag_outerloop          @ end of outer loop
+               
+    ldmpc   regs=r4-r8
+    .size   dsp_apply_gain, .-dsp_apply_gain
diff --git a/lib/rbcodec/dsp/dsp_arm_v6.S b/lib/rbcodec/dsp/dsp_arm_v6.S
new file mode 100644
index 0000000000..39949498ea
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp_arm_v6.S
@@ -0,0 +1,127 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2010 Michael Sevakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ *  void sample_output_mono(int count, struct dsp_data *data,
+ *                          const int32_t *src[], int16_t *dst)
+ */
+    .section .text, "ax", %progbits
+    .align  2
+    .global sample_output_mono
+    .type   sample_output_mono, %function
+sample_output_mono:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldr     r1, [r1]                   @ r1 = data->output_scale
+    ldr     r2, [r2]                   @ r2 = src[0]
+                                       @
+    mov     r4, #1                     @ r4 = 1 << (scale - 1)
+    mov     r4, r4, lsl r1             @
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    mov     r4, r4, lsr #1             @
+    beq     2f                         @ Zero? Only one sample!
+                                       @
+1:                                     @
+    ldmia   r2!, { r12, r14 }          @ load Mi0, Mi1
+    qadd    r12, r12, r4               @ round, scale, saturate and
+    qadd    r14, r14, r4               @ pack Mi0 to So0, Mi1 to So1
+    mov     r12, r12, asr r1           @
+    mov     r14, r14, asr r1           @
+    ssat    r12, #16, r12              @
+    ssat    r14, #16, r14              @
+    pkhbt   r12, r12, r12, asl #16     @
+    pkhbt   r14, r14, r14, asl #16     @
+    subs    r0, r0, #2                 @
+    stmia   r3!, { r12, r14 }          @ store So0, So1
+    bgt     1b                         @
+                                       @
+    ldmltfd sp!, { r4, pc }            @ if count was even, we're done
+                                       @
+2:                                     @
+    ldr     r12, [r2]                  @ round, scale, saturate
+    qadd    r12, r12, r4               @ and pack Mi to So
+    mov     r12, r12, asr r1           @
+    ssat    r12, #16, r12              @
+    pkhbt   r12, r12, r12, asl #16     @
+    str     r12, [r3]                  @ store So
+                                       @       
+    ldmfd   sp!, { r4, pc }            @
+    .size   sample_output_mono, .-sample_output_mono
+
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ */
+    .section .text, "ax", %progbits
+    .align  2
+    .global sample_output_stereo
+    .type   sample_output_stereo, %function
+sample_output_stereo:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r7, lr }         @
+                                       @
+    ldr     r1, [r1]                   @ r1 = data->output_scale    
+    ldmia   r2, { r2, r4 }             @ r2 = src[0], r4 = src[1]
+                                       @
+    mov     r5, #1                     @ r5 = 1 << (scale - 1)
+    mov     r5, r5, lsl r1             @
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    mov     r5, r5, lsr #1             @
+    beq     2f                         @ Zero? Only one sample!
+                                       @
+1:                                     @
+    ldmia   r2!, { r6, r7 }            @ r6, r7 = Li0, Li1
+    ldmia   r4!, { r12, r14 }          @ r12, r14 = Ri0, Ri1
+    qadd    r6, r6, r5                 @ round, scale, saturate and pack
+    qadd    r7, r7, r5                 @ Li0+Ri0 to So0, Li1+Ri1 to So1
+    qadd    r12, r12, r5               @
+    qadd    r14, r14, r5               @
+    mov     r6, r6, asr r1             @ 
+    mov     r7, r7, asr r1             @
+    mov     r12, r12, asr r1           @
+    mov     r14, r14, asr r1           @
+    ssat    r6, #16, r6                @
+    ssat    r12, #16, r12              @
+    ssat    r7, #16, r7                @
+    ssat    r14, #16, r14              @
+    pkhbt   r6, r6, r12, asl #16       @
+    pkhbt   r7, r7, r14, asl #16       @
+    subs    r0, r0, #2                 @
+    stmia   r3!, { r6, r7 }            @ store So0, So1
+    bgt     1b                         @
+                                       @
+    ldmltfd sp!, { r4-r7, pc }         @ if count was even, we're done
+                                       @
+2:                                     @
+    ldr     r6, [r2]                   @ r6 = Li
+    ldr     r12, [r4]                  @ r12 = Ri
+    qadd    r6, r6, r5                 @ round, scale, saturate
+    qadd    r12, r12, r5               @ and pack Li+Ri to So
+    mov     r6, r6, asr r1             @
+    mov     r12, r12, asr r1           @
+    ssat    r6, #16, r6                @
+    ssat    r12, #16, r12              @
+    pkhbt   r6, r6, r12, asl #16       @
+    str     r6, [r3]                   @ store So
+                                       @
+    ldmfd   sp!, { r4-r7, pc }         @
+    .size   sample_output_stereo, .-sample_output_stereo
diff --git a/lib/rbcodec/dsp/dsp_asm.h b/lib/rbcodec/dsp/dsp_asm.h
new file mode 100644
index 0000000000..7bf18370a3
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp_asm.h
@@ -0,0 +1,86 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <config.h>
+
+#ifndef _DSP_ASM_H
+#define _DSP_ASM_H
+
+/* Set the appropriate #defines based on CPU or whatever matters */
+#if defined(CPU_ARM)
+#define DSP_HAVE_ASM_APPLY_GAIN
+#define DSP_HAVE_ASM_RESAMPLING
+#define DSP_HAVE_ASM_CROSSFEED
+#define DSP_HAVE_ASM_SOUND_CHAN_MONO
+#define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+#define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+#elif defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_APPLY_GAIN
+#define DSP_HAVE_ASM_RESAMPLING
+#define DSP_HAVE_ASM_CROSSFEED
+#define DSP_HAVE_ASM_SOUND_CHAN_MONO
+#define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+#define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+#endif /* CPU_COLDFIRE */
+
+/* Declare prototypes based upon what's #defined above */
+#ifdef DSP_HAVE_ASM_CROSSFEED
+void apply_crossfeed(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_APPLY_GAIN
+void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]);
+#endif /* DSP_HAVE_ASM_APPLY_GAIN* */
+
+#ifdef DSP_HAVE_ASM_RESAMPLING
+int dsp_upsample(int count, struct dsp_data *data,
+                 const int32_t *src[], int32_t *dst[]);
+int dsp_downsample(int count, struct dsp_data *data,
+                   const int32_t *src[], int32_t *dst[]);
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_MONO
+void channels_process_sound_chan_mono(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+void channels_process_sound_chan_custom(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+void sample_output_stereo(int count, struct dsp_data *data,
+                          const int32_t *src[], int16_t *dst);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+void sample_output_mono(int count, struct dsp_data *data,
+                        const int32_t *src[], int16_t *dst);
+#endif
+
+#endif /* _DSP_ASM_H */
diff --git a/lib/rbcodec/dsp/dsp_cf.S b/lib/rbcodec/dsp/dsp_cf.S
new file mode 100644
index 0000000000..cda811a7d5
--- /dev/null
+++ b/lib/rbcodec/dsp/dsp_cf.S
@@ -0,0 +1,611 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Thom Johansen
+ * Portions Copyright (C) 2007 Michael Sevakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+    move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+    move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+    subq.l      #1, %d1                 | next channel
+    bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+    .align      2
+    .global     apply_crossfeed 
+apply_crossfeed:
+    lea.l       -44(%sp), %sp           |
+    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
+    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
+    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    move.l      (%a1)+, %d6             | %d6 = direct gain
+    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
+    move.l      132(%a1), %a0           | fetch delay line address
+    movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
+    /* Register usage in loop:
+     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
+     * %d0..%d3 = history
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
+     * %d7 = count
+     */
+10: | loop                              |
+    movclr.l    %acc0, %d4              | write outputs
+    move.l      %d4, (%a4)+             | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)+             | .
+20: | loop start                        |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
+    .word       0x51fb | tpf.l          | trap the buffer wrap
+30: | wrap buffer                       | ...fwd taken branches more costly
+    lea.l       -104(%a0), %a0          | wrap it up
+    subq.l      #1, %d7                 | --count > 0 ?
+    bgt.b       10b | loop              | yes? do more
+    movclr.l    %acc0, %d4              | write last outputs
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
+    lea.l       crossfeed_data+16, %a1  | save data back to struct
+    movem.l     %d0-%d3, (%a1)          | ...history
+    move.l      %a0, 120(%a1)           | ...delay_p
+    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
+    lea.l       44(%sp), %sp            |
+    rts                                 |
+    .size       apply_crossfeed,.-apply_crossfeed 
+
+/****************************************************************************
+ * int dsp_downsample(int count, struct dsp_data *data,
+ *                    in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_downsample
+dsp_downsample:
+    lea.l       -40(%sp), %sp           | save non-clobberables
+    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
+                                        | %a0 = data
+                                        | %a1 = src
+                                        | %a2 = dst
+    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
+                                        | %d4 = delta = data->resample_data.delta
+    moveq.l     #16, %d7                | %d7 = shift
+10: | channel loop                      |
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
+    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
+    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    lsr.l       %d7, %d6                |
+    cmp.l       %d2, %d6                | past end of samples?
+    bge.b       40f | skip resample loop| yes? skip loop
+    tst.l       %d6                     | need last sample of prev. frame?
+    bne.b       20f | resample loop     | no? start main loop
+    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
+    bra.b       30f | resample start last | start with last (last in %d0)
+20: | resample loop                     |
+    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
+    movem.l     (%a5), %d0-%d1          |
+30: | resample start last               |
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+    move.l      %d0, %acc0              | %acc0 = previous sample
+    move.l      %d5, %d0                | frac = (phase << 16) >> 1
+    lsl.l       %d7, %d0                |
+    lsr.l       #1, %d0                 |
+    mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
+    add.l       %d4, %d5                | phase += delta
+    move.l      %d5, %d6                | pos = phase >> 16
+    lsr.l       %d7, %d6                |
+    movclr.l    %acc0, %d0              |
+    move.l      %d0, (%a4)+             | *d++ = %d0
+    cmp.l       %d2, %d6                | pos < count?
+    blt.b       20b | resample loop     | yes? continue resampling
+40: | skip resample loop                |
+    subq.l      #1, %d3                 | ch > 0?
+    bgt.b       10b | channel loop      | yes? process next channel
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
+    sub.l       %d2, %d5                | data->resample_data.phase =
+    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
+    move.l      %a4, %d0                | return d - d[0]
+    sub.l       (%a2), %d0              |
+    asr.l       #2, %d0                 | convert bytes->samples
+    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
+    lea.l       40(%sp), %sp            | cleanup stack
+    rts                                 | buh-bye
+    .size       dsp_downsample,.-dsp_downsample
+
+/****************************************************************************
+ * int dsp_upsample(int count, struct dsp_data *dsp,
+ *                  const int32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_upsample
+dsp_upsample:
+    lea.l       -40(%sp), %sp           | save non-clobberables
+    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
+                                        | %a0 = data
+                                        | %a1 = src
+                                        | %a2 = dst
+    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
+                                        | %d4 = delta = data->resample_data.delta
+    swap        %d4                     | swap delta to high word to use...
+                                        | ...carries to increment position
+10: | channel loop                      |
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
+    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
+    swap        %d5                     | swap phase to high word to use
+                                        | carries to increment position
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
+    clr.w       %d5                     |
+    eor.l       %d5, %d7                | pos == 0?
+    beq.b       40f | loop start        | yes? start loop
+    cmp.l       %d2, %d7                | past end of samples?
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
+    movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+    bra.b       40f | loop start        |
+20: | next sample loop                  |
+    move.l      %d6, %d0                | move previous sample to %d0
+    move.l      (%a3)+, %d1             | fetch next sample
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
+    lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
+    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
+    lsl.l       #1, %d5                 | restore frac to phase
+    add.l       %d4, %d5                | phase += delta
+    bcc.b       30b | same sample loop  | load next values?
+    cmp.l       %a5, %a3                | src <= src_end?
+    bls.b       20b | next sample loop  | yes? continue resampling
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
+    subq.l      #1, %d3                 | ch > 0?
+    bgt.b       10b | channel loop      | yes? process next channel
+    swap        %d5                     | wrap phase to start of next frame
+    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
+    move.l      %a4, %d0                | return d - d[0]
+    sub.l       (%a2), %d0              |
+    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
+    asr.l       #2, %d0                 | convert bytes->samples
+    lea.l       40(%sp), %sp            | cleanup stack
+    rts                                 | buh-bye
+    .size       dsp_upsample,.-dsp_upsample
+
+/****************************************************************************
+ * void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ *
+ * Mix left and right channels 50/50 into a center channel.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_mono
+channels_process_sound_chan_mono:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.s       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             | output to original buffer
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
+
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ *
+ * Apply stereo width (narrowing/expanding) effect.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_custom
+channels_process_sound_chan_custom:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -28(%sp), %sp           | save registers
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
+    move.l      dsp_sw_cross, %d4       | load cross (side) gain
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d5              |
+    movclr.l    %acc1, %d6              |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
+
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *
+ *  Separate channels into side channels.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_karaoke
+channels_process_sound_chan_karaoke:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ *
+ * Framework based on the ubiquitous Rockbox line transfer logic for
+ * Coldfire CPUs.
+ *
+ * Does emac clamping and scaling (which proved faster than the usual
+ * checks and branches - even single test clamping) and writes using
+ * line burst transfers. Also better than writing a single L-R pair per
+ * loop but a good deal more code.
+ *
+ * Attemping bursting during reads is rather futile since the source and
+ * destination alignments rarely agree and too much complication will
+ * slow us up. The parallel loads seem to do a bit better at least until
+ * a pcm buffer can always give line aligned chunk and then aligning the
+ * dest can then imply the source is aligned if the source buffers are.
+ * For now longword alignment is assumed of both the source and dest.
+ *
+ */
+    .section   .text
+    .align      2
+    .global    sample_output_stereo
+sample_output_stereo:
+    lea.l       -48(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d7/%a2-%a6, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     52(%sp), %a0-%a2/%a4      |
+    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    moveq.l     #1, %d0                   |
+    asl.l       %d1, %d0                  |
+    move.l      %d0, %a1                  |
+    move.l      #0x8000, %a6              | %a6 = rounding term
+    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a4, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a0, %d0                  | at least a full line?
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a4, %d0                  | any leading longwords?
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
+    move.l      (%a2)+, %d1               | read longword from L and R
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
+    mac.l       %d2, %a1, %acc1           | shift R to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | interleave MS 16 bits of each 
+    move.l      %d1, (%a4)+               | ...and write both
+    cmp.l       %a4, %d0                  |
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
+    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
+30: | line loop                           |
+    move.l      (%a3)+, %d4               | get next 4 R samples and scale
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
+    mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
+    mac.l       %d6, %a1, (%a3)+, %d7, %acc2 |
+    mac.l       %d7, %a1, (%a2)+, %d0, %acc3 |
+    lea.l       16(%a4), %a4              | increment dest here, mitigate stalls
+    movclr.l    %acc0, %d4                | obtain R results
+    movclr.l    %acc1, %d5                |
+    movclr.l    %acc2, %d6                |
+    movclr.l    %acc3, %d7                |
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | get next 4 L samples and scale
+    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 | with saturation
+    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %a1             , %acc3 |
+    swap        %d4                       | a) interleave most significant...
+    swap        %d5                       |
+    swap        %d6                       |
+    swap        %d7                       |
+    movclr.l    %acc0, %d0                | obtain L results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.w      %d4, %d0                  | a) ... 16 bits of L and R
+    move.w      %d5, %d1                  |
+    move.w      %d6, %d2                  |
+    move.w      %d7, %d3                  |
+    movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
+    cmp.l       %a4, %a5                  |
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
+    cmp.l       %a4, %a0                  | any longwords left?
+    bls.b       60f | output end          | no? stop
+50: | long loop 1                         |
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
+    mac.l       %d2, %a1, %acc1           |
+    movclr.l    %acc0, %d1                |
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a4)+               |
+    cmp.l       %a4, %a0                  |
+    bhi.b       50b                       | long loop 1
+60: | output end                          |
+    movem.l     (%sp), %d1-%d7/%a2-%a6    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       48(%sp), %sp              | cleanup
+    rts                                   |
+    .size      sample_output_stereo, .-sample_output_stereo
+
+/****************************************************************************
+ * void sample_output_mono(int count, struct dsp_data *data,
+ *                         const int32_t *src[], int16_t *dst)
+ *
+ * Same treatment as sample_output_stereo but for one channel.
+ */
+    .section   .text
+    .align      2
+    .global    sample_output_mono
+sample_output_mono:
+    lea.l       -32(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d5/%a2-%a4, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     36(%sp), %a0-%a3          |
+    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    moveq.l     #1, %d5                   |
+    asl.l       %d1, %d5                  |
+    move.l      #0x8000, %a4              | %a4 = rounding term
+    movem.l     (%a2), %a2                | get source channel pointer
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a3, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a0, %d0                  | at least a full line?
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a3, %d0                  | any leading longwords?
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
+    move.l      (%a2)+, %d1               | read longword from L and R
+    move.l      %a4, %acc0                |
+    mac.l       %d1, %d5, %acc0           | shift L to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    move.l      %d1, %d2                  |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | duplicate single channel into
+    move.l      %d1, (%a3)+               | L and R
+    cmp.l       %a3, %d0                  |
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
+    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
+30: | line loop                           |
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    move.l      %a4, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %d5             , %acc3 |
+    lea.l       16(%a3), %a3              | increment dest here, mitigate stalls
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      %d0, %d4                  | duplicate single channel
+    swap        %d4                       | into L and R
+    move.w      %d4, %d0                  |
+    move.l      %d1, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d1                  |
+    move.l      %d2, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d2                  |
+    move.l      %d3, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d3                  |
+    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
+    cmp.l       %a3, %a1                  |
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
+    cmp.l       %a3, %a0                  | any longwords left?
+    bls.b       60f | output end          | no? stop
+50: | loop loop 1                         |
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    move.l      %a4, %acc0                |
+    mac.l       %d1, %d5, %acc0           | the same way as leading ones
+    movclr.l    %acc0, %d1                |
+    move.l      %d1, %d2                  |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a3)+               |
+    cmp.l       %a3, %a0                  |
+    bhi.b       50b | long loop 1         |
+60: | output end                          |
+    movem.l     (%sp), %d1-%d5/%a2-%a4    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       32(%sp), %sp              | cleanup
+    rts                                   |
+    .size      sample_output_mono, .-sample_output_mono
diff --git a/lib/rbcodec/dsp/eq.c b/lib/rbcodec/dsp/eq.c
new file mode 100644
index 0000000000..122a46a4c5
--- /dev/null
+++ b/lib/rbcodec/dsp/eq.c
@@ -0,0 +1,268 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <inttypes.h>
+#include "config.h"
+#include "fixedpoint.h"
+#include "fracmul.h"
+#include "eq.h"
+#include "replaygain.h"
+
+/** 
+ * Calculate first order shelving filter. Filter is not directly usable by the
+ * eq_filter() function.
+ * @param cutoff shelf midpoint frequency. See eq_pk_coefs for format.
+ * @param A decibel value multiplied by ten, describing gain/attenuation of
+ * shelf. Max value is 24 dB.
+ * @param low true for low-shelf filter, false for high-shelf filter.
+ * @param c pointer to coefficient storage. Coefficients are s4.27 format.
+ */
+void filter_shelf_coefs(unsigned long cutoff, long A, bool low, int32_t *c)
+{
+    long sin, cos;
+    int32_t b0, b1, a0, a1; /* s3.28 */
+    const long g = get_replaygain_int(A*5) << 4; /* 10^(db/40), s3.28 */
+
+    sin = fp_sincos(cutoff/2, &cos);
+    if (low) {
+        const int32_t sin_div_g = fp_div(sin, g, 25);
+        const int32_t sin_g = FRACMUL(sin, g);
+        cos >>= 3;
+        b0 = sin_g + cos;             /* 0.25 .. 4.10 */
+        b1 = sin_g - cos;             /* -1 .. 3.98 */
+        a0 = sin_div_g + cos;         /* 0.25 .. 4.10 */
+        a1 = sin_div_g - cos;         /* -1 .. 3.98 */
+    } else {
+        const int32_t cos_div_g = fp_div(cos, g, 25);
+        const int32_t cos_g = FRACMUL(cos, g);
+        sin >>= 3;
+        b0 = sin + cos_g;             /* 0.25 .. 4.10 */
+        b1 = sin - cos_g;             /* -3.98 .. 1 */
+        a0 = sin + cos_div_g;         /* 0.25 .. 4.10 */
+        a1 = sin - cos_div_g;         /* -3.98 .. 1 */
+    }
+
+    const int32_t rcp_a0 = fp_div(1, a0, 57); /* 0.24 .. 3.98, s2.29 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 1);       /* 0.063 .. 15.85 */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 1);       /* -15.85 .. 15.85 */
+    *c++ = -FRACMUL_SHL(a1, rcp_a0, 1);      /* -1 .. 1 */
+}
+
+#ifdef HAVE_SW_TONE_CONTROLS
+/** 
+ * Calculate second order section filter consisting of one low-shelf and one
+ * high-shelf section.
+ * @param cutoff_low low-shelf midpoint frequency. See eq_pk_coefs for format.
+ * @param cutoff_high high-shelf midpoint frequency.
+ * @param A_low decibel value multiplied by ten, describing gain/attenuation of
+ * low-shelf part. Max value is 24 dB.
+ * @param A_high decibel value multiplied by ten, describing gain/attenuation of
+ * high-shelf part. Max value is 24 dB.
+ * @param A decibel value multiplied by ten, describing additional overall gain.
+ * @param c pointer to coefficient storage. Coefficients are s4.27 format.
+ */
+void filter_bishelf_coefs(unsigned long cutoff_low, unsigned long cutoff_high,
+                          long A_low, long A_high, long A, int32_t *c)
+{
+    const long g = get_replaygain_int(A*10) << 7; /* 10^(db/20), s0.31 */
+    int32_t c_ls[3], c_hs[3];
+
+    filter_shelf_coefs(cutoff_low, A_low, true, c_ls);
+    filter_shelf_coefs(cutoff_high, A_high, false, c_hs);
+    c_ls[0] = FRACMUL(g, c_ls[0]);
+    c_ls[1] = FRACMUL(g, c_ls[1]);
+
+    /* now we cascade the two first order filters to one second order filter
+     * which can be used by eq_filter(). these resulting coefficients have a
+     * really wide numerical range, so we use a fixed point format which will
+     * work for the selected cutoff frequencies (in dsp.c) only.
+     */
+    const int32_t b0 = c_ls[0], b1 = c_ls[1], b2 = c_hs[0], b3 = c_hs[1];
+    const int32_t a0 = c_ls[2], a1 = c_hs[2];
+    *c++ = FRACMUL_SHL(b0, b2, 4);
+    *c++ = FRACMUL_SHL(b0, b3, 4) + FRACMUL_SHL(b1, b2, 4);
+    *c++ = FRACMUL_SHL(b1, b3, 4);
+    *c++ = a0 + a1;
+    *c++ = -FRACMUL_SHL(a0, a1, 4);
+}
+#endif
+
+/* Coef calculation taken from Audio-EQ-Cookbook.txt by Robert Bristow-Johnson.
+ * Slightly faster calculation can be done by deriving forms which use tan()
+ * instead of cos() and sin(), but the latter are far easier to use when doing
+ * fixed point math, and performance is not a big point in the calculation part.
+ * All the 'a' filter coefficients are negated so we can use only additions
+ * in the filtering equation.
+ */
+
+/** 
+ * Calculate second order section peaking filter coefficients.
+ * @param cutoff a value from 0 to 0x80000000, where 0 represents 0 Hz and
+ * 0x80000000 represents the Nyquist frequency (samplerate/2).
+ * @param Q Q factor value multiplied by ten. Lower bound is artificially set
+ * at 0.5.
+ * @param db decibel value multiplied by ten, describing gain/attenuation at
+ * peak freq. Max value is 24 dB.
+ * @param c pointer to coefficient storage. Coefficients are s3.28 format.
+ */
+void eq_pk_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 28; /* s3.28 */
+    const long A = get_replaygain_int(db*5) << 5; /* 10^(db/40), s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    int32_t a0, a1, a2; /* these are all s3.28 format */
+    int32_t b0, b1, b2;
+    const long alphadivA = fp_div(alpha, A, 27);
+    const long alphaA = FRACMUL(alpha, A);
+
+    /* possible numerical ranges are in comments by each coef */
+    b0 = one + alphaA;                /* [1 .. 5] */
+    b1 = a1 = -2*(cs >> 3);           /* [-2 .. 2] */
+    b2 = one - alphaA;                /* [-3 .. 1] */
+    a0 = one + alphadivA;             /* [1 .. 5] */
+    a2 = one - alphadivA;             /* [-3 .. 1] */
+
+    /* range of this is roughly [0.2 .. 1], but we'll never hit 1 completely */
+    const long rcp_a0 = fp_div(1, a0, 59); /* s0.31 */
+    *c++ = FRACMUL(b0, rcp_a0);         /* [0.25 .. 4] */
+    *c++ = FRACMUL(b1, rcp_a0);         /* [-2 .. 2] */
+    *c++ = FRACMUL(b2, rcp_a0);         /* [-2.4 .. 1] */
+    *c++ = FRACMUL(-a1, rcp_a0);        /* [-2 .. 2] */
+    *c++ = FRACMUL(-a2, rcp_a0);        /* [-0.6 .. 1] */
+}
+
+/**
+ * Calculate coefficients for lowshelf filter. Parameters are as for
+ * eq_pk_coefs, but the coefficient format is s5.26 fixed point.
+ */
+void eq_ls_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 25; /* s6.25 */
+    const long sqrtA = get_replaygain_int(db*5/2) << 2; /* 10^(db/80), s5.26 */
+    const long A = FRACMUL_SHL(sqrtA, sqrtA, 8); /* s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    const long ap1 = (A >> 4) + one;
+    const long am1 = (A >> 4) - one;
+    const long ap1_cs = FRACMUL(ap1, cs);
+    const long am1_cs = FRACMUL(am1, cs);
+    const long twosqrtalpha = 2*FRACMUL(sqrtA, alpha);
+    int32_t a0, a1, a2; /* these are all s6.25 format */
+    int32_t b0, b1, b2;
+    
+    /* [0.1 .. 40] */
+    b0 = FRACMUL_SHL(A, ap1 - am1_cs + twosqrtalpha, 2);
+    /* [-16 .. 63.4] */
+    b1 = FRACMUL_SHL(A, am1 - ap1_cs, 3);
+    /* [0 .. 31.7] */
+    b2 = FRACMUL_SHL(A, ap1 - am1_cs - twosqrtalpha, 2);
+    /* [0.5 .. 10] */
+    a0 = ap1 + am1_cs + twosqrtalpha;
+    /* [-16 .. 4] */
+    a1 = -2*(am1 + ap1_cs);
+    /* [0 .. 8] */
+    a2 = ap1 + am1_cs - twosqrtalpha;
+
+    /* [0.1 .. 1.99] */
+    const long rcp_a0 = fp_div(1, a0, 55);    /* s1.30 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 2);       /* [0.06 .. 15.9] */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 2);       /* [-2 .. 31.7] */
+    *c++ = FRACMUL_SHL(b2, rcp_a0, 2);       /* [0 .. 15.9] */
+    *c++ = FRACMUL_SHL(-a1, rcp_a0, 2);      /* [-2 .. 2] */
+    *c++ = FRACMUL_SHL(-a2, rcp_a0, 2);      /* [0 .. 1] */
+}
+
+/**
+ * Calculate coefficients for highshelf filter. Parameters are as for
+ * eq_pk_coefs, but the coefficient format is s5.26 fixed point.
+ */
+void eq_hs_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 25; /* s6.25 */
+    const long sqrtA = get_replaygain_int(db*5/2) << 2; /* 10^(db/80), s5.26 */
+    const long A = FRACMUL_SHL(sqrtA, sqrtA, 8); /* s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    const long ap1 = (A >> 4) + one;
+    const long am1 = (A >> 4) - one;
+    const long ap1_cs = FRACMUL(ap1, cs);
+    const long am1_cs = FRACMUL(am1, cs);
+    const long twosqrtalpha = 2*FRACMUL(sqrtA, alpha);
+    int32_t a0, a1, a2; /* these are all s6.25 format */
+    int32_t b0, b1, b2;
+
+    /* [0.1 .. 40] */
+    b0 = FRACMUL_SHL(A, ap1 + am1_cs + twosqrtalpha, 2);
+    /* [-63.5 .. 16] */
+    b1 = -FRACMUL_SHL(A, am1 + ap1_cs, 3);
+    /* [0 .. 32] */
+    b2 = FRACMUL_SHL(A, ap1 + am1_cs - twosqrtalpha, 2);
+    /* [0.5 .. 10] */
+    a0 = ap1 - am1_cs + twosqrtalpha;
+    /* [-4 .. 16] */
+    a1 = 2*(am1 - ap1_cs);
+    /* [0 .. 8] */
+    a2 = ap1 - am1_cs - twosqrtalpha;
+
+    /* [0.1 .. 1.99] */
+    const long rcp_a0 = fp_div(1, a0, 55);    /* s1.30 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 2);       /* [0 .. 16] */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 2);       /* [-31.7 .. 2] */
+    *c++ = FRACMUL_SHL(b2, rcp_a0, 2);       /* [0 .. 16] */
+    *c++ = FRACMUL_SHL(-a1, rcp_a0, 2);      /* [-2 .. 2] */
+    *c++ = FRACMUL_SHL(-a2, rcp_a0, 2);      /* [0 .. 1] */
+}
+
+/* We realise the filters as a second order direct form 1 structure. Direct
+ * form 1 was chosen because of better numerical properties for fixed point
+ * implementations.
+ */
+
+#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM))
+void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+               unsigned channels, unsigned shift)
+{
+    unsigned c, i;
+    long long acc;
+
+    /* Direct form 1 filtering code.
+       y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+       where y[] is output and x[] is input.
+     */
+
+    for (c = 0; c < channels; c++) {
+        for (i = 0; i < num; i++) {
+            acc  = (long long) x[c][i] * f->coefs[0];
+            acc += (long long) f->history[c][0] * f->coefs[1];
+            acc += (long long) f->history[c][1] * f->coefs[2];
+            acc += (long long) f->history[c][2] * f->coefs[3];
+            acc += (long long) f->history[c][3] * f->coefs[4];
+            f->history[c][1] = f->history[c][0];
+            f->history[c][0] = x[c][i];
+            f->history[c][3] = f->history[c][2];
+            x[c][i] = (acc << shift) >> 32;
+            f->history[c][2] = x[c][i];
+        }
+    }
+}
+#endif
+
diff --git a/lib/rbcodec/dsp/eq.h b/lib/rbcodec/dsp/eq.h
new file mode 100644
index 0000000000..a44e9153ac
--- /dev/null
+++ b/lib/rbcodec/dsp/eq.h
@@ -0,0 +1,50 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _EQ_H
+#define _EQ_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+/* These depend on the fixed point formats used by the different filter types
+   and need to be changed when they change.
+ */
+#define FILTER_BISHELF_SHIFT 5
+#define EQ_PEAK_SHIFT 4
+#define EQ_SHELF_SHIFT 6
+
+struct eqfilter {
+    int32_t coefs[5];        /* Order is b0, b1, b2, a1, a2 */
+    int32_t history[2][4];
+};
+
+void filter_shelf_coefs(unsigned long cutoff, long A, bool low, int32_t *c);
+void filter_bishelf_coefs(unsigned long cutoff_low, unsigned long cutoff_high,
+                          long A_low, long A_high, long A, int32_t *c);
+void eq_pk_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_ls_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_hs_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+               unsigned channels, unsigned shift);
+
+#endif
+
diff --git a/lib/rbcodec/dsp/eq_arm.S b/lib/rbcodec/dsp/eq_arm.S
new file mode 100644
index 0000000000..b0e1771e89
--- /dev/null
+++ b/lib/rbcodec/dsp/eq_arm.S
@@ -0,0 +1,89 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
+#if CONFIG_CPU == PP5002
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .global eq_filter
+eq_filter:
+    ldr r12, [sp]             @ get shift parameter
+    stmdb sp!, { r0-r11, lr } @ save all params and clobbered regs 
+    ldmia r1!, { r4-r8 }      @ load coefs
+    mov r10, r1               @ loop prelude expects filter struct addr in r10
+
+.filterloop:
+    ldr r9, [sp]            @ get pointer to this channels data
+    add r0, r9, #4
+    str r0, [sp]            @ save back pointer to next channels data
+    ldr r9, [r9]            @ r9 = x[]
+    ldr r14, [sp, #8]       @ r14 = numsamples
+    ldmia r10, { r0-r3 }    @ load history, r10 should be filter struct addr
+    str r10, [sp, #4]       @ save it for loop end
+
+    /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator,
+     * r12 = shift amount, r14 = number of samples.
+     */
+.loop:
+    /* Direct form 1 filtering code.
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+     * where y[] is output and x[] is input. This is performed out of order to
+     * reuse registers, we're pretty short on regs.
+     */
+    smull r10, r11, r6, r1     @ acc = b2*x[i - 2]
+    mov r1, r0                 @ fix input history
+    smlal r10, r11, r5, r0     @ acc += b1*x[i - 1]
+    ldr r0, [r9]               @ load input and fix history in same operation
+    smlal r10, r11, r7, r2     @ acc += a1*y[i - 1]
+    smlal r10, r11, r8, r3     @ acc += a2*y[i - 2]
+    smlal r10, r11, r4, r0     @ acc += b0*x[i] /* avoid stall on arm9*/
+    mov r3, r2                 @ fix output history
+    mov r2, r11, asl r12       @ get upper part of result and shift left
+#ifdef HIGH_PRECISION
+    rsb r11, r12, #32          @ get shift amount for lower part
+    orr r2, r2, r10, lsr r11   @ then mix in correctly shifted lower part
+#endif
+    str r2, [r9], #4           @ save result
+    subs r14, r14, #1          @ are we done with this channel?
+    bne .loop
+
+    ldr r10, [sp, #4]          @ load filter struct pointer
+    stmia r10!, { r0-r3 }      @ save back history
+    ldr r11, [sp, #12]         @ load number of channels
+    subs r11, r11, #1          @ all channels processed?
+    strne r11, [sp, #12]
+    bne .filterloop
+
+    add sp, sp, #16            @ compensate for temp storage
+    ldmpc regs=r4-r11
+
diff --git a/lib/rbcodec/dsp/eq_cf.S b/lib/rbcodec/dsp/eq_cf.S
new file mode 100644
index 0000000000..30a28b9d99
--- /dev/null
+++ b/lib/rbcodec/dsp/eq_cf.S
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" - 1 of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
+    .text
+    .global eq_filter
+eq_filter:
+    lea.l (-11*4, %sp), %sp 
+    movem.l %d2-%d7/%a2-%a6, (%sp)    | save clobbered regs
+    move.l (11*4+8, %sp), %a5         | fetch filter structure address
+    move.l (11*4+20, %sp), %d7        | load shift count
+    subq.l #1, %d7                    | EMAC gives us one free shift
+#ifdef HIGH_PRECISION
+    moveq.l #8, %d6
+    sub.l %d7, %d6                    | shift for lower part of accumulator
+#endif
+    movem.l (%a5), %a0-%a4            | load coefs
+    lea.l (5*4, %a5), %a5             | point to filter history
+
+.filterloop:
+    move.l (11*4+4, %sp), %a6         | load input channel pointer
+    addq.l #4, (11*4+4, %sp)          | point x to next channel
+    move.l (%a6), %a6
+    move.l (11*4+12, %sp), %d5        | number of samples
+    movem.l (%a5), %d0-%d3            | load filter history
+
+    /* d0-d3 = history, d4 = temp, d5 = sample count, d6 = lower shift amount,
+     * d7 = upper shift amount, a0-a4 = coefs, a5 = history pointer, a6 = x[]
+     */
+.loop:
+    /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+     * where y[] is output and x[] is input. This is performed out of order
+     * to do parallel load of input value.
+     */
+    mac.l %a2, %d1, %acc0               | acc = b2*x[i - 2]
+    move.l %d0, %d1                     | fix input history
+    mac.l %a1, %d0, (%a6), %d0, %acc0   | acc += b1*x[i - 1], x[i] -> d0
+    mac.l %a0, %d0, %acc0               | acc += b0*x[i]
+    mac.l %a3, %d2, %acc0               | acc += a1*y[i - 1]
+    mac.l %a4, %d3, %acc0               | acc += a2*y[i - 2]
+    move.l %d2, %d3                     | fix output history
+#ifdef HIGH_PRECISION
+    move.l %accext01, %d2               | fetch lower part of accumulator
+    move.b %d2, %d4                     | clear upper three bytes
+    lsr.l %d6, %d4                      | shift lower bits
+#endif
+    movclr.l %acc0, %d2                 | fetch upper part of result
+    asl.l %d7, %d2                      | restore fixed point format
+#ifdef HIGH_PRECISION
+    or.l %d2, %d4                       | combine lower and upper parts
+#endif
+    move.l %d2, (%a6)+                  | save result
+    subq.l #1, %d5                      | are we done with this channel?
+    jne .loop
+    
+    movem.l %d0-%d3, (%a5)              | save history back to struct
+    lea.l (4*4, %a5), %a5               | point to next channel's history
+    subq.l #1, (11*4+16, %sp)           | have we processed both channels?
+    jne .filterloop
+
+    movem.l (%sp), %d2-%d7/%a2-%a6
+    lea.l (11*4, %sp), %sp
+    rts
+
diff --git a/lib/rbcodec/dsp/eqs/Acoustic.cfg b/lib/rbcodec/dsp/eqs/Acoustic.cfg
new file mode 100644
index 0000000000..34b5ed8a2b
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Acoustic.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 15
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 30
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 20
diff --git a/lib/rbcodec/dsp/eqs/Bass.cfg b/lib/rbcodec/dsp/eqs/Bass.cfg
new file mode 100644
index 0000000000..2742459081
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Bass.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 50
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 35
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 15
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 5
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: -5
diff --git a/lib/rbcodec/dsp/eqs/Classical.cfg b/lib/rbcodec/dsp/eqs/Classical.cfg
new file mode 100644
index 0000000000..bf2f9f9566
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Classical.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 50
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 40
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -20
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 10
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 20
diff --git a/lib/rbcodec/dsp/eqs/Default.cfg b/lib/rbcodec/dsp/eqs/Default.cfg
new file mode 100644
index 0000000000..d6f345fa9e
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Default.cfg
@@ -0,0 +1,17 @@
+eq enabled: off
+eq precut: 0
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 0
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 0
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 0
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 0
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 0
diff --git a/lib/rbcodec/dsp/eqs/Disco.cfg b/lib/rbcodec/dsp/eqs/Disco.cfg
new file mode 100644
index 0000000000..f894f26da1
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Disco.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 30
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 45
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 25
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 10
diff --git a/lib/rbcodec/dsp/eqs/Electronic.cfg b/lib/rbcodec/dsp/eqs/Electronic.cfg
new file mode 100644
index 0000000000..e70c911272
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Electronic.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 55
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 25
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 55
diff --git a/lib/rbcodec/dsp/eqs/Hip-Hop.cfg b/lib/rbcodec/dsp/eqs/Hip-Hop.cfg
new file mode 100644
index 0000000000..2d38425dc4
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Hip-Hop.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 65
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 65
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 25
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -10
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 35
diff --git a/lib/rbcodec/dsp/eqs/Jazz.cfg b/lib/rbcodec/dsp/eqs/Jazz.cfg
new file mode 100644
index 0000000000..f576f9fcc1
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Jazz.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 60
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 40
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 15
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -25
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 5
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 60
diff --git a/lib/rbcodec/dsp/eqs/Lounge.cfg b/lib/rbcodec/dsp/eqs/Lounge.cfg
new file mode 100644
index 0000000000..39ae23a7e7
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Lounge.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 20
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -25
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 20
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: -15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 15
diff --git a/lib/rbcodec/dsp/eqs/Pop.cfg b/lib/rbcodec/dsp/eqs/Pop.cfg
new file mode 100644
index 0000000000..1d8cefe173
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Pop.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -10
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 50
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: -10
diff --git a/lib/rbcodec/dsp/eqs/R&B.cfg b/lib/rbcodec/dsp/eqs/R&B.cfg
new file mode 100644
index 0000000000..a460b587f5
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/R&B.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 35
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 45
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 5
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 25
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 30
diff --git a/lib/rbcodec/dsp/eqs/Rock.cfg b/lib/rbcodec/dsp/eqs/Rock.cfg
new file mode 100644
index 0000000000..ec4f0356a8
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Rock.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 25
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 0
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 20
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 45
diff --git a/lib/rbcodec/dsp/eqs/Vocal.cfg b/lib/rbcodec/dsp/eqs/Vocal.cfg
new file mode 100644
index 0000000000..1de754f07c
--- /dev/null
+++ b/lib/rbcodec/dsp/eqs/Vocal.cfg
@@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 45
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 20
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 0
diff --git a/lib/rbcodec/dsp/tdspeed.c b/lib/rbcodec/dsp/tdspeed.c
new file mode 100644
index 0000000000..731be12621
--- /dev/null
+++ b/lib/rbcodec/dsp/tdspeed.c
@@ -0,0 +1,450 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 2006-2007 by Stéphane Doyon <s.doyon@videotron.ca>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include "sound.h"
+#include "core_alloc.h"
+#include "system.h"
+#include "tdspeed.h"
+#include "settings.h"
+
+#define assert(cond)
+
+#define MIN_RATE 8000
+#define MAX_RATE 48000 /* double buffer for double rate */
+#define MINFREQ 100
+
+#define FIXED_BUFSIZE 3072 /* 48KHz factor 3.0 */
+
+static int32_t** dsp_src;
+static int handles[4];
+static int32_t *overlap_buffer[2] = { NULL, NULL };
+static int32_t *outbuf[2] = { NULL, NULL };
+
+static int move_callback(int handle, void* current, void* new)
+{
+    /* TODO */
+    (void)handle;
+    if (dsp_src)
+    {
+        int ch = (current == outbuf[0]) ? 0 : 1;
+        dsp_src[ch] = outbuf[ch] = new;
+    }
+    return BUFLIB_CB_OK;
+}
+
+static struct buflib_callbacks ops = {
+    .move_callback = move_callback,
+    .shrink_callback = NULL,
+};
+
+static int ovl_move_callback(int handle, void* current, void* new)
+{
+    /* TODO */
+    (void)handle;
+    if (dsp_src)
+    {
+        int ch = (current == overlap_buffer[0]) ? 0 : 1;
+        overlap_buffer[ch] = new;
+    }
+    return BUFLIB_CB_OK;
+}
+
+static struct buflib_callbacks ovl_ops = {
+    .move_callback = ovl_move_callback,
+    .shrink_callback = NULL,
+};
+
+
+static struct tdspeed_state_s
+{
+    bool stereo;
+    int32_t shift_max;      /* maximum displacement on a frame */
+    int32_t src_step;       /* source window pace */
+    int32_t dst_step;       /* destination window pace */
+    int32_t dst_order;      /* power of two for dst_step */
+    int32_t ovl_shift;      /* overlap buffer frame shift */
+    int32_t ovl_size;       /* overlap buffer used size */
+    int32_t ovl_space;      /* overlap buffer size */
+    int32_t *ovl_buff[2];   /* overlap buffer */
+} tdspeed_state;
+
+void tdspeed_init(void)
+{
+    if (!global_settings.timestretch_enabled)
+        return;
+
+    /* Allocate buffers */
+    if (overlap_buffer[0] == NULL)
+    {
+        handles[0] = core_alloc_ex("tdspeed ovl left", FIXED_BUFSIZE * sizeof(int32_t), &ovl_ops);
+        overlap_buffer[0] = core_get_data(handles[0]);
+    }
+    if (overlap_buffer[1] == NULL)
+    {
+        handles[1] = core_alloc_ex("tdspeed ovl right", FIXED_BUFSIZE * sizeof(int32_t), &ovl_ops);
+        overlap_buffer[1] = core_get_data(handles[1]);
+    }
+    if (outbuf[0] == NULL)
+    {
+        handles[2] = core_alloc_ex("tdspeed left", TDSPEED_OUTBUFSIZE * sizeof(int32_t), &ops);
+        outbuf[0] = core_get_data(handles[2]);
+    }
+    if (outbuf[1] == NULL)
+    {
+        handles[3] = core_alloc_ex("tdspeed right", TDSPEED_OUTBUFSIZE * sizeof(int32_t), &ops);
+        outbuf[1] = core_get_data(handles[3]);
+    }
+}
+
+void tdspeed_finish(void)
+{
+    for(unsigned i = 0; i < ARRAYLEN(handles); i++)
+    {
+        if (handles[i] > 0)
+        {
+            core_free(handles[i]);
+            handles[i] = 0;
+        }
+    }
+    overlap_buffer[0] = overlap_buffer[1] = NULL;
+    outbuf[0]         = outbuf[1]         = NULL;
+}
+
+bool tdspeed_config(int samplerate, bool stereo, int32_t factor)
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+    int src_frame_sz;
+
+    /* Check buffers were allocated ok */
+    if (overlap_buffer[0] == NULL || overlap_buffer[1] == NULL)
+        return false;
+
+    if (outbuf[0] == NULL || outbuf[1] == NULL)
+        return false;
+
+    /* Check parameters */
+    if (factor == PITCH_SPEED_100)
+        return false;
+
+    if (samplerate < MIN_RATE || samplerate > MAX_RATE)
+        return false;
+
+    if (factor < STRETCH_MIN || factor > STRETCH_MAX)
+        return false;
+
+    st->stereo = stereo;
+    st->dst_step = samplerate / MINFREQ;
+
+    if (factor > PITCH_SPEED_100)
+        st->dst_step = st->dst_step * PITCH_SPEED_100 / factor;
+
+    st->dst_order = 1;
+
+    while (st->dst_step >>= 1)
+        st->dst_order++;
+
+    st->dst_step = (1 << st->dst_order);
+    st->src_step = st->dst_step * factor / PITCH_SPEED_100;
+    st->shift_max = (st->dst_step > st->src_step) ? st->dst_step : st->src_step;
+
+    src_frame_sz = st->shift_max + st->dst_step;
+
+    if (st->dst_step > st->src_step)
+        src_frame_sz += st->dst_step - st->src_step;
+
+    st->ovl_space = ((src_frame_sz - 2) / st->src_step) * st->src_step
+                        + src_frame_sz;
+
+    if (st->src_step > st->dst_step)
+        st->ovl_space += 2*st->src_step - st->dst_step;
+
+    if (st->ovl_space > FIXED_BUFSIZE)
+        st->ovl_space = FIXED_BUFSIZE;
+
+    st->ovl_size = 0;
+    st->ovl_shift = 0;
+
+    st->ovl_buff[0] = overlap_buffer[0];
+
+    if (stereo)
+        st->ovl_buff[1] = overlap_buffer[1];
+    else
+        st->ovl_buff[1] = st->ovl_buff[0];
+
+    return true;
+}
+
+static int tdspeed_apply(int32_t *buf_out[2], int32_t *buf_in[2],
+                         int data_len, int last, int out_size)
+/* data_len in samples */
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+    int32_t *dest[2];
+    int32_t next_frame, prev_frame, src_frame_sz;
+    bool stereo = buf_in[0] != buf_in[1];
+
+    assert(stereo == st->stereo);
+
+    src_frame_sz = st->shift_max + st->dst_step;
+
+    if (st->dst_step > st->src_step)
+        src_frame_sz += st->dst_step - st->src_step;
+
+    /* deal with overlap data first, if any */
+    if (st->ovl_size)
+    {
+        int32_t have, copy, steps;
+        have = st->ovl_size;
+
+        if (st->ovl_shift > 0)
+            have -= st->ovl_shift;
+
+        /* append just enough data to have all of the overlap buffer consumed */
+        steps = (have - 1) / st->src_step;
+        copy = steps * st->src_step + src_frame_sz - have;
+
+        if (copy < src_frame_sz - st->dst_step)
+            copy += st->src_step;  /* one more step to allow for pregap data */
+
+        if (copy > data_len)
+            copy = data_len;
+
+        assert(st->ovl_size + copy <= FIXED_BUFSIZE);
+        memcpy(st->ovl_buff[0] + st->ovl_size, buf_in[0],
+               copy * sizeof(int32_t));
+
+        if (stereo)
+            memcpy(st->ovl_buff[1] + st->ovl_size, buf_in[1],
+                   copy * sizeof(int32_t));
+
+        if (!last && have + copy < src_frame_sz)
+        {
+            /* still not enough to process at least one frame */
+            st->ovl_size += copy;
+            return 0;
+        }
+
+        /* recursively call ourselves to process the overlap buffer */
+        have = st->ovl_size;
+        st->ovl_size = 0;
+
+        if (copy == data_len)
+        {
+            assert(have + copy <= FIXED_BUFSIZE);
+            return tdspeed_apply(buf_out, st->ovl_buff, have+copy, last,
+                               out_size);
+        }
+
+        assert(have + copy <= FIXED_BUFSIZE);
+        int i = tdspeed_apply(buf_out, st->ovl_buff, have+copy, -1, out_size);
+
+        dest[0] = buf_out[0] + i;
+        dest[1] = buf_out[1] + i;
+
+        /* readjust pointers to account for data already consumed */
+        next_frame = copy - src_frame_sz + st->src_step;
+        prev_frame = next_frame - st->ovl_shift;
+    }
+    else
+    {
+        dest[0] = buf_out[0];
+        dest[1] = buf_out[1];
+
+        next_frame = prev_frame = 0;
+
+        if (st->ovl_shift > 0)
+            next_frame += st->ovl_shift;
+        else
+            prev_frame += -st->ovl_shift;
+    }
+
+    st->ovl_shift = 0;
+
+    /* process all complete frames */
+    while (data_len - next_frame >= src_frame_sz)
+    {
+        /* find frame overlap by autocorelation */
+        int const INC1 = 8;
+        int const INC2 = 32;
+
+        int64_t min_delta = INT64_MAX;  /* most positive */
+        int shift = 0;
+
+        /* Power of 2 of a 28bit number requires 56bits, can accumulate
+           256times in a 64bit variable. */
+        assert(st->dst_step / INC2 <= 256);
+        assert(next_frame + st->shift_max - 1 + st->dst_step - 1 < data_len);
+        assert(prev_frame + st->dst_step - 1 < data_len);
+
+        for (int i = 0; i < st->shift_max; i += INC1)
+        {
+            int64_t delta = 0;
+
+            int32_t *curr = buf_in[0] + next_frame + i;
+            int32_t *prev = buf_in[0] + prev_frame;
+
+            for (int j = 0; j < st->dst_step; j += INC2, curr += INC2, prev += INC2)
+            {
+                int32_t diff = *curr - *prev;
+                delta += abs(diff);
+
+                if (delta >= min_delta)
+                    goto skip;
+            }
+
+            if (stereo)
+            {
+                curr = buf_in[1] + next_frame + i;
+                prev = buf_in[1] + prev_frame;
+
+                for (int j = 0; j < st->dst_step; j += INC2, curr += INC2, prev += INC2)
+                {
+                    int32_t diff = *curr - *prev;
+                    delta += abs(diff);
+
+                    if (delta >= min_delta)
+                        goto skip;
+                }
+            }
+
+            min_delta = delta;
+            shift = i;
+skip:;
+        }
+
+        /* overlap fading-out previous frame with fading-in current frame */
+        int32_t *curr = buf_in[0] + next_frame + shift;
+        int32_t *prev = buf_in[0] + prev_frame;
+
+        int32_t *d = dest[0];
+
+        assert(next_frame + shift + st->dst_step - 1 < data_len);
+        assert(prev_frame + st->dst_step - 1 < data_len);
+        assert(dest[0] - buf_out[0] + st->dst_step - 1 < out_size);
+
+        for (int i = 0, j = st->dst_step; j; i++, j--)
+        {
+            *d++ = (*curr++ * (int64_t)i +
+                    *prev++ * (int64_t)j) >> st->dst_order;
+        }
+
+        dest[0] = d;
+
+        if (stereo)
+        {
+            curr = buf_in[1] + next_frame + shift;
+            prev = buf_in[1] + prev_frame;
+
+            d = dest[1];
+
+            for (int i = 0, j = st->dst_step; j; i++, j--)
+            {
+                assert(d < buf_out[1] + out_size);
+
+                *d++ = (*curr++ * (int64_t)i +
+                        *prev++ * (int64_t)j) >> st->dst_order;
+            }
+
+            dest[1] = d;
+        }
+
+        /* adjust pointers for next frame */
+        prev_frame = next_frame + shift + st->dst_step;
+        next_frame += st->src_step;
+
+        /* here next_frame - prev_frame = src_step - dst_step - shift */
+        assert(next_frame - prev_frame == st->src_step - st->dst_step - shift);
+    }
+
+    /* now deal with remaining partial frames */
+    if (last == -1)
+    {
+        /* special overlap buffer processing: remember frame shift only */
+        st->ovl_shift = next_frame - prev_frame;
+    }
+    else if (last != 0)
+    {
+        /* last call: purge all remaining data to output buffer */
+        int i = data_len - prev_frame;
+
+        assert(dest[0] + i <= buf_out[0] + out_size);
+        memcpy(dest[0], buf_in[0] + prev_frame, i * sizeof(int32_t));
+
+        dest[0] += i;
+
+        if (stereo)
+        {
+            assert(dest[1] + i <= buf_out[1] + out_size);
+            memcpy(dest[1], buf_in[1] + prev_frame, i * sizeof(int32_t));
+            dest[1] += i;
+        }
+    }
+    else
+    {
+        /* preserve remaining data + needed overlap data for next call */
+        st->ovl_shift = next_frame - prev_frame;
+        int i = (st->ovl_shift < 0) ? next_frame : prev_frame;
+        st->ovl_size = data_len - i;
+
+        assert(st->ovl_size <= FIXED_BUFSIZE);
+        memcpy(st->ovl_buff[0], buf_in[0] + i, st->ovl_size * sizeof(int32_t));
+
+        if (stereo)
+            memcpy(st->ovl_buff[1], buf_in[1] + i, st->ovl_size * sizeof(int32_t));
+    }
+
+    return dest[0] - buf_out[0];
+}
+
+long tdspeed_est_output_size()
+{
+    return TDSPEED_OUTBUFSIZE;
+}
+
+long tdspeed_est_input_size(long size)
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+
+    size = (size - st->ovl_size) * st->src_step / st->dst_step;
+
+    if (size < 0)
+        size = 0;
+
+    return size;
+}
+
+int tdspeed_doit(int32_t *src[], int count)
+{
+    dsp_src = src;
+    count = tdspeed_apply( (int32_t *[2]) { outbuf[0], outbuf[1] },
+                           src, count, 0, TDSPEED_OUTBUFSIZE);
+
+    src[0] = outbuf[0];
+    src[1] = outbuf[1];
+
+    return count;
+}
+
diff --git a/lib/rbcodec/dsp/tdspeed.h b/lib/rbcodec/dsp/tdspeed.h
new file mode 100644
index 0000000000..e91eeb1701
--- /dev/null
+++ b/lib/rbcodec/dsp/tdspeed.h
@@ -0,0 +1,49 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 2006-2007 by St�phane Doyon <s.doyon@videotron.ca>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _TDSPEED_H
+#define _TDSPEED_H
+
+#include "dsp.h"
+
+#define TDSPEED_OUTBUFSIZE 4096
+
+/* some #define functions to get the pitch, stretch and speed values based on */
+/* two known values.  Remember that params are alphabetical.                  */
+#define GET_SPEED(pitch, stretch) \
+    ((pitch * stretch + PITCH_SPEED_100 / 2L) / PITCH_SPEED_100)
+#define GET_PITCH(speed, stretch) \
+    ((speed * PITCH_SPEED_100 + stretch / 2L) / stretch)
+#define GET_STRETCH(pitch, speed) \
+    ((speed * PITCH_SPEED_100 + pitch   / 2L) / pitch)
+
+void tdspeed_init(void);
+void tdspeed_finish(void);
+bool tdspeed_config(int samplerate, bool stereo, int32_t factor);
+long tdspeed_est_output_size(void);
+long tdspeed_est_input_size(long size);
+int tdspeed_doit(int32_t *src[], int count);
+
+#define STRETCH_MAX (250L * PITCH_SPEED_PRECISION) /* 250% */
+#define STRETCH_MIN (35L  * PITCH_SPEED_PRECISION) /* 35%  */
+
+#endif
-- 
cgit v1.2.3