1 files changed, 482 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libfaad/filtbank.c b/lib/rbcodec/codecs/libfaad/filtbank.c
new file mode 100644
index 0000000000..fd7a4dc91f
--- /dev/null
+++ b/lib/rbcodec/codecs/libfaad/filtbank.c
@@ -0,0 +1,482 @@
+/*
+** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
+** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
+**  
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+** 
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+** 
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software 
+** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+**
+** Any non-GPL usage of this software or parts of this software is strictly
+** forbidden.
+**
+** Commercial non-GPL licensing of this software is possible.
+** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
+**
+** $Id$
+**/
+#include "common.h"
+#include "structs.h"
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32_WCE
+#define assert(x)
+#else
+#include <assert.h>
+#endif
+#include "filtbank.h"
+#include "decoder.h"
+#include "syntax.h"
+#include "kbd_win.h"
+#include "sine_win.h"
+/* static variables */
+static real_t transf_buf[2*FRAME_LEN] IBSS_ATTR MEM_ALIGN_ATTR;
+#ifdef LTP_DEC
+static real_t windowed_buf[2*FRAME_LEN] MEM_ALIGN_ATTR = {0};
+#endif
+/*Windowing functions borrowed from libwmai*/
+#ifdef CPU_ARM
+static inline 
+void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "0:"
+        "ldmia %[d]!, {r0, r1};"
+        "ldmia %[w]!, {r4, r5};"
+        /* consume the first data and window value so we can use those
+         * registers again */
+        "smull r8, r9, r0, r4;"
+        "ldmia %[src2]!, {r0, r4};"
+        "add   r0, r0, r9, lsl #1;"  /* *dst=*dst+(r9<<1)*/
+        "smull r8, r9, r1, r5;"
+        "add   r1, r4, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len)
+        : 
+        : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+static inline
+void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
+                         int len)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "add   %[s1], %[s1], %[n], lsl #2;"
+        "0:"
+        "ldmia %[s0]!, {r0, r1};"
+        "ldmdb %[s1]!, {r4, r5};"
+        "smull r8, r9, r0, r5;"
+        "mov   r0, r9, lsl #1;"
+        "smull r8, r9, r1, r4;"
+        "mov   r1, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
+        : 
+        : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+#elif defined(CPU_COLDFIRE)
+static inline
+void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "0:"
+        "movem.l (%[src0]), %%d0-%%d3;"
+        "movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%d4, %%acc0;"
+        "mac.l %%d1, %%d5, %%acc1;"
+        "mac.l %%d2, %%a0, %%acc2;"
+        "mac.l %%d3, %%a1, %%acc3;"
+        "lea.l (16, %[src0]), %[src0];"
+        "lea.l (16, %[src1]), %[src1];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;"
+        "lea.l (16, %[src2]), %[src2];"
+        "add.l %%d4, %%d0;"
+        "add.l %%d5, %%d1;"
+        "add.l %%a0, %%d2;"
+        "add.l %%a1, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len)
+        : 
+        : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+static inline
+void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
+                         int len)
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "lea.l (-16, %[s1], %[n]*4), %[s1];"
+        "0:"
+        "movem.l (%[s0]), %%d0-%%d3;"
+        "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%a1, %%acc0;"
+        "mac.l %%d1, %%a0, %%acc1;"
+        "mac.l %%d2, %%d5, %%acc2;"
+        "mac.l %%d3, %%d4, %%acc3;"
+        "lea.l (16, %[s0]), %[s0];"
+        "lea.l (-16, %[s1]), %[s1];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+#else
+static inline void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] = MUL_F(src0[i], src1[i]) + src2[i];
+}
+static inline void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = MUL_F(src0[i], src1[-i]);
+}
+#endif
+#ifdef LTP_DEC
+static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
+{
+    mdct_info *mdct = NULL;
+    switch (len)
+    {
+    case 2048:
+    case 1920:
+        mdct = fb->mdct2048;
+        break;
+    case 256:
+    case 240:
+        mdct = fb->mdct256;
+        break;
+#ifdef LD_DEC
+    case 1024:
+    case 960:
+        mdct = fb->mdct1024;
+        break;
+#endif
+    }
+    faad_mdct(mdct, in_data, out_data);
+}
+#endif
+void ifilter_bank(uint8_t window_sequence, uint8_t window_shape,
+                  uint8_t window_shape_prev, real_t *freq_in,
+                  real_t *time_out, real_t *overlap,
+                  uint8_t object_type, uint16_t frame_len)
+{
+    int32_t i, idx0, idx1;
+    real_t win0, win1, win2;
+     
+    const real_t *window_long       = NULL;
+    const real_t *window_long_prev  = NULL;
+    const real_t *window_short      = NULL;
+    const real_t *window_short_prev = NULL;
+    int32_t nlong    = frame_len;
+    int32_t nshort   = frame_len/8;
+    int32_t nflat_ls = (nlong-nshort)/2;
+#ifdef PROFILE
+    int64_t count = faad_get_ts();
+#endif
+    memset(transf_buf,0,sizeof(transf_buf));
+    /* select windows of current frame and previous frame (Sine or KBD) */
+#ifdef LD_DEC
+    if (object_type == LD)
+    {
+        window_long       = fb->ld_window[window_shape];
+        window_long_prev  = fb->ld_window[window_shape_prev];
+    } else {
+#else
+        (void) object_type;
+#endif
+    /* AAC uses two different window shapes depending on spectal features */
+    if (window_shape == 0) {
+        window_long  = sine_long_1024;
+        window_short = sine_short_128;
+    } else {
+        window_long  = kbd_long_1024;
+        window_short = kbd_short_128;            
+    }
+    
+    if (window_shape_prev == 0) {
+        window_long_prev  = sine_long_1024;
+        window_short_prev = sine_short_128;
+    } else {
+        window_long_prev  = kbd_long_1024;
+        window_short_prev = kbd_short_128;
+    }
+#ifdef LD_DEC
+    }
+#endif
+#if 0
+    for (i = 0; i < 1024; i++)
+    {
+        printf("%d\n", freq_in[i]);
+    }
+#endif
+#if 0
+    printf("%d %d\n", window_sequence, window_shape);
+#endif
+    switch (window_sequence)
+    {
+    case ONLY_LONG_SEQUENCE:
+        /* perform iMDCT */
+        ff_imdct_calc(11, transf_buf, freq_in);
+        /* add second half output of previous frame to windowed output of current frame */
+        vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap,  nlong);
+        /* window the second half and save as overlap for next frame */
+        vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
+        break;
+    case LONG_START_SEQUENCE:
+        /* perform iMDCT */
+        ff_imdct_calc(11, transf_buf, freq_in);
+        /* add second half output of previous frame to windowed output of current frame */
+        vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap,  nlong);
+        /* window the second half and save as overlap for next frame */
+        /* construct second half window using padding with 1's and 0's */
+        
+        memcpy(overlap, transf_buf+nlong, nflat_ls*sizeof(real_t));
+        vector_fmul_reverse(overlap+nflat_ls, transf_buf+nlong+nflat_ls, window_short, nshort);
+        memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
+        break;
+    case EIGHT_SHORT_SEQUENCE:
+        /* this could be assemblerized too, but this case is extremely uncommon */   
+         
+        /* perform iMDCT for each short block */
+        idx0 = 0;       ff_imdct_calc(8, transf_buf            , freq_in       );
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
+        /* Add second half output of previous frame to windowed output of current 
+         * frame */
+        /* Step 1: copy */
+        memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
+        /* Step 2: First window half, first half of nshort */
+        for (i = 0; i < nshort/2; i++) {
+            win0 = window_short[nshort-1-i];
+            win1 = window_short[i];
+            win2 = window_short_prev[i];
+            idx0 = nflat_ls + i;
+            idx1 = i;
+            time_out[idx0] = overlap[idx0] +                                        MUL_F(transf_buf[idx1], win2); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
+        }
+        /* Step 3: First window half, second half of nshort */
+        for (; i < nshort; i++) {
+            win0 = window_short[nshort-1-i];
+            win1 = window_short[i];
+            idx0 = nflat_ls + i;
+            idx1 = i;
+            time_out[idx0] = overlap[idx0] +                                        MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
+        }
+        /* Window the second half and save as overlap for next frame */
+        /* Step 1: Second window half, first half of nshort */
+        for (i = 0; i < nshort/2; i++) {
+            win0 = window_short[nshort-1-i];
+            win1 = window_short[i];
+            idx0 = nflat_ls + 5*nshort + i - nlong;
+            idx1 = nshort*10 + i;
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
+        }
+        /* Step 2: Second window half, second half of nshort */
+        for (; i < nshort; i++) {
+            win0 = window_short[nshort-1-i];
+            win1 = window_short[i];
+            idx0 = nflat_ls + 4*nshort + i - nlong;
+            idx1 = nshort*8 + i;
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
+            overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
+        }
+        /* Step 3: Set to zero */
+        memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
+        break;
+    case LONG_STOP_SEQUENCE:
+        /* perform iMDCT */
+        ff_imdct_calc(11, transf_buf, freq_in);
+        /* add second half output of previous frame to windowed output of current frame */
+        /* construct first half window using padding with 1's and 0's */
+        memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
+        vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls,  nshort);
+        /* nflat_ls can be divided by 2. */
+        idx0 = nflat_ls + nshort;
+        for (i = 0; i < nflat_ls; i+=2) {
+            time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
+            time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
+        }
+        /* window the second half and save as overlap for next frame */
+        vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
+        break;
+    }
+#if 0
+    for (i = 0; i < 1024; i++)
+    {
+        printf("%d\n", time_out[i]);
+        //printf("0x%.8X\n", time_out[i]);
+    }
+#endif
+#ifdef PROFILE
+    count = faad_get_ts() - count;
+    fb->cycles += count;
+#endif
+}
+#ifdef LTP_DEC
+/* only works for LTP -> no overlapping, no short blocks */
+void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
+                     uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct,
+                     uint8_t object_type, uint16_t frame_len)
+{
+    int16_t i;
+    const real_t *window_long = NULL;
+    const real_t *window_long_prev = NULL;
+    const real_t *window_short = NULL;
+    const real_t *window_short_prev = NULL;
+    uint16_t nlong = frame_len;
+    uint16_t nshort = frame_len/8;
+    uint16_t nflat_ls = (nlong-nshort)/2;
+    //assert(window_sequence != EIGHT_SHORT_SEQUENCE);
+    memset(windowed_buf,0,sizeof(windowed_buf));
+#ifdef LD_DEC
+    if (object_type == LD)
+    {
+        window_long       = fb->ld_window[window_shape];
+        window_long_prev  = fb->ld_window[window_shape_prev];
+    } else {
+#else
+        (void) object_type;
+#endif
+        window_long       = fb->long_window[window_shape];
+        window_long_prev  = fb->long_window[window_shape_prev];
+        window_short      = fb->short_window[window_shape];
+        window_short_prev = fb->short_window[window_shape_prev];
+#ifdef LD_DEC
+    }
+#endif
+    switch(window_sequence)
+    {
+    case ONLY_LONG_SEQUENCE:
+        for (i = nlong-1; i >= 0; i--)
+        {
+            windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
+            windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
+        }
+        mdct(fb, windowed_buf, out_mdct, 2*nlong);
+        break;
+    case LONG_START_SEQUENCE:
+        for (i = 0; i < nlong; i++)
+            windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
+        for (i = 0; i < nflat_ls; i++)
+            windowed_buf[i+nlong] = in_data[i+nlong];
+        for (i = 0; i < nshort; i++)
+            windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
+        for (i = 0; i < nflat_ls; i++)
+            windowed_buf[i+nlong+nflat_ls+nshort] = 0;
+        mdct(fb, windowed_buf, out_mdct, 2*nlong);
+        break;
+    case LONG_STOP_SEQUENCE:
+        for (i = 0; i < nflat_ls; i++)
+            windowed_buf[i] = 0;
+        for (i = 0; i < nshort; i++)
+            windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
+        for (i = 0; i < nflat_ls; i++)
+            windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
+        for (i = 0; i < nlong; i++)
+            windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
+        mdct(fb, windowed_buf, out_mdct, 2*nlong);
+        break;
+    }
+}
+#endif

diff --git a/lib/rbcodec/codecs/libfaad/filtbank.c b/lib/rbcodec/codecs/libfaad/filtbank.c new file mode 100644 index 0000000000..fd7a4dc91f --- /dev/null +++ b/lib/rbcodec/codecs/libfaad/filtbank.c
@@ -0,0 +1,482 @@
	1	/*
	2	** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
	3	** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
	4	**
	5	** This program is free software; you can redistribute it and/or modify
	6	** it under the terms of the GNU General Public License as published by
	7	** the Free Software Foundation; either version 2 of the License, or
	8	** (at your option) any later version.
	9	**
	10	** This program is distributed in the hope that it will be useful,
	11	** but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	** GNU General Public License for more details.
	14	**
	15	** You should have received a copy of the GNU General Public License
	16	** along with this program; if not, write to the Free Software
	17	** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	18	**
	19	** Any non-GPL usage of this software or parts of this software is strictly
	20	** forbidden.
	21	**
	22	** Commercial non-GPL licensing of this software is possible.
	23	** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
	24	**
	25	** $Id$
	26	**/
	27
	28	#include "common.h"
	29	#include "structs.h"
	30
	31	#include <stdlib.h>
	32	#include <string.h>
	33	#ifdef _WIN32_WCE
	34	#define assert(x)
	35	#else
	36	#include <assert.h>
	37	#endif
	38
	39	#include "filtbank.h"
	40	#include "decoder.h"
	41	#include "syntax.h"
	42	#include "kbd_win.h"
	43	#include "sine_win.h"
	44
	45
	46	/* static variables */
	47	static real_t transf_buf[2*FRAME_LEN] IBSS_ATTR MEM_ALIGN_ATTR;
	48	#ifdef LTP_DEC
	49	static real_t windowed_buf[2*FRAME_LEN] MEM_ALIGN_ATTR = {0};
	50	#endif
	51
	52
	53	/Windowing functions borrowed from libwmai/
	54	#ifdef CPU_ARM
	55	static inline
	56	void vector_fmul_add_add(real_t dst, const real_t src0, const real_t src1, const real_t src2, int len)
	57	{
	58	/* Block sizes are always power of two */
	59	asm volatile (
	60	"0:"
	61	"ldmia %[d]!, {r0, r1};"
	62	"ldmia %[w]!, {r4, r5};"
	63	/* consume the first data and window value so we can use those
	64	* registers again */
	65	"smull r8, r9, r0, r4;"
	66	"ldmia %[src2]!, {r0, r4};"
	67	"add r0, r0, r9, lsl #1;" /* dst=dst+(r9<<1)*/
	68	"smull r8, r9, r1, r5;"
	69	"add r1, r4, r9, lsl #1;"
	70	"stmia %[dst]!, {r0, r1};"
	71	"subs %[n], %[n], #2;"
	72	"bne 0b;"
	73	: [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len)
	74	:
	75	: "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
	76	}
	77	static inline
	78	void vector_fmul_reverse(real_t dst, const real_t src0, const real_t *src1,
	79	int len)
	80	{
	81	/* Block sizes are always power of two */
	82	asm volatile (
	83	"add %[s1], %[s1], %[n], lsl #2;"
	84	"0:"
	85	"ldmia %[s0]!, {r0, r1};"
	86	"ldmdb %[s1]!, {r4, r5};"
	87	"smull r8, r9, r0, r5;"
	88	"mov r0, r9, lsl #1;"
	89	"smull r8, r9, r1, r4;"
	90	"mov r1, r9, lsl #1;"
	91	"stmia %[dst]!, {r0, r1};"
	92	"subs %[n], %[n], #2;"
	93	"bne 0b;"
	94	: [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
	95	:
	96	: "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
	97	}
	98
	99	#elif defined(CPU_COLDFIRE)
	100	static inline
	101	void vector_fmul_add_add(real_t dst, const real_t src0, const real_t src1, const real_t src2, int len)
	102	{
	103	/* Block sizes are always power of two. Smallest block is always way bigger
	104	* than four too.*/
	105	asm volatile (
	106	"0:"
	107	"movem.l (%[src0]), %%d0-%%d3;"
	108	"movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;"
	109	"mac.l %%d0, %%d4, %%acc0;"
	110	"mac.l %%d1, %%d5, %%acc1;"
	111	"mac.l %%d2, %%a0, %%acc2;"
	112	"mac.l %%d3, %%a1, %%acc3;"
	113	"lea.l (16, %[src0]), %[src0];"
	114	"lea.l (16, %[src1]), %[src1];"
	115	"movclr.l %%acc0, %%d0;"
	116	"movclr.l %%acc1, %%d1;"
	117	"movclr.l %%acc2, %%d2;"
	118	"movclr.l %%acc3, %%d3;"
	119	"movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;"
	120	"lea.l (16, %[src2]), %[src2];"
	121	"add.l %%d4, %%d0;"
	122	"add.l %%d5, %%d1;"
	123	"add.l %%a0, %%d2;"
	124	"add.l %%a1, %%d3;"
	125	"movem.l %%d0-%%d3, (%[dst]);"
	126	"lea.l (16, %[dst]), %[dst];"
	127	"subq.l #4, %[n];"
	128	"jne 0b;"
	129	: [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len)
	130	:
	131	: "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
	132	}
	133
	134	static inline
	135	void vector_fmul_reverse(real_t dst, const real_t src0, const real_t *src1,
	136	int len)
	137	{
	138	/* Block sizes are always power of two. Smallest block is always way bigger
	139	* than four too.*/
	140	asm volatile (
	141	"lea.l (-16, %[s1], %[n]*4), %[s1];"
	142	"0:"
	143	"movem.l (%[s0]), %%d0-%%d3;"
	144	"movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
	145	"mac.l %%d0, %%a1, %%acc0;"
	146	"mac.l %%d1, %%a0, %%acc1;"
	147	"mac.l %%d2, %%d5, %%acc2;"
	148	"mac.l %%d3, %%d4, %%acc3;"
	149	"lea.l (16, %[s0]), %[s0];"
	150	"lea.l (-16, %[s1]), %[s1];"
	151	"movclr.l %%acc0, %%d0;"
	152	"movclr.l %%acc1, %%d1;"
	153	"movclr.l %%acc2, %%d2;"
	154	"movclr.l %%acc3, %%d3;"
	155	"movem.l %%d0-%%d3, (%[dst]);"
	156	"lea.l (16, %[dst]), %[dst];"
	157	"subq.l #4, %[n];"
	158	"jne 0b;"
	159	: [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
	160	: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
	161	}
	162
	163	#else
	164	static inline void vector_fmul_add_add(real_t dst, const real_t src0, const real_t src1, const real_t src2, int len){
	165	int i;
	166	for(i=0; i<len; i++)
	167	dst[i] = MUL_F(src0[i], src1[i]) + src2[i];
	168	}
	169
	170	static inline void vector_fmul_reverse(real_t dst, const real_t src0, const real_t *src1, int len){
	171	int i;
	172	src1 += len-1;
	173	for(i=0; i<len; i++)
	174	dst[i] = MUL_F(src0[i], src1[-i]);
	175	}
	176	#endif
	177
	178	#ifdef LTP_DEC
	179	static INLINE void mdct(fb_info fb, real_t in_data, real_t *out_data, uint16_t len)
	180	{
	181	mdct_info *mdct = NULL;
	182
	183	switch (len)
	184	{
	185	case 2048:
	186	case 1920:
	187	mdct = fb->mdct2048;
	188	break;
	189	case 256:
	190	case 240:
	191	mdct = fb->mdct256;
	192	break;
	193	#ifdef LD_DEC
	194	case 1024:
	195	case 960:
	196	mdct = fb->mdct1024;
	197	break;
	198	#endif
	199	}
	200
	201	faad_mdct(mdct, in_data, out_data);
	202	}
	203	#endif
	204
	205	void ifilter_bank(uint8_t window_sequence, uint8_t window_shape,
	206	uint8_t window_shape_prev, real_t *freq_in,
	207	real_t time_out, real_t overlap,
	208	uint8_t object_type, uint16_t frame_len)
	209	{
	210	int32_t i, idx0, idx1;
	211	real_t win0, win1, win2;
	212
	213	const real_t *window_long = NULL;
	214	const real_t *window_long_prev = NULL;
	215	const real_t *window_short = NULL;
	216	const real_t *window_short_prev = NULL;
	217
	218	int32_t nlong = frame_len;
	219	int32_t nshort = frame_len/8;
	220	int32_t nflat_ls = (nlong-nshort)/2;
	221
	222	#ifdef PROFILE
	223	int64_t count = faad_get_ts();
	224	#endif
	225
	226	memset(transf_buf,0,sizeof(transf_buf));
	227	/* select windows of current frame and previous frame (Sine or KBD) */
	228	#ifdef LD_DEC
	229	if (object_type == LD)
	230	{
	231	window_long = fb->ld_window[window_shape];
	232	window_long_prev = fb->ld_window[window_shape_prev];
	233	} else {
	234	#else
	235	(void) object_type;
	236	#endif
	237
	238	/* AAC uses two different window shapes depending on spectal features */
	239	if (window_shape == 0) {
	240	window_long = sine_long_1024;
	241	window_short = sine_short_128;
	242	} else {
	243	window_long = kbd_long_1024;
	244	window_short = kbd_short_128;
	245	}
	246
	247	if (window_shape_prev == 0) {
	248	window_long_prev = sine_long_1024;
	249	window_short_prev = sine_short_128;
	250	} else {
	251	window_long_prev = kbd_long_1024;
	252	window_short_prev = kbd_short_128;
	253	}
	254
	255	#ifdef LD_DEC
	256	}
	257	#endif
	258
	259	#if 0
	260	for (i = 0; i < 1024; i++)
	261	{
	262	printf("%d\n", freq_in[i]);
	263	}
	264	#endif
	265
	266	#if 0
	267	printf("%d %d\n", window_sequence, window_shape);
	268	#endif
	269	switch (window_sequence)
	270	{
	271	case ONLY_LONG_SEQUENCE:
	272	/* perform iMDCT */
	273	ff_imdct_calc(11, transf_buf, freq_in);
	274
	275	/* add second half output of previous frame to windowed output of current frame */
	276	vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong);
	277
	278	/* window the second half and save as overlap for next frame */
	279	vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
	280
	281	break;
	282
	283	case LONG_START_SEQUENCE:
	284	/* perform iMDCT */
	285	ff_imdct_calc(11, transf_buf, freq_in);
	286
	287	/* add second half output of previous frame to windowed output of current frame */
	288	vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong);
	289
	290	/* window the second half and save as overlap for next frame */
	291	/* construct second half window using padding with 1's and 0's */
	292
	293	memcpy(overlap, transf_buf+nlong, nflat_ls*sizeof(real_t));
	294
	295	vector_fmul_reverse(overlap+nflat_ls, transf_buf+nlong+nflat_ls, window_short, nshort);
	296
	297	memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
	298	break;
	299
	300	case EIGHT_SHORT_SEQUENCE:
	301	/* this could be assemblerized too, but this case is extremely uncommon */
	302
	303	/* perform iMDCT for each short block */
	304	idx0 = 0; ff_imdct_calc(8, transf_buf , freq_in );
	305	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	306	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	307	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	308	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	309	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	310	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	311	idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
	312
	313	/* Add second half output of previous frame to windowed output of current
	314	* frame */
	315	/* Step 1: copy */
	316	memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
	317	/* Step 2: First window half, first half of nshort */
	318	for (i = 0; i < nshort/2; i++) {
	319	win0 = window_short[nshort-1-i];
	320	win1 = window_short[i];
	321	win2 = window_short_prev[i];
	322	idx0 = nflat_ls + i;
	323	idx1 = i;
	324	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1], win2); idx0 += nshort; idx1 += (nshort<<1);
	325	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	326	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	327	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	328	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
	329	}
	330	/* Step 3: First window half, second half of nshort */
	331	for (; i < nshort; i++) {
	332	win0 = window_short[nshort-1-i];
	333	win1 = window_short[i];
	334	idx0 = nflat_ls + i;
	335	idx1 = i;
	336	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	337	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	338	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	339	time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
	340	}
	341
	342	/* Window the second half and save as overlap for next frame */
	343	/* Step 1: Second window half, first half of nshort */
	344	for (i = 0; i < nshort/2; i++) {
	345	win0 = window_short[nshort-1-i];
	346	win1 = window_short[i];
	347	idx0 = nflat_ls + 5*nshort + i - nlong;
	348	idx1 = nshort*10 + i;
	349	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	350	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	351	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	352	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
	353	}
	354	/* Step 2: Second window half, second half of nshort */
	355	for (; i < nshort; i++) {
	356	win0 = window_short[nshort-1-i];
	357	win1 = window_short[i];
	358	idx0 = nflat_ls + 4*nshort + i - nlong;
	359	idx1 = nshort*8 + i;
	360	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	361	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	362	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	363	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
	364	overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
	365	}
	366	/* Step 3: Set to zero */
	367	memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
	368
	369	break;
	370
	371	case LONG_STOP_SEQUENCE:
	372	/* perform iMDCT */
	373	ff_imdct_calc(11, transf_buf, freq_in);
	374
	375	/* add second half output of previous frame to windowed output of current frame */
	376	/* construct first half window using padding with 1's and 0's */
	377	memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
	378
	379	vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls, nshort);
	380
	381	/* nflat_ls can be divided by 2. */
	382	idx0 = nflat_ls + nshort;
	383	for (i = 0; i < nflat_ls; i+=2) {
	384	time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
	385	time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
	386	}
	387
	388	/* window the second half and save as overlap for next frame */
	389	vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
	390	break;
	391	}
	392
	393	#if 0
	394	for (i = 0; i < 1024; i++)
	395	{
	396	printf("%d\n", time_out[i]);
	397	//printf("0x%.8X\n", time_out[i]);
	398	}
	399	#endif
	400
	401
	402	#ifdef PROFILE
	403	count = faad_get_ts() - count;
	404	fb->cycles += count;
	405	#endif
	406	}
	407
	408
	409	#ifdef LTP_DEC
	410	/* only works for LTP -> no overlapping, no short blocks */
	411	void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
	412	uint8_t window_shape_prev, real_t in_data, real_t out_mdct,
	413	uint8_t object_type, uint16_t frame_len)
	414	{
	415	int16_t i;
	416
	417	const real_t *window_long = NULL;
	418	const real_t *window_long_prev = NULL;
	419	const real_t *window_short = NULL;
	420	const real_t *window_short_prev = NULL;
	421
	422	uint16_t nlong = frame_len;
	423	uint16_t nshort = frame_len/8;
	424	uint16_t nflat_ls = (nlong-nshort)/2;
	425
	426	//assert(window_sequence != EIGHT_SHORT_SEQUENCE);
	427
	428	memset(windowed_buf,0,sizeof(windowed_buf));
	429	#ifdef LD_DEC
	430	if (object_type == LD)
	431	{
	432	window_long = fb->ld_window[window_shape];
	433	window_long_prev = fb->ld_window[window_shape_prev];
	434	} else {
	435	#else
	436	(void) object_type;
	437	#endif
	438	window_long = fb->long_window[window_shape];
	439	window_long_prev = fb->long_window[window_shape_prev];
	440	window_short = fb->short_window[window_shape];
	441	window_short_prev = fb->short_window[window_shape_prev];
	442	#ifdef LD_DEC
	443	}
	444	#endif
	445
	446	switch(window_sequence)
	447	{
	448	case ONLY_LONG_SEQUENCE:
	449	for (i = nlong-1; i >= 0; i--)
	450	{
	451	windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
	452	windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
	453	}
	454	mdct(fb, windowed_buf, out_mdct, 2*nlong);
	455	break;
	456
	457	case LONG_START_SEQUENCE:
	458	for (i = 0; i < nlong; i++)
	459	windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
	460	for (i = 0; i < nflat_ls; i++)
	461	windowed_buf[i+nlong] = in_data[i+nlong];
	462	for (i = 0; i < nshort; i++)
	463	windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
	464	for (i = 0; i < nflat_ls; i++)
	465	windowed_buf[i+nlong+nflat_ls+nshort] = 0;
	466	mdct(fb, windowed_buf, out_mdct, 2*nlong);
	467	break;
	468
	469	case LONG_STOP_SEQUENCE:
	470	for (i = 0; i < nflat_ls; i++)
	471	windowed_buf[i] = 0;
	472	for (i = 0; i < nshort; i++)
	473	windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
	474	for (i = 0; i < nflat_ls; i++)
	475	windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
	476	for (i = 0; i < nlong; i++)
	477	windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
	478	mdct(fb, windowed_buf, out_mdct, 2*nlong);
	479	break;
	480	}
	481	}
	482	#endif