From f51189fa4d16c138c951775a237326b807fdda89 Mon Sep 17 00:00:00 2001 From: Michael Giacomelli Date: Wed, 10 Nov 2010 18:29:29 +0000 Subject: Commit FS#11709 by me. Introduces ARMv5E optimized iQMF for atrac3 based on packed multiply instructions. Additionally, improves scheduling on arm9 and arm11 and forces cache alignment of buffers on all targets. Accuracy is slightly reduced, but still greater then 16 bit. Clip+ CPU clock required for LP2 files decreases by 13MHz and ARM11 by 18MHz. No performance or accuracy changes on armv4 or non-arm. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28549 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/atrac3_rm.c | 2 +- apps/codecs/libatrac/SOURCES | 3 + apps/codecs/libatrac/atrac3.c | 42 +++++++-- apps/codecs/libatrac/atrac3.h | 4 +- apps/codecs/libatrac/atrac3_armv5e.S | 163 +++++++++++++++++++++++++++++++++++ 5 files changed, 205 insertions(+), 9 deletions(-) create mode 100644 apps/codecs/libatrac/atrac3_armv5e.S (limited to 'apps/codecs') diff --git a/apps/codecs/atrac3_rm.c b/apps/codecs/atrac3_rm.c index 6c559ec868..bad9831a25 100644 --- a/apps/codecs/atrac3_rm.c +++ b/apps/codecs/atrac3_rm.c @@ -30,7 +30,7 @@ CODEC_HEADER static RMContext rmctx; static RMPacket pkt; -static ATRAC3Context q IBSS_ATTR; +static ATRAC3Context q IBSS_ATTR __attribute__ ((aligned (32))); static void init_rm(RMContext *rmctx) { diff --git a/apps/codecs/libatrac/SOURCES b/apps/codecs/libatrac/SOURCES index 3eaf4c9c3b..85f011cb87 100644 --- a/apps/codecs/libatrac/SOURCES +++ b/apps/codecs/libatrac/SOURCES @@ -1,5 +1,8 @@ atrac3.c #if defined(CPU_ARM) atrac3_arm.S +#if (ARM_ARCH >= 5) +atrac3_armv5e.S +#endif #endif diff --git a/apps/codecs/libatrac/atrac3.c b/apps/codecs/libatrac/atrac3.c index b93b058bb2..f6085fa2fa 100644 --- a/apps/codecs/libatrac/atrac3.c +++ b/apps/codecs/libatrac/atrac3.c @@ -55,7 +55,11 @@ #define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) static VLC spectral_coeff_tab[7]; +#if defined(CPU_ARM) && (ARM_ARCH >= 5) /*ARMv5e+ uses 32x16 multiplication*/ +static int16_t qmf_window[48] IBSS_ATTR __attribute__ ((aligned (32))); +#else static int32_t qmf_window[48] IBSS_ATTR; +#endif static int32_t atrac3_spectrum [2][1024] IBSS_ATTR __attribute__((aligned(16))); static int32_t atrac3_IMDCT_buf[2][ 512] IBSS_ATTR __attribute__((aligned(16))); static int32_t atrac3_prevFrame[2][1024] IBSS_ATTR; @@ -118,12 +122,30 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM; * } */ -#if defined(CPU_ARM) +#if defined(CPU_ARM) && (ARM_ARCH >= 5) extern void - atrac3_iqmf_dewindowing(int32_t *out, + atrac3_iqmf_dewindowing_armv5e(int32_t *out, int32_t *in, - int32_t *win, + int16_t *win, unsigned int nIn); + static inline void + atrac3_iqmf_dewindowing(int32_t *out, + int32_t *in, + int16_t *win, + unsigned int nIn) + { + //atrac3_iqmf_dewindowing_armv5e(out, in, win, nIn); + + } + + +#elif defined(CPU_ARM) + extern void + atrac3_iqmf_dewindowing(int32_t *out, + int32_t *in, + int16_t *win, + unsigned int nIn); + #elif defined (CPU_COLDFIRE) #define MULTIPLY_ADD_BLOCK \ "movem.l (%[win]), %%d0-%%d7 \n\t" \ @@ -206,7 +228,9 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM; out[0] = s2; out[1] = s1; + } + } #endif @@ -244,6 +268,7 @@ atrac3_imdct_windowing(int32_t *buffer, static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp) { + /* Restore the delay buffer */ memcpy(temp, delayBuf, 46*sizeof(int32_t)); @@ -274,6 +299,7 @@ static void IMLT(int32_t *pInput, int32_t *pOutput) /* Windowing. */ atrac3_imdct_windowing(pOutput, window_lookup); + } @@ -320,9 +346,13 @@ static void init_atrac3_transforms(void) /* Generate the QMF window. */ for (i=0 ; i<24; i++) { s = qmf_48tap_half_fix[i] << 1; - qmf_window[i] = s; - qmf_window[47 - i] = s; + #if defined(CPU_ARM) && (ARM_ARCH >= 5) + qmf_window[i] = qmf_window[47-i] = (int16_t)((s+(1<<15))>>16); + #else + qmf_window[i] = qmf_window[47-i] = s; + #endif } + } @@ -1229,7 +1259,7 @@ int atrac3_decode_init(ATRAC3Context *q, struct mp3entry *id3) vlcs_initialized = 1; } - + init_atrac3_transforms(); /* init the joint-stereo decoding data */ diff --git a/apps/codecs/libatrac/atrac3.h b/apps/codecs/libatrac/atrac3.h index 74dd992e1b..afe582ab72 100644 --- a/apps/codecs/libatrac/atrac3.h +++ b/apps/codecs/libatrac/atrac3.h @@ -67,6 +67,7 @@ typedef struct { } channel_unit; typedef struct { + int32_t outSamples[2048]; GetBitContext gb; //@{ /** stream data */ @@ -90,8 +91,7 @@ typedef struct { int weighting_delay[6]; //@} //@{ - /** data buffers */ - int32_t outSamples[2048]; + /** data buffers */ uint8_t decoded_bytes_buffer[1024]; int32_t tempBuf[1070]; //@} diff --git a/apps/codecs/libatrac/atrac3_armv5e.S b/apps/codecs/libatrac/atrac3_armv5e.S new file mode 100644 index 0000000000..1add5faef5 --- /dev/null +++ b/apps/codecs/libatrac/atrac3_armv5e.S @@ -0,0 +1,163 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id: + * + * Copyright (C) 2010 by Michael Giacomelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" + + .section .text, "ax", %progbits + + +/**************************************************************************** + * atrac3_iqmf_dewindowing_armv5e(int32_t *out, + * int32_t *in, + * int32_t *win, + * unsigned int nIn); + * + * Dewindowing step within iqmf of atrac3 synthesis using 16 bit filter + * coefficients and armv5e packed multiply instructions. Uses 2.5 cycles + * per filter coefficient (ideal). Benchmarked 3.54 per coefficient (Clip+). + * + * Reference implementation: + * + * for (j = nIn; j != 0; j--) { + * s1 = fixmul32(in[0], win[0]); + * s2 = fixmul32(in[1], win[1]); + * for (i = 2; i < 48; i += 2) { + * s1 += fixmul32(in[i ], win[i ]); + * s2 += fixmul32(in[i+1], win[i+1]); + * } + * out[0] = s2 << 1; + * out[1] = s1 << 1; + * in += 2; + * out += 2; + * } + * Note: r12 is a scratch register and can be used without restorage. + ****************************************************************************/ + .align 2 + .global atrac3_iqmf_dewindowing_armv5e + .type atrac3_iqmf_dewindowing_armv5e, %function + +atrac3_iqmf_dewindowing_armv5e: + /* r0 = dest */ + /* r1 = input samples */ + /* r2 = window coefficients */ + /* r3 = counter */ + stmfd sp!, {r4-r11, lr} /* save non-scratch registers */ + +.iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */ + /* 0.. 7 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[0..7] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[0..3] to avoid stall on arm11 */ + smulwb lr, r6, r4 /* s1 = in[0] * win[0] */ + smulwt r12, r7, r4 /* s2 = in[1] * win[1] */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11, r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + /* 8..15 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[8..15] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + /* 16..23 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[16..23] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + /* 24..31 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[24..31] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + /* 32..39 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[32..39] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + /* 40..47 */ + ldmia r2!, {r4, r5, r8, r9} /* load win[40..47] */ + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */ + smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */ + smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */ + + + mov lr , lr , lsl #1 + mov r12, r12, lsl #1 + + stmia r0!, {r12, lr} /* store result out[0]=s2, out[1]=s1 */ + sub r1, r1, #184 /* roll back 64 entries = 184 bytes */ + sub r2, r2, #96 /* roll back 48 entries * 2 bytes = 96 bytes = win[0] */ + + subs r3, r3, #1 /* outer loop -= 1 */ + bgt .iqmf_dewindow_outer_loop + + ldmpc regs=r4-r11 /* restore registers */ + +.atrac3_iqmf_dewindowing_armv5e_end: + .size atrac3_iqmf_dewindowing_armv5e,.atrac3_iqmf_dewindowing_armv5e_end-atrac3_iqmf_dewindowing_armv5e -- cgit v1.2.3