From f0d1c96ee435e03af0c92aa5ac5260499ae589ed Mon Sep 17 00:00:00 2001 From: Dave Bryant Date: Thu, 23 Feb 2006 20:53:59 +0000 Subject: Optimization of WavPack decoding in ARM assembler (for iPods). This allows WavPack files encoded in "high" mode to play without skipping, although it's still rather marginal (i.e. can't play with other DSP effects enabled). For now this will not work with 24-bit files either, although that is coming along. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8814 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libwavpack/SOURCES | 3 + apps/codecs/libwavpack/arm.S | 474 ++++++++++++++++++++++++++++++++++++++++ apps/codecs/libwavpack/unpack.c | 6 +- 3 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 apps/codecs/libwavpack/arm.S diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES index f63c55a87a..8e38767ec6 100644 --- a/apps/codecs/libwavpack/SOURCES +++ b/apps/codecs/libwavpack/SOURCES @@ -8,4 +8,7 @@ wputils.c #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) coldfire.S #endif +#if defined(CPU_ARM) && !defined(SIMULATOR) +arm.S +#endif diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S new file mode 100644 index 0000000000..0b92bfccd7 --- /dev/null +++ b/apps/codecs/libwavpack/arm.S @@ -0,0 +1,474 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by David Bryant + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* This is an assembly optimized version of the following WavPack function: + * + * void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp, + * long *buffer, long sample_count); + * + * It performs a single pass of stereo decorrelation on the provided buffer. + * Note that this version of the function requires that the 8 previous stereo + * samples are visible and correct. In other words, it ignores the "samples_*" + * fields in the decorr_pass structure and gets the history data directly + * from the buffer. It does, however, return the appropriate history samples + * to the decorr_pass structure before returning. + * + * This is written to work on a ARM7TDMI processor. This version only uses the + * 32-bit multiply-accumulate instruction and so will overflow with 24-bit + * WavPack files. The advanced 64-bit multiply instructions in the ARM will + * provide full resolution for this, but are somewhat slower and have not + * been included yet. + */ + .text + .align + .global decorr_stereo_pass_cont_arm + +/* + * on entry: + * + * r0 = struct decorr_pass *dpp + * r1 = long *buffer + * r2 = long sample_count + */ + +decorr_stereo_pass_cont_arm: + + stmfd sp!, {r4 - r8, r10, r11, lr} + mov r5, r0 @ r5 = dpp + mov r11, #512 @ r11 = 512 for rounding + ldrsh r6, [r0, #2] @ r6 = dpp->delta + ldrsh r4, [r0, #4] @ r4 = dpp->weight_A + ldrsh r0, [r0, #6] @ r0 = dpp->weight_B + cmp r2, #0 @ exit if no samples to process + beq common_exit + + add r7, r1, r2, asl #3 @ r7 = buffer ending position + ldrsh r2, [r5, #0] @ r2 = dpp->term + cmp r2, #0 + bmi minus_term + + ldr lr, [r1, #-16] @ load 2 sample history from buffer + ldr r10, [r1, #-12] @ for terms 2, 17, and 18 + ldr r8, [r1, #-8] + ldr r3, [r1, #-4] + cmp r2, #17 + beq term_17_loop + cmp r2, #18 + beq term_18_loop + cmp r2, #2 + beq term_2_loop + b term_default_loop @ else handle default (1-8, except 2) + +minus_term: + mov r10, #1024 @ r10 = -1024 for weight clipping + rsb r10, r10, #0 @ (only used for negative terms) + cmn r2, #1 + beq term_minus_1 + cmn r2, #2 + beq term_minus_2 + cmn r2, #3 + beq term_minus_3 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = 17 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample + * r3 = previous right sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = current decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_17_loop: + rsbs ip, lr, r8, asl #1 @ decorr value = (2 * prev) - 2nd prev + mov lr, r8 @ previous becomes 2nd previous + ldr r2, [r1], #4 @ get sample & update pointer + mla r8, ip, r4, r11 @ mult decorr value by weight, round, + add r8, r2, r8, asr #10 @ shift, and add to new sample + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L325 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L325: rsbs ip, r10, r3, asl #1 @ do same thing for right channel + mov r10, r3 + ldr r2, [r1], #4 + mla r3, ip, r0, r11 + add r3, r2, r3, asr #10 + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L329 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L329: cmp r7, r1 @ loop back if more samples to do + bhi term_17_loop + b store_1718 @ common exit for terms 17 & 18 + +/* + ****************************************************************************** + * Loop to handle term = 18 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample + * r3 = previous right sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_18_loop: + sub ip, r8, lr @ decorr value = + mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1 + adds ip, r8, ip, asr #1 + ldr r2, [r1], #4 @ get sample & update pointer + mla r8, ip, r4, r11 @ mult decorr value by weight, round, + add r8, r2, r8, asr #10 @ shift, and add to new sample + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L337 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L337: sub ip, r3, r10 @ do same thing for right channel + mov r10, r3 + adds ip, r3, ip, asr #1 + ldr r2, [r1], #4 + mla r3, ip, r0, r11 + add r3, r2, r3, asr #10 + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L341 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L341: cmp r7, r1 @ loop back if more samples to do + bhi term_18_loop + +/* common exit for terms 17 & 18 */ + +store_1718: + str r3, [r5, #40] @ store sample history into struct + str r8, [r5, #8] + str r10, [r5, #44] + str lr, [r5, #12] + b common_exit @ and return + +/* + ****************************************************************************** + * Loop to handle term = 2 condition + * (note that this case can be handled by the default term handler (1-8), but + * this special case is faster because it doesn't have to read memory twice) + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample + * r3 = previous right sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_2_loop: + movs ip, lr @ get decorrelation value & test + mov lr, r8 @ previous becomes 2nd previous + ldr r2, [r1], #4 @ get sample & update pointer + mla r8, ip, r4, r11 @ mult decorr value by weight, round, + add r8, r2, r8, asr #10 @ shift, and add to new sample + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L225 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L225: movs ip, r10 @ do same thing for right channel + mov r10, r3 + ldr r2, [r1], #4 + mla r3, ip, r0, r11 + add r3, r2, r3, asr #10 + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L229 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L229: cmp r7, r1 @ loop back if more samples to do + bhi term_2_loop + b default_term_exit @ this exit updates all dpp->samples + +/* + ****************************************************************************** + * Loop to handle default term condition + * + * r0 = dpp->weight_B r8 = result accumulator + * r1 = bptr r9 = + * r2 = dpp->term r10 = + * r3 = decorrelation value r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_default_loop: + ldr ip, [r1] @ get original sample + ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term + mla r8, r4, r3, r11 @ mult decorr value by weight, round, + add r8, ip, r8, asr #10 @ shift and add to new sample + str r8, [r1], #4 @ store update sample + cmp r3, #0 + cmpne ip, #0 + beq .L350 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L350: ldr ip, [r1] @ do the same thing for right channel + ldr r3, [r1, -r2, asl #3] + mla r8, r0, r3, r11 + add r8, ip, r8, asr #10 + str r8, [r1], #4 + cmp r3, #0 + cmpne ip, #0 + beq .L354 + teq ip, r3 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L354: cmp r7, r1 @ loop back if more samples to do + bhi term_default_loop + +/* + * This exit is used by terms 1-8 to store the previous 8 samples into the decorr + * structure (even if they are not all used for the given term) + */ + +default_term_exit: + ldrsh r3, [r5, #0] + sub ip, r3, #1 + mov lr, #7 + +.L358: and r3, ip, #7 + add r3, r5, r3, asl #2 + ldr r2, [r1, #-4] + str r2, [r3, #40] + ldr r2, [r1, #-8]! + str r2, [r3, #8] + sub ip, ip, #1 + sub lr, lr, #1 + cmn lr, #1 + bne .L358 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -1 condition + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous right sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated left sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_1: + ldr r3, [r1, #-4] + +term_minus_1_loop: + ldr ip, [r1] @ for left channel the decorrelation value + mla r2, r3, r4, r11 @ is the previous right sample (in r3) + add lr, ip, r2, asr #10 + str lr, [r1], #8 + cmp r3, #0 + cmpne ip, #0 + beq .L361 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #1024 + movgt r4, #1024 + cmp r4, r10 + movlt r4, r10 + +.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value + mla r3, r0, lr, r11 @ is the just updated right sample (in lr) + add r3, r2, r3, asr #10 + str r3, [r1, #-4] + cmp lr, #0 + cmpne r2, #0 + beq .L369 + teq r2, lr + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #1024 @ then clip weight to +/-1024 + movgt r0, #1024 + cmp r0, r10 + movlt r0, r10 + +.L369: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_1_loop + + str r3, [r5, #8] @ else store right sample and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -2 condition + * (note that the channels are processed in the reverse order here) + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous left sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_2: + ldr r3, [r1, #-8] + +term_minus_2_loop: + ldr ip, [r1, #4] @ for right channel the decorrelation value + mla r2, r3, r0, r11 @ is the previous left sample (in r3) + add lr, ip, r2, asr #10 + str lr, [r1, #4] + cmp r3, #0 + cmpne ip, #0 + beq .L380 + teq ip, r3 @ update weight based on signs + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #1024 @ then clip weight to +/-1024 + movgt r0, #1024 + cmp r0, r10 + movlt r0, r10 + +.L380: ldr r2, [r1, #0] @ for left channel the decorrelation value + mla r3, r4, lr, r11 @ is the just updated left sample (in lr) + add r3, r2, r3, asr #10 + str r3, [r1], #8 + cmp lr, #0 + cmpne r2, #0 + beq .L388 + teq r2, lr + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #1024 + movgt r4, #1024 + cmp r4, r10 + movlt r4, r10 + +.L388: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_2_loop + + str r3, [r5, #40] @ else store left channel and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -3 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current left sample r10 = -1024 (for clipping) + * r3 = previous right sample r11 = 512 (for rounding) + * r4 = dpp->weight_A ip = intermediate result + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_3: + ldr r3, [r1, #-4] @ load previous samples + ldr r8, [r1, #-8] + +term_minus_3_loop: + ldr ip, [r1] + mla r2, r3, r4, r11 + add r2, ip, r2, asr #10 + str r2, [r1], #4 + cmp r3, #0 + cmpne ip, #0 + beq .L399 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #1024 @ then clip weight to +/-1024 + movgt r4, #1024 + cmp r4, r10 + movlt r4, r10 + +.L399: movs ip, r8 @ ip = previous left we use now + mov r8, r2 @ r8 = current left we use next time + ldr r2, [r1], #4 + mla r3, ip, r0, r11 + add r3, r2, r3, asr #10 + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L407 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #1024 + movgt r0, #1024 + cmp r0, r10 + movlt r0, r10 + +.L407: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_3_loop + + str r3, [r5, #8] @ else store previous samples & exit + str r8, [r5, #40] + +/* + * Before finally exiting we must store weights back for next time + */ + +common_exit: + strh r4, [r5, #4] + strh r0, [r5, #6] + ldmfd sp!, {r4 - r8, r10, r11, pc} + diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c index 8f5c1ee46f..0c61e0e38a 100644 --- a/apps/codecs/libwavpack/unpack.c +++ b/apps/codecs/libwavpack/unpack.c @@ -288,6 +288,8 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd) #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count); +#elif defined(CPU_ARM) && !defined(SIMULATOR) +extern void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp, long *buffer, long sample_count); #else static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count); #endif @@ -350,6 +352,8 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count) decorr_stereo_pass (dpp, buffer, 8); #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8); +#elif defined(CPU_ARM) && !defined(SIMULATOR) + decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8); #else decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8); #endif @@ -510,7 +514,7 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp dpp->weight_B = weight_B; } -#if !defined(CPU_COLDFIRE) || defined(SIMULATOR) +#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM)) || defined(SIMULATOR) static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count) { -- cgit v1.2.3