From 503116240a4c827bda6091207097c7b7e3772467 Mon Sep 17 00:00:00 2001 From: Dave Bryant Date: Sun, 26 Feb 2006 08:22:34 +0000 Subject: More WavPack optimizations. Restored 24-bit file playback. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8842 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libwavpack/SOURCES | 1 + apps/codecs/libwavpack/arm.S | 12 +- apps/codecs/libwavpack/arml.S | 502 ++++++++++++++++++++++++++++++++++++++++ apps/codecs/libwavpack/unpack.c | 6 +- 4 files changed, 513 insertions(+), 8 deletions(-) create mode 100644 apps/codecs/libwavpack/arml.S diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES index 8e38767ec6..5dcded9112 100644 --- a/apps/codecs/libwavpack/SOURCES +++ b/apps/codecs/libwavpack/SOURCES @@ -10,5 +10,6 @@ coldfire.S #endif #if defined(CPU_ARM) && !defined(SIMULATOR) arm.S +arml.S #endif diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S index 0b92bfccd7..233bfd3a52 100644 --- a/apps/codecs/libwavpack/arm.S +++ b/apps/codecs/libwavpack/arm.S @@ -31,9 +31,7 @@ * * This is written to work on a ARM7TDMI processor. This version only uses the * 32-bit multiply-accumulate instruction and so will overflow with 24-bit - * WavPack files. The advanced 64-bit multiply instructions in the ARM will - * provide full resolution for this, but are somewhat slower and have not - * been included yet. + * WavPack files. */ .text .align @@ -248,7 +246,7 @@ term_2_loop: term_default_loop: ldr ip, [r1] @ get original sample ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term - mla r8, r4, r3, r11 @ mult decorr value by weight, round, + mla r8, r3, r4, r11 @ mult decorr value by weight, round, add r8, ip, r8, asr #10 @ shift and add to new sample str r8, [r1], #4 @ store update sample cmp r3, #0 @@ -260,7 +258,7 @@ term_default_loop: .L350: ldr ip, [r1] @ do the same thing for right channel ldr r3, [r1, -r2, asl #3] - mla r8, r0, r3, r11 + mla r8, r3, r0, r11 add r8, ip, r8, asr #10 str r8, [r1], #4 cmp r3, #0 @@ -330,7 +328,7 @@ term_minus_1_loop: movlt r4, r10 .L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value - mla r3, r0, lr, r11 @ is the just updated right sample (in lr) + mla r3, lr, r0, r11 @ is the just updated right sample (in lr) add r3, r2, r3, asr #10 str r3, [r1, #-4] cmp lr, #0 @@ -386,7 +384,7 @@ term_minus_2_loop: movlt r0, r10 .L380: ldr r2, [r1, #0] @ for left channel the decorrelation value - mla r3, r4, lr, r11 @ is the just updated left sample (in lr) + mla r3, lr, r4, r11 @ is the just updated left sample (in lr) add r3, r2, r3, asr #10 str r3, [r1], #8 cmp lr, #0 diff --git a/apps/codecs/libwavpack/arml.S b/apps/codecs/libwavpack/arml.S new file mode 100644 index 0000000000..97474f93b9 --- /dev/null +++ b/apps/codecs/libwavpack/arml.S @@ -0,0 +1,502 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by David Bryant + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* This is an assembly optimized version of the following WavPack function: + * + * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp, + * long *buffer, long sample_count); + * + * It performs a single pass of stereo decorrelation on the provided buffer. + * Note that this version of the function requires that the 8 previous stereo + * samples are visible and correct. In other words, it ignores the "samples_*" + * fields in the decorr_pass structure and gets the history data directly + * from the buffer. It does, however, return the appropriate history samples + * to the decorr_pass structure before returning. + * + * This is written to work on a ARM7TDMI processor. This version uses the + * 64-bit multiply-accumulate instruction and so can be used with all + * WavPack files. However, for optimum performance with 16-bit WavPack + * files, there is a faster version that only uses the 32-bit MLA + * instruction. + */ + + .text + .align + .global decorr_stereo_pass_cont_arml + +/* + * on entry: + * + * r0 = struct decorr_pass *dpp + * r1 = long *buffer + * r2 = long sample_count + */ + +decorr_stereo_pass_cont_arml: + + stmfd sp!, {r4 - r8, r10, r11, lr} + mov r5, r0 @ r5 = dpp + mov r11, #512 @ r11 = 512 for rounding + ldrsh r6, [r0, #2] @ r6 = dpp->delta + ldrsh r4, [r0, #4] @ r4 = dpp->weight_A + ldrsh r0, [r0, #6] @ r0 = dpp->weight_B + cmp r2, #0 @ exit if no samples to process + beq common_exit + + mov r0, r0, asl #18 @ for 64-bit math we use weights << 18 + mov r4, r4, asl #18 + mov r6, r6, asl #18 + add r7, r1, r2, asl #3 @ r7 = buffer ending position + ldrsh r2, [r5, #0] @ r2 = dpp->term + cmp r2, #0 + blt minus_term + + ldr lr, [r1, #-16] @ load 2 sample history from buffer + ldr r10, [r1, #-12] @ for terms 2, 17, and 18 + ldr r8, [r1, #-8] + ldr r3, [r1, #-4] + + cmp r2, #18 + beq term_18_loop + mov lr, lr, asl #4 + mov r10, r10, asl #4 + cmp r2, #2 + beq term_2_loop + cmp r2, #17 + beq term_17_loop + b term_default_loop + +minus_term: + mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping + rsb r10, r10, #0 @ (only used for negative terms) + cmn r2, #1 + beq term_minus_1 + cmn r2, #2 + beq term_minus_2 + cmn r2, #3 + beq term_minus_3 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = 17 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample << 4 + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample << 4 + * r7 = eptr pc = + ******************************************************************************* + */ + +term_17_loop: + rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev + mov lr, r8, asl #4 @ previous becomes 2nd previous + ldr r2, [r1], #4 @ get sample & update pointer + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L325 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel + mov r10, r3, asl #4 + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L329 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L329: cmp r7, r1 @ loop back if more samples to do + bhi term_17_loop + mov lr, lr, asr #4 + mov r10, r10, asr #4 + b store_1718 @ common exit for terms 17 & 18 + +/* + ****************************************************************************** + * Loop to handle term = 18 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_18_loop: + rsb ip, lr, r8 @ decorr value = + mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1 + add ip, lr, ip, asr #1 + movs ip, ip, asl #4 + ldr r2, [r1], #4 @ get sample & update pointer + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L337 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L337: rsb ip, r10, r3 @ do same thing for right channel + mov r10, r3 + add ip, r10, ip, asr #1 + movs ip, ip, asl #4 + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L341 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L341: cmp r7, r1 @ loop back if more samples to do + bhi term_18_loop + +/* common exit for terms 17 & 18 */ + +store_1718: + str r3, [r5, #40] @ store sample history into struct + str r8, [r5, #8] + str r10, [r5, #44] + str lr, [r5, #12] + b common_exit @ and return + +/* + ****************************************************************************** + * Loop to handle term = 2 condition + * (note that this case can be handled by the default term handler (1-8), but + * this special case is faster because it doesn't have to read memory twice) + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample << 4 + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample << 4 + * r7 = eptr pc = + ******************************************************************************* + */ + +term_2_loop: + movs ip, lr @ get decorrelation value & test + ldr r2, [r1], #4 @ get sample & update pointer + mov lr, r8, asl #4 @ previous becomes 2nd previous + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L225 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L225: movs ip, r10 @ do same thing for right channel + ldr r2, [r1], #4 + mov r10, r3, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L229 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L229: cmp r7, r1 @ loop back if more samples to do + bhi term_2_loop + + b default_term_exit @ this exit updates all dpp->samples + +/* + ****************************************************************************** + * Loop to handle default term condition + * + * r0 = dpp->weight_B r8 = result accumulator + * r1 = bptr r9 = + * r2 = dpp->term r10 = + * r3 = decorrelation value r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_default_loop: + ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term + ldr ip, [r1], #4 @ get original sample and bump ptr + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r8, ip + smlalne r11, r8, r4, r3 + strne r8, [r1, #-4] @ if possibly changed, store updated sample + cmpne ip, #0 + beq .L350 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel + ldr ip, [r1], #4 + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r8, ip + smlalne r11, r8, r0, r3 + strne r8, [r1, #-4] + cmpne ip, #0 + beq .L354 + teq ip, r3 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L354: cmp r7, r1 @ loop back if more samples to do + bhi term_default_loop + +/* + * This exit is used by terms 1-8 to store the previous 8 samples into the decorr + * structure (even if they are not all used for the given term) + */ + +default_term_exit: + ldrsh r3, [r5, #0] + sub ip, r3, #1 + mov lr, #7 + +.L358: and r3, ip, #7 + add r3, r5, r3, asl #2 + ldr r2, [r1, #-4] + str r2, [r3, #40] + ldr r2, [r1, #-8]! + str r2, [r3, #8] + sub ip, ip, #1 + sub lr, lr, #1 + cmn lr, #1 + bne .L358 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -1 condition + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated left sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_1: + ldr r3, [r1, #-4] + +term_minus_1_loop: + ldr ip, [r1], #8 @ for left channel the decorrelation value + movs r3, r3, asl #4 @ is the previous right sample (in r3) + mov r11, #0x80000000 + mov lr, ip + smlalne r11, lr, r4, r3 + strne lr, [r1, #-8] + cmpne ip, #0 + beq .L361 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value + movs lr, lr, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, lr + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L369 + teq r2, lr + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) @ then clip weight to +/-1024 + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L369: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_1_loop + + str r3, [r5, #8] @ else store right sample and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -2 condition + * (note that the channels are processed in the reverse order here) + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous left sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_2: + ldr r3, [r1, #-8] + +term_minus_2_loop: + ldr ip, [r1, #4] @ for right channel the decorrelation value + movs r3, r3, asl #4 @ is the previous left sample (in r3) + mov r11, #0x80000000 + mov lr, ip + smlalne r11, lr, r0, r3 + strne lr, [r1, #4] + cmpne ip, #0 + beq .L380 + teq ip, r3 @ update weight based on signs + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) @ then clip weight to +/-1024 + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value + movs lr, lr, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r4, lr + strne r3, [r1, #-8] + cmpne r2, #0 + beq .L388 + teq r2, lr + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L388: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_2_loop + + str r3, [r5, #40] @ else store left channel and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -3 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current left sample r10 = -1024 (for clipping) + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = intermediate result + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_3: + ldr r3, [r1, #-4] @ load previous samples + ldr r8, [r1, #-8] + +term_minus_3_loop: + ldr ip, [r1], #4 + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r2, ip + smlalne r11, r2, r4, r3 + strne r2, [r1, #-4] + cmpne ip, #0 + beq .L399 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) @ then clip weight to +/-1024 + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L399: movs ip, r8, asl #4 @ ip = previous left we use now + mov r8, r2 @ r8 = current left we use next time + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L407 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L407: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_3_loop + + str r3, [r5, #8] @ else store previous samples & exit + str r8, [r5, #40] + +/* + * Before finally exiting we must store weights back for next time + */ + +common_exit: + mov r0, r0, asr #18 @ restore weights to real magnitude + mov r4, r4, asr #18 + strh r4, [r5, #4] + strh r0, [r5, #6] + ldmfd sp!, {r4 - r8, r10, r11, pc} + diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c index 0c61e0e38a..af5d71585e 100644 --- a/apps/codecs/libwavpack/unpack.c +++ b/apps/codecs/libwavpack/unpack.c @@ -290,6 +290,7 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd) extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count); #elif defined(CPU_ARM) && !defined(SIMULATOR) extern void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp, long *buffer, long sample_count); +extern void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp, long *buffer, long sample_count); #else static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count); #endif @@ -353,7 +354,10 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count) #if defined(CPU_COLDFIRE) && !defined(SIMULATOR) decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8); #elif defined(CPU_ARM) && !defined(SIMULATOR) - decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8); + if (((flags & MAG_MASK) >> MAG_LSB) > 15) + decorr_stereo_pass_cont_arml (dpp, buffer + 16, sample_count - 8); + else + decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8); #else decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8); #endif -- cgit v1.2.3