From f40bfc9267b13b54e6379dfe7539447662879d24 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sat, 25 Jun 2011 21:32:25 -0400 Subject: Add codecs to librbcodec. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97 Reviewed-on: http://gerrit.rockbox.org/137 Reviewed-by: Nils Wallménius Tested-by: Nils Wallménius --- lib/rbcodec/codecs/libwavpack/arml.S | 506 +++++++++++++++++++++++++++++++++++ 1 file changed, 506 insertions(+) create mode 100644 lib/rbcodec/codecs/libwavpack/arml.S (limited to 'lib/rbcodec/codecs/libwavpack/arml.S') diff --git a/lib/rbcodec/codecs/libwavpack/arml.S b/lib/rbcodec/codecs/libwavpack/arml.S new file mode 100644 index 0000000000..60818aa1e6 --- /dev/null +++ b/lib/rbcodec/codecs/libwavpack/arml.S @@ -0,0 +1,506 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by David Bryant + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* This is an assembly optimized version of the following WavPack function: + * + * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp, + * long *buffer, long sample_count); + * + * It performs a single pass of stereo decorrelation on the provided buffer. + * Note that this version of the function requires that the 8 previous stereo + * samples are visible and correct. In other words, it ignores the "samples_*" + * fields in the decorr_pass structure and gets the history data directly + * from the buffer. It does, however, return the appropriate history samples + * to the decorr_pass structure before returning. + * + * This is written to work on a ARM7TDMI processor. This version uses the + * 64-bit multiply-accumulate instruction and so can be used with all + * WavPack files. However, for optimum performance with 16-bit WavPack + * files, there is a faster version that only uses the 32-bit MLA + * instruction. + */ + +#include "config.h" + + .text + .align + .global decorr_stereo_pass_cont_arml + +/* + * on entry: + * + * r0 = struct decorr_pass *dpp + * r1 = long *buffer + * r2 = long sample_count + */ + +decorr_stereo_pass_cont_arml: + + stmfd sp!, {r4 - r8, r10, r11, lr} + mov r5, r0 @ r5 = dpp + mov r11, #512 @ r11 = 512 for rounding + ldrsh r6, [r0, #2] @ r6 = dpp->delta + ldrsh r4, [r0, #4] @ r4 = dpp->weight_A + ldrsh r0, [r0, #6] @ r0 = dpp->weight_B + cmp r2, #0 @ exit if no samples to process + beq common_exit + + mov r0, r0, asl #18 @ for 64-bit math we use weights << 18 + mov r4, r4, asl #18 + mov r6, r6, asl #18 + add r7, r1, r2, asl #3 @ r7 = buffer ending position + ldrsh r2, [r5, #0] @ r2 = dpp->term + cmp r2, #0 + blt minus_term + + ldr lr, [r1, #-16] @ load 2 sample history from buffer + ldr r10, [r1, #-12] @ for terms 2, 17, and 18 + ldr r8, [r1, #-8] + ldr r3, [r1, #-4] + + cmp r2, #18 + beq term_18_loop + mov lr, lr, asl #4 + mov r10, r10, asl #4 + cmp r2, #2 + beq term_2_loop + cmp r2, #17 + beq term_17_loop + b term_default_loop + +minus_term: + mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping + rsb r10, r10, #0 @ (only used for negative terms) + cmn r2, #1 + beq term_minus_1 + cmn r2, #2 + beq term_minus_2 + cmn r2, #3 + beq term_minus_3 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = 17 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample << 4 + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample << 4 + * r7 = eptr pc = + ******************************************************************************* + */ + +term_17_loop: + rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev + mov lr, r8, asl #4 @ previous becomes 2nd previous + ldr r2, [r1], #4 @ get sample & update pointer + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L325 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel + mov r10, r3, asl #4 + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L329 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L329: cmp r7, r1 @ loop back if more samples to do + bhi term_17_loop + mov lr, lr, asr #4 + mov r10, r10, asr #4 + b store_1718 @ common exit for terms 17 & 18 + +/* + ****************************************************************************** + * Loop to handle term = 18 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_18_loop: + rsb ip, lr, r8 @ decorr value = + mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1 + add ip, lr, ip, asr #1 + movs ip, ip, asl #4 + ldr r2, [r1], #4 @ get sample & update pointer + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L337 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L337: rsb ip, r10, r3 @ do same thing for right channel + mov r10, r3 + add ip, r10, ip, asr #1 + movs ip, ip, asl #4 + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L341 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L341: cmp r7, r1 @ loop back if more samples to do + bhi term_18_loop + +/* common exit for terms 17 & 18 */ + +store_1718: + str r3, [r5, #40] @ store sample history into struct + str r8, [r5, #8] + str r10, [r5, #44] + str lr, [r5, #12] + b common_exit @ and return + +/* + ****************************************************************************** + * Loop to handle term = 2 condition + * (note that this case can be handled by the default term handler (1-8), but + * this special case is faster because it doesn't have to read memory twice) + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current sample r10 = second previous left sample << 4 + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = decorrelation value + * r5 = dpp sp = + * r6 = dpp->delta lr = second previous right sample << 4 + * r7 = eptr pc = + ******************************************************************************* + */ + +term_2_loop: + movs ip, lr @ get decorrelation value & test + ldr r2, [r1], #4 @ get sample & update pointer + mov lr, r8, asl #4 @ previous becomes 2nd previous + mov r11, #0x80000000 + mov r8, r2 + smlalne r11, r8, r4, ip + strne r8, [r1, #-4] @ if change possible, store sample back + cmpne r2, #0 + beq .L225 + teq ip, r2 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L225: movs ip, r10 @ do same thing for right channel + ldr r2, [r1], #4 + mov r10, r3, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L229 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L229: cmp r7, r1 @ loop back if more samples to do + bhi term_2_loop + + b default_term_exit @ this exit updates all dpp->samples + +/* + ****************************************************************************** + * Loop to handle default term condition + * + * r0 = dpp->weight_B r8 = result accumulator + * r1 = bptr r9 = + * r2 = dpp->term r10 = + * r3 = decorrelation value r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_default_loop: + ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term + ldr ip, [r1], #4 @ get original sample and bump ptr + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r8, ip + smlalne r11, r8, r4, r3 + strne r8, [r1, #-4] @ if possibly changed, store updated sample + cmpne ip, #0 + beq .L350 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + +.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel + ldr ip, [r1], #4 + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r8, ip + smlalne r11, r8, r0, r3 + strne r8, [r1, #-4] + cmpne ip, #0 + beq .L354 + teq ip, r3 + submi r0, r0, r6 + addpl r0, r0, r6 + +.L354: cmp r7, r1 @ loop back if more samples to do + bhi term_default_loop + +/* + * This exit is used by terms 1-8 to store the previous 8 samples into the decorr + * structure (even if they are not all used for the given term) + */ + +default_term_exit: + ldrsh r3, [r5, #0] + sub ip, r3, #1 + mov lr, #7 + +.L358: and r3, ip, #7 + add r3, r5, r3, asl #2 + ldr r2, [r1, #-4] + str r2, [r3, #40] + ldr r2, [r1, #-8]! + str r2, [r3, #8] + sub ip, ip, #1 + sub lr, lr, #1 + cmn lr, #1 + bne .L358 + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -1 condition + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated left sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_1: + ldr r3, [r1, #-4] + +term_minus_1_loop: + ldr ip, [r1], #8 @ for left channel the decorrelation value + movs r3, r3, asl #4 @ is the previous right sample (in r3) + mov r11, #0x80000000 + mov lr, ip + smlalne r11, lr, r4, r3 + strne lr, [r1, #-8] + cmpne ip, #0 + beq .L361 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value + movs lr, lr, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, lr + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L369 + teq r2, lr + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) @ then clip weight to +/-1024 + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L369: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_1_loop + + str r3, [r5, #8] @ else store right sample and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -2 condition + * (note that the channels are processed in the reverse order here) + * + * r0 = dpp->weight_B r8 = + * r1 = bptr r9 = + * r2 = intermediate result r10 = -1024 (for clipping) + * r3 = previous left sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = current sample + * r5 = dpp sp = + * r6 = dpp->delta lr = updated right sample + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_2: + ldr r3, [r1, #-8] + +term_minus_2_loop: + ldr ip, [r1, #4] @ for right channel the decorrelation value + movs r3, r3, asl #4 @ is the previous left sample (in r3) + mov r11, #0x80000000 + mov lr, ip + smlalne r11, lr, r0, r3 + strne lr, [r1, #4] + cmpne ip, #0 + beq .L380 + teq ip, r3 @ update weight based on signs + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) @ then clip weight to +/-1024 + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value + movs lr, lr, asl #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r4, lr + strne r3, [r1, #-8] + cmpne r2, #0 + beq .L388 + teq r2, lr + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L388: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_2_loop + + str r3, [r5, #40] @ else store left channel and exit + b common_exit + +/* + ****************************************************************************** + * Loop to handle term = -3 condition + * + * r0 = dpp->weight_B r8 = previous left sample + * r1 = bptr r9 = + * r2 = current left sample r10 = -1024 (for clipping) + * r3 = previous right sample r11 = lo accumulator (for rounding) + * r4 = dpp->weight_A ip = intermediate result + * r5 = dpp sp = + * r6 = dpp->delta lr = + * r7 = eptr pc = + ******************************************************************************* + */ + +term_minus_3: + ldr r3, [r1, #-4] @ load previous samples + ldr r8, [r1, #-8] + +term_minus_3_loop: + ldr ip, [r1], #4 + movs r3, r3, asl #4 + mov r11, #0x80000000 + mov r2, ip + smlalne r11, r2, r4, r3 + strne r2, [r1, #-4] + cmpne ip, #0 + beq .L399 + teq ip, r3 @ update weight based on signs + submi r4, r4, r6 + addpl r4, r4, r6 + cmp r4, #(1024 << 18) @ then clip weight to +/-1024 + movgt r4, #(1024 << 18) + cmp r4, r10 + movlt r4, r10 + +.L399: movs ip, r8, asl #4 @ ip = previous left we use now + mov r8, r2 @ r8 = current left we use next time + ldr r2, [r1], #4 + mov r11, #0x80000000 + mov r3, r2 + smlalne r11, r3, r0, ip + strne r3, [r1, #-4] + cmpne r2, #0 + beq .L407 + teq ip, r2 + submi r0, r0, r6 + addpl r0, r0, r6 + cmp r0, #(1024 << 18) + movgt r0, #(1024 << 18) + cmp r0, r10 + movlt r0, r10 + +.L407: cmp r7, r1 @ loop back if more samples to do + bhi term_minus_3_loop + + str r3, [r5, #8] @ else store previous samples & exit + str r8, [r5, #40] + +/* + * Before finally exiting we must store weights back for next time + */ + +common_exit: + mov r0, r0, asr #18 @ restore weights to real magnitude + mov r4, r4, asr #18 + strh r4, [r5, #4] + strh r0, [r5, #6] + ldmpc regs="r4-r8, r10-r11" + -- cgit v1.2.3