Optimization of WavPack decoding in ARM assembler (for iPods). This allows WavPack files encoded in "high" mode to

play without skipping, although it's still rather marginal (i.e. can't play with other DSP effects enabled). For now this will not work with 24-bit files either, although that is coming along. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8814 a1c6a512-1295-4272-9138-f99709370657
author: Dave Bryant <bryant@rockbox.org> 2006-02-23 20:53:59 +0000
committer: Dave Bryant <bryant@rockbox.org> 2006-02-23 20:53:59 +0000
commit: f0d1c96ee435e03af0c92aa5ac5260499ae589ed (patch)
tree: 48ad35f7a5bff47eae27c7488bc32be0e889bd86
parent: eeec278d21ae258da9108bbbccf04d977c3d3bfa (diff)
download: rockbox-f0d1c96ee435e03af0c92aa5ac5260499ae589ed.tar.gz
rockbox-f0d1c96ee435e03af0c92aa5ac5260499ae589ed.zip
3 files changed, 482 insertions, 1 deletions
diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES
index f63c55a87a..8e38767ec6 100644
--- a/apps/codecs/libwavpack/SOURCES
+++ b/apps/codecs/libwavpack/SOURCES
@@ -8,4 +8,7 @@ wputils.c
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
 coldfire.S
 #endif
+#if defined(CPU_ARM) && !defined(SIMULATOR)
+arm.S
+#endif
diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S
new file mode 100644
index 0000000000..0b92bfccd7
--- /dev/null
+++ b/apps/codecs/libwavpack/arm.S
@@ -0,0 +1,474 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by David Bryant
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp,
+ *                                   long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a ARM7TDMI processor. This version only uses the
+ * 32-bit multiply-accumulate instruction and so will overflow with 24-bit
+ * WavPack files. The advanced 64-bit multiply instructions in the ARM will
+ * provide full resolution for this, but are somewhat slower and have not
+ * been included yet.
+ */
+        .text
+        .align
+        .global         decorr_stereo_pass_cont_arm
+/*
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = long *buffer
+ * r2 = long sample_count
+ */
+decorr_stereo_pass_cont_arm:
+        stmfd   sp!, {r4 - r8, r10, r11, lr}
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
+        ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
+        ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     common_exit
+        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
+        ldrsh   r2, [r5, #0]            @ r2 = dpp->term
+        cmp     r2, #0
+        bmi     minus_term
+        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
+        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
+        ldr     r8, [r1, #-8]
+        ldr     r3, [r1, #-4]
+        cmp     r2, #17
+        beq     term_17_loop
+        cmp     r2, #18
+        beq     term_18_loop
+        cmp     r2, #2
+        beq     term_2_loop
+        b       term_default_loop       @ else handle default (1-8, except 2)
+minus_term:
+        mov     r10, #1024              @ r10 = -1024 for weight clipping
+        rsb     r10, r10, #0            @  (only used for negative terms)
+        cmn     r2, #1
+        beq     term_minus_1
+        cmn     r2, #2
+        beq     term_minus_2
+        cmn     r2, #3
+        beq     term_minus_3
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_17_loop:
+        rsbs    ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L325
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L325:  rsbs    ip, r10, r3, asl #1     @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L329
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L329:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_17_loop
+        b       store_1718              @ common exit for terms 17 & 18
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_18_loop:
+        sub     ip, r8, lr              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        adds    ip, r8, ip, asr #1
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L337
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L337:  sub     ip, r3, r10             @ do same thing for right channel
+        mov     r10, r3
+        adds    ip, r3, ip, asr #1
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L341
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L341:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_18_loop
+/* common exit for terms 17 & 18 */
+store_1718:
+        str     r3, [r5, #40]           @ store sample history into struct
+        str     r8, [r5, #8]
+        str     r10, [r5, #44]
+        str     lr, [r5, #12]
+        b       common_exit             @ and return
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_2_loop:
+        movs    ip, lr                  @ get decorrelation value & test
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L225
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L225:  movs    ip, r10                 @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L229
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L229:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_2_loop
+        b       default_term_exit       @ this exit updates all dpp->samples
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 = dpp->weight_B           r8 = result accumulator
+ * r1 = bptr                    r9 = 
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_default_loop:
+        ldr     ip, [r1]                @ get original sample
+        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
+        mla     r8, r4, r3, r11         @ mult decorr value by weight, round,
+        add     r8, ip, r8, asr #10     @  shift and add to new sample
+        str     r8, [r1], #4            @ store update sample
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L350
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L350:  ldr     ip, [r1]                @ do the same thing for right channel
+        ldr     r3, [r1, -r2, asl #3]
+        mla     r8, r0, r3, r11
+        add     r8, ip, r8, asr #10
+        str     r8, [r1], #4
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L354
+        teq     ip, r3
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L354:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_default_loop
+/*
+ * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
+ * structure (even if they are not all used for the given term)
+ */
+default_term_exit:
+        ldrsh   r3, [r5, #0]
+        sub     ip, r3, #1
+        mov     lr, #7
+.L358:  and     r3, ip, #7
+        add     r3, r5, r3, asl #2
+        ldr     r2, [r1, #-4]
+        str     r2, [r3, #40]
+        ldr     r2, [r1, #-8]!
+        str     r2, [r3, #8]
+        sub     ip, ip, #1
+        sub     lr, lr, #1
+        cmn     lr, #1
+        bne     .L358
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -1 condition
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_1:
+        ldr     r3, [r1, #-4]
+term_minus_1_loop:
+        ldr     ip, [r1]                @ for left channel the decorrelation value
+        mla     r2, r3, r4, r11         @  is the previous right sample (in r3)
+        add     lr, ip, r2, asr #10
+        str     lr, [r1], #8
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L361
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+.L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
+        mla     r3, r0, lr, r11         @  is the just updated right sample (in lr)
+        add     r3, r2, r3, asr #10
+        str     r3, [r1, #-4]
+        cmp     lr, #0
+        cmpne   r2, #0
+        beq     .L369
+        teq     r2, lr
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+.L369:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_1_loop
+        str     r3, [r5, #8]            @ else store right sample and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -2 condition
+ * (note that the channels are processed in the reverse order here)
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous left sample    r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_2:
+        ldr     r3, [r1, #-8]
+term_minus_2_loop:
+        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
+        mla     r2, r3, r0, r11         @  is the previous left sample (in r3)
+        add     lr, ip, r2, asr #10
+        str     lr, [r1, #4]
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L380
+        teq     ip, r3                  @ update weight based on signs
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+.L380:  ldr     r2, [r1, #0]            @ for left channel the decorrelation value
+        mla     r3, r4, lr, r11         @  is the just updated left sample (in lr)
+        add     r3, r2, r3, asr #10
+        str     r3, [r1], #8
+        cmp     lr, #0
+        cmpne   r2, #0
+        beq     .L388
+        teq     r2, lr
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+.L388:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_2_loop
+        str     r3, [r5, #40]           @ else store left channel and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -3 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current left sample     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = intermediate result
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_3:
+        ldr     r3, [r1, #-4]           @ load previous samples
+        ldr     r8, [r1, #-8]
+term_minus_3_loop:
+        ldr     ip, [r1]
+        mla     r2, r3, r4, r11
+        add     r2, ip, r2, asr #10
+        str     r2, [r1], #4
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L399
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024               @ then clip weight to +/-1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+.L399:  movs    ip, r8                  @ ip = previous left we use now
+        mov     r8, r2                  @ r8 = current left we use next time
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L407
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+.L407:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_3_loop
+        str     r3, [r5, #8]            @ else store previous samples & exit
+        str     r8, [r5, #40]
+/*
+ * Before finally exiting we must store weights back for next time
+ */
+common_exit:
+        strh    r4, [r5, #4]
+        strh    r0, [r5, #6]
+        ldmfd   sp!, {r4 - r8, r10, r11, pc}
diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c
index 8f5c1ee46f..0c61e0e38a 100644
--- a/apps/codecs/libwavpack/unpack.c
+++ b/apps/codecs/libwavpack/unpack.c
@@ -288,6 +288,8 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
 extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count);
+#elif defined(CPU_ARM) && !defined(SIMULATOR)
+extern void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp, long *buffer, long sample_count);
 #else
 static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
 #endif
@@ -350,6 +352,8 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
                decorr_stereo_pass (dpp, buffer, 8);
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
                decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
+#elif defined(CPU_ARM) && !defined(SIMULATOR)
+                decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8);
 #else
                decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
 #endif
@@ -510,7 +514,7 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp
    dpp->weight_B = weight_B;
 }
-#if !defined(CPU_COLDFIRE) || defined(SIMULATOR)
+#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM)) || defined(SIMULATOR)
 static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count)
 {
author	Dave Bryant <bryant@rockbox.org>	2006-02-23 20:53:59 +0000
committer	Dave Bryant <bryant@rockbox.org>	2006-02-23 20:53:59 +0000
commit	f0d1c96ee435e03af0c92aa5ac5260499ae589ed (patch)
tree	48ad35f7a5bff47eae27c7488bc32be0e889bd86
parent	eeec278d21ae258da9108bbbccf04d977c3d3bfa (diff)
download	rockbox-f0d1c96ee435e03af0c92aa5ac5260499ae589ed.tar.gz rockbox-f0d1c96ee435e03af0c92aa5ac5260499ae589ed.zip

diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES index f63c55a87a..8e38767ec6 100644 --- a/apps/codecs/libwavpack/SOURCES +++ b/apps/codecs/libwavpack/SOURCES
@@ -8,4 +8,7 @@ wputils.c
8	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)	8	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
9	coldfire.S	9	coldfire.S
10	#endif	10	#endif
		11	#if defined(CPU_ARM) && !defined(SIMULATOR)
		12	arm.S
		13	#endif
11		14


diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S new file mode 100644 index 0000000000..0b92bfccd7 --- /dev/null +++ b/apps/codecs/libwavpack/arm.S
@@ -0,0 +1,474 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id$
		9	*
		10	* Copyright (C) 2006 by David Bryant
		11	*
		12	* All files in this archive are subject to the GNU General Public License.
		13	* See the file COPYING in the source tree root for full license agreement.
		14	*
		15	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		16	* KIND, either express or implied.
		17	*
		18	****************************************************************************/
		19
		20	/* This is an assembly optimized version of the following WavPack function:
		21	*
		22	* void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp,
		23	* long *buffer, long sample_count);
		24	*
		25	* It performs a single pass of stereo decorrelation on the provided buffer.
		26	* Note that this version of the function requires that the 8 previous stereo
		27	* samples are visible and correct. In other words, it ignores the "samples_*"
		28	* fields in the decorr_pass structure and gets the history data directly
		29	* from the buffer. It does, however, return the appropriate history samples
		30	* to the decorr_pass structure before returning.
		31	*
		32	* This is written to work on a ARM7TDMI processor. This version only uses the
		33	* 32-bit multiply-accumulate instruction and so will overflow with 24-bit
		34	* WavPack files. The advanced 64-bit multiply instructions in the ARM will
		35	* provide full resolution for this, but are somewhat slower and have not
		36	* been included yet.
		37	*/
		38	.text
		39	.align
		40	.global decorr_stereo_pass_cont_arm
		41
		42	/*
		43	* on entry:
		44	*
		45	* r0 = struct decorr_pass *dpp
		46	* r1 = long *buffer
		47	* r2 = long sample_count
		48	*/
		49
		50	decorr_stereo_pass_cont_arm:
		51
		52	stmfd sp!, {r4 - r8, r10, r11, lr}
		53	mov r5, r0 @ r5 = dpp
		54	mov r11, #512 @ r11 = 512 for rounding
		55	ldrsh r6, [r0, #2] @ r6 = dpp->delta
		56	ldrsh r4, [r0, #4] @ r4 = dpp->weight_A
		57	ldrsh r0, [r0, #6] @ r0 = dpp->weight_B
		58	cmp r2, #0 @ exit if no samples to process
		59	beq common_exit
		60
		61	add r7, r1, r2, asl #3 @ r7 = buffer ending position
		62	ldrsh r2, [r5, #0] @ r2 = dpp->term
		63	cmp r2, #0
		64	bmi minus_term
		65
		66	ldr lr, [r1, #-16] @ load 2 sample history from buffer
		67	ldr r10, [r1, #-12] @ for terms 2, 17, and 18
		68	ldr r8, [r1, #-8]
		69	ldr r3, [r1, #-4]
		70	cmp r2, #17
		71	beq term_17_loop
		72	cmp r2, #18
		73	beq term_18_loop
		74	cmp r2, #2
		75	beq term_2_loop
		76	b term_default_loop @ else handle default (1-8, except 2)
		77
		78	minus_term:
		79	mov r10, #1024 @ r10 = -1024 for weight clipping
		80	rsb r10, r10, #0 @ (only used for negative terms)
		81	cmn r2, #1
		82	beq term_minus_1
		83	cmn r2, #2
		84	beq term_minus_2
		85	cmn r2, #3
		86	beq term_minus_3
		87	b common_exit
		88
		89	/*
		90	******************************************************************************
		91	* Loop to handle term = 17 condition
		92	*
		93	* r0 = dpp->weight_B r8 = previous left sample
		94	* r1 = bptr r9 =
		95	* r2 = current sample r10 = second previous left sample
		96	* r3 = previous right sample r11 = 512 (for rounding)
		97	* r4 = dpp->weight_A ip = current decorrelation value
		98	* r5 = dpp sp =
		99	* r6 = dpp->delta lr = second previous right sample
		100	* r7 = eptr pc =
		101	*******************************************************************************
		102	*/
		103
		104	term_17_loop:
		105	rsbs ip, lr, r8, asl #1 @ decorr value = (2 * prev) - 2nd prev
		106	mov lr, r8 @ previous becomes 2nd previous
		107	ldr r2, [r1], #4 @ get sample & update pointer
		108	mla r8, ip, r4, r11 @ mult decorr value by weight, round,
		109	add r8, r2, r8, asr #10 @ shift, and add to new sample
		110	strne r8, [r1, #-4] @ if change possible, store sample back
		111	cmpne r2, #0
		112	beq .L325
		113	teq ip, r2 @ update weight based on signs
		114	submi r4, r4, r6
		115	addpl r4, r4, r6
		116
		117	.L325: rsbs ip, r10, r3, asl #1 @ do same thing for right channel
		118	mov r10, r3
		119	ldr r2, [r1], #4
		120	mla r3, ip, r0, r11
		121	add r3, r2, r3, asr #10
		122	strne r3, [r1, #-4]
		123	cmpne r2, #0
		124	beq .L329
		125	teq ip, r2
		126	submi r0, r0, r6
		127	addpl r0, r0, r6
		128
		129	.L329: cmp r7, r1 @ loop back if more samples to do
		130	bhi term_17_loop
		131	b store_1718 @ common exit for terms 17 & 18
		132
		133	/*
		134	******************************************************************************
		135	* Loop to handle term = 18 condition
		136	*
		137	* r0 = dpp->weight_B r8 = previous left sample
		138	* r1 = bptr r9 =
		139	* r2 = current sample r10 = second previous left sample
		140	* r3 = previous right sample r11 = 512 (for rounding)
		141	* r4 = dpp->weight_A ip = decorrelation value
		142	* r5 = dpp sp =
		143	* r6 = dpp->delta lr = second previous right sample
		144	* r7 = eptr pc =
		145	*******************************************************************************
		146	*/
		147
		148	term_18_loop:
		149	sub ip, r8, lr @ decorr value =
		150	mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1
		151	adds ip, r8, ip, asr #1
		152	ldr r2, [r1], #4 @ get sample & update pointer
		153	mla r8, ip, r4, r11 @ mult decorr value by weight, round,
		154	add r8, r2, r8, asr #10 @ shift, and add to new sample
		155	strne r8, [r1, #-4] @ if change possible, store sample back
		156	cmpne r2, #0
		157	beq .L337
		158	teq ip, r2 @ update weight based on signs
		159	submi r4, r4, r6
		160	addpl r4, r4, r6
		161
		162	.L337: sub ip, r3, r10 @ do same thing for right channel
		163	mov r10, r3
		164	adds ip, r3, ip, asr #1
		165	ldr r2, [r1], #4
		166	mla r3, ip, r0, r11
		167	add r3, r2, r3, asr #10
		168	strne r3, [r1, #-4]
		169	cmpne r2, #0
		170	beq .L341
		171	teq ip, r2
		172	submi r0, r0, r6
		173	addpl r0, r0, r6
		174
		175	.L341: cmp r7, r1 @ loop back if more samples to do
		176	bhi term_18_loop
		177
		178	/* common exit for terms 17 & 18 */
		179
		180	store_1718:
		181	str r3, [r5, #40] @ store sample history into struct
		182	str r8, [r5, #8]
		183	str r10, [r5, #44]
		184	str lr, [r5, #12]
		185	b common_exit @ and return
		186
		187	/*
		188	******************************************************************************
		189	* Loop to handle term = 2 condition
		190	* (note that this case can be handled by the default term handler (1-8), but
		191	* this special case is faster because it doesn't have to read memory twice)
		192	*
		193	* r0 = dpp->weight_B r8 = previous left sample
		194	* r1 = bptr r9 =
		195	* r2 = current sample r10 = second previous left sample
		196	* r3 = previous right sample r11 = 512 (for rounding)
		197	* r4 = dpp->weight_A ip = decorrelation value
		198	* r5 = dpp sp =
		199	* r6 = dpp->delta lr = second previous right sample
		200	* r7 = eptr pc =
		201	*******************************************************************************
		202	*/
		203
		204	term_2_loop:
		205	movs ip, lr @ get decorrelation value & test
		206	mov lr, r8 @ previous becomes 2nd previous
		207	ldr r2, [r1], #4 @ get sample & update pointer
		208	mla r8, ip, r4, r11 @ mult decorr value by weight, round,
		209	add r8, r2, r8, asr #10 @ shift, and add to new sample
		210	strne r8, [r1, #-4] @ if change possible, store sample back
		211	cmpne r2, #0
		212	beq .L225
		213	teq ip, r2 @ update weight based on signs
		214	submi r4, r4, r6
		215	addpl r4, r4, r6
		216
		217	.L225: movs ip, r10 @ do same thing for right channel
		218	mov r10, r3
		219	ldr r2, [r1], #4
		220	mla r3, ip, r0, r11
		221	add r3, r2, r3, asr #10
		222	strne r3, [r1, #-4]
		223	cmpne r2, #0
		224	beq .L229
		225	teq ip, r2
		226	submi r0, r0, r6
		227	addpl r0, r0, r6
		228
		229	.L229: cmp r7, r1 @ loop back if more samples to do
		230	bhi term_2_loop
		231	b default_term_exit @ this exit updates all dpp->samples
		232
		233	/*
		234	******************************************************************************
		235	* Loop to handle default term condition
		236	*
		237	* r0 = dpp->weight_B r8 = result accumulator
		238	* r1 = bptr r9 =
		239	* r2 = dpp->term r10 =
		240	* r3 = decorrelation value r11 = 512 (for rounding)
		241	* r4 = dpp->weight_A ip = current sample
		242	* r5 = dpp sp =
		243	* r6 = dpp->delta lr =
		244	* r7 = eptr pc =
		245	*******************************************************************************
		246	*/
		247
		248	term_default_loop:
		249	ldr ip, [r1] @ get original sample
		250	ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term
		251	mla r8, r4, r3, r11 @ mult decorr value by weight, round,
		252	add r8, ip, r8, asr #10 @ shift and add to new sample
		253	str r8, [r1], #4 @ store update sample
		254	cmp r3, #0
		255	cmpne ip, #0
		256	beq .L350
		257	teq ip, r3 @ update weight based on signs
		258	submi r4, r4, r6
		259	addpl r4, r4, r6
		260
		261	.L350: ldr ip, [r1] @ do the same thing for right channel
		262	ldr r3, [r1, -r2, asl #3]
		263	mla r8, r0, r3, r11
		264	add r8, ip, r8, asr #10
		265	str r8, [r1], #4
		266	cmp r3, #0
		267	cmpne ip, #0
		268	beq .L354
		269	teq ip, r3
		270	submi r0, r0, r6
		271	addpl r0, r0, r6
		272
		273	.L354: cmp r7, r1 @ loop back if more samples to do
		274	bhi term_default_loop
		275
		276	/*
		277	* This exit is used by terms 1-8 to store the previous 8 samples into the decorr
		278	* structure (even if they are not all used for the given term)
		279	*/
		280
		281	default_term_exit:
		282	ldrsh r3, [r5, #0]
		283	sub ip, r3, #1
		284	mov lr, #7
		285
		286	.L358: and r3, ip, #7
		287	add r3, r5, r3, asl #2
		288	ldr r2, [r1, #-4]
		289	str r2, [r3, #40]
		290	ldr r2, [r1, #-8]!
		291	str r2, [r3, #8]
		292	sub ip, ip, #1
		293	sub lr, lr, #1
		294	cmn lr, #1
		295	bne .L358
		296	b common_exit
		297
		298	/*
		299	******************************************************************************
		300	* Loop to handle term = -1 condition
		301	*
		302	* r0 = dpp->weight_B r8 =
		303	* r1 = bptr r9 =
		304	* r2 = intermediate result r10 = -1024 (for clipping)
		305	* r3 = previous right sample r11 = 512 (for rounding)
		306	* r4 = dpp->weight_A ip = current sample
		307	* r5 = dpp sp =
		308	* r6 = dpp->delta lr = updated left sample
		309	* r7 = eptr pc =
		310	*******************************************************************************
		311	*/
		312
		313	term_minus_1:
		314	ldr r3, [r1, #-4]
		315
		316	term_minus_1_loop:
		317	ldr ip, [r1] @ for left channel the decorrelation value
		318	mla r2, r3, r4, r11 @ is the previous right sample (in r3)
		319	add lr, ip, r2, asr #10
		320	str lr, [r1], #8
		321	cmp r3, #0
		322	cmpne ip, #0
		323	beq .L361
		324	teq ip, r3 @ update weight based on signs
		325	submi r4, r4, r6
		326	addpl r4, r4, r6
		327	cmp r4, #1024
		328	movgt r4, #1024
		329	cmp r4, r10
		330	movlt r4, r10
		331
		332	.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value
		333	mla r3, r0, lr, r11 @ is the just updated right sample (in lr)
		334	add r3, r2, r3, asr #10
		335	str r3, [r1, #-4]
		336	cmp lr, #0
		337	cmpne r2, #0
		338	beq .L369
		339	teq r2, lr
		340	submi r0, r0, r6
		341	addpl r0, r0, r6
		342	cmp r0, #1024 @ then clip weight to +/-1024
		343	movgt r0, #1024
		344	cmp r0, r10
		345	movlt r0, r10
		346
		347	.L369: cmp r7, r1 @ loop back if more samples to do
		348	bhi term_minus_1_loop
		349
		350	str r3, [r5, #8] @ else store right sample and exit
		351	b common_exit
		352
		353	/*
		354	******************************************************************************
		355	* Loop to handle term = -2 condition
		356	* (note that the channels are processed in the reverse order here)
		357	*
		358	* r0 = dpp->weight_B r8 =
		359	* r1 = bptr r9 =
		360	* r2 = intermediate result r10 = -1024 (for clipping)
		361	* r3 = previous left sample r11 = 512 (for rounding)
		362	* r4 = dpp->weight_A ip = current sample
		363	* r5 = dpp sp =
		364	* r6 = dpp->delta lr = updated right sample
		365	* r7 = eptr pc =
		366	*******************************************************************************
		367	*/
		368
		369	term_minus_2:
		370	ldr r3, [r1, #-8]
		371
		372	term_minus_2_loop:
		373	ldr ip, [r1, #4] @ for right channel the decorrelation value
		374	mla r2, r3, r0, r11 @ is the previous left sample (in r3)
		375	add lr, ip, r2, asr #10
		376	str lr, [r1, #4]
		377	cmp r3, #0
		378	cmpne ip, #0
		379	beq .L380
		380	teq ip, r3 @ update weight based on signs
		381	submi r0, r0, r6
		382	addpl r0, r0, r6
		383	cmp r0, #1024 @ then clip weight to +/-1024
		384	movgt r0, #1024
		385	cmp r0, r10
		386	movlt r0, r10
		387
		388	.L380: ldr r2, [r1, #0] @ for left channel the decorrelation value
		389	mla r3, r4, lr, r11 @ is the just updated left sample (in lr)
		390	add r3, r2, r3, asr #10
		391	str r3, [r1], #8
		392	cmp lr, #0
		393	cmpne r2, #0
		394	beq .L388
		395	teq r2, lr
		396	submi r4, r4, r6
		397	addpl r4, r4, r6
		398	cmp r4, #1024
		399	movgt r4, #1024
		400	cmp r4, r10
		401	movlt r4, r10
		402
		403	.L388: cmp r7, r1 @ loop back if more samples to do
		404	bhi term_minus_2_loop
		405
		406	str r3, [r5, #40] @ else store left channel and exit
		407	b common_exit
		408
		409	/*
		410	******************************************************************************
		411	* Loop to handle term = -3 condition
		412	*
		413	* r0 = dpp->weight_B r8 = previous left sample
		414	* r1 = bptr r9 =
		415	* r2 = current left sample r10 = -1024 (for clipping)
		416	* r3 = previous right sample r11 = 512 (for rounding)
		417	* r4 = dpp->weight_A ip = intermediate result
		418	* r5 = dpp sp =
		419	* r6 = dpp->delta lr =
		420	* r7 = eptr pc =
		421	*******************************************************************************
		422	*/
		423
		424	term_minus_3:
		425	ldr r3, [r1, #-4] @ load previous samples
		426	ldr r8, [r1, #-8]
		427
		428	term_minus_3_loop:
		429	ldr ip, [r1]
		430	mla r2, r3, r4, r11
		431	add r2, ip, r2, asr #10
		432	str r2, [r1], #4
		433	cmp r3, #0
		434	cmpne ip, #0
		435	beq .L399
		436	teq ip, r3 @ update weight based on signs
		437	submi r4, r4, r6
		438	addpl r4, r4, r6
		439	cmp r4, #1024 @ then clip weight to +/-1024
		440	movgt r4, #1024
		441	cmp r4, r10
		442	movlt r4, r10
		443
		444	.L399: movs ip, r8 @ ip = previous left we use now
		445	mov r8, r2 @ r8 = current left we use next time
		446	ldr r2, [r1], #4
		447	mla r3, ip, r0, r11
		448	add r3, r2, r3, asr #10
		449	strne r3, [r1, #-4]
		450	cmpne r2, #0
		451	beq .L407
		452	teq ip, r2
		453	submi r0, r0, r6
		454	addpl r0, r0, r6
		455	cmp r0, #1024
		456	movgt r0, #1024
		457	cmp r0, r10
		458	movlt r0, r10
		459
		460	.L407: cmp r7, r1 @ loop back if more samples to do
		461	bhi term_minus_3_loop
		462
		463	str r3, [r5, #8] @ else store previous samples & exit
		464	str r8, [r5, #40]
		465
		466	/*
		467	* Before finally exiting we must store weights back for next time
		468	*/
		469
		470	common_exit:
		471	strh r4, [r5, #4]
		472	strh r0, [r5, #6]
		473	ldmfd sp!, {r4 - r8, r10, r11, pc}
		474


diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c index 8f5c1ee46f..0c61e0e38a 100644 --- a/apps/codecs/libwavpack/unpack.c +++ b/apps/codecs/libwavpack/unpack.c
@@ -288,6 +288,8 @@ int read_config_info (WavpackContext wpc, WavpackMetadata wpmd)
288		288
289	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)	289	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
290	extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass dpp, long buffer, long sample_count);	290	extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass dpp, long buffer, long sample_count);
		291	#elif defined(CPU_ARM) && !defined(SIMULATOR)
		292	extern void decorr_stereo_pass_cont_arm (struct decorr_pass dpp, long buffer, long sample_count);
291	#else	293	#else
292	static void decorr_stereo_pass_cont (struct decorr_pass dpp, long buffer, long sample_count);	294	static void decorr_stereo_pass_cont (struct decorr_pass dpp, long buffer, long sample_count);
293	#endif	295	#endif
@@ -350,6 +352,8 @@ long unpack_samples (WavpackContext wpc, long buffer, ulong sample_count)
350	decorr_stereo_pass (dpp, buffer, 8);	352	decorr_stereo_pass (dpp, buffer, 8);
351	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)	353	#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
352	decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);	354	decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
		355	#elif defined(CPU_ARM) && !defined(SIMULATOR)
		356	decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8);
353	#else	357	#else
354	decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);	358	decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
355	#endif	359	#endif
@@ -510,7 +514,7 @@ static void decorr_stereo_pass (struct decorr_pass dpp, long buffer, long samp
510	dpp->weight_B = weight_B;	514	dpp->weight_B = weight_B;
511	}	515	}
512		516
513	#if !defined(CPU_COLDFIRE) \|\| defined(SIMULATOR)	517	#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM)) \|\| defined(SIMULATOR)
514		518
515	static void decorr_stereo_pass_cont (struct decorr_pass dpp, long buffer, long sample_count)	519	static void decorr_stereo_pass_cont (struct decorr_pass dpp, long buffer, long sample_count)
516	{	520	{