From f0d1c96ee435e03af0c92aa5ac5260499ae589ed Mon Sep 17 00:00:00 2001
From: Dave Bryant <bryant@rockbox.org>
Date: Thu, 23 Feb 2006 20:53:59 +0000
Subject: Optimization of WavPack decoding in ARM assembler (for iPods). This
 allows WavPack files encoded in "high" mode to play without skipping,
 although it's still rather marginal (i.e. can't play with other DSP effects
 enabled). For now this will not work with 24-bit files either, although that
 is coming along.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8814 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libwavpack/SOURCES  |   3 +
 apps/codecs/libwavpack/arm.S    | 474 ++++++++++++++++++++++++++++++++++++++++
 apps/codecs/libwavpack/unpack.c |   6 +-
 3 files changed, 482 insertions(+), 1 deletion(-)
 create mode 100644 apps/codecs/libwavpack/arm.S

diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES
index f63c55a87a..8e38767ec6 100644
--- a/apps/codecs/libwavpack/SOURCES
+++ b/apps/codecs/libwavpack/SOURCES
@@ -8,4 +8,7 @@ wputils.c
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
 coldfire.S
 #endif
+#if defined(CPU_ARM) && !defined(SIMULATOR)
+arm.S
+#endif
 
diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S
new file mode 100644
index 0000000000..0b92bfccd7
--- /dev/null
+++ b/apps/codecs/libwavpack/arm.S
@@ -0,0 +1,474 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by David Bryant
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp,
+ *                                   long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a ARM7TDMI processor. This version only uses the
+ * 32-bit multiply-accumulate instruction and so will overflow with 24-bit
+ * WavPack files. The advanced 64-bit multiply instructions in the ARM will
+ * provide full resolution for this, but are somewhat slower and have not
+ * been included yet.
+ */
+        .text
+        .align
+        .global         decorr_stereo_pass_cont_arm
+
+/*
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = long *buffer
+ * r2 = long sample_count
+ */
+
+decorr_stereo_pass_cont_arm:
+
+        stmfd   sp!, {r4 - r8, r10, r11, lr}
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
+        ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
+        ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     common_exit
+
+        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
+        ldrsh   r2, [r5, #0]            @ r2 = dpp->term
+        cmp     r2, #0
+        bmi     minus_term
+
+        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
+        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
+        ldr     r8, [r1, #-8]
+        ldr     r3, [r1, #-4]
+        cmp     r2, #17
+        beq     term_17_loop
+        cmp     r2, #18
+        beq     term_18_loop
+        cmp     r2, #2
+        beq     term_2_loop
+        b       term_default_loop       @ else handle default (1-8, except 2)
+
+minus_term:
+        mov     r10, #1024              @ r10 = -1024 for weight clipping
+        rsb     r10, r10, #0            @  (only used for negative terms)
+        cmn     r2, #1
+        beq     term_minus_1
+        cmn     r2, #2
+        beq     term_minus_2
+        cmn     r2, #3
+        beq     term_minus_3
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_17_loop:
+        rsbs    ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L325
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+.L325:  rsbs    ip, r10, r3, asl #1     @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L329
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+.L329:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_17_loop
+        b       store_1718              @ common exit for terms 17 & 18
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_18_loop:
+        sub     ip, r8, lr              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        adds    ip, r8, ip, asr #1
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L337
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+.L337:  sub     ip, r3, r10             @ do same thing for right channel
+        mov     r10, r3
+        adds    ip, r3, ip, asr #1
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L341
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+.L341:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_18_loop
+
+/* common exit for terms 17 & 18 */
+
+store_1718:
+        str     r3, [r5, #40]           @ store sample history into struct
+        str     r8, [r5, #8]
+        str     r10, [r5, #44]
+        str     lr, [r5, #12]
+        b       common_exit             @ and return
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_2_loop:
+        movs    ip, lr                  @ get decorrelation value & test
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L225
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+.L225:  movs    ip, r10                 @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L229
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+.L229:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_2_loop
+        b       default_term_exit       @ this exit updates all dpp->samples
+
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 = dpp->weight_B           r8 = result accumulator
+ * r1 = bptr                    r9 = 
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_default_loop:
+        ldr     ip, [r1]                @ get original sample
+        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
+        mla     r8, r4, r3, r11         @ mult decorr value by weight, round,
+        add     r8, ip, r8, asr #10     @  shift and add to new sample
+        str     r8, [r1], #4            @ store update sample
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L350
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+.L350:  ldr     ip, [r1]                @ do the same thing for right channel
+        ldr     r3, [r1, -r2, asl #3]
+        mla     r8, r0, r3, r11
+        add     r8, ip, r8, asr #10
+        str     r8, [r1], #4
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L354
+        teq     ip, r3
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+.L354:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_default_loop
+
+/*
+ * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
+ * structure (even if they are not all used for the given term)
+ */
+
+default_term_exit:
+        ldrsh   r3, [r5, #0]
+        sub     ip, r3, #1
+        mov     lr, #7
+
+.L358:  and     r3, ip, #7
+        add     r3, r5, r3, asl #2
+        ldr     r2, [r1, #-4]
+        str     r2, [r3, #40]
+        ldr     r2, [r1, #-8]!
+        str     r2, [r3, #8]
+        sub     ip, ip, #1
+        sub     lr, lr, #1
+        cmn     lr, #1
+        bne     .L358
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -1 condition
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_1:
+        ldr     r3, [r1, #-4]
+
+term_minus_1_loop:
+        ldr     ip, [r1]                @ for left channel the decorrelation value
+        mla     r2, r3, r4, r11         @  is the previous right sample (in r3)
+        add     lr, ip, r2, asr #10
+        str     lr, [r1], #8
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L361
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+.L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
+        mla     r3, r0, lr, r11         @  is the just updated right sample (in lr)
+        add     r3, r2, r3, asr #10
+        str     r3, [r1, #-4]
+        cmp     lr, #0
+        cmpne   r2, #0
+        beq     .L369
+        teq     r2, lr
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+.L369:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_1_loop
+
+        str     r3, [r5, #8]            @ else store right sample and exit
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -2 condition
+ * (note that the channels are processed in the reverse order here)
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous left sample    r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_2:
+        ldr     r3, [r1, #-8]
+
+term_minus_2_loop:
+        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
+        mla     r2, r3, r0, r11         @  is the previous left sample (in r3)
+        add     lr, ip, r2, asr #10
+        str     lr, [r1, #4]
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L380
+        teq     ip, r3                  @ update weight based on signs
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+.L380:  ldr     r2, [r1, #0]            @ for left channel the decorrelation value
+        mla     r3, r4, lr, r11         @  is the just updated left sample (in lr)
+        add     r3, r2, r3, asr #10
+        str     r3, [r1], #8
+        cmp     lr, #0
+        cmpne   r2, #0
+        beq     .L388
+        teq     r2, lr
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+.L388:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_2_loop
+
+        str     r3, [r5, #40]           @ else store left channel and exit
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -3 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current left sample     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = intermediate result
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_3:
+        ldr     r3, [r1, #-4]           @ load previous samples
+        ldr     r8, [r1, #-8]
+
+term_minus_3_loop:
+        ldr     ip, [r1]
+        mla     r2, r3, r4, r11
+        add     r2, ip, r2, asr #10
+        str     r2, [r1], #4
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     .L399
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024               @ then clip weight to +/-1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+.L399:  movs    ip, r8                  @ ip = previous left we use now
+        mov     r8, r2                  @ r8 = current left we use next time
+        ldr     r2, [r1], #4
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L407
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+.L407:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_3_loop
+
+        str     r3, [r5, #8]            @ else store previous samples & exit
+        str     r8, [r5, #40]
+
+/*
+ * Before finally exiting we must store weights back for next time
+ */
+
+common_exit:
+        strh    r4, [r5, #4]
+        strh    r0, [r5, #6]
+        ldmfd   sp!, {r4 - r8, r10, r11, pc}
+
diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c
index 8f5c1ee46f..0c61e0e38a 100644
--- a/apps/codecs/libwavpack/unpack.c
+++ b/apps/codecs/libwavpack/unpack.c
@@ -288,6 +288,8 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
 
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
 extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count);
+#elif defined(CPU_ARM) && !defined(SIMULATOR)
+extern void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp, long *buffer, long sample_count);
 #else
 static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
 #endif
@@ -350,6 +352,8 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
                 decorr_stereo_pass (dpp, buffer, 8);
 #if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
                 decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
+#elif defined(CPU_ARM) && !defined(SIMULATOR)
+                decorr_stereo_pass_cont_arm (dpp, buffer + 16, sample_count - 8);
 #else
                 decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
 #endif
@@ -510,7 +514,7 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp
     dpp->weight_B = weight_B;
 }
 
-#if !defined(CPU_COLDFIRE) || defined(SIMULATOR)
+#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM)) || defined(SIMULATOR)
 
 static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count)
 {
-- 
cgit v1.2.3