1 files changed, 502 insertions, 0 deletions
diff --git a/apps/codecs/libwavpack/arml.S b/apps/codecs/libwavpack/arml.S
new file mode 100644
index 0000000000..97474f93b9
--- /dev/null
+++ b/apps/codecs/libwavpack/arml.S
@@ -0,0 +1,502 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by David Bryant
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
+ *                                    long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a ARM7TDMI processor. This version uses the
+ * 64-bit multiply-accumulate instruction and so can be used with all
+ * WavPack files. However, for optimum performance with 16-bit WavPack
+ * files, there is a faster version that only uses the 32-bit MLA
+ * instruction.
+ */
+        .text
+        .align
+        .global         decorr_stereo_pass_cont_arml
+/*
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = long *buffer
+ * r2 = long sample_count
+ */
+decorr_stereo_pass_cont_arml:
+        stmfd   sp!, {r4 - r8, r10, r11, lr}
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
+        ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
+        ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     common_exit
+        mov     r0, r0, asl #18         @ for 64-bit math we use weights << 18
+        mov     r4, r4, asl #18
+        mov     r6, r6, asl #18
+        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
+        ldrsh   r2, [r5, #0]            @ r2 = dpp->term
+        cmp     r2, #0
+        blt     minus_term
+        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
+        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
+        ldr     r8, [r1, #-8]
+        ldr     r3, [r1, #-4]
+        cmp     r2, #18
+        beq     term_18_loop
+        mov     lr, lr, asl #4
+        mov     r10, r10, asl #4
+        cmp     r2, #2
+        beq     term_2_loop
+        cmp     r2, #17
+        beq     term_17_loop
+        b       term_default_loop
+minus_term:
+        mov     r10, #(1024 << 18)      @ r10 = -1024 << 18 for weight clipping
+        rsb     r10, r10, #0            @  (only used for negative terms)
+        cmn     r2, #1
+        beq     term_minus_1
+        cmn     r2, #2
+        beq     term_minus_2
+        cmn     r2, #3
+        beq     term_minus_3
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample << 4
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample << 4
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_17_loop:
+        rsbs    ip, lr, r8, asl #5      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8, asl #4          @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L325
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L325:  rsbs    ip, r10, r3, asl #5     @ do same thing for right channel
+        mov     r10, r3, asl #4
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L329
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L329:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_17_loop
+        mov     lr, lr, asr #4
+        mov     r10, r10, asr #4
+        b       store_1718              @ common exit for terms 17 & 18
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_18_loop:
+        rsb     ip, lr, r8              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        add     ip, lr, ip, asr #1
+        movs    ip, ip, asl #4
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L337
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L337:  rsb     ip, r10, r3             @ do same thing for right channel
+        mov     r10, r3
+        add     ip, r10, ip, asr #1
+        movs    ip, ip, asl #4
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L341
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L341:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_18_loop
+/* common exit for terms 17 & 18 */
+store_1718:
+        str     r3, [r5, #40]           @ store sample history into struct
+        str     r8, [r5, #8]
+        str     r10, [r5, #44]
+        str     lr, [r5, #12]
+        b       common_exit             @ and return
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample << 4
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample << 4
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_2_loop:
+        movs    ip, lr                  @ get decorrelation value & test
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     lr, r8, asl #4          @ previous becomes 2nd previous
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L225
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L225:  movs    ip, r10                 @ do same thing for right channel
+        ldr     r2, [r1], #4
+        mov     r10, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L229
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L229:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_2_loop
+        b       default_term_exit       @ this exit updates all dpp->samples
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 = dpp->weight_B           r8 = result accumulator
+ * r1 = bptr                    r9 = 
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_default_loop:
+        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
+        ldr     ip, [r1], #4            @ get original sample and bump ptr
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r8, ip
+        smlalne r11, r8, r4, r3
+        strne   r8, [r1, #-4]           @ if possibly changed, store updated sample
+        cmpne   ip, #0
+        beq     .L350
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L350:  ldr     r3, [r1, -r2, asl #3]   @ do the same thing for right channel
+        ldr     ip, [r1], #4
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r8, ip
+        smlalne r11, r8, r0, r3
+        strne   r8, [r1, #-4]
+        cmpne   ip, #0
+        beq     .L354
+        teq     ip, r3
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L354:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_default_loop
+/*
+ * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
+ * structure (even if they are not all used for the given term)
+ */
+default_term_exit:
+        ldrsh   r3, [r5, #0]
+        sub     ip, r3, #1
+        mov     lr, #7
+.L358:  and     r3, ip, #7
+        add     r3, r5, r3, asl #2
+        ldr     r2, [r1, #-4]
+        str     r2, [r3, #40]
+        ldr     r2, [r1, #-8]!
+        str     r2, [r3, #8]
+        sub     ip, ip, #1
+        sub     lr, lr, #1
+        cmn     lr, #1
+        bne     .L358
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -1 condition
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_1:
+        ldr     r3, [r1, #-4]
+term_minus_1_loop:
+        ldr     ip, [r1], #8            @ for left channel the decorrelation value
+        movs    r3, r3, asl #4          @  is the previous right sample (in r3)
+        mov     r11, #0x80000000
+        mov     lr, ip
+        smlalne r11, lr, r4, r3
+        strne   lr, [r1, #-8]
+        cmpne   ip, #0
+        beq     .L361
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
+        movs    lr, lr, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, lr
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L369
+        teq     r2, lr
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L369:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_1_loop
+        str     r3, [r5, #8]            @ else store right sample and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -2 condition
+ * (note that the channels are processed in the reverse order here)
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous left sample    r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_2:
+        ldr     r3, [r1, #-8]
+term_minus_2_loop:
+        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
+        movs    r3, r3, asl #4          @  is the previous left sample (in r3)
+        mov     r11, #0x80000000
+        mov     lr, ip
+        smlalne r11, lr, r0, r3
+        strne   lr, [r1, #4]
+        cmpne   ip, #0
+        beq     .L380
+        teq     ip, r3                  @ update weight based on signs
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L380:  ldr     r2, [r1], #8            @ for left channel the decorrelation value
+        movs    lr, lr, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r4, lr
+        strne   r3, [r1, #-8]
+        cmpne   r2, #0
+        beq     .L388
+        teq     r2, lr
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L388:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_2_loop
+        str     r3, [r5, #40]           @ else store left channel and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -3 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current left sample     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = intermediate result
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_3:
+        ldr     r3, [r1, #-4]           @ load previous samples
+        ldr     r8, [r1, #-8]
+term_minus_3_loop:
+        ldr     ip, [r1], #4
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r2, ip
+        smlalne r11, r2, r4, r3
+        strne   r2, [r1, #-4]
+        cmpne   ip, #0
+        beq     .L399
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)       @ then clip weight to +/-1024
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L399:  movs    ip, r8, asl #4          @ ip = previous left we use now
+        mov     r8, r2                  @ r8 = current left we use next time
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L407
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L407:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_3_loop
+        str     r3, [r5, #8]            @ else store previous samples & exit
+        str     r8, [r5, #40]
+/*
+ * Before finally exiting we must store weights back for next time
+ */
+common_exit:
+        mov     r0, r0, asr #18         @ restore weights to real magnitude
+        mov     r4, r4, asr #18
+        strh    r4, [r5, #4]
+        strh    r0, [r5, #6]
+        ldmfd   sp!, {r4 - r8, r10, r11, pc}

diff --git a/apps/codecs/libwavpack/arml.S b/apps/codecs/libwavpack/arml.S new file mode 100644 index 0000000000..97474f93b9 --- /dev/null +++ b/apps/codecs/libwavpack/arml.S
@@ -0,0 +1,502 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2006 by David Bryant
	11	*
	12	* All files in this archive are subject to the GNU General Public License.
	13	* See the file COPYING in the source tree root for full license agreement.
	14	*
	15	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	16	* KIND, either express or implied.
	17	*
	18	****************************************************************************/
	19
	20	/* This is an assembly optimized version of the following WavPack function:
	21	*
	22	* void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
	23	* long *buffer, long sample_count);
	24	*
	25	* It performs a single pass of stereo decorrelation on the provided buffer.
	26	* Note that this version of the function requires that the 8 previous stereo
	27	* samples are visible and correct. In other words, it ignores the "samples_*"
	28	* fields in the decorr_pass structure and gets the history data directly
	29	* from the buffer. It does, however, return the appropriate history samples
	30	* to the decorr_pass structure before returning.
	31	*
	32	* This is written to work on a ARM7TDMI processor. This version uses the
	33	* 64-bit multiply-accumulate instruction and so can be used with all
	34	* WavPack files. However, for optimum performance with 16-bit WavPack
	35	* files, there is a faster version that only uses the 32-bit MLA
	36	* instruction.
	37	*/
	38
	39	.text
	40	.align
	41	.global decorr_stereo_pass_cont_arml
	42
	43	/*
	44	* on entry:
	45	*
	46	* r0 = struct decorr_pass *dpp
	47	* r1 = long *buffer
	48	* r2 = long sample_count
	49	*/
	50
	51	decorr_stereo_pass_cont_arml:
	52
	53	stmfd sp!, {r4 - r8, r10, r11, lr}
	54	mov r5, r0 @ r5 = dpp
	55	mov r11, #512 @ r11 = 512 for rounding
	56	ldrsh r6, [r0, #2] @ r6 = dpp->delta
	57	ldrsh r4, [r0, #4] @ r4 = dpp->weight_A
	58	ldrsh r0, [r0, #6] @ r0 = dpp->weight_B
	59	cmp r2, #0 @ exit if no samples to process
	60	beq common_exit
	61
	62	mov r0, r0, asl #18 @ for 64-bit math we use weights << 18
	63	mov r4, r4, asl #18
	64	mov r6, r6, asl #18
	65	add r7, r1, r2, asl #3 @ r7 = buffer ending position
	66	ldrsh r2, [r5, #0] @ r2 = dpp->term
	67	cmp r2, #0
	68	blt minus_term
	69
	70	ldr lr, [r1, #-16] @ load 2 sample history from buffer
	71	ldr r10, [r1, #-12] @ for terms 2, 17, and 18
	72	ldr r8, [r1, #-8]
	73	ldr r3, [r1, #-4]
	74
	75	cmp r2, #18
	76	beq term_18_loop
	77	mov lr, lr, asl #4
	78	mov r10, r10, asl #4
	79	cmp r2, #2
	80	beq term_2_loop
	81	cmp r2, #17
	82	beq term_17_loop
	83	b term_default_loop
	84
	85	minus_term:
	86	mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping
	87	rsb r10, r10, #0 @ (only used for negative terms)
	88	cmn r2, #1
	89	beq term_minus_1
	90	cmn r2, #2
	91	beq term_minus_2
	92	cmn r2, #3
	93	beq term_minus_3
	94	b common_exit
	95
	96	/*
	97	******************************************************************************
	98	* Loop to handle term = 17 condition
	99	*
	100	* r0 = dpp->weight_B r8 = previous left sample
	101	* r1 = bptr r9 =
	102	* r2 = current sample r10 = second previous left sample << 4
	103	* r3 = previous right sample r11 = lo accumulator (for rounding)
	104	* r4 = dpp->weight_A ip = current decorrelation value
	105	* r5 = dpp sp =
	106	* r6 = dpp->delta lr = second previous right sample << 4
	107	* r7 = eptr pc =
	108	*******************************************************************************
	109	*/
	110
	111	term_17_loop:
	112	rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev
	113	mov lr, r8, asl #4 @ previous becomes 2nd previous
	114	ldr r2, [r1], #4 @ get sample & update pointer
	115	mov r11, #0x80000000
	116	mov r8, r2
	117	smlalne r11, r8, r4, ip
	118	strne r8, [r1, #-4] @ if change possible, store sample back
	119	cmpne r2, #0
	120	beq .L325
	121	teq ip, r2 @ update weight based on signs
	122	submi r4, r4, r6
	123	addpl r4, r4, r6
	124
	125	.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel
	126	mov r10, r3, asl #4
	127	ldr r2, [r1], #4
	128	mov r11, #0x80000000
	129	mov r3, r2
	130	smlalne r11, r3, r0, ip
	131	strne r3, [r1, #-4]
	132	cmpne r2, #0
	133	beq .L329
	134	teq ip, r2
	135	submi r0, r0, r6
	136	addpl r0, r0, r6
	137
	138	.L329: cmp r7, r1 @ loop back if more samples to do
	139	bhi term_17_loop
	140	mov lr, lr, asr #4
	141	mov r10, r10, asr #4
	142	b store_1718 @ common exit for terms 17 & 18
	143
	144	/*
	145	******************************************************************************
	146	* Loop to handle term = 18 condition
	147	*
	148	* r0 = dpp->weight_B r8 = previous left sample
	149	* r1 = bptr r9 =
	150	* r2 = current sample r10 = second previous left sample
	151	* r3 = previous right sample r11 = lo accumulator (for rounding)
	152	* r4 = dpp->weight_A ip = decorrelation value
	153	* r5 = dpp sp =
	154	* r6 = dpp->delta lr = second previous right sample
	155	* r7 = eptr pc =
	156	*******************************************************************************
	157	*/
	158
	159	term_18_loop:
	160	rsb ip, lr, r8 @ decorr value =
	161	mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1
	162	add ip, lr, ip, asr #1
	163	movs ip, ip, asl #4
	164	ldr r2, [r1], #4 @ get sample & update pointer
	165	mov r11, #0x80000000
	166	mov r8, r2
	167	smlalne r11, r8, r4, ip
	168	strne r8, [r1, #-4] @ if change possible, store sample back
	169	cmpne r2, #0
	170	beq .L337
	171	teq ip, r2 @ update weight based on signs
	172	submi r4, r4, r6
	173	addpl r4, r4, r6
	174
	175	.L337: rsb ip, r10, r3 @ do same thing for right channel
	176	mov r10, r3
	177	add ip, r10, ip, asr #1
	178	movs ip, ip, asl #4
	179	ldr r2, [r1], #4
	180	mov r11, #0x80000000
	181	mov r3, r2
	182	smlalne r11, r3, r0, ip
	183	strne r3, [r1, #-4]
	184	cmpne r2, #0
	185	beq .L341
	186	teq ip, r2
	187	submi r0, r0, r6
	188	addpl r0, r0, r6
	189
	190	.L341: cmp r7, r1 @ loop back if more samples to do
	191	bhi term_18_loop
	192
	193	/* common exit for terms 17 & 18 */
	194
	195	store_1718:
	196	str r3, [r5, #40] @ store sample history into struct
	197	str r8, [r5, #8]
	198	str r10, [r5, #44]
	199	str lr, [r5, #12]
	200	b common_exit @ and return
	201
	202	/*
	203	******************************************************************************
	204	* Loop to handle term = 2 condition
	205	* (note that this case can be handled by the default term handler (1-8), but
	206	* this special case is faster because it doesn't have to read memory twice)
	207	*
	208	* r0 = dpp->weight_B r8 = previous left sample
	209	* r1 = bptr r9 =
	210	* r2 = current sample r10 = second previous left sample << 4
	211	* r3 = previous right sample r11 = lo accumulator (for rounding)
	212	* r4 = dpp->weight_A ip = decorrelation value
	213	* r5 = dpp sp =
	214	* r6 = dpp->delta lr = second previous right sample << 4
	215	* r7 = eptr pc =
	216	*******************************************************************************
	217	*/
	218
	219	term_2_loop:
	220	movs ip, lr @ get decorrelation value & test
	221	ldr r2, [r1], #4 @ get sample & update pointer
	222	mov lr, r8, asl #4 @ previous becomes 2nd previous
	223	mov r11, #0x80000000
	224	mov r8, r2
	225	smlalne r11, r8, r4, ip
	226	strne r8, [r1, #-4] @ if change possible, store sample back
	227	cmpne r2, #0
	228	beq .L225
	229	teq ip, r2 @ update weight based on signs
	230	submi r4, r4, r6
	231	addpl r4, r4, r6
	232
	233	.L225: movs ip, r10 @ do same thing for right channel
	234	ldr r2, [r1], #4
	235	mov r10, r3, asl #4
	236	mov r11, #0x80000000
	237	mov r3, r2
	238	smlalne r11, r3, r0, ip
	239	strne r3, [r1, #-4]
	240	cmpne r2, #0
	241	beq .L229
	242	teq ip, r2
	243	submi r0, r0, r6
	244	addpl r0, r0, r6
	245
	246	.L229: cmp r7, r1 @ loop back if more samples to do
	247	bhi term_2_loop
	248
	249	b default_term_exit @ this exit updates all dpp->samples
	250
	251	/*
	252	******************************************************************************
	253	* Loop to handle default term condition
	254	*
	255	* r0 = dpp->weight_B r8 = result accumulator
	256	* r1 = bptr r9 =
	257	* r2 = dpp->term r10 =
	258	* r3 = decorrelation value r11 = lo accumulator (for rounding)
	259	* r4 = dpp->weight_A ip = current sample
	260	* r5 = dpp sp =
	261	* r6 = dpp->delta lr =
	262	* r7 = eptr pc =
	263	*******************************************************************************
	264	*/
	265
	266	term_default_loop:
	267	ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term
	268	ldr ip, [r1], #4 @ get original sample and bump ptr
	269	movs r3, r3, asl #4
	270	mov r11, #0x80000000
	271	mov r8, ip
	272	smlalne r11, r8, r4, r3
	273	strne r8, [r1, #-4] @ if possibly changed, store updated sample
	274	cmpne ip, #0
	275	beq .L350
	276	teq ip, r3 @ update weight based on signs
	277	submi r4, r4, r6
	278	addpl r4, r4, r6
	279
	280	.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel
	281	ldr ip, [r1], #4
	282	movs r3, r3, asl #4
	283	mov r11, #0x80000000
	284	mov r8, ip
	285	smlalne r11, r8, r0, r3
	286	strne r8, [r1, #-4]
	287	cmpne ip, #0
	288	beq .L354
	289	teq ip, r3
	290	submi r0, r0, r6
	291	addpl r0, r0, r6
	292
	293	.L354: cmp r7, r1 @ loop back if more samples to do
	294	bhi term_default_loop
	295
	296	/*
	297	* This exit is used by terms 1-8 to store the previous 8 samples into the decorr
	298	* structure (even if they are not all used for the given term)
	299	*/
	300
	301	default_term_exit:
	302	ldrsh r3, [r5, #0]
	303	sub ip, r3, #1
	304	mov lr, #7
	305
	306	.L358: and r3, ip, #7
	307	add r3, r5, r3, asl #2
	308	ldr r2, [r1, #-4]
	309	str r2, [r3, #40]
	310	ldr r2, [r1, #-8]!
	311	str r2, [r3, #8]
	312	sub ip, ip, #1
	313	sub lr, lr, #1
	314	cmn lr, #1
	315	bne .L358
	316	b common_exit
	317
	318	/*
	319	******************************************************************************
	320	* Loop to handle term = -1 condition
	321	*
	322	* r0 = dpp->weight_B r8 =
	323	* r1 = bptr r9 =
	324	* r2 = intermediate result r10 = -1024 (for clipping)
	325	* r3 = previous right sample r11 = lo accumulator (for rounding)
	326	* r4 = dpp->weight_A ip = current sample
	327	* r5 = dpp sp =
	328	* r6 = dpp->delta lr = updated left sample
	329	* r7 = eptr pc =
	330	*******************************************************************************
	331	*/
	332
	333	term_minus_1:
	334	ldr r3, [r1, #-4]
	335
	336	term_minus_1_loop:
	337	ldr ip, [r1], #8 @ for left channel the decorrelation value
	338	movs r3, r3, asl #4 @ is the previous right sample (in r3)
	339	mov r11, #0x80000000
	340	mov lr, ip
	341	smlalne r11, lr, r4, r3
	342	strne lr, [r1, #-8]
	343	cmpne ip, #0
	344	beq .L361
	345	teq ip, r3 @ update weight based on signs
	346	submi r4, r4, r6
	347	addpl r4, r4, r6
	348	cmp r4, #(1024 << 18)
	349	movgt r4, #(1024 << 18)
	350	cmp r4, r10
	351	movlt r4, r10
	352
	353	.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value
	354	movs lr, lr, asl #4
	355	mov r11, #0x80000000
	356	mov r3, r2
	357	smlalne r11, r3, r0, lr
	358	strne r3, [r1, #-4]
	359	cmpne r2, #0
	360	beq .L369
	361	teq r2, lr
	362	submi r0, r0, r6
	363	addpl r0, r0, r6
	364	cmp r0, #(1024 << 18) @ then clip weight to +/-1024
	365	movgt r0, #(1024 << 18)
	366	cmp r0, r10
	367	movlt r0, r10
	368
	369	.L369: cmp r7, r1 @ loop back if more samples to do
	370	bhi term_minus_1_loop
	371
	372	str r3, [r5, #8] @ else store right sample and exit
	373	b common_exit
	374
	375	/*
	376	******************************************************************************
	377	* Loop to handle term = -2 condition
	378	* (note that the channels are processed in the reverse order here)
	379	*
	380	* r0 = dpp->weight_B r8 =
	381	* r1 = bptr r9 =
	382	* r2 = intermediate result r10 = -1024 (for clipping)
	383	* r3 = previous left sample r11 = lo accumulator (for rounding)
	384	* r4 = dpp->weight_A ip = current sample
	385	* r5 = dpp sp =
	386	* r6 = dpp->delta lr = updated right sample
	387	* r7 = eptr pc =
	388	*******************************************************************************
	389	*/
	390
	391	term_minus_2:
	392	ldr r3, [r1, #-8]
	393
	394	term_minus_2_loop:
	395	ldr ip, [r1, #4] @ for right channel the decorrelation value
	396	movs r3, r3, asl #4 @ is the previous left sample (in r3)
	397	mov r11, #0x80000000
	398	mov lr, ip
	399	smlalne r11, lr, r0, r3
	400	strne lr, [r1, #4]
	401	cmpne ip, #0
	402	beq .L380
	403	teq ip, r3 @ update weight based on signs
	404	submi r0, r0, r6
	405	addpl r0, r0, r6
	406	cmp r0, #(1024 << 18) @ then clip weight to +/-1024
	407	movgt r0, #(1024 << 18)
	408	cmp r0, r10
	409	movlt r0, r10
	410
	411	.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value
	412	movs lr, lr, asl #4
	413	mov r11, #0x80000000
	414	mov r3, r2
	415	smlalne r11, r3, r4, lr
	416	strne r3, [r1, #-8]
	417	cmpne r2, #0
	418	beq .L388
	419	teq r2, lr
	420	submi r4, r4, r6
	421	addpl r4, r4, r6
	422	cmp r4, #(1024 << 18)
	423	movgt r4, #(1024 << 18)
	424	cmp r4, r10
	425	movlt r4, r10
	426
	427	.L388: cmp r7, r1 @ loop back if more samples to do
	428	bhi term_minus_2_loop
	429
	430	str r3, [r5, #40] @ else store left channel and exit
	431	b common_exit
	432
	433	/*
	434	******************************************************************************
	435	* Loop to handle term = -3 condition
	436	*
	437	* r0 = dpp->weight_B r8 = previous left sample
	438	* r1 = bptr r9 =
	439	* r2 = current left sample r10 = -1024 (for clipping)
	440	* r3 = previous right sample r11 = lo accumulator (for rounding)
	441	* r4 = dpp->weight_A ip = intermediate result
	442	* r5 = dpp sp =
	443	* r6 = dpp->delta lr =
	444	* r7 = eptr pc =
	445	*******************************************************************************
	446	*/
	447
	448	term_minus_3:
	449	ldr r3, [r1, #-4] @ load previous samples
	450	ldr r8, [r1, #-8]
	451
	452	term_minus_3_loop:
	453	ldr ip, [r1], #4
	454	movs r3, r3, asl #4
	455	mov r11, #0x80000000
	456	mov r2, ip
	457	smlalne r11, r2, r4, r3
	458	strne r2, [r1, #-4]
	459	cmpne ip, #0
	460	beq .L399
	461	teq ip, r3 @ update weight based on signs
	462	submi r4, r4, r6
	463	addpl r4, r4, r6
	464	cmp r4, #(1024 << 18) @ then clip weight to +/-1024
	465	movgt r4, #(1024 << 18)
	466	cmp r4, r10
	467	movlt r4, r10
	468
	469	.L399: movs ip, r8, asl #4 @ ip = previous left we use now
	470	mov r8, r2 @ r8 = current left we use next time
	471	ldr r2, [r1], #4
	472	mov r11, #0x80000000
	473	mov r3, r2
	474	smlalne r11, r3, r0, ip
	475	strne r3, [r1, #-4]
	476	cmpne r2, #0
	477	beq .L407
	478	teq ip, r2
	479	submi r0, r0, r6
	480	addpl r0, r0, r6
	481	cmp r0, #(1024 << 18)
	482	movgt r0, #(1024 << 18)
	483	cmp r0, r10
	484	movlt r0, r10
	485
	486	.L407: cmp r7, r1 @ loop back if more samples to do
	487	bhi term_minus_3_loop
	488
	489	str r3, [r5, #8] @ else store previous samples & exit
	490	str r8, [r5, #40]
	491
	492	/*
	493	* Before finally exiting we must store weights back for next time
	494	*/
	495
	496	common_exit:
	497	mov r0, r0, asr #18 @ restore weights to real magnitude
	498	mov r4, r4, asr #18
	499	strh r4, [r5, #4]
	500	strh r0, [r5, #6]
	501	ldmfd sp!, {r4 - r8, r10, r11, pc}
	502