1 files changed, 506 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libwavpack/arml.S b/lib/rbcodec/codecs/libwavpack/arml.S
new file mode 100644
index 0000000000..60818aa1e6
--- /dev/null
+++ b/lib/rbcodec/codecs/libwavpack/arml.S
@@ -0,0 +1,506 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by David Bryant
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
+ *                                    long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a ARM7TDMI processor. This version uses the
+ * 64-bit multiply-accumulate instruction and so can be used with all
+ * WavPack files. However, for optimum performance with 16-bit WavPack
+ * files, there is a faster version that only uses the 32-bit MLA
+ * instruction.
+ */
+#include "config.h"
+        .text
+        .align
+        .global         decorr_stereo_pass_cont_arml
+/*
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = long *buffer
+ * r2 = long sample_count
+ */
+decorr_stereo_pass_cont_arml:
+        stmfd   sp!, {r4 - r8, r10, r11, lr}
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
+        ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
+        ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     common_exit
+        mov     r0, r0, asl #18         @ for 64-bit math we use weights << 18
+        mov     r4, r4, asl #18
+        mov     r6, r6, asl #18
+        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
+        ldrsh   r2, [r5, #0]            @ r2 = dpp->term
+        cmp     r2, #0
+        blt     minus_term
+        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
+        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
+        ldr     r8, [r1, #-8]
+        ldr     r3, [r1, #-4]
+        cmp     r2, #18
+        beq     term_18_loop
+        mov     lr, lr, asl #4
+        mov     r10, r10, asl #4
+        cmp     r2, #2
+        beq     term_2_loop
+        cmp     r2, #17
+        beq     term_17_loop
+        b       term_default_loop
+minus_term:
+        mov     r10, #(1024 << 18)      @ r10 = -1024 << 18 for weight clipping
+        rsb     r10, r10, #0            @  (only used for negative terms)
+        cmn     r2, #1
+        beq     term_minus_1
+        cmn     r2, #2
+        beq     term_minus_2
+        cmn     r2, #3
+        beq     term_minus_3
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample << 4
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample << 4
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_17_loop:
+        rsbs    ip, lr, r8, asl #5      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8, asl #4          @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L325
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L325:  rsbs    ip, r10, r3, asl #5     @ do same thing for right channel
+        mov     r10, r3, asl #4
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L329
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L329:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_17_loop
+        mov     lr, lr, asr #4
+        mov     r10, r10, asr #4
+        b       store_1718              @ common exit for terms 17 & 18
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_18_loop:
+        rsb     ip, lr, r8              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        add     ip, lr, ip, asr #1
+        movs    ip, ip, asl #4
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L337
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L337:  rsb     ip, r10, r3             @ do same thing for right channel
+        mov     r10, r3
+        add     ip, r10, ip, asr #1
+        movs    ip, ip, asl #4
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L341
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L341:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_18_loop
+/* common exit for terms 17 & 18 */
+store_1718:
+        str     r3, [r5, #40]           @ store sample history into struct
+        str     r8, [r5, #8]
+        str     r10, [r5, #44]
+        str     lr, [r5, #12]
+        b       common_exit             @ and return
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current sample          r10 = second previous left sample << 4
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous right sample << 4
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_2_loop:
+        movs    ip, lr                  @ get decorrelation value & test
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     lr, r8, asl #4          @ previous becomes 2nd previous
+        mov     r11, #0x80000000
+        mov     r8, r2
+        smlalne r11, r8, r4, ip
+        strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     .L225
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L225:  movs    ip, r10                 @ do same thing for right channel
+        ldr     r2, [r1], #4
+        mov     r10, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L229
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L229:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_2_loop
+        b       default_term_exit       @ this exit updates all dpp->samples
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 = dpp->weight_B           r8 = result accumulator
+ * r1 = bptr                    r9 = 
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_default_loop:
+        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
+        ldr     ip, [r1], #4            @ get original sample and bump ptr
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r8, ip
+        smlalne r11, r8, r4, r3
+        strne   r8, [r1, #-4]           @ if possibly changed, store updated sample
+        cmpne   ip, #0
+        beq     .L350
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+.L350:  ldr     r3, [r1, -r2, asl #3]   @ do the same thing for right channel
+        ldr     ip, [r1], #4
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r8, ip
+        smlalne r11, r8, r0, r3
+        strne   r8, [r1, #-4]
+        cmpne   ip, #0
+        beq     .L354
+        teq     ip, r3
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+.L354:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_default_loop
+/*
+ * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
+ * structure (even if they are not all used for the given term)
+ */
+default_term_exit:
+        ldrsh   r3, [r5, #0]
+        sub     ip, r3, #1
+        mov     lr, #7
+.L358:  and     r3, ip, #7
+        add     r3, r5, r3, asl #2
+        ldr     r2, [r1, #-4]
+        str     r2, [r3, #40]
+        ldr     r2, [r1, #-8]!
+        str     r2, [r3, #8]
+        sub     ip, ip, #1
+        sub     lr, lr, #1
+        cmn     lr, #1
+        bne     .L358
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -1 condition
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_1:
+        ldr     r3, [r1, #-4]
+term_minus_1_loop:
+        ldr     ip, [r1], #8            @ for left channel the decorrelation value
+        movs    r3, r3, asl #4          @  is the previous right sample (in r3)
+        mov     r11, #0x80000000
+        mov     lr, ip
+        smlalne r11, lr, r4, r3
+        strne   lr, [r1, #-8]
+        cmpne   ip, #0
+        beq     .L361
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
+        movs    lr, lr, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, lr
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L369
+        teq     r2, lr
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L369:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_1_loop
+        str     r3, [r5, #8]            @ else store right sample and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -2 condition
+ * (note that the channels are processed in the reverse order here)
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 = 
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous left sample    r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_2:
+        ldr     r3, [r1, #-8]
+term_minus_2_loop:
+        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
+        movs    r3, r3, asl #4          @  is the previous left sample (in r3)
+        mov     r11, #0x80000000
+        mov     lr, ip
+        smlalne r11, lr, r0, r3
+        strne   lr, [r1, #4]
+        cmpne   ip, #0
+        beq     .L380
+        teq     ip, r3                  @ update weight based on signs
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L380:  ldr     r2, [r1], #8            @ for left channel the decorrelation value
+        movs    lr, lr, asl #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r4, lr
+        strne   r3, [r1, #-8]
+        cmpne   r2, #0
+        beq     .L388
+        teq     r2, lr
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L388:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_2_loop
+        str     r3, [r5, #40]           @ else store left channel and exit
+        b       common_exit
+/*
+ ******************************************************************************
+ * Loop to handle term = -3 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 = 
+ * r2 = current left sample     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = lo accumulator (for rounding)
+ * r4 = dpp->weight_A           ip = intermediate result
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+term_minus_3:
+        ldr     r3, [r1, #-4]           @ load previous samples
+        ldr     r8, [r1, #-8]
+term_minus_3_loop:
+        ldr     ip, [r1], #4
+        movs    r3, r3, asl #4
+        mov     r11, #0x80000000
+        mov     r2, ip
+        smlalne r11, r2, r4, r3
+        strne   r2, [r1, #-4]
+        cmpne   ip, #0
+        beq     .L399
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #(1024 << 18)       @ then clip weight to +/-1024
+        movgt   r4, #(1024 << 18)
+        cmp     r4, r10
+        movlt   r4, r10
+.L399:  movs    ip, r8, asl #4          @ ip = previous left we use now
+        mov     r8, r2                  @ r8 = current left we use next time
+        ldr     r2, [r1], #4
+        mov     r11, #0x80000000
+        mov     r3, r2
+        smlalne r11, r3, r0, ip
+        strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     .L407
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #(1024 << 18)
+        movgt   r0, #(1024 << 18)
+        cmp     r0, r10
+        movlt   r0, r10
+.L407:  cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_3_loop
+        str     r3, [r5, #8]            @ else store previous samples & exit
+        str     r8, [r5, #40]
+/*
+ * Before finally exiting we must store weights back for next time
+ */
+common_exit:
+        mov     r0, r0, asr #18         @ restore weights to real magnitude
+        mov     r4, r4, asr #18
+        strh    r4, [r5, #4]
+        strh    r0, [r5, #6]
+        ldmpc   regs="r4-r8, r10-r11"

diff --git a/lib/rbcodec/codecs/libwavpack/arml.S b/lib/rbcodec/codecs/libwavpack/arml.S new file mode 100644 index 0000000000..60818aa1e6 --- /dev/null +++ b/lib/rbcodec/codecs/libwavpack/arml.S
@@ -0,0 +1,506 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2006 by David Bryant
	11	*
	12	* This program is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU General Public License
	14	* as published by the Free Software Foundation; either version 2
	15	* of the License, or (at your option) any later version.
	16	*
	17	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	18	* KIND, either express or implied.
	19	*
	20	****************************************************************************/
	21
	22	/* This is an assembly optimized version of the following WavPack function:
	23	*
	24	* void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
	25	* long *buffer, long sample_count);
	26	*
	27	* It performs a single pass of stereo decorrelation on the provided buffer.
	28	* Note that this version of the function requires that the 8 previous stereo
	29	* samples are visible and correct. In other words, it ignores the "samples_*"
	30	* fields in the decorr_pass structure and gets the history data directly
	31	* from the buffer. It does, however, return the appropriate history samples
	32	* to the decorr_pass structure before returning.
	33	*
	34	* This is written to work on a ARM7TDMI processor. This version uses the
	35	* 64-bit multiply-accumulate instruction and so can be used with all
	36	* WavPack files. However, for optimum performance with 16-bit WavPack
	37	* files, there is a faster version that only uses the 32-bit MLA
	38	* instruction.
	39	*/
	40
	41	#include "config.h"
	42
	43	.text
	44	.align
	45	.global decorr_stereo_pass_cont_arml
	46
	47	/*
	48	* on entry:
	49	*
	50	* r0 = struct decorr_pass *dpp
	51	* r1 = long *buffer
	52	* r2 = long sample_count
	53	*/
	54
	55	decorr_stereo_pass_cont_arml:
	56
	57	stmfd sp!, {r4 - r8, r10, r11, lr}
	58	mov r5, r0 @ r5 = dpp
	59	mov r11, #512 @ r11 = 512 for rounding
	60	ldrsh r6, [r0, #2] @ r6 = dpp->delta
	61	ldrsh r4, [r0, #4] @ r4 = dpp->weight_A
	62	ldrsh r0, [r0, #6] @ r0 = dpp->weight_B
	63	cmp r2, #0 @ exit if no samples to process
	64	beq common_exit
	65
	66	mov r0, r0, asl #18 @ for 64-bit math we use weights << 18
	67	mov r4, r4, asl #18
	68	mov r6, r6, asl #18
	69	add r7, r1, r2, asl #3 @ r7 = buffer ending position
	70	ldrsh r2, [r5, #0] @ r2 = dpp->term
	71	cmp r2, #0
	72	blt minus_term
	73
	74	ldr lr, [r1, #-16] @ load 2 sample history from buffer
	75	ldr r10, [r1, #-12] @ for terms 2, 17, and 18
	76	ldr r8, [r1, #-8]
	77	ldr r3, [r1, #-4]
	78
	79	cmp r2, #18
	80	beq term_18_loop
	81	mov lr, lr, asl #4
	82	mov r10, r10, asl #4
	83	cmp r2, #2
	84	beq term_2_loop
	85	cmp r2, #17
	86	beq term_17_loop
	87	b term_default_loop
	88
	89	minus_term:
	90	mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping
	91	rsb r10, r10, #0 @ (only used for negative terms)
	92	cmn r2, #1
	93	beq term_minus_1
	94	cmn r2, #2
	95	beq term_minus_2
	96	cmn r2, #3
	97	beq term_minus_3
	98	b common_exit
	99
	100	/*
	101	******************************************************************************
	102	* Loop to handle term = 17 condition
	103	*
	104	* r0 = dpp->weight_B r8 = previous left sample
	105	* r1 = bptr r9 =
	106	* r2 = current sample r10 = second previous left sample << 4
	107	* r3 = previous right sample r11 = lo accumulator (for rounding)
	108	* r4 = dpp->weight_A ip = current decorrelation value
	109	* r5 = dpp sp =
	110	* r6 = dpp->delta lr = second previous right sample << 4
	111	* r7 = eptr pc =
	112	*******************************************************************************
	113	*/
	114
	115	term_17_loop:
	116	rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev
	117	mov lr, r8, asl #4 @ previous becomes 2nd previous
	118	ldr r2, [r1], #4 @ get sample & update pointer
	119	mov r11, #0x80000000
	120	mov r8, r2
	121	smlalne r11, r8, r4, ip
	122	strne r8, [r1, #-4] @ if change possible, store sample back
	123	cmpne r2, #0
	124	beq .L325
	125	teq ip, r2 @ update weight based on signs
	126	submi r4, r4, r6
	127	addpl r4, r4, r6
	128
	129	.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel
	130	mov r10, r3, asl #4
	131	ldr r2, [r1], #4
	132	mov r11, #0x80000000
	133	mov r3, r2
	134	smlalne r11, r3, r0, ip
	135	strne r3, [r1, #-4]
	136	cmpne r2, #0
	137	beq .L329
	138	teq ip, r2
	139	submi r0, r0, r6
	140	addpl r0, r0, r6
	141
	142	.L329: cmp r7, r1 @ loop back if more samples to do
	143	bhi term_17_loop
	144	mov lr, lr, asr #4
	145	mov r10, r10, asr #4
	146	b store_1718 @ common exit for terms 17 & 18
	147
	148	/*
	149	******************************************************************************
	150	* Loop to handle term = 18 condition
	151	*
	152	* r0 = dpp->weight_B r8 = previous left sample
	153	* r1 = bptr r9 =
	154	* r2 = current sample r10 = second previous left sample
	155	* r3 = previous right sample r11 = lo accumulator (for rounding)
	156	* r4 = dpp->weight_A ip = decorrelation value
	157	* r5 = dpp sp =
	158	* r6 = dpp->delta lr = second previous right sample
	159	* r7 = eptr pc =
	160	*******************************************************************************
	161	*/
	162
	163	term_18_loop:
	164	rsb ip, lr, r8 @ decorr value =
	165	mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1
	166	add ip, lr, ip, asr #1
	167	movs ip, ip, asl #4
	168	ldr r2, [r1], #4 @ get sample & update pointer
	169	mov r11, #0x80000000
	170	mov r8, r2
	171	smlalne r11, r8, r4, ip
	172	strne r8, [r1, #-4] @ if change possible, store sample back
	173	cmpne r2, #0
	174	beq .L337
	175	teq ip, r2 @ update weight based on signs
	176	submi r4, r4, r6
	177	addpl r4, r4, r6
	178
	179	.L337: rsb ip, r10, r3 @ do same thing for right channel
	180	mov r10, r3
	181	add ip, r10, ip, asr #1
	182	movs ip, ip, asl #4
	183	ldr r2, [r1], #4
	184	mov r11, #0x80000000
	185	mov r3, r2
	186	smlalne r11, r3, r0, ip
	187	strne r3, [r1, #-4]
	188	cmpne r2, #0
	189	beq .L341
	190	teq ip, r2
	191	submi r0, r0, r6
	192	addpl r0, r0, r6
	193
	194	.L341: cmp r7, r1 @ loop back if more samples to do
	195	bhi term_18_loop
	196
	197	/* common exit for terms 17 & 18 */
	198
	199	store_1718:
	200	str r3, [r5, #40] @ store sample history into struct
	201	str r8, [r5, #8]
	202	str r10, [r5, #44]
	203	str lr, [r5, #12]
	204	b common_exit @ and return
	205
	206	/*
	207	******************************************************************************
	208	* Loop to handle term = 2 condition
	209	* (note that this case can be handled by the default term handler (1-8), but
	210	* this special case is faster because it doesn't have to read memory twice)
	211	*
	212	* r0 = dpp->weight_B r8 = previous left sample
	213	* r1 = bptr r9 =
	214	* r2 = current sample r10 = second previous left sample << 4
	215	* r3 = previous right sample r11 = lo accumulator (for rounding)
	216	* r4 = dpp->weight_A ip = decorrelation value
	217	* r5 = dpp sp =
	218	* r6 = dpp->delta lr = second previous right sample << 4
	219	* r7 = eptr pc =
	220	*******************************************************************************
	221	*/
	222
	223	term_2_loop:
	224	movs ip, lr @ get decorrelation value & test
	225	ldr r2, [r1], #4 @ get sample & update pointer
	226	mov lr, r8, asl #4 @ previous becomes 2nd previous
	227	mov r11, #0x80000000
	228	mov r8, r2
	229	smlalne r11, r8, r4, ip
	230	strne r8, [r1, #-4] @ if change possible, store sample back
	231	cmpne r2, #0
	232	beq .L225
	233	teq ip, r2 @ update weight based on signs
	234	submi r4, r4, r6
	235	addpl r4, r4, r6
	236
	237	.L225: movs ip, r10 @ do same thing for right channel
	238	ldr r2, [r1], #4
	239	mov r10, r3, asl #4
	240	mov r11, #0x80000000
	241	mov r3, r2
	242	smlalne r11, r3, r0, ip
	243	strne r3, [r1, #-4]
	244	cmpne r2, #0
	245	beq .L229
	246	teq ip, r2
	247	submi r0, r0, r6
	248	addpl r0, r0, r6
	249
	250	.L229: cmp r7, r1 @ loop back if more samples to do
	251	bhi term_2_loop
	252
	253	b default_term_exit @ this exit updates all dpp->samples
	254
	255	/*
	256	******************************************************************************
	257	* Loop to handle default term condition
	258	*
	259	* r0 = dpp->weight_B r8 = result accumulator
	260	* r1 = bptr r9 =
	261	* r2 = dpp->term r10 =
	262	* r3 = decorrelation value r11 = lo accumulator (for rounding)
	263	* r4 = dpp->weight_A ip = current sample
	264	* r5 = dpp sp =
	265	* r6 = dpp->delta lr =
	266	* r7 = eptr pc =
	267	*******************************************************************************
	268	*/
	269
	270	term_default_loop:
	271	ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term
	272	ldr ip, [r1], #4 @ get original sample and bump ptr
	273	movs r3, r3, asl #4
	274	mov r11, #0x80000000
	275	mov r8, ip
	276	smlalne r11, r8, r4, r3
	277	strne r8, [r1, #-4] @ if possibly changed, store updated sample
	278	cmpne ip, #0
	279	beq .L350
	280	teq ip, r3 @ update weight based on signs
	281	submi r4, r4, r6
	282	addpl r4, r4, r6
	283
	284	.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel
	285	ldr ip, [r1], #4
	286	movs r3, r3, asl #4
	287	mov r11, #0x80000000
	288	mov r8, ip
	289	smlalne r11, r8, r0, r3
	290	strne r8, [r1, #-4]
	291	cmpne ip, #0
	292	beq .L354
	293	teq ip, r3
	294	submi r0, r0, r6
	295	addpl r0, r0, r6
	296
	297	.L354: cmp r7, r1 @ loop back if more samples to do
	298	bhi term_default_loop
	299
	300	/*
	301	* This exit is used by terms 1-8 to store the previous 8 samples into the decorr
	302	* structure (even if they are not all used for the given term)
	303	*/
	304
	305	default_term_exit:
	306	ldrsh r3, [r5, #0]
	307	sub ip, r3, #1
	308	mov lr, #7
	309
	310	.L358: and r3, ip, #7
	311	add r3, r5, r3, asl #2
	312	ldr r2, [r1, #-4]
	313	str r2, [r3, #40]
	314	ldr r2, [r1, #-8]!
	315	str r2, [r3, #8]
	316	sub ip, ip, #1
	317	sub lr, lr, #1
	318	cmn lr, #1
	319	bne .L358
	320	b common_exit
	321
	322	/*
	323	******************************************************************************
	324	* Loop to handle term = -1 condition
	325	*
	326	* r0 = dpp->weight_B r8 =
	327	* r1 = bptr r9 =
	328	* r2 = intermediate result r10 = -1024 (for clipping)
	329	* r3 = previous right sample r11 = lo accumulator (for rounding)
	330	* r4 = dpp->weight_A ip = current sample
	331	* r5 = dpp sp =
	332	* r6 = dpp->delta lr = updated left sample
	333	* r7 = eptr pc =
	334	*******************************************************************************
	335	*/
	336
	337	term_minus_1:
	338	ldr r3, [r1, #-4]
	339
	340	term_minus_1_loop:
	341	ldr ip, [r1], #8 @ for left channel the decorrelation value
	342	movs r3, r3, asl #4 @ is the previous right sample (in r3)
	343	mov r11, #0x80000000
	344	mov lr, ip
	345	smlalne r11, lr, r4, r3
	346	strne lr, [r1, #-8]
	347	cmpne ip, #0
	348	beq .L361
	349	teq ip, r3 @ update weight based on signs
	350	submi r4, r4, r6
	351	addpl r4, r4, r6
	352	cmp r4, #(1024 << 18)
	353	movgt r4, #(1024 << 18)
	354	cmp r4, r10
	355	movlt r4, r10
	356
	357	.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value
	358	movs lr, lr, asl #4
	359	mov r11, #0x80000000
	360	mov r3, r2
	361	smlalne r11, r3, r0, lr
	362	strne r3, [r1, #-4]
	363	cmpne r2, #0
	364	beq .L369
	365	teq r2, lr
	366	submi r0, r0, r6
	367	addpl r0, r0, r6
	368	cmp r0, #(1024 << 18) @ then clip weight to +/-1024
	369	movgt r0, #(1024 << 18)
	370	cmp r0, r10
	371	movlt r0, r10
	372
	373	.L369: cmp r7, r1 @ loop back if more samples to do
	374	bhi term_minus_1_loop
	375
	376	str r3, [r5, #8] @ else store right sample and exit
	377	b common_exit
	378
	379	/*
	380	******************************************************************************
	381	* Loop to handle term = -2 condition
	382	* (note that the channels are processed in the reverse order here)
	383	*
	384	* r0 = dpp->weight_B r8 =
	385	* r1 = bptr r9 =
	386	* r2 = intermediate result r10 = -1024 (for clipping)
	387	* r3 = previous left sample r11 = lo accumulator (for rounding)
	388	* r4 = dpp->weight_A ip = current sample
	389	* r5 = dpp sp =
	390	* r6 = dpp->delta lr = updated right sample
	391	* r7 = eptr pc =
	392	*******************************************************************************
	393	*/
	394
	395	term_minus_2:
	396	ldr r3, [r1, #-8]
	397
	398	term_minus_2_loop:
	399	ldr ip, [r1, #4] @ for right channel the decorrelation value
	400	movs r3, r3, asl #4 @ is the previous left sample (in r3)
	401	mov r11, #0x80000000
	402	mov lr, ip
	403	smlalne r11, lr, r0, r3
	404	strne lr, [r1, #4]
	405	cmpne ip, #0
	406	beq .L380
	407	teq ip, r3 @ update weight based on signs
	408	submi r0, r0, r6
	409	addpl r0, r0, r6
	410	cmp r0, #(1024 << 18) @ then clip weight to +/-1024
	411	movgt r0, #(1024 << 18)
	412	cmp r0, r10
	413	movlt r0, r10
	414
	415	.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value
	416	movs lr, lr, asl #4
	417	mov r11, #0x80000000
	418	mov r3, r2
	419	smlalne r11, r3, r4, lr
	420	strne r3, [r1, #-8]
	421	cmpne r2, #0
	422	beq .L388
	423	teq r2, lr
	424	submi r4, r4, r6
	425	addpl r4, r4, r6
	426	cmp r4, #(1024 << 18)
	427	movgt r4, #(1024 << 18)
	428	cmp r4, r10
	429	movlt r4, r10
	430
	431	.L388: cmp r7, r1 @ loop back if more samples to do
	432	bhi term_minus_2_loop
	433
	434	str r3, [r5, #40] @ else store left channel and exit
	435	b common_exit
	436
	437	/*
	438	******************************************************************************
	439	* Loop to handle term = -3 condition
	440	*
	441	* r0 = dpp->weight_B r8 = previous left sample
	442	* r1 = bptr r9 =
	443	* r2 = current left sample r10 = -1024 (for clipping)
	444	* r3 = previous right sample r11 = lo accumulator (for rounding)
	445	* r4 = dpp->weight_A ip = intermediate result
	446	* r5 = dpp sp =
	447	* r6 = dpp->delta lr =
	448	* r7 = eptr pc =
	449	*******************************************************************************
	450	*/
	451
	452	term_minus_3:
	453	ldr r3, [r1, #-4] @ load previous samples
	454	ldr r8, [r1, #-8]
	455
	456	term_minus_3_loop:
	457	ldr ip, [r1], #4
	458	movs r3, r3, asl #4
	459	mov r11, #0x80000000
	460	mov r2, ip
	461	smlalne r11, r2, r4, r3
	462	strne r2, [r1, #-4]
	463	cmpne ip, #0
	464	beq .L399
	465	teq ip, r3 @ update weight based on signs
	466	submi r4, r4, r6
	467	addpl r4, r4, r6
	468	cmp r4, #(1024 << 18) @ then clip weight to +/-1024
	469	movgt r4, #(1024 << 18)
	470	cmp r4, r10
	471	movlt r4, r10
	472
	473	.L399: movs ip, r8, asl #4 @ ip = previous left we use now
	474	mov r8, r2 @ r8 = current left we use next time
	475	ldr r2, [r1], #4
	476	mov r11, #0x80000000
	477	mov r3, r2
	478	smlalne r11, r3, r0, ip
	479	strne r3, [r1, #-4]
	480	cmpne r2, #0
	481	beq .L407
	482	teq ip, r2
	483	submi r0, r0, r6
	484	addpl r0, r0, r6
	485	cmp r0, #(1024 << 18)
	486	movgt r0, #(1024 << 18)
	487	cmp r0, r10
	488	movlt r0, r10
	489
	490	.L407: cmp r7, r1 @ loop back if more samples to do
	491	bhi term_minus_3_loop
	492
	493	str r3, [r5, #8] @ else store previous samples & exit
	494	str r8, [r5, #40]
	495
	496	/*
	497	* Before finally exiting we must store weights back for next time
	498	*/
	499
	500	common_exit:
	501	mov r0, r0, asr #18 @ restore weights to real magnitude
	502	mov r4, r4, asr #18
	503	strh r4, [r5, #4]
	504	strh r0, [r5, #6]
	505	ldmpc regs="r4-r8, r10-r11"
	506