2 files changed, 72 insertions, 32 deletions
diff --git a/apps/eq_arm.S b/apps/eq_arm.S
index 85617dc2fb..0c1961d2d3 100644
--- a/apps/eq_arm.S
+++ b/apps/eq_arm.S
@@ -7,7 +7,7 @@
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
- * Copyright (C) 2006 Thom Johansen
+ * Copyright (C) 2006-2007 Thom Johansen
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
@@ -17,6 +17,15 @@
 *
 ****************************************************************************/
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
    .text
    .global eq_filter
 eq_filter:
@@ -33,35 +42,40 @@ eq_filter:
    ldr r14, [sp, #8]       @ r14 = numsamples
    ldmia r10, { r0-r3 }    @ load history, r10 should be filter struct addr
    str r10, [sp, #4]       @ save it for loop end
-.loop:
    /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator,
-       r12 = shift amount, r14 = number of samples.
+     * r12 = shift amount, r14 = number of samples.
-       See eq_cf.S for explanation of what this loop does. Primary difference
-       is the reordering of the equation we do here, which is done for register
-       reuse reasons, we're pretty short on regs.
     */
-    smull r10, r11, r6, r1  @ acc = b2*x[i - 2]
+.loop:
-    mov r1, r0              @ fix input history
+    /* Direct form 1 filtering code.
-    smlal r10, r11, r5, r0  @ acc += b1*x[i - 1]
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
-    ldr r0, [r9]            @ load input and fix history in same operation
+     * where y[] is output and x[] is input. This is performed out of order to
-    smlal r10, r11, r4, r0  @ acc += b0*x[i]
+     * reuse registers, we're pretty short on regs.
-    smlal r10, r11, r7, r2  @ acc += a1*y[i - 1]
+     */
-    smlal r10, r11, r8, r3  @ acc += a2*y[i - 2]
+    smull r10, r11, r6, r1     @ acc = b2*x[i - 2]
-    mov r3, r2              @ fix output history
+    mov r1, r0                 @ fix input history
-    mov r2, r11, lsl r12    @ get result
+    smlal r10, r11, r5, r0     @ acc += b1*x[i - 1]
-    @ TODO: arm makes it easy to mix in lower bits from r10 for extended
+    ldr r0, [r9]               @ load input and fix history in same operation
-    @ precision here, but we don't have enough regs to save the shift factor
+    smlal r10, r11, r4, r0     @ acc += b0*x[i]
-    @ we would need (32 - r12).
+    smlal r10, r11, r7, r2     @ acc += a1*y[i - 1]
-    str r2, [r9], #4        @ save result
+    smlal r10, r11, r8, r3     @ acc += a2*y[i - 2]
-    subs r14, r14, #1       @ are we done with this channel?
+    mov r3, r2                 @ fix output history
+    mov r2, r11, asl r12       @ get upper part of result and shift left
+#ifdef HIGH_PRECISION
+    rsb r11, r12, #32          @ get shift amount for lower part
+    orr r2, r2, r10, lsr r11   @ then mix in correctly shifted lower part
+#endif
+    str r2, [r9], #4           @ save result
+    subs r14, r14, #1          @ are we done with this channel?
    bne .loop
-    ldr r10, [sp, #4]       @ load filter struct pointer
+    ldr r10, [sp, #4]          @ load filter struct pointer
-    stmia r10!, { r0-r3 }   @ save back history
+    stmia r10!, { r0-r3 }      @ save back history
-    ldr r11, [sp, #12]      @ load number of channels
+    ldr r11, [sp, #12]         @ load number of channels
-    subs r11, r11, #1       @ all channels processed?
+    subs r11, r11, #1          @ all channels processed?
    strne r11, [sp, #12]
    bne .filterloop
-    add sp, sp, #16         @ compensate for temp storage
+    add sp, sp, #16            @ compensate for temp storage
    ldmia sp!, { r4-r11, pc }
diff --git a/apps/eq_cf.S b/apps/eq_cf.S
index c9458cdc77..75bfcafb3a 100644
--- a/apps/eq_cf.S
+++ b/apps/eq_cf.S
@@ -7,7 +7,7 @@
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
- * Copyright (C) 2006 Thom Johansen
+ * Copyright (C) 2006-2007 Thom Johansen
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
@@ -17,14 +17,27 @@
 *
 ****************************************************************************/
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" - 1 of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
    .text
    .global eq_filter
 eq_filter:
    lea.l (-11*4, %sp), %sp 
    movem.l %d2-%d7/%a2-%a6, (%sp)    | save clobbered regs
    move.l (11*4+8, %sp), %a5         | fetch filter structure address
-    movem.l (11*4+16, %sp), %d6-%d7   | load num. channels and shift count
+    move.l (11*4+20, %sp), %d7        | load shift count
    subq.l #1, %d7                    | EMAC gives us one free shift
+#ifdef HIGH_PRECISION
+    moveq.l #8, %d6
+    sub.l %d7, %d6                    | shift for lower part of accumulator
+#endif
    movem.l (%a5), %a0-%a4            | load coefs
    lea.l (5*4, %a5), %a5             | point to filter history
@@ -34,11 +47,16 @@ eq_filter:
    move.l (%a6), %a6
    move.l (11*4+12, %sp), %d5        | number of samples
    movem.l (%a5), %d0-%d3            | load filter history
+    /* d0-r3 = history, d4 = number of channels, d5 = sample count,
+     * d6 = lower shift amount, d7 = upper shift amount, a0-a4 = coefs,
+     * a5 = history pointer, a6 = x[]
+     */
 .loop:
    /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.
-       y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
-       where y[] is output and x[] is input. This is performed out of order
+     * where y[] is output and x[] is input. This is performed out of order
-       to do parallel load of input value.
+     * to do parallel load of input value.
     */
    mac.l %a2, %d1, %acc0               | acc = b2*x[i - 2]
    move.l %d0, %d1                     | fix input history
@@ -47,15 +65,23 @@ eq_filter:
    mac.l %a3, %d2, %acc0               | acc += a1*y[i - 1]
    mac.l %a4, %d3, %acc0               | acc += a2*y[i - 2]
    move.l %d2, %d3                     | fix output history
-    movclr.l %acc0, %d2                 | fetch and write result
+#ifdef HIGH_PRECISION
+    move.l %accext01, %d2               | fetch lower part of accumulator
+    move.b %d2, %d4                     | clear upper three bytes
+    lsr.l %d6, %d4                      | shift lower bits
+#endif
+    movclr.l %acc0, %d2                 | fetch upper part of result
    asl.l %d7, %d2                      | restore fixed point format
+#ifdef HIGH_PRECISION
+    or.l %d2, %d4                       | combine lower and upper parts
+#endif
    move.l %d2, (%a6)+                  | save result
    subq.l #1, %d5                      | are we done with this channel?
    jne .loop
    
    movem.l %d0-%d3, (%a5)              | save history back to struct
    lea.l (4*4, %a5), %a5               | point to next channel's history
-    subq.l #1, %d6                      | have we processed both channels?
+    subq.l #1, (11*4+16, %sp)           | have we processed both channels?
    jne .filterloop
    movem.l (%sp), %d2-%d7/%a2-%a6

diff --git a/apps/eq_arm.S b/apps/eq_arm.S index 85617dc2fb..0c1961d2d3 100644 --- a/apps/eq_arm.S +++ b/apps/eq_arm.S
@@ -7,7 +7,7 @@
7	* \/ \/ \/ \/ \/	7	* \/ \/ \/ \/ \/
8	* $Id$	8	* $Id$
9	*	9	*
10	* Copyright (C) 2006 Thom Johansen	10	* Copyright (C) 2006-2007 Thom Johansen
11	*	11	*
12	* All files in this archive are subject to the GNU General Public License.	12	* All files in this archive are subject to the GNU General Public License.
13	* See the file COPYING in the source tree root for full license agreement.	13	* See the file COPYING in the source tree root for full license agreement.
@@ -17,6 +17,15 @@
17	*	17	*
18	****************************************************************************/	18	****************************************************************************/
19		19
		20	/* uncomment this to make filtering calculate lower bits after shifting.
		21	* without this, "shift" of the lower bits will be lost here.
		22	*/
		23	/* #define HIGH_PRECISION */
		24
		25	/*
		26	* void eq_filter(int32_t *x, struct eqfilter f, unsigned num,
		27	* unsigned channels, unsigned shift)
		28	*/
20	.text	29	.text
21	.global eq_filter	30	.global eq_filter
22	eq_filter:	31	eq_filter:
@@ -33,35 +42,40 @@ eq_filter:
33	ldr r14, [sp, #8] @ r14 = numsamples	42	ldr r14, [sp, #8] @ r14 = numsamples
34	ldmia r10, { r0-r3 } @ load history, r10 should be filter struct addr	43	ldmia r10, { r0-r3 } @ load history, r10 should be filter struct addr
35	str r10, [sp, #4] @ save it for loop end	44	str r10, [sp, #4] @ save it for loop end
36	.loop:	45
37	/* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator,	46	/* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator,
38	r12 = shift amount, r14 = number of samples.	47	* r12 = shift amount, r14 = number of samples.
39	See eq_cf.S for explanation of what this loop does. Primary difference
40	is the reordering of the equation we do here, which is done for register
41	reuse reasons, we're pretty short on regs.
42	*/	48	*/
43	smull r10, r11, r6, r1 @ acc = b2*x[i - 2]	49	.loop:
44	mov r1, r0 @ fix input history	50	/* Direct form 1 filtering code.
45	smlal r10, r11, r5, r0 @ acc += b1*x[i - 1]	51	* y[n] = b0x[i] + b1x[i - 1] + b2x[i - 2] + a1y[i - 1] + a2*y[i - 2],
46	ldr r0, [r9] @ load input and fix history in same operation	52	* where y[] is output and x[] is input. This is performed out of order to
47	smlal r10, r11, r4, r0 @ acc += b0*x[i]	53	* reuse registers, we're pretty short on regs.
48	smlal r10, r11, r7, r2 @ acc += a1*y[i - 1]	54	*/
49	smlal r10, r11, r8, r3 @ acc += a2*y[i - 2]	55	smull r10, r11, r6, r1 @ acc = b2*x[i - 2]
50	mov r3, r2 @ fix output history	56	mov r1, r0 @ fix input history
51	mov r2, r11, lsl r12 @ get result	57	smlal r10, r11, r5, r0 @ acc += b1*x[i - 1]
52	@ TODO: arm makes it easy to mix in lower bits from r10 for extended	58	ldr r0, [r9] @ load input and fix history in same operation
53	@ precision here, but we don't have enough regs to save the shift factor	59	smlal r10, r11, r4, r0 @ acc += b0*x[i]
54	@ we would need (32 - r12).	60	smlal r10, r11, r7, r2 @ acc += a1*y[i - 1]
55	str r2, [r9], #4 @ save result	61	smlal r10, r11, r8, r3 @ acc += a2*y[i - 2]
56	subs r14, r14, #1 @ are we done with this channel?	62	mov r3, r2 @ fix output history
		63	mov r2, r11, asl r12 @ get upper part of result and shift left
		64	#ifdef HIGH_PRECISION
		65	rsb r11, r12, #32 @ get shift amount for lower part
		66	orr r2, r2, r10, lsr r11 @ then mix in correctly shifted lower part
		67	#endif
		68	str r2, [r9], #4 @ save result
		69	subs r14, r14, #1 @ are we done with this channel?
57	bne .loop	70	bne .loop
58		71
59	ldr r10, [sp, #4] @ load filter struct pointer	72	ldr r10, [sp, #4] @ load filter struct pointer
60	stmia r10!, { r0-r3 } @ save back history	73	stmia r10!, { r0-r3 } @ save back history
61	ldr r11, [sp, #12] @ load number of channels	74	ldr r11, [sp, #12] @ load number of channels
62	subs r11, r11, #1 @ all channels processed?	75	subs r11, r11, #1 @ all channels processed?
63	strne r11, [sp, #12]	76	strne r11, [sp, #12]
64	bne .filterloop	77	bne .filterloop
65		78
66	add sp, sp, #16 @ compensate for temp storage	79	add sp, sp, #16 @ compensate for temp storage
67	ldmia sp!, { r4-r11, pc }	80	ldmia sp!, { r4-r11, pc }
		81


diff --git a/apps/eq_cf.S b/apps/eq_cf.S index c9458cdc77..75bfcafb3a 100644 --- a/apps/eq_cf.S +++ b/apps/eq_cf.S
@@ -7,7 +7,7 @@
7	* \/ \/ \/ \/ \/	7	* \/ \/ \/ \/ \/
8	* $Id$	8	* $Id$
9	*	9	*
10	* Copyright (C) 2006 Thom Johansen	10	* Copyright (C) 2006-2007 Thom Johansen
11	*	11	*
12	* All files in this archive are subject to the GNU General Public License.	12	* All files in this archive are subject to the GNU General Public License.
13	* See the file COPYING in the source tree root for full license agreement.	13	* See the file COPYING in the source tree root for full license agreement.
@@ -17,14 +17,27 @@
17	*	17	*
18	****************************************************************************/	18	****************************************************************************/
19		19
		20	/* uncomment this to make filtering calculate lower bits after shifting.
		21	* without this, "shift" - 1 of the lower bits will be lost here.
		22	*/
		23	/* #define HIGH_PRECISION */
		24
		25	/*
		26	* void eq_filter(int32_t *x, struct eqfilter f, unsigned num,
		27	* unsigned channels, unsigned shift)
		28	*/
20	.text	29	.text
21	.global eq_filter	30	.global eq_filter
22	eq_filter:	31	eq_filter:
23	lea.l (-11*4, %sp), %sp	32	lea.l (-11*4, %sp), %sp
24	movem.l %d2-%d7/%a2-%a6, (%sp) \| save clobbered regs	33	movem.l %d2-%d7/%a2-%a6, (%sp) \| save clobbered regs
25	move.l (11*4+8, %sp), %a5 \| fetch filter structure address	34	move.l (11*4+8, %sp), %a5 \| fetch filter structure address
26	movem.l (11*4+16, %sp), %d6-%d7 \| load num. channels and shift count	35	move.l (11*4+20, %sp), %d7 \| load shift count
27	subq.l #1, %d7 \| EMAC gives us one free shift	36	subq.l #1, %d7 \| EMAC gives us one free shift
		37	#ifdef HIGH_PRECISION
		38	moveq.l #8, %d6
		39	sub.l %d7, %d6 \| shift for lower part of accumulator
		40	#endif
28	movem.l (%a5), %a0-%a4 \| load coefs	41	movem.l (%a5), %a0-%a4 \| load coefs
29	lea.l (5*4, %a5), %a5 \| point to filter history	42	lea.l (5*4, %a5), %a5 \| point to filter history
30		43
@@ -34,11 +47,16 @@ eq_filter:
34	move.l (%a6), %a6	47	move.l (%a6), %a6
35	move.l (11*4+12, %sp), %d5 \| number of samples	48	move.l (11*4+12, %sp), %d5 \| number of samples
36	movem.l (%a5), %d0-%d3 \| load filter history	49	movem.l (%a5), %d0-%d3 \| load filter history
		50
		51	/* d0-r3 = history, d4 = number of channels, d5 = sample count,
		52	* d6 = lower shift amount, d7 = upper shift amount, a0-a4 = coefs,
		53	* a5 = history pointer, a6 = x[]
		54	*/
37	.loop:	55	.loop:
38	/* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.	56	/* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.
39	y[n] = b0x[i] + b1x[i - 1] + b2x[i - 2] + a1y[i - 1] + a2*y[i - 2],	57	* y[n] = b0x[i] + b1x[i - 1] + b2x[i - 2] + a1y[i - 1] + a2*y[i - 2],
40	where y[] is output and x[] is input. This is performed out of order	58	* where y[] is output and x[] is input. This is performed out of order
41	to do parallel load of input value.	59	* to do parallel load of input value.
42	*/	60	*/
43	mac.l %a2, %d1, %acc0 \| acc = b2*x[i - 2]	61	mac.l %a2, %d1, %acc0 \| acc = b2*x[i - 2]
44	move.l %d0, %d1 \| fix input history	62	move.l %d0, %d1 \| fix input history
@@ -47,15 +65,23 @@ eq_filter:
47	mac.l %a3, %d2, %acc0 \| acc += a1*y[i - 1]	65	mac.l %a3, %d2, %acc0 \| acc += a1*y[i - 1]
48	mac.l %a4, %d3, %acc0 \| acc += a2*y[i - 2]	66	mac.l %a4, %d3, %acc0 \| acc += a2*y[i - 2]
49	move.l %d2, %d3 \| fix output history	67	move.l %d2, %d3 \| fix output history
50	movclr.l %acc0, %d2 \| fetch and write result	68	#ifdef HIGH_PRECISION
		69	move.l %accext01, %d2 \| fetch lower part of accumulator
		70	move.b %d2, %d4 \| clear upper three bytes
		71	lsr.l %d6, %d4 \| shift lower bits
		72	#endif
		73	movclr.l %acc0, %d2 \| fetch upper part of result
51	asl.l %d7, %d2 \| restore fixed point format	74	asl.l %d7, %d2 \| restore fixed point format
		75	#ifdef HIGH_PRECISION
		76	or.l %d2, %d4 \| combine lower and upper parts
		77	#endif
52	move.l %d2, (%a6)+ \| save result	78	move.l %d2, (%a6)+ \| save result
53	subq.l #1, %d5 \| are we done with this channel?	79	subq.l #1, %d5 \| are we done with this channel?
54	jne .loop	80	jne .loop
55		81
56	movem.l %d0-%d3, (%a5) \| save history back to struct	82	movem.l %d0-%d3, (%a5) \| save history back to struct
57	lea.l (4*4, %a5), %a5 \| point to next channel's history	83	lea.l (4*4, %a5), %a5 \| point to next channel's history
58	subq.l #1, %d6 \| have we processed both channels?	84	subq.l #1, (11*4+16, %sp) \| have we processed both channels?
59	jne .filterloop	85	jne .filterloop
60		86
61	movem.l (%sp), %d2-%d7/%a2-%a6	87	movem.l (%sp), %d2-%d7/%a2-%a6