diff options
-rw-r--r-- | apps/eq_arm.S | 64 | ||||
-rw-r--r-- | apps/eq_cf.S | 40 |
2 files changed, 72 insertions, 32 deletions
diff --git a/apps/eq_arm.S b/apps/eq_arm.S index 85617dc2fb..0c1961d2d3 100644 --- a/apps/eq_arm.S +++ b/apps/eq_arm.S | |||
@@ -7,7 +7,7 @@ | |||
7 | * \/ \/ \/ \/ \/ | 7 | * \/ \/ \/ \/ \/ |
8 | * $Id$ | 8 | * $Id$ |
9 | * | 9 | * |
10 | * Copyright (C) 2006 Thom Johansen | 10 | * Copyright (C) 2006-2007 Thom Johansen |
11 | * | 11 | * |
12 | * All files in this archive are subject to the GNU General Public License. | 12 | * All files in this archive are subject to the GNU General Public License. |
13 | * See the file COPYING in the source tree root for full license agreement. | 13 | * See the file COPYING in the source tree root for full license agreement. |
@@ -17,6 +17,15 @@ | |||
17 | * | 17 | * |
18 | ****************************************************************************/ | 18 | ****************************************************************************/ |
19 | 19 | ||
20 | /* uncomment this to make filtering calculate lower bits after shifting. | ||
21 | * without this, "shift" of the lower bits will be lost here. | ||
22 | */ | ||
23 | /* #define HIGH_PRECISION */ | ||
24 | |||
25 | /* | ||
26 | * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num, | ||
27 | * unsigned channels, unsigned shift) | ||
28 | */ | ||
20 | .text | 29 | .text |
21 | .global eq_filter | 30 | .global eq_filter |
22 | eq_filter: | 31 | eq_filter: |
@@ -33,35 +42,40 @@ eq_filter: | |||
33 | ldr r14, [sp, #8] @ r14 = numsamples | 42 | ldr r14, [sp, #8] @ r14 = numsamples |
34 | ldmia r10, { r0-r3 } @ load history, r10 should be filter struct addr | 43 | ldmia r10, { r0-r3 } @ load history, r10 should be filter struct addr |
35 | str r10, [sp, #4] @ save it for loop end | 44 | str r10, [sp, #4] @ save it for loop end |
36 | .loop: | 45 | |
37 | /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator, | 46 | /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator, |
38 | r12 = shift amount, r14 = number of samples. | 47 | * r12 = shift amount, r14 = number of samples. |
39 | See eq_cf.S for explanation of what this loop does. Primary difference | ||
40 | is the reordering of the equation we do here, which is done for register | ||
41 | reuse reasons, we're pretty short on regs. | ||
42 | */ | 48 | */ |
43 | smull r10, r11, r6, r1 @ acc = b2*x[i - 2] | 49 | .loop: |
44 | mov r1, r0 @ fix input history | 50 | /* Direct form 1 filtering code. |
45 | smlal r10, r11, r5, r0 @ acc += b1*x[i - 1] | 51 | * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], |
46 | ldr r0, [r9] @ load input and fix history in same operation | 52 | * where y[] is output and x[] is input. This is performed out of order to |
47 | smlal r10, r11, r4, r0 @ acc += b0*x[i] | 53 | * reuse registers, we're pretty short on regs. |
48 | smlal r10, r11, r7, r2 @ acc += a1*y[i - 1] | 54 | */ |
49 | smlal r10, r11, r8, r3 @ acc += a2*y[i - 2] | 55 | smull r10, r11, r6, r1 @ acc = b2*x[i - 2] |
50 | mov r3, r2 @ fix output history | 56 | mov r1, r0 @ fix input history |
51 | mov r2, r11, lsl r12 @ get result | 57 | smlal r10, r11, r5, r0 @ acc += b1*x[i - 1] |
52 | @ TODO: arm makes it easy to mix in lower bits from r10 for extended | 58 | ldr r0, [r9] @ load input and fix history in same operation |
53 | @ precision here, but we don't have enough regs to save the shift factor | 59 | smlal r10, r11, r4, r0 @ acc += b0*x[i] |
54 | @ we would need (32 - r12). | 60 | smlal r10, r11, r7, r2 @ acc += a1*y[i - 1] |
55 | str r2, [r9], #4 @ save result | 61 | smlal r10, r11, r8, r3 @ acc += a2*y[i - 2] |
56 | subs r14, r14, #1 @ are we done with this channel? | 62 | mov r3, r2 @ fix output history |
63 | mov r2, r11, asl r12 @ get upper part of result and shift left | ||
64 | #ifdef HIGH_PRECISION | ||
65 | rsb r11, r12, #32 @ get shift amount for lower part | ||
66 | orr r2, r2, r10, lsr r11 @ then mix in correctly shifted lower part | ||
67 | #endif | ||
68 | str r2, [r9], #4 @ save result | ||
69 | subs r14, r14, #1 @ are we done with this channel? | ||
57 | bne .loop | 70 | bne .loop |
58 | 71 | ||
59 | ldr r10, [sp, #4] @ load filter struct pointer | 72 | ldr r10, [sp, #4] @ load filter struct pointer |
60 | stmia r10!, { r0-r3 } @ save back history | 73 | stmia r10!, { r0-r3 } @ save back history |
61 | ldr r11, [sp, #12] @ load number of channels | 74 | ldr r11, [sp, #12] @ load number of channels |
62 | subs r11, r11, #1 @ all channels processed? | 75 | subs r11, r11, #1 @ all channels processed? |
63 | strne r11, [sp, #12] | 76 | strne r11, [sp, #12] |
64 | bne .filterloop | 77 | bne .filterloop |
65 | 78 | ||
66 | add sp, sp, #16 @ compensate for temp storage | 79 | add sp, sp, #16 @ compensate for temp storage |
67 | ldmia sp!, { r4-r11, pc } | 80 | ldmia sp!, { r4-r11, pc } |
81 | |||
diff --git a/apps/eq_cf.S b/apps/eq_cf.S index c9458cdc77..75bfcafb3a 100644 --- a/apps/eq_cf.S +++ b/apps/eq_cf.S | |||
@@ -7,7 +7,7 @@ | |||
7 | * \/ \/ \/ \/ \/ | 7 | * \/ \/ \/ \/ \/ |
8 | * $Id$ | 8 | * $Id$ |
9 | * | 9 | * |
10 | * Copyright (C) 2006 Thom Johansen | 10 | * Copyright (C) 2006-2007 Thom Johansen |
11 | * | 11 | * |
12 | * All files in this archive are subject to the GNU General Public License. | 12 | * All files in this archive are subject to the GNU General Public License. |
13 | * See the file COPYING in the source tree root for full license agreement. | 13 | * See the file COPYING in the source tree root for full license agreement. |
@@ -17,14 +17,27 @@ | |||
17 | * | 17 | * |
18 | ****************************************************************************/ | 18 | ****************************************************************************/ |
19 | 19 | ||
20 | /* uncomment this to make filtering calculate lower bits after shifting. | ||
21 | * without this, "shift" - 1 of the lower bits will be lost here. | ||
22 | */ | ||
23 | /* #define HIGH_PRECISION */ | ||
24 | |||
25 | /* | ||
26 | * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num, | ||
27 | * unsigned channels, unsigned shift) | ||
28 | */ | ||
20 | .text | 29 | .text |
21 | .global eq_filter | 30 | .global eq_filter |
22 | eq_filter: | 31 | eq_filter: |
23 | lea.l (-11*4, %sp), %sp | 32 | lea.l (-11*4, %sp), %sp |
24 | movem.l %d2-%d7/%a2-%a6, (%sp) | save clobbered regs | 33 | movem.l %d2-%d7/%a2-%a6, (%sp) | save clobbered regs |
25 | move.l (11*4+8, %sp), %a5 | fetch filter structure address | 34 | move.l (11*4+8, %sp), %a5 | fetch filter structure address |
26 | movem.l (11*4+16, %sp), %d6-%d7 | load num. channels and shift count | 35 | move.l (11*4+20, %sp), %d7 | load shift count |
27 | subq.l #1, %d7 | EMAC gives us one free shift | 36 | subq.l #1, %d7 | EMAC gives us one free shift |
37 | #ifdef HIGH_PRECISION | ||
38 | moveq.l #8, %d6 | ||
39 | sub.l %d7, %d6 | shift for lower part of accumulator | ||
40 | #endif | ||
28 | movem.l (%a5), %a0-%a4 | load coefs | 41 | movem.l (%a5), %a0-%a4 | load coefs |
29 | lea.l (5*4, %a5), %a5 | point to filter history | 42 | lea.l (5*4, %a5), %a5 | point to filter history |
30 | 43 | ||
@@ -34,11 +47,16 @@ eq_filter: | |||
34 | move.l (%a6), %a6 | 47 | move.l (%a6), %a6 |
35 | move.l (11*4+12, %sp), %d5 | number of samples | 48 | move.l (11*4+12, %sp), %d5 | number of samples |
36 | movem.l (%a5), %d0-%d3 | load filter history | 49 | movem.l (%a5), %d0-%d3 | load filter history |
50 | |||
51 | /* d0-r3 = history, d4 = number of channels, d5 = sample count, | ||
52 | * d6 = lower shift amount, d7 = upper shift amount, a0-a4 = coefs, | ||
53 | * a5 = history pointer, a6 = x[] | ||
54 | */ | ||
37 | .loop: | 55 | .loop: |
38 | /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode. | 56 | /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode. |
39 | y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], | 57 | * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], |
40 | where y[] is output and x[] is input. This is performed out of order | 58 | * where y[] is output and x[] is input. This is performed out of order |
41 | to do parallel load of input value. | 59 | * to do parallel load of input value. |
42 | */ | 60 | */ |
43 | mac.l %a2, %d1, %acc0 | acc = b2*x[i - 2] | 61 | mac.l %a2, %d1, %acc0 | acc = b2*x[i - 2] |
44 | move.l %d0, %d1 | fix input history | 62 | move.l %d0, %d1 | fix input history |
@@ -47,15 +65,23 @@ eq_filter: | |||
47 | mac.l %a3, %d2, %acc0 | acc += a1*y[i - 1] | 65 | mac.l %a3, %d2, %acc0 | acc += a1*y[i - 1] |
48 | mac.l %a4, %d3, %acc0 | acc += a2*y[i - 2] | 66 | mac.l %a4, %d3, %acc0 | acc += a2*y[i - 2] |
49 | move.l %d2, %d3 | fix output history | 67 | move.l %d2, %d3 | fix output history |
50 | movclr.l %acc0, %d2 | fetch and write result | 68 | #ifdef HIGH_PRECISION |
69 | move.l %accext01, %d2 | fetch lower part of accumulator | ||
70 | move.b %d2, %d4 | clear upper three bytes | ||
71 | lsr.l %d6, %d4 | shift lower bits | ||
72 | #endif | ||
73 | movclr.l %acc0, %d2 | fetch upper part of result | ||
51 | asl.l %d7, %d2 | restore fixed point format | 74 | asl.l %d7, %d2 | restore fixed point format |
75 | #ifdef HIGH_PRECISION | ||
76 | or.l %d2, %d4 | combine lower and upper parts | ||
77 | #endif | ||
52 | move.l %d2, (%a6)+ | save result | 78 | move.l %d2, (%a6)+ | save result |
53 | subq.l #1, %d5 | are we done with this channel? | 79 | subq.l #1, %d5 | are we done with this channel? |
54 | jne .loop | 80 | jne .loop |
55 | 81 | ||
56 | movem.l %d0-%d3, (%a5) | save history back to struct | 82 | movem.l %d0-%d3, (%a5) | save history back to struct |
57 | lea.l (4*4, %a5), %a5 | point to next channel's history | 83 | lea.l (4*4, %a5), %a5 | point to next channel's history |
58 | subq.l #1, %d6 | have we processed both channels? | 84 | subq.l #1, (11*4+16, %sp) | have we processed both channels? |
59 | jne .filterloop | 85 | jne .filterloop |
60 | 86 | ||
61 | movem.l (%sp), %d2-%d7/%a2-%a6 | 87 | movem.l (%sp), %d2-%d7/%a2-%a6 |