2 files changed, 168 insertions, 17 deletions
diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c
index 02f93a27b1..e64f087a5d 100644
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@@ -47,6 +47,7 @@
 #include "filters_arm4.h"
 #elif defined (COLDFIRE_ASM)
 #define OVERRIDE_IIR_MEM16
+#define OVERRIDE_QMF_SYNTH
 #elif defined (BFIN_ASM)
 #include "filters_bfin.h"
 #endif
@@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
   }
 }
+#ifndef OVERRIDE_QMF_SYNTH
 /* Re-synthesised a signal from the QMF low-band and high-band signals */
 void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
   /* assumptions:
@@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
   for (i = 0; i < M2; i++)
      mem2[2*i+1] = xx2[i];
 }
+#endif
 #ifdef FIXED_POINT
 #if 0
diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S
index 579af11581..dd650844c8 100644
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@@ -31,7 +31,6 @@
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
    .text
 /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
    .global iir_mem16
@@ -59,14 +58,18 @@ iir_mem16:
    move.w   (%a3)+, %d0
    ext.l    %d0
    add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
+    move.l   #32767, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
+    move.l   #65534, %a6
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
+    add.l    %d1, %d0           | Bias result to [-1..65534]
-    jle      1f
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
-    spl.b    %d0                
+    jls      2f
-    ext.w    %d0
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
    neg.l    %d0                | msac.w is bugged in gas, do this for now
    move.w   %d0, (%a5)+        | Write result to y[i]
    move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@@ -111,14 +114,18 @@ iir_mem16:
    move.w   (%a3)+, %d0
    ext.l    %d0
    add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
+    move.l   #32767, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
+    move.l   #65534, %a6
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
+    add.l    %d1, %d0           | Bias result to [-1..65534]
-    jle      1f
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
-    spl.b    %d0                
+    jls      2f
-    ext.w    %d0
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
    neg.l    %d0                | msac.w is bugged in gas, do this for now
    move.w   %d0, (%a5)+        | Write result to y[i]
    move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@@ -159,7 +166,148 @@ iir_mem16:
    movem.l  %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
 .exit:
-    movem.l (%sp), %d2-%d7/%a2-%a6
+    movem.l  (%sp), %d2-%d7/%a2-%a6
-    lea.l (44, %sp), %sp
+    lea.l    (44, %sp), %sp
+    rts
+/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
+    .global qmf_synth
+qmf_synth:
+    lea.l    (-44, %sp), %sp
+    movem.l  %d2-%d7/%a2-%a6, (%sp)
+    movem.l  (44+4, %sp), %a0-%a3          | a0 = x1, a1 = x2, a2 = a, a3 = y
+    movem.l  (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
+    move.l   #0x80, %macsr                 | Enable saturation
+    | Comments make more sense when compared to the reference C version
+    move.l   %a2, %d6                   | Backup a
+    lsr.l    #1, %d0                    | N2 = N >> 1
+    lsr.l    #1, %d1                    | M2 = M >> 1
+    move.l   %d1, %d7                   | Backup M2
+    clr.l    %d2
+    sub.l    %d0, %d2
+    sub.l    %d1, %d2                   | d2 = -(N2 + M2)
+    lea.l    (%sp, %d2.l*2), %a2        | Alloc two buffers of N2 + M2 shorts
+    lea.l    (%a2, %d2.l*2), %a6        | a2 = xx1, a6 = xx2
+    move.l   %sp, %d3
+    move.l   %a6, %sp                   | Update sp
+    move.l   %d3, -(%sp)                | Stack old %sp
+    | Backwards copy x1 and x2 arrays to xx1 and xx2
+    | TODO: these copying loops probably have more potential for optimization
+    lea.l    (%a0, %d0.l*2), %a0        | x1 += N2
+    lea.l    (%a1, %d0.l*2), %a1        | x2 += N2
+    move.l   %d0, %d2                   | Loop counter is N2
+0:
+    move.w   -(%a0), (%a2)+
+    move.w   -(%a1), (%a6)+
+    subq.l   #1, %d2
+    jne      0b
+    | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
+    move.l   %d1, %d2                   | Loop counter is M2
+    addq.l   #4, %a4                    | a4 = &mem1[1]
+    addq.l   #4, %a5                    | a5 = &mem2[1]
+    move.l   %a4, %d3                   | Backup mem1 and mem2
+    move.l   %a5, %d4
+0:
+    move.l   (%a4), %d5
+    move.w   %d5, (%a2)+
+    move.l   (%a5), %d5
+    move.w   %d5, (%a6)+
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d2
+    jne      0b
+    move.l   %d3, %a4                   | a4 = &mem1[1]
+    move.l   %d4, %a5                   | a5 = &mem2[1]
+    clr.l    %d2
+    sub.l    %d1, %d2                   | d2 = -M2
+    lea.l    (-4, %a2, %d2.l*2), %a0    | a0 = &xx1[N2 - 2]
+    lea.l    (-4, %a6, %d2.l*2), %a1    | a1 = &xx2[N2 - 2]
+    move.l   %d6, %a2                   | a2 = a
+    | Main loop, register usage:
+    | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
+    | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
+    | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
+0:  | Outerloop
+    move.l   #32768, %d2                        | Rounding constant
+    move.l   %d2, %acc0
+    move.l   %d2, %acc1
+    move.l   %d2, %acc2
+    move.l   %d2, %acc3
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    move.l   (%a2)+, %d6                        | d6 = [a0, a1]
+1:  | Innerloop
+    move.w   (%a0)+, %d3                        | d3 = x11
+    move.w   (%a1)+, %d5                        | d5 = x21
+    mac.w    %d6u, %d3l, #1, %acc0              | acc0 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc0              | acc0 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc1              | acc1 += a1*x11
+    mac.w    %d6l, %d5l, #1, %acc1              | acc1 += a1*x21
+    mac.w    %d6u, %d2l, #1, %acc2              | acc2 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc2              | acc2 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc3              | acc3 += a1*x10
+    mac.w    %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    mac.w    %d6u, %d2l, #1, %acc0              | acc0 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc0              | acc0 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc1              | acc1 += a1*x10
+    mac.w    %d6l, %d4l, #1, %acc1              | acc1 += a1*x20
+    mac.w    %d6u, %d3l, #1, %acc2              | acc2 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc2              | acc2 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc3              | acc3 += a1*x11
+    mac.w    %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
+    subq.l   #2, %d1
+    jne      1b
+ 
+    sub.l    %d7, %d1                           | d1 = -M2
+    lea.l    (-4, %a2, %d1.l*4), %a2            | a2 = &a[0]
+    lea.l    (-6, %a0, %d1.l*2), %a0            | a0 = &xx1[N2 - 2 - i] 
+    lea.l    (-6, %a1, %d1.l*2), %a1            | a1 = &xx2[N2 - 2 - i]
+    neg.l    %d1                                | d1 = M2
+    movclr.l %acc0, %d2
+    movclr.l %acc1, %d3
+    movclr.l %acc2, %d4
+    movclr.l %acc3, %d5
+    swap.w   %d2                                | Shift 16 right
+    swap.w   %d3
+    swap.w   %d4
+    swap.w   %d5
+    | Thanks to the extra shift in the mac chain, we get clipping for free.
+    | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
+    | but since qmf_synth() is called so late in the signal chain, it should
+    | work fine.
+    move.w   %d2, (%a3)+        | Write results to y[]
+    move.w   %d3, (%a3)+
+    move.w   %d4, (%a3)+
+    move.w   %d5, (%a3)+
+    subq.l   #2, %d0
+    jne      0b
+    | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
+    addq.l   #4, %a0            | a0 = &xx1[0]
+    addq.l   #4, %a1            | a1 = &xx2[0]
+0:
+    move.w   (%a0)+, %d2
+    move.w   (%a1)+, %d3
+    ext.l    %d2
+    ext.l    %d3
+    move.l   %d2, (%a4)
+    move.l   %d3, (%a5)
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d1
+    jne      0b
+    move.l   #0, %macsr
+    move.l   (%sp), %sp
+    movem.l  (%sp), %d2-%d7/%a2-%a6
+    lea.l    (44, %sp), %sp
    rts

diff --git a/apps/codecs/libspeex/filters.c b/apps/codecs/libspeex/filters.c index 02f93a27b1..e64f087a5d 100644 --- a/apps/codecs/libspeex/filters.c +++ b/apps/codecs/libspeex/filters.c
@@ -47,6 +47,7 @@
47	#include "filters_arm4.h"	47	#include "filters_arm4.h"
48	#elif defined (COLDFIRE_ASM)	48	#elif defined (COLDFIRE_ASM)
49	#define OVERRIDE_IIR_MEM16	49	#define OVERRIDE_IIR_MEM16
		50	#define OVERRIDE_QMF_SYNTH
50	#elif defined (BFIN_ASM)	51	#elif defined (BFIN_ASM)
51	#include "filters_bfin.h"	52	#include "filters_bfin.h"
52	#endif	53	#endif
@@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t xx, const spx_word16_t aa, spx_word16_t *y1
475	}	476	}
476	}	477	}
477		478
		479	#ifndef OVERRIDE_QMF_SYNTH
478	/* Re-synthesised a signal from the QMF low-band and high-band signals */	480	/* Re-synthesised a signal from the QMF low-band and high-band signals */
479	void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char *stack)	481	void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char *stack)
480	/* assumptions:	482	/* assumptions:
@@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_
566	for (i = 0; i < M2; i++)	568	for (i = 0; i < M2; i++)
567	mem2[2*i+1] = xx2[i];	569	mem2[2*i+1] = xx2[i];
568	}	570	}
		571	#endif
569		572
570	#ifdef FIXED_POINT	573	#ifdef FIXED_POINT
571	#if 0	574	#if 0


diff --git a/apps/codecs/libspeex/filters_cf.S b/apps/codecs/libspeex/filters_cf.S index 579af11581..dd650844c8 100644 --- a/apps/codecs/libspeex/filters_cf.S +++ b/apps/codecs/libspeex/filters_cf.S
@@ -31,7 +31,6 @@
31	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS	31	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	32	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33	*/	33	*/
34
35	.text	34	.text
36	/* void iir_mem16(const spx_word16_t x, const spx_coef_t den, spx_word16_t y, int N, int ord, spx_mem_t mem, char stack) /	35	/* void iir_mem16(const spx_word16_t x, const spx_coef_t den, spx_word16_t y, int N, int ord, spx_mem_t mem, char stack) /
37	.global iir_mem16	36	.global iir_mem16
@@ -59,14 +58,18 @@ iir_mem16:
59	move.w (%a3)+, %d0	58	move.w (%a3)+, %d0
60	ext.l %d0	59	ext.l %d0
61	add.l %d1, %d0 \| Add with x[i]	60	add.l %d1, %d0 \| Add with x[i]
62	move.l #32768, %d1	61	move.l #32767, %d1
63	add.l %d1, %d0 \| Bias result to [0..65535]	62	move.l #65534, %a6
64	cmp.l #65535, %d0 \| Clip to [0..65535] range	63	add.l %d1, %d0 \| Bias result to [-1..65534]
65	jle 1f	64	cmp.l %a6, %d0 \| Now do clip to [0..65534] range
66	spl.b %d0	65	jls 2f
67	ext.w %d0	66	jpl 1f
		67	clr.l %d0 \| Clip low
		68	.word 0x51fa \| trapf.w, shadow next insn
68	1:	69	1:
69	sub.l %d1, %d0 \| Bias clipped result back to [-32768..32767]	70	move.l %a6, %d0 \| Clip high
		71	2:
		72	sub.l %d1, %d0 \| Bias clipped result back to [-32767..32767]
70	neg.l %d0 \| msac.w is bugged in gas, do this for now	73	neg.l %d0 \| msac.w is bugged in gas, do this for now
71	move.w %d0, (%a5)+ \| Write result to y[i]	74	move.w %d0, (%a5)+ \| Write result to y[i]
72	move.l (%a4)+, %a6 \| Fetch den[0] and den[1]	75	move.l (%a4)+, %a6 \| Fetch den[0] and den[1]
@@ -111,14 +114,18 @@ iir_mem16:
111	move.w (%a3)+, %d0	114	move.w (%a3)+, %d0
112	ext.l %d0	115	ext.l %d0
113	add.l %d1, %d0 \| Add with x[i]	116	add.l %d1, %d0 \| Add with x[i]
114	move.l #32768, %d1	117	move.l #32767, %d1
115	add.l %d1, %d0 \| Bias result to [0..65535]	118	move.l #65534, %a6
116	cmp.l #65535, %d0 \| Clip to [0..65535] range	119	add.l %d1, %d0 \| Bias result to [-1..65534]
117	jle 1f	120	cmp.l %a6, %d0 \| Now do clip to [0..65534] range
118	spl.b %d0	121	jls 2f
119	ext.w %d0	122	jpl 1f
		123	clr.l %d0 \| Clip low
		124	.word 0x51fa \| trapf.w, shadow next insn
120	1:	125	1:
121	sub.l %d1, %d0 \| Bias clipped result back to [-32768..32767]	126	move.l %a6, %d0 \| Clip high
		127	2:
		128	sub.l %d1, %d0 \| Bias clipped result back to [-32767..32767]
122	neg.l %d0 \| msac.w is bugged in gas, do this for now	129	neg.l %d0 \| msac.w is bugged in gas, do this for now
123	move.w %d0, (%a5)+ \| Write result to y[i]	130	move.w %d0, (%a5)+ \| Write result to y[i]
124	move.l (%a4)+, %a6 \| Fetch den[0] and den[1]	131	move.l (%a4)+, %a6 \| Fetch den[0] and den[1]
@@ -159,7 +166,148 @@ iir_mem16:
159	movem.l %d1-%d7/%a0-%a2, (%a6) \| Save back mem[]	166	movem.l %d1-%d7/%a0-%a2, (%a6) \| Save back mem[]
160		167
161	.exit:	168	.exit:
162	movem.l (%sp), %d2-%d7/%a2-%a6	169	movem.l (%sp), %d2-%d7/%a2-%a6
163	lea.l (44, %sp), %sp	170	lea.l (44, %sp), %sp
		171	rts
		172
		173	/* void qmf_synth(const spx_word16_t x1, const spx_word16_t x2, const spx_word16_t a, spx_word16_t y, int N, int M, spx_word32_t mem1, spx_word32_t mem2, char stack) /
		174	.global qmf_synth
		175	qmf_synth:
		176	lea.l (-44, %sp), %sp
		177	movem.l %d2-%d7/%a2-%a6, (%sp)
		178	movem.l (44+4, %sp), %a0-%a3 \| a0 = x1, a1 = x2, a2 = a, a3 = y
		179	movem.l (44+20, %sp), %d0-%d1/%a4-%a5 \| d0 = N, d1 = M, a4 = mem1,a5 = mem2
		180	move.l #0x80, %macsr \| Enable saturation
		181
		182	\| Comments make more sense when compared to the reference C version
		183	move.l %a2, %d6 \| Backup a
		184	lsr.l #1, %d0 \| N2 = N >> 1
		185	lsr.l #1, %d1 \| M2 = M >> 1
		186	move.l %d1, %d7 \| Backup M2
		187	clr.l %d2
		188	sub.l %d0, %d2
		189	sub.l %d1, %d2 \| d2 = -(N2 + M2)
		190	lea.l (%sp, %d2.l*2), %a2 \| Alloc two buffers of N2 + M2 shorts
		191	lea.l (%a2, %d2.l*2), %a6 \| a2 = xx1, a6 = xx2
		192	move.l %sp, %d3
		193	move.l %a6, %sp \| Update sp
		194	move.l %d3, -(%sp) \| Stack old %sp
		195
		196	\| Backwards copy x1 and x2 arrays to xx1 and xx2
		197	\| TODO: these copying loops probably have more potential for optimization
		198	lea.l (%a0, %d0.l*2), %a0 \| x1 += N2
		199	lea.l (%a1, %d0.l*2), %a1 \| x2 += N2
		200	move.l %d0, %d2 \| Loop counter is N2
		201	0:
		202	move.w -(%a0), (%a2)+
		203	move.w -(%a1), (%a6)+
		204	subq.l #1, %d2
		205	jne 0b
		206
		207	\| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
		208	move.l %d1, %d2 \| Loop counter is M2
		209	addq.l #4, %a4 \| a4 = &mem1[1]
		210	addq.l #4, %a5 \| a5 = &mem2[1]
		211	move.l %a4, %d3 \| Backup mem1 and mem2
		212	move.l %a5, %d4
		213	0:
		214	move.l (%a4), %d5
		215	move.w %d5, (%a2)+
		216	move.l (%a5), %d5
		217	move.w %d5, (%a6)+
		218	addq.l #8, %a4
		219	addq.l #8, %a5
		220	subq.l #1, %d2
		221	jne 0b
		222	move.l %d3, %a4 \| a4 = &mem1[1]
		223	move.l %d4, %a5 \| a5 = &mem2[1]
		224
		225	clr.l %d2
		226	sub.l %d1, %d2 \| d2 = -M2
		227	lea.l (-4, %a2, %d2.l*2), %a0 \| a0 = &xx1[N2 - 2]
		228	lea.l (-4, %a6, %d2.l*2), %a1 \| a1 = &xx2[N2 - 2]
		229	move.l %d6, %a2 \| a2 = a
		230
		231	\| Main loop, register usage:
		232	\| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
		233	\| d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
		234	\| a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
		235	0: \| Outerloop
		236	move.l #32768, %d2 \| Rounding constant
		237	move.l %d2, %acc0
		238	move.l %d2, %acc1
		239	move.l %d2, %acc2
		240	move.l %d2, %acc3
		241	move.w (%a0)+, %d2 \| d2 = x10
		242	move.w (%a1)+, %d4 \| d4 = x20
		243	move.l (%a2)+, %d6 \| d6 = [a0, a1]
		244	1: \| Innerloop
		245	move.w (%a0)+, %d3 \| d3 = x11
		246	move.w (%a1)+, %d5 \| d5 = x21
		247	mac.w %d6u, %d3l, #1, %acc0 \| acc0 += a0*x11
		248	msac.w %d6u, %d5l, #1, %acc0 \| acc0 -= a0*x21
		249	mac.w %d6l, %d3l, #1, %acc1 \| acc1 += a1*x11
		250	mac.w %d6l, %d5l, #1, %acc1 \| acc1 += a1*x21
		251	mac.w %d6u, %d2l, #1, %acc2 \| acc2 += a0*x10
		252	msac.w %d6u, %d4l, #1, %acc2 \| acc2 -= a0*x20
		253	mac.w %d6l, %d2l, #1, %acc3 \| acc3 += a1*x10
		254	mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 \| acc3 += a1*x20
		255
		256	move.w (%a0)+, %d2 \| d2 = x10
		257	move.w (%a1)+, %d4 \| d4 = x20
		258	mac.w %d6u, %d2l, #1, %acc0 \| acc0 += a0*x10
		259	msac.w %d6u, %d4l, #1, %acc0 \| acc0 -= a0*x20
		260	mac.w %d6l, %d2l, #1, %acc1 \| acc1 += a1*x10
		261	mac.w %d6l, %d4l, #1, %acc1 \| acc1 += a1*x20
		262	mac.w %d6u, %d3l, #1, %acc2 \| acc2 += a0*x11
		263	msac.w %d6u, %d5l, #1, %acc2 \| acc2 -= a0*x21
		264	mac.w %d6l, %d3l, #1, %acc3 \| acc3 += a1*x11
		265	mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 \| acc3 += a1*x21
		266	subq.l #2, %d1
		267	jne 1b
		268
		269	sub.l %d7, %d1 \| d1 = -M2
		270	lea.l (-4, %a2, %d1.l*4), %a2 \| a2 = &a[0]
		271	lea.l (-6, %a0, %d1.l*2), %a0 \| a0 = &xx1[N2 - 2 - i]
		272	lea.l (-6, %a1, %d1.l*2), %a1 \| a1 = &xx2[N2 - 2 - i]
		273	neg.l %d1 \| d1 = M2
		274	movclr.l %acc0, %d2
		275	movclr.l %acc1, %d3
		276	movclr.l %acc2, %d4
		277	movclr.l %acc3, %d5
		278	swap.w %d2 \| Shift 16 right
		279	swap.w %d3
		280	swap.w %d4
		281	swap.w %d5
		282	\| Thanks to the extra shift in the mac chain, we get clipping for free.
		283	\| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
		284	\| but since qmf_synth() is called so late in the signal chain, it should
		285	\| work fine.
		286	move.w %d2, (%a3)+ \| Write results to y[]
		287	move.w %d3, (%a3)+
		288	move.w %d4, (%a3)+
		289	move.w %d5, (%a3)+
		290	subq.l #2, %d0
		291	jne 0b
		292
		293	\| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
		294	addq.l #4, %a0 \| a0 = &xx1[0]
		295	addq.l #4, %a1 \| a1 = &xx2[0]
		296	0:
		297	move.w (%a0)+, %d2
		298	move.w (%a1)+, %d3
		299	ext.l %d2
		300	ext.l %d3
		301	move.l %d2, (%a4)
		302	move.l %d3, (%a5)
		303	addq.l #8, %a4
		304	addq.l #8, %a5
		305	subq.l #1, %d1
		306	jne 0b
		307
		308	move.l #0, %macsr
		309	move.l (%sp), %sp
		310	movem.l (%sp), %d2-%d7/%a2-%a6
		311	lea.l (44, %sp), %sp
164	rts	312	rts
165		313