1 files changed, 327 insertions, 0 deletions
diff --git a/apps/codecs/libtremor/asm_mcf5249.h b/apps/codecs/libtremor/asm_mcf5249.h
new file mode 100644
index 0000000000..64dfb1b785
--- /dev/null
+++ b/apps/codecs/libtremor/asm_mcf5249.h
@@ -0,0 +1,327 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ *
+ * Copyright (C) 2005 by Pedro Vasconcelos
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+/* asm routines for wide math on the MCF5249 */
+#include "os_types.h"
+#if defined(CPU_COLDFIRE)
+/* attribute for 16-byte alignment */
+#define LINE_ATTR   __attribute__ ((aligned (16)))
+#ifndef _V_WIDE_MATH
+#define _V_WIDE_MATH
+#define MB()
+static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
+  asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
+                "movclr.l %%acc0, %[x];"       /* move & clear acc */
+                "asr.l #1, %[x];"              /* no overflow test */
+                : [x] "+&d" (x)
+                : [y] "r" (y)
+                : "cc");
+  return x;
+}
+static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
+  asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
+                "movclr.l %%acc0, %[x];"    /* move and clear */
+                : [x] "+&r" (x)
+                : [y] "r" (y)
+                : "cc");
+  return x;
+}
+static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
+  ogg_int32_t r;
+  asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
+                "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
+                "movclr.l %%acc0, %[r];"     /* get higher half */
+                "asl.l #8, %[r];"            /* hi<<16, plus one free */
+                "asl.l #8, %[r];"
+                "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
+                "lsr.l #7, %[x];"
+                "or.l %[x], %[r];"           /* logical-or results */
+                : [r] "=&d" (r), [x] "+d" (x)
+                : [y] "d" (y)
+                : "cc");
+  return r;
+}
+static inline 
+void XPROD31(ogg_int32_t  a, ogg_int32_t  b,   
+             ogg_int32_t  t, ogg_int32_t  v,
+             ogg_int32_t *x, ogg_int32_t *y)
+{ 
+  asm volatile ("mac.l %[a], %[t], %%acc0;"
+                "mac.l %[b], %[v], %%acc0;"
+                "mac.l %[b], %[t], %%acc1;"
+                "msac.l %[a], %[v], %%acc1;"
+                "movclr.l %%acc0, %[a];"
+                "move.l %[a], (%[x]);"
+                "movclr.l %%acc1, %[a];"
+                "move.l %[a], (%[y]);"
+                : [a] "+&r" (a)
+                : [x] "a" (x), [y] "a" (y),
+                  [b] "r" (b), [t] "r" (t), [v] "r" (v)
+                : "cc", "memory");
+}
+static inline
+void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,   
+              ogg_int32_t  t, ogg_int32_t  v,
+              ogg_int32_t *x, ogg_int32_t *y)
+{
+  asm volatile ("mac.l %[a], %[t], %%acc0;"
+                "msac.l %[b], %[v], %%acc0;"
+                "mac.l %[b], %[t], %%acc1;"
+                "mac.l %[a], %[v], %%acc1;"
+                "movclr.l %%acc0, %[a];"
+                "move.l %[a], (%[x]);"
+                "movclr.l %%acc1, %[a];"
+                "move.l %[a], (%[y]);"
+                : [a] "+&r" (a)
+                : [x] "a" (x), [y] "a" (y),
+                  [b] "r" (b), [t] "r" (t), [v] "r" (v)
+                : "cc", "memory");
+}
+#if 0    /* canonical Tremor definition */
+#define XPROD32(_a, _b, _t, _v, _x, _y)         \
+  { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
+    (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
+#endif
+/* this could lose the LSB by overflow, but i don't think it'll ever happen.
+   if anyone think they can hear a bug caused by this, please try the above
+   version. */
+#define XPROD32(_a, _b, _t, _v, _x, _y)     \
+  asm volatile ("mac.l %[a], %[t], %%acc0;" \
+                "mac.l %[b], %[v], %%acc0;" \
+                "mac.l %[b], %[t], %%acc1;" \
+                "msac.l %[a], %[v], %%acc1;" \
+                "movclr.l %%acc0, %[x];" \
+                "asr.l #1, %[x];" \
+                "movclr.l %%acc1, %[y];" \
+                "asr.l #1, %[y];" \
+                : [x] "=&d" (_x), [y] "=&d" (_y) \
+                : [a] "r" (_a), [b] "r" (_b), \
+                  [t] "r" (_t), [v] "r" (_v) \
+                : "cc");
+#ifndef _V_VECT_OPS
+#define _V_VECT_OPS
+/* asm versions of vector operations for block.c, window.c */
+/* assumes MAC is initialized & accumulators cleared */
+static inline 
+void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
+{
+  /* align to 16 bytes */
+  while(n>0 && (int)x&16) {
+    *x++ += *y++;
+    n--;
+  }
+  asm volatile ("bra 1f;"
+                "0:"                          /* loop start */
+                "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
+                "movem.l (%[y]), %%a0-%%a3;"
+                /* add */
+                "add.l %%a0, %%d0;"
+                "add.l %%a1, %%d1;"
+                "add.l %%a2, %%d2;"
+                "add.l %%a3, %%d3;"
+                /* store and advance */
+                "movem.l %%d0-%%d3, (%[x]);"  
+                "lea.l (4*4, %[x]), %[x];"
+                "lea.l (4*4, %[y]), %[y];"
+                "subq.l #4, %[n];"     /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
+                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
+                    "cc", "memory");
+  /* add final elements */
+  while (n>0) {
+    *x++ += *y++;
+    n--;
+  }
+}
+static inline 
+void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
+{
+  /* align to 16 bytes */
+  while(n>0 && (int)x&16) {
+    *x++ = *y++;
+    n--;
+  }  
+  asm volatile ("bra 1f;"
+                "0:"                                    /* loop start */
+                "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
+                "movem.l %%d0-%%d3, (%[x]);"            /* store */
+                "lea.l (4*4, %[x]), %[x];"              /* advance */
+                "lea.l (4*4, %[y]), %[y];"
+                "subq.l #4, %[n];"                      /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
+                : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
+  /* copy final elements */
+  while (n>0) {
+    *x++ = *y++;
+    n--;
+  }
+}
+static inline 
+void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
+{
+  /* ensure data is aligned to 16-bytes */
+  while(n>0 && (int)data%16) {
+    *data = MULT31(*data, *window);
+    data++;
+    window++;
+    n--;
+  }
+  asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
+                "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
+                "lea.l (4*4, %[w]), %[w];"
+                "bra 1f;"               /* jump to loop condition */
+                "0:" /* loop body */
+                /* multiply and load next window values */
+                "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
+                "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
+                "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
+                "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"              
+                "movclr.l %%acc0, %%d0;"  /* get the products */
+                "movclr.l %%acc1, %%d1;"
+                "movclr.l %%acc2, %%d2;"
+                "movclr.l %%acc3, %%d3;"
+                /* store and advance */
+                "movem.l %%d0-%%d3, (%[d]);"  
+                "lea.l (4*4, %[d]), %[d];"
+                "movem.l (%[d]), %%d0-%%d3;"
+                "subq.l #4, %[n];"     /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                /* multiply final elements */
+                "tst.l %[n];"
+                "beq 1f;"      /* n=0 */
+                "mac.l %%d0, %%a0, %%acc0;"
+                "movclr.l %%acc0, %%d0;"
+                "move.l %%d0, (%[d])+;"
+                "subq.l #1, %[n];"
+                "beq 1f;"     /* n=1 */
+                "mac.l %%d1, %%a1, %%acc0;"
+                "movclr.l %%acc0, %%d1;"
+                "move.l %%d1, (%[d])+;"
+                "subq.l #1, %[n];"
+                "beq 1f;"     /* n=2 */
+                /* otherwise n = 3 */
+                "mac.l %%d2, %%a2, %%acc0;"
+                "movclr.l %%acc0, %%d2;"
+                "move.l %%d2, (%[d])+;"
+                "1:"
+                : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
+                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
+                    "cc", "memory");
+}
+static inline 
+void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
+{
+  /* ensure at least data is aligned to 16-bytes */
+  while(n>0 && (int)data%16) {
+    *data = MULT31(*data, *window);
+    data++;
+    window--;
+    n--;
+  }
+  asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
+                "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
+                "movem.l (%[w]), %%a0-%%a3;"
+                "bra 1f;"               /* jump to loop condition */
+                "0:" /* loop body */
+                /* multiply and load next window value */
+                "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
+                "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
+                "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
+                "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"              
+                "movclr.l %%acc0, %%d0;"  /* get the products */
+                "movclr.l %%acc1, %%d1;"
+                "movclr.l %%acc2, %%d2;"
+                "movclr.l %%acc3, %%d3;"
+                /* store and advance */
+                "movem.l %%d0-%%d3, (%[d]);"  
+                "lea.l (4*4, %[d]), %[d];"
+                "movem.l (%[d]), %%d0-%%d3;"
+                "subq.l #4, %[n];"     /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                /* multiply final elements */
+                "tst.l %[n];"
+                "beq 1f;"      /* n=0 */
+                "mac.l %%d0, %%a3, %%acc0;"
+                "movclr.l %%acc0, %%d0;"
+                "move.l %%d0, (%[d])+;"
+                "subq.l #1, %[n];"
+                "beq 1f;"     /* n=1 */
+                "mac.l %%d1, %%a2, %%acc0;"
+                "movclr.l %%acc0, %%d1;"
+                "move.l %%d1, (%[d])+;"
+                "subq.l #1, %[n];"
+                "beq 1f;"     /* n=2 */
+                /* otherwise n = 3 */
+                "mac.l %%d2, %%a1, %%acc0;"
+                "movclr.l %%acc0, %%d2;"
+                "move.l %%d2, (%[d])+;"
+                "1:"
+                : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
+                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
+                    "cc", "memory");
+}
+#endif
+#endif
+#ifndef _V_CLIP_MATH
+#define _V_CLIP_MATH
+/* this is portable C and simple; why not use this as default? */
+static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) {
+  register ogg_int32_t hi=32767, lo=-32768;
+  return (x>=hi ? hi : (x<=lo ? lo : x));
+}
+#endif
+#else
+#define LINE_ATTR
+#endif

diff --git a/apps/codecs/libtremor/asm_mcf5249.h b/apps/codecs/libtremor/asm_mcf5249.h new file mode 100644 index 0000000000..64dfb1b785 --- /dev/null +++ b/apps/codecs/libtremor/asm_mcf5249.h
@@ -0,0 +1,327 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	*
	9	* Copyright (C) 2005 by Pedro Vasconcelos
	10	*
	11	* This program is free software; you can redistribute it and/or
	12	* modify it under the terms of the GNU General Public License
	13	* as published by the Free Software Foundation; either version 2
	14	* of the License, or (at your option) any later version.
	15	*
	16	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	17	* KIND, either express or implied.
	18	*
	19	****************************************************************************/
	20	/* asm routines for wide math on the MCF5249 */
	21
	22	#include "os_types.h"
	23
	24	#if defined(CPU_COLDFIRE)
	25
	26	/* attribute for 16-byte alignment */
	27	#define LINE_ATTR __attribute__ ((aligned (16)))
	28
	29	#ifndef _V_WIDE_MATH
	30	#define _V_WIDE_MATH
	31
	32	#define MB()
	33
	34	static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
	35
	36	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
	37	"movclr.l %%acc0, %[x];" /* move & clear acc */
	38	"asr.l #1, %[x];" /* no overflow test */
	39	: [x] "+&d" (x)
	40	: [y] "r" (y)
	41	: "cc");
	42	return x;
	43	}
	44
	45	static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
	46
	47	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
	48	"movclr.l %%acc0, %[x];" /* move and clear */
	49	: [x] "+&r" (x)
	50	: [y] "r" (y)
	51	: "cc");
	52	return x;
	53	}
	54
	55
	56	static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
	57	ogg_int32_t r;
	58
	59	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
	60	"mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
	61	"movclr.l %%acc0, %[r];" /* get higher half */
	62	"asl.l #8, %[r];" /* hi<<16, plus one free */
	63	"asl.l #8, %[r];"
	64	"lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
	65	"lsr.l #7, %[x];"
	66	"or.l %[x], %[r];" /* logical-or results */
	67	: [r] "=&d" (r), [x] "+d" (x)
	68	: [y] "d" (y)
	69	: "cc");
	70	return r;
	71	}
	72
	73
	74	static inline
	75	void XPROD31(ogg_int32_t a, ogg_int32_t b,
	76	ogg_int32_t t, ogg_int32_t v,
	77	ogg_int32_t x, ogg_int32_t y)
	78	{
	79	asm volatile ("mac.l %[a], %[t], %%acc0;"
	80	"mac.l %[b], %[v], %%acc0;"
	81	"mac.l %[b], %[t], %%acc1;"
	82	"msac.l %[a], %[v], %%acc1;"
	83	"movclr.l %%acc0, %[a];"
	84	"move.l %[a], (%[x]);"
	85	"movclr.l %%acc1, %[a];"
	86	"move.l %[a], (%[y]);"
	87	: [a] "+&r" (a)
	88	: [x] "a" (x), [y] "a" (y),
	89	[b] "r" (b), [t] "r" (t), [v] "r" (v)
	90	: "cc", "memory");
	91	}
	92
	93
	94	static inline
	95	void XNPROD31(ogg_int32_t a, ogg_int32_t b,
	96	ogg_int32_t t, ogg_int32_t v,
	97	ogg_int32_t x, ogg_int32_t y)
	98	{
	99	asm volatile ("mac.l %[a], %[t], %%acc0;"
	100	"msac.l %[b], %[v], %%acc0;"
	101	"mac.l %[b], %[t], %%acc1;"
	102	"mac.l %[a], %[v], %%acc1;"
	103	"movclr.l %%acc0, %[a];"
	104	"move.l %[a], (%[x]);"
	105	"movclr.l %%acc1, %[a];"
	106	"move.l %[a], (%[y]);"
	107	: [a] "+&r" (a)
	108	: [x] "a" (x), [y] "a" (y),
	109	[b] "r" (b), [t] "r" (t), [v] "r" (v)
	110	: "cc", "memory");
	111	}
	112
	113
	114	#if 0 /* canonical Tremor definition */
	115	#define XPROD32(_a, _b, _t, _v, _x, _y) \
	116	{ (_x)=MULT32(_a,_t)+MULT32(_b,_v); \
	117	(_y)=MULT32(_b,_t)-MULT32(_a,_v); }
	118	#endif
	119
	120	/* this could lose the LSB by overflow, but i don't think it'll ever happen.
	121	if anyone think they can hear a bug caused by this, please try the above
	122	version. */
	123	#define XPROD32(_a, _b, _t, _v, _x, _y) \
	124	asm volatile ("mac.l %[a], %[t], %%acc0;" \
	125	"mac.l %[b], %[v], %%acc0;" \
	126	"mac.l %[b], %[t], %%acc1;" \
	127	"msac.l %[a], %[v], %%acc1;" \
	128	"movclr.l %%acc0, %[x];" \
	129	"asr.l #1, %[x];" \
	130	"movclr.l %%acc1, %[y];" \
	131	"asr.l #1, %[y];" \
	132	: [x] "=&d" (_x), [y] "=&d" (_y) \
	133	: [a] "r" (_a), [b] "r" (_b), \
	134	[t] "r" (_t), [v] "r" (_v) \
	135	: "cc");
	136
	137	#ifndef _V_VECT_OPS
	138	#define _V_VECT_OPS
	139
	140	/* asm versions of vector operations for block.c, window.c */
	141	/* assumes MAC is initialized & accumulators cleared */
	142	static inline
	143	void vect_add(ogg_int32_t x, ogg_int32_t y, int n)
	144	{
	145	/* align to 16 bytes */
	146	while(n>0 && (int)x&16) {
	147	x++ += y++;
	148	n--;
	149	}
	150	asm volatile ("bra 1f;"
	151	"0:" /* loop start */
	152	"movem.l (%[x]), %%d0-%%d3;" /* fetch values */
	153	"movem.l (%[y]), %%a0-%%a3;"
	154	/* add */
	155	"add.l %%a0, %%d0;"
	156	"add.l %%a1, %%d1;"
	157	"add.l %%a2, %%d2;"
	158	"add.l %%a3, %%d3;"
	159	/* store and advance */
	160	"movem.l %%d0-%%d3, (%[x]);"
	161	"lea.l (4*4, %[x]), %[x];"
	162	"lea.l (4*4, %[y]), %[y];"
	163	"subq.l #4, %[n];" /* done 4 elements */
	164	"1: cmpi.l #4, %[n];"
	165	"bge 0b;"
	166	: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
	167	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
	168	"cc", "memory");
	169	/* add final elements */
	170	while (n>0) {
	171	x++ += y++;
	172	n--;
	173	}
	174	}
	175
	176	static inline
	177	void vect_copy(ogg_int32_t x, ogg_int32_t y, int n)
	178	{
	179	/* align to 16 bytes */
	180	while(n>0 && (int)x&16) {
	181	x++ = y++;
	182	n--;
	183	}
	184	asm volatile ("bra 1f;"
	185	"0:" /* loop start */
	186	"movem.l (%[y]), %%d0-%%d3;" /* fetch values */
	187	"movem.l %%d0-%%d3, (%[x]);" /* store */
	188	"lea.l (44, %[x]), %[x];" / advance */
	189	"lea.l (4*4, %[y]), %[y];"
	190	"subq.l #4, %[n];" /* done 4 elements */
	191	"1: cmpi.l #4, %[n];"
	192	"bge 0b;"
	193	: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
	194	: : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
	195	/* copy final elements */
	196	while (n>0) {
	197	x++ = y++;
	198	n--;
	199	}
	200	}
	201
	202
	203	static inline
	204	void vect_mult_fw(ogg_int32_t data, LOOKUP_T window, int n)
	205	{
	206	/* ensure data is aligned to 16-bytes */
	207	while(n>0 && (int)data%16) {
	208	data = MULT31(data, *window);
	209	data++;
	210	window++;
	211	n--;
	212	}
	213	asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
	214	"movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
	215	"lea.l (4*4, %[w]), %[w];"
	216	"bra 1f;" /* jump to loop condition */
	217	"0:" /* loop body */
	218	/* multiply and load next window values */
	219	"mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
	220	"mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
	221	"mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
	222	"mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
	223	"movclr.l %%acc0, %%d0;" /* get the products */
	224	"movclr.l %%acc1, %%d1;"
	225	"movclr.l %%acc2, %%d2;"
	226	"movclr.l %%acc3, %%d3;"
	227	/* store and advance */
	228	"movem.l %%d0-%%d3, (%[d]);"
	229	"lea.l (4*4, %[d]), %[d];"
	230	"movem.l (%[d]), %%d0-%%d3;"
	231	"subq.l #4, %[n];" /* done 4 elements */
	232	"1: cmpi.l #4, %[n];"
	233	"bge 0b;"
	234	/* multiply final elements */
	235	"tst.l %[n];"
	236	"beq 1f;" /* n=0 */
	237	"mac.l %%d0, %%a0, %%acc0;"
	238	"movclr.l %%acc0, %%d0;"
	239	"move.l %%d0, (%[d])+;"
	240	"subq.l #1, %[n];"
	241	"beq 1f;" /* n=1 */
	242	"mac.l %%d1, %%a1, %%acc0;"
	243	"movclr.l %%acc0, %%d1;"
	244	"move.l %%d1, (%[d])+;"
	245	"subq.l #1, %[n];"
	246	"beq 1f;" /* n=2 */
	247	/* otherwise n = 3 */
	248	"mac.l %%d2, %%a2, %%acc0;"
	249	"movclr.l %%acc0, %%d2;"
	250	"move.l %%d2, (%[d])+;"
	251	"1:"
	252	: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
	253	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
	254	"cc", "memory");
	255	}
	256
	257	static inline
	258	void vect_mult_bw(ogg_int32_t data, LOOKUP_T window, int n)
	259	{
	260	/* ensure at least data is aligned to 16-bytes */
	261	while(n>0 && (int)data%16) {
	262	data = MULT31(data, *window);
	263	data++;
	264	window--;
	265	n--;
	266	}
	267	asm volatile ("lea.l (-34, %[w]), %[w];" / loop start */
	268	"movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
	269	"movem.l (%[w]), %%a0-%%a3;"
	270	"bra 1f;" /* jump to loop condition */
	271	"0:" /* loop body */
	272	/* multiply and load next window value */
	273	"mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
	274	"mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
	275	"mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
	276	"mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
	277	"movclr.l %%acc0, %%d0;" /* get the products */
	278	"movclr.l %%acc1, %%d1;"
	279	"movclr.l %%acc2, %%d2;"
	280	"movclr.l %%acc3, %%d3;"
	281	/* store and advance */
	282	"movem.l %%d0-%%d3, (%[d]);"
	283	"lea.l (4*4, %[d]), %[d];"
	284	"movem.l (%[d]), %%d0-%%d3;"
	285	"subq.l #4, %[n];" /* done 4 elements */
	286	"1: cmpi.l #4, %[n];"
	287	"bge 0b;"
	288	/* multiply final elements */
	289	"tst.l %[n];"
	290	"beq 1f;" /* n=0 */
	291	"mac.l %%d0, %%a3, %%acc0;"
	292	"movclr.l %%acc0, %%d0;"
	293	"move.l %%d0, (%[d])+;"
	294	"subq.l #1, %[n];"
	295	"beq 1f;" /* n=1 */
	296	"mac.l %%d1, %%a2, %%acc0;"
	297	"movclr.l %%acc0, %%d1;"
	298	"move.l %%d1, (%[d])+;"
	299	"subq.l #1, %[n];"
	300	"beq 1f;" /* n=2 */
	301	/* otherwise n = 3 */
	302	"mac.l %%d2, %%a1, %%acc0;"
	303	"movclr.l %%acc0, %%d2;"
	304	"move.l %%d2, (%[d])+;"
	305	"1:"
	306	: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
	307	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
	308	"cc", "memory");
	309	}
	310
	311	#endif
	312
	313	#endif
	314
	315	#ifndef _V_CLIP_MATH
	316	#define _V_CLIP_MATH
	317
	318	/* this is portable C and simple; why not use this as default? */
	319	static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) {
	320	register ogg_int32_t hi=32767, lo=-32768;
	321	return (x>=hi ? hi : (x<=lo ? lo : x));
	322	}
	323
	324	#endif
	325	#else
	326	#define LINE_ATTR
	327	#endif