1 files changed, 0 insertions, 353 deletions
diff --git a/apps/codecs/lib/asm_mcf5249.h b/apps/codecs/lib/asm_mcf5249.h
deleted file mode 100644
index 841c413a94..0000000000
--- a/apps/codecs/lib/asm_mcf5249.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/***************************************************************************
- *             __________               __   ___.
- *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
- *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
- *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
- *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
- *                     \/            \/     \/    \/            \/
- *
- * Copyright (C) 2005 by Pedro Vasconcelos
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ****************************************************************************/
-/* asm routines for wide math on the MCF5249 */
-#if defined(CPU_COLDFIRE)
-#define INCL_OPTIMIZED_MULT32
-static inline int32_t MULT32(int32_t x, int32_t y) {
-  asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
-                "movclr.l %%acc0, %[x];"       /* move & clear acc */
-                "asr.l #1, %[x];"              /* no overflow test */
-                : [x] "+&d" (x)
-                : [y] "r" (y)
-                : "cc");
-  return x;
-}
-#define INCL_OPTIMIZED_MULT31
-static inline int32_t MULT31(int32_t x, int32_t y) {
-  asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
-                "movclr.l %%acc0, %[x];"    /* move and clear */
-                : [x] "+&r" (x)
-                : [y] "r" (y)
-                : "cc");
-  return x;
-}
-#define INCL_OPTIMIZED_MULT31_SHIFT15
-/* NOTE: this requires that the emac is *NOT* rounding */
-static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
-  int32_t r;
-  asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
-                "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
-                "movclr.l %%acc0, %[r];"     /* get higher half */
-                "swap %[r];"                 /* hi<<16, plus one free */
-                "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
-                "lsr.l #7, %[x];"
-                "move.w %[x], %[r];"         /* logical-or results */
-                : [r] "=&d" (r), [x] "+d" (x)
-                : [y] "d" (y)
-                : "cc");
-  return r;
-}
-#define INCL_OPTIMIZED_MULT31_SHIFT16
-static inline int32_t MULT31_SHIFT16(int32_t x, int32_t y) {
-  int32_t r;
-  asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
-                "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
-                "movclr.l %%acc0, %[r];"     /* get higher half */
-                "lsr.l #1, %[r];"            /* hi >> 1, to compensate emac shift */
-                "move.w %[r], %[x];"         /* x = x & 0xffff0000 | r & 0xffff */
-                "swap %[x];"                 /* x = (unsigned)x << 16 | (unsigned)x >> 16 */
-                : [r] "=&d" (r), [x] "+d" (x)
-                : [y] "d" (y)
-                : "cc");
-  return x;
-}
-#define INCL_OPTIMIZED_XPROD31
-static inline
-void XPROD31(int32_t  a, int32_t  b,
-             int32_t  t, int32_t  v,
-             int32_t *x, int32_t *y)
-{
-  asm volatile ("mac.l %[a], %[t], %%acc0;"
-                "mac.l %[b], %[v], %%acc0;"
-                "mac.l %[b], %[t], %%acc1;"
-                "msac.l %[a], %[v], %%acc1;"
-                "movclr.l %%acc0, %[a];"
-                "move.l %[a], (%[x]);"
-                "movclr.l %%acc1, %[a];"
-                "move.l %[a], (%[y]);"
-                : [a] "+&r" (a)
-                : [x] "a" (x), [y] "a" (y),
-                  [b] "r" (b), [t] "r" (t), [v] "r" (v)
-                : "cc", "memory");
-}
-#define INCL_OPTIMIZED_XNPROD31
-static inline
-void XNPROD31(int32_t  a, int32_t  b,
-              int32_t  t, int32_t  v,
-              int32_t *x, int32_t *y)
-{
-  asm volatile ("mac.l %[a], %[t], %%acc0;"
-                "msac.l %[b], %[v], %%acc0;"
-                "mac.l %[b], %[t], %%acc1;"
-                "mac.l %[a], %[v], %%acc1;"
-                "movclr.l %%acc0, %[a];"
-                "move.l %[a], (%[x]);"
-                "movclr.l %%acc1, %[a];"
-                "move.l %[a], (%[y]);"
-                : [a] "+&r" (a)
-                : [x] "a" (x), [y] "a" (y),
-                  [b] "r" (b), [t] "r" (t), [v] "r" (v)
-                : "cc", "memory");
-}
-/* this could lose the LSB by overflow, but i don't think it'll ever happen.
-   if anyone think they can hear a bug caused by this, please try the above
-   version. */
-#define INCL_OPTIMIZED_XPROD32
-#define XPROD32(_a, _b, _t, _v, _x, _y)     \
-  asm volatile ("mac.l %[a], %[t], %%acc0;" \
-                "mac.l %[b], %[v], %%acc0;" \
-                "mac.l %[b], %[t], %%acc1;" \
-                "msac.l %[a], %[v], %%acc1;" \
-                "movclr.l %%acc0, %[x];" \
-                "asr.l #1, %[x];" \
-                "movclr.l %%acc1, %[y];" \
-                "asr.l #1, %[y];" \
-                : [x] "=d" (_x), [y] "=d" (_y) \
-                : [a] "r" (_a), [b] "r" (_b), \
-                  [t] "r" (_t), [v] "r" (_v) \
-                : "cc");
-#define INCL_OPTIMIZED_XPROD31_R
-#define XPROD31_R(_a, _b, _t, _v, _x, _y)   \
-  asm volatile ("mac.l %[a], %[t], %%acc0;" \
-                "mac.l %[b], %[v], %%acc0;" \
-                "mac.l %[b], %[t], %%acc1;" \
-                "msac.l %[a], %[v], %%acc1;" \
-                "movclr.l %%acc0, %[x];" \
-                "movclr.l %%acc1, %[y];" \
-                : [x] "=r" (_x), [y] "=r" (_y) \
-                : [a] "r" (_a), [b] "r" (_b), \
-                  [t] "r" (_t), [v] "r" (_v) \
-                : "cc");
-#define INCL_OPTIMIZED_XNPROD31_R
-#define XNPROD31_R(_a, _b, _t, _v, _x, _y)  \
-  asm volatile ("mac.l %[a], %[t], %%acc0;" \
-                "msac.l %[b], %[v], %%acc0;" \
-                "mac.l %[b], %[t], %%acc1;" \
-                "mac.l %[a], %[v], %%acc1;" \
-                "movclr.l %%acc0, %[x];" \
-                "movclr.l %%acc1, %[y];" \
-                : [x] "=r" (_x), [y] "=r" (_y) \
-                : [a] "r" (_a), [b] "r" (_b), \
-                  [t] "r" (_t), [v] "r" (_v) \
-                : "cc");
-#ifndef _V_VECT_OPS
-#define _V_VECT_OPS
-/* asm versions of vector operations for block.c, window.c */
-/* assumes MAC is initialized & accumulators cleared */
-static inline
-void vect_add(int32_t *x, const int32_t *y, int n)
-{
-  /* align to 16 bytes */
-  while(n>0 && (int)x&15) {
-    *x++ += *y++;
-    n--;
-  }
-  asm volatile ("bra 1f;"
-                "0:"                          /* loop start */
-                "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
-                "movem.l (%[y]), %%a0-%%a3;"
-                /* add */
-                "add.l %%a0, %%d0;"
-                "add.l %%a1, %%d1;"
-                "add.l %%a2, %%d2;"
-                "add.l %%a3, %%d3;"
-                /* store and advance */
-                "movem.l %%d0-%%d3, (%[x]);"
-                "lea.l (4*4, %[x]), %[x];"
-                "lea.l (4*4, %[y]), %[y];"
-                "subq.l #4, %[n];"     /* done 4 elements */
-                "1: cmpi.l #4, %[n];"
-                "bge 0b;"
-                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
-                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
-                    "cc", "memory");
-  /* add final elements */
-  while (n>0) {
-    *x++ += *y++;
-    n--;
-  }
-}
-static inline
-void vect_copy(int32_t *x, const int32_t *y, int n)
-{
-  /* align to 16 bytes */
-  while(n>0 && (int)x&15) {
-    *x++ = *y++;
-    n--;
-  }
-  asm volatile ("bra 1f;"
-                "0:"                                    /* loop start */
-                "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
-                "movem.l %%d0-%%d3, (%[x]);"            /* store */
-                "lea.l (4*4, %[x]), %[x];"              /* advance */
-                "lea.l (4*4, %[y]), %[y];"
-                "subq.l #4, %[n];"                      /* done 4 elements */
-                "1: cmpi.l #4, %[n];"
-                "bge 0b;"
-                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
-                : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
-  /* copy final elements */
-  while (n>0) {
-    *x++ = *y++;
-    n--;
-  }
-}
-static inline
-void vect_mult_fw(int32_t *data, const int32_t *window, int n)
-{
-  /* ensure data is aligned to 16-bytes */
-  while(n>0 && (int)data&15) {
-    *data = MULT31(*data, *window);
-    data++;
-    window++;
-    n--;
-  }
-  asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
-                "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
-                "lea.l (4*4, %[w]), %[w];"
-                "bra 1f;"               /* jump to loop condition */
-                "0:" /* loop body */
-                /* multiply and load next window values */
-                "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
-                "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
-                "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
-                "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
-                "movclr.l %%acc0, %%d0;"  /* get the products */
-                "movclr.l %%acc1, %%d1;"
-                "movclr.l %%acc2, %%d2;"
-                "movclr.l %%acc3, %%d3;"
-                /* store and advance */
-                "movem.l %%d0-%%d3, (%[d]);"
-                "lea.l (4*4, %[d]), %[d];"
-                "movem.l (%[d]), %%d0-%%d3;"
-                "subq.l #4, %[n];"     /* done 4 elements */
-                "1: cmpi.l #4, %[n];"
-                "bge 0b;"
-                /* multiply final elements */
-                "tst.l %[n];"
-                "beq 1f;"      /* n=0 */
-                "mac.l %%d0, %%a0, %%acc0;"
-                "movclr.l %%acc0, %%d0;"
-                "move.l %%d0, (%[d])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=1 */
-                "mac.l %%d1, %%a1, %%acc0;"
-                "movclr.l %%acc0, %%d1;"
-                "move.l %%d1, (%[d])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=2 */
-                /* otherwise n = 3 */
-                "mac.l %%d2, %%a2, %%acc0;"
-                "movclr.l %%acc0, %%d2;"
-                "move.l %%d2, (%[d])+;"
-                "1:"
-                : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
-                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
-                    "cc", "memory");
-}
-static inline
-void vect_mult_bw(int32_t *data, const int32_t *window, int n)
-{
-  /* ensure at least data is aligned to 16-bytes */
-  while(n>0 && (int)data&15) {
-    *data = MULT31(*data, *window);
-    data++;
-    window--;
-    n--;
-  }
-  asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
-                "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
-                "movem.l (%[w]), %%a0-%%a3;"
-                "bra 1f;"               /* jump to loop condition */
-                "0:" /* loop body */
-                /* multiply and load next window value */
-                "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
-                "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
-                "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
-                "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
-                "movclr.l %%acc0, %%d0;"  /* get the products */
-                "movclr.l %%acc1, %%d1;"
-                "movclr.l %%acc2, %%d2;"
-                "movclr.l %%acc3, %%d3;"
-                /* store and advance */
-                "movem.l %%d0-%%d3, (%[d]);"
-                "lea.l (4*4, %[d]), %[d];"
-                "movem.l (%[d]), %%d0-%%d3;"
-                "subq.l #4, %[n];"     /* done 4 elements */
-                "1: cmpi.l #4, %[n];"
-                "bge 0b;"
-                /* multiply final elements */
-                "tst.l %[n];"
-                "beq 1f;"      /* n=0 */
-                "mac.l %%d0, %%a3, %%acc0;"
-                "movclr.l %%acc0, %%d0;"
-                "move.l %%d0, (%[d])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=1 */
-                "mac.l %%d1, %%a2, %%acc0;"
-                "movclr.l %%acc0, %%d1;"
-                "move.l %%d1, (%[d])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=2 */
-                /* otherwise n = 3 */
-                "mac.l %%d2, %%a1, %%acc0;"
-                "movclr.l %%acc0, %%d2;"
-                "move.l %%d2, (%[d])+;"
-                "1:"
-                : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
-                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
-                    "cc", "memory");
-}
-#endif
-/* not used anymore */
-/*
-#ifndef _V_CLIP_MATH
-#define _V_CLIP_MATH
-* this is portable C and simple; why not use this as default?
-static inline int32_t CLIP_TO_15(register int32_t x) {
-  register int32_t hi=32767, lo=-32768;
-  return (x>=hi ? hi : (x<=lo ? lo : x));
-}
-#endif
-*/
-#endif

diff --git a/apps/codecs/lib/asm_mcf5249.h b/apps/codecs/lib/asm_mcf5249.h deleted file mode 100644 index 841c413a94..0000000000 --- a/apps/codecs/lib/asm_mcf5249.h +++ /dev/null
@@ -1,353 +0,0 @@
1	/***************************************************************************
2	* __________ __ ___.
3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
7	* \/ \/ \/ \/ \/
8	*
9	* Copyright (C) 2005 by Pedro Vasconcelos
10	*
11	* This program is free software; you can redistribute it and/or
12	* modify it under the terms of the GNU General Public License
13	* as published by the Free Software Foundation; either version 2
14	* of the License, or (at your option) any later version.
15	*
16	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
17	* KIND, either express or implied.
18	*
19	****************************************************************************/
20	/* asm routines for wide math on the MCF5249 */
21
22	#if defined(CPU_COLDFIRE)
23
24	#define INCL_OPTIMIZED_MULT32
25	static inline int32_t MULT32(int32_t x, int32_t y) {
26
27	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
28	"movclr.l %%acc0, %[x];" /* move & clear acc */
29	"asr.l #1, %[x];" /* no overflow test */
30	: [x] "+&d" (x)
31	: [y] "r" (y)
32	: "cc");
33	return x;
34	}
35
36	#define INCL_OPTIMIZED_MULT31
37	static inline int32_t MULT31(int32_t x, int32_t y) {
38	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
39	"movclr.l %%acc0, %[x];" /* move and clear */
40	: [x] "+&r" (x)
41	: [y] "r" (y)
42	: "cc");
43	return x;
44	}
45
46	#define INCL_OPTIMIZED_MULT31_SHIFT15
47	/* NOTE: this requires that the emac is NOT rounding */
48	static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
49	int32_t r;
50
51	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
52	"mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
53	"movclr.l %%acc0, %[r];" /* get higher half */
54	"swap %[r];" /* hi<<16, plus one free */
55	"lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
56	"lsr.l #7, %[x];"
57	"move.w %[x], %[r];" /* logical-or results */
58	: [r] "=&d" (r), [x] "+d" (x)
59	: [y] "d" (y)
60	: "cc");
61	return r;
62	}
63
64	#define INCL_OPTIMIZED_MULT31_SHIFT16
65	static inline int32_t MULT31_SHIFT16(int32_t x, int32_t y) {
66	int32_t r;
67
68	asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
69	"mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
70	"movclr.l %%acc0, %[r];" /* get higher half */
71	"lsr.l #1, %[r];" /* hi >> 1, to compensate emac shift */
72	"move.w %[r], %[x];" /* x = x & 0xffff0000 \| r & 0xffff */
73	"swap %[x];" /* x = (unsigned)x << 16 \| (unsigned)x >> 16 */
74	: [r] "=&d" (r), [x] "+d" (x)
75	: [y] "d" (y)
76	: "cc");
77	return x;
78	}
79
80	#define INCL_OPTIMIZED_XPROD31
81	static inline
82	void XPROD31(int32_t a, int32_t b,
83	int32_t t, int32_t v,
84	int32_t x, int32_t y)
85	{
86	asm volatile ("mac.l %[a], %[t], %%acc0;"
87	"mac.l %[b], %[v], %%acc0;"
88	"mac.l %[b], %[t], %%acc1;"
89	"msac.l %[a], %[v], %%acc1;"
90	"movclr.l %%acc0, %[a];"
91	"move.l %[a], (%[x]);"
92	"movclr.l %%acc1, %[a];"
93	"move.l %[a], (%[y]);"
94	: [a] "+&r" (a)
95	: [x] "a" (x), [y] "a" (y),
96	[b] "r" (b), [t] "r" (t), [v] "r" (v)
97	: "cc", "memory");
98	}
99
100	#define INCL_OPTIMIZED_XNPROD31
101	static inline
102	void XNPROD31(int32_t a, int32_t b,
103	int32_t t, int32_t v,
104	int32_t x, int32_t y)
105	{
106	asm volatile ("mac.l %[a], %[t], %%acc0;"
107	"msac.l %[b], %[v], %%acc0;"
108	"mac.l %[b], %[t], %%acc1;"
109	"mac.l %[a], %[v], %%acc1;"
110	"movclr.l %%acc0, %[a];"
111	"move.l %[a], (%[x]);"
112	"movclr.l %%acc1, %[a];"
113	"move.l %[a], (%[y]);"
114	: [a] "+&r" (a)
115	: [x] "a" (x), [y] "a" (y),
116	[b] "r" (b), [t] "r" (t), [v] "r" (v)
117	: "cc", "memory");
118	}
119
120
121	/* this could lose the LSB by overflow, but i don't think it'll ever happen.
122	if anyone think they can hear a bug caused by this, please try the above
123	version. */
124	#define INCL_OPTIMIZED_XPROD32
125	#define XPROD32(_a, _b, _t, _v, _x, _y) \
126	asm volatile ("mac.l %[a], %[t], %%acc0;" \
127	"mac.l %[b], %[v], %%acc0;" \
128	"mac.l %[b], %[t], %%acc1;" \
129	"msac.l %[a], %[v], %%acc1;" \
130	"movclr.l %%acc0, %[x];" \
131	"asr.l #1, %[x];" \
132	"movclr.l %%acc1, %[y];" \
133	"asr.l #1, %[y];" \
134	: [x] "=d" (_x), [y] "=d" (_y) \
135	: [a] "r" (_a), [b] "r" (_b), \
136	[t] "r" (_t), [v] "r" (_v) \
137	: "cc");
138
139	#define INCL_OPTIMIZED_XPROD31_R
140	#define XPROD31_R(_a, _b, _t, _v, _x, _y) \
141	asm volatile ("mac.l %[a], %[t], %%acc0;" \
142	"mac.l %[b], %[v], %%acc0;" \
143	"mac.l %[b], %[t], %%acc1;" \
144	"msac.l %[a], %[v], %%acc1;" \
145	"movclr.l %%acc0, %[x];" \
146	"movclr.l %%acc1, %[y];" \
147	: [x] "=r" (_x), [y] "=r" (_y) \
148	: [a] "r" (_a), [b] "r" (_b), \
149	[t] "r" (_t), [v] "r" (_v) \
150	: "cc");
151
152	#define INCL_OPTIMIZED_XNPROD31_R
153	#define XNPROD31_R(_a, _b, _t, _v, _x, _y) \
154	asm volatile ("mac.l %[a], %[t], %%acc0;" \
155	"msac.l %[b], %[v], %%acc0;" \
156	"mac.l %[b], %[t], %%acc1;" \
157	"mac.l %[a], %[v], %%acc1;" \
158	"movclr.l %%acc0, %[x];" \
159	"movclr.l %%acc1, %[y];" \
160	: [x] "=r" (_x), [y] "=r" (_y) \
161	: [a] "r" (_a), [b] "r" (_b), \
162	[t] "r" (_t), [v] "r" (_v) \
163	: "cc");
164
165	#ifndef _V_VECT_OPS
166	#define _V_VECT_OPS
167
168	/* asm versions of vector operations for block.c, window.c */
169	/* assumes MAC is initialized & accumulators cleared */
170	static inline
171	void vect_add(int32_t x, const int32_t y, int n)
172	{
173	/* align to 16 bytes */
174	while(n>0 && (int)x&15) {
175	x++ += y++;
176	n--;
177	}
178	asm volatile ("bra 1f;"
179	"0:" /* loop start */
180	"movem.l (%[x]), %%d0-%%d3;" /* fetch values */
181	"movem.l (%[y]), %%a0-%%a3;"
182	/* add */
183	"add.l %%a0, %%d0;"
184	"add.l %%a1, %%d1;"
185	"add.l %%a2, %%d2;"
186	"add.l %%a3, %%d3;"
187	/* store and advance */
188	"movem.l %%d0-%%d3, (%[x]);"
189	"lea.l (4*4, %[x]), %[x];"
190	"lea.l (4*4, %[y]), %[y];"
191	"subq.l #4, %[n];" /* done 4 elements */
192	"1: cmpi.l #4, %[n];"
193	"bge 0b;"
194	: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
195	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
196	"cc", "memory");
197	/* add final elements */
198	while (n>0) {
199	x++ += y++;
200	n--;
201	}
202	}
203
204	static inline
205	void vect_copy(int32_t x, const int32_t y, int n)
206	{
207	/* align to 16 bytes */
208	while(n>0 && (int)x&15) {
209	x++ = y++;
210	n--;
211	}
212	asm volatile ("bra 1f;"
213	"0:" /* loop start */
214	"movem.l (%[y]), %%d0-%%d3;" /* fetch values */
215	"movem.l %%d0-%%d3, (%[x]);" /* store */
216	"lea.l (44, %[x]), %[x];" / advance */
217	"lea.l (4*4, %[y]), %[y];"
218	"subq.l #4, %[n];" /* done 4 elements */
219	"1: cmpi.l #4, %[n];"
220	"bge 0b;"
221	: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
222	: : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
223	/* copy final elements */
224	while (n>0) {
225	x++ = y++;
226	n--;
227	}
228	}
229
230	static inline
231	void vect_mult_fw(int32_t data, const int32_t window, int n)
232	{
233	/* ensure data is aligned to 16-bytes */
234	while(n>0 && (int)data&15) {
235	data = MULT31(data, *window);
236	data++;
237	window++;
238	n--;
239	}
240	asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
241	"movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
242	"lea.l (4*4, %[w]), %[w];"
243	"bra 1f;" /* jump to loop condition */
244	"0:" /* loop body */
245	/* multiply and load next window values */
246	"mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
247	"mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
248	"mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
249	"mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
250	"movclr.l %%acc0, %%d0;" /* get the products */
251	"movclr.l %%acc1, %%d1;"
252	"movclr.l %%acc2, %%d2;"
253	"movclr.l %%acc3, %%d3;"
254	/* store and advance */
255	"movem.l %%d0-%%d3, (%[d]);"
256	"lea.l (4*4, %[d]), %[d];"
257	"movem.l (%[d]), %%d0-%%d3;"
258	"subq.l #4, %[n];" /* done 4 elements */
259	"1: cmpi.l #4, %[n];"
260	"bge 0b;"
261	/* multiply final elements */
262	"tst.l %[n];"
263	"beq 1f;" /* n=0 */
264	"mac.l %%d0, %%a0, %%acc0;"
265	"movclr.l %%acc0, %%d0;"
266	"move.l %%d0, (%[d])+;"
267	"subq.l #1, %[n];"
268	"beq 1f;" /* n=1 */
269	"mac.l %%d1, %%a1, %%acc0;"
270	"movclr.l %%acc0, %%d1;"
271	"move.l %%d1, (%[d])+;"
272	"subq.l #1, %[n];"
273	"beq 1f;" /* n=2 */
274	/* otherwise n = 3 */
275	"mac.l %%d2, %%a2, %%acc0;"
276	"movclr.l %%acc0, %%d2;"
277	"move.l %%d2, (%[d])+;"
278	"1:"
279	: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
280	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
281	"cc", "memory");
282	}
283
284	static inline
285	void vect_mult_bw(int32_t data, const int32_t window, int n)
286	{
287	/* ensure at least data is aligned to 16-bytes */
288	while(n>0 && (int)data&15) {
289	data = MULT31(data, *window);
290	data++;
291	window--;
292	n--;
293	}
294	asm volatile ("lea.l (-34, %[w]), %[w];" / loop start */
295	"movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
296	"movem.l (%[w]), %%a0-%%a3;"
297	"bra 1f;" /* jump to loop condition */
298	"0:" /* loop body */
299	/* multiply and load next window value */
300	"mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
301	"mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
302	"mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
303	"mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
304	"movclr.l %%acc0, %%d0;" /* get the products */
305	"movclr.l %%acc1, %%d1;"
306	"movclr.l %%acc2, %%d2;"
307	"movclr.l %%acc3, %%d3;"
308	/* store and advance */
309	"movem.l %%d0-%%d3, (%[d]);"
310	"lea.l (4*4, %[d]), %[d];"
311	"movem.l (%[d]), %%d0-%%d3;"
312	"subq.l #4, %[n];" /* done 4 elements */
313	"1: cmpi.l #4, %[n];"
314	"bge 0b;"
315	/* multiply final elements */
316	"tst.l %[n];"
317	"beq 1f;" /* n=0 */
318	"mac.l %%d0, %%a3, %%acc0;"
319	"movclr.l %%acc0, %%d0;"
320	"move.l %%d0, (%[d])+;"
321	"subq.l #1, %[n];"
322	"beq 1f;" /* n=1 */
323	"mac.l %%d1, %%a2, %%acc0;"
324	"movclr.l %%acc0, %%d1;"
325	"move.l %%d1, (%[d])+;"
326	"subq.l #1, %[n];"
327	"beq 1f;" /* n=2 */
328	/* otherwise n = 3 */
329	"mac.l %%d2, %%a1, %%acc0;"
330	"movclr.l %%acc0, %%d2;"
331	"move.l %%d2, (%[d])+;"
332	"1:"
333	: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
334	: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
335	"cc", "memory");
336	}
337
338	#endif
339
340	/* not used anymore */
341	/*
342	#ifndef _V_CLIP_MATH
343	#define _V_CLIP_MATH
344
345	* this is portable C and simple; why not use this as default?
346	static inline int32_t CLIP_TO_15(register int32_t x) {
347	register int32_t hi=32767, lo=-32768;
348	return (x>=hi ? hi : (x<=lo ? lo : x));
349	}
350
351	#endif
352	*/
353	#endif