1 files changed, 214 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
new file mode 100644
index 0000000000..84afda3e5d
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv7 neon vector math copyright (C) 2010 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#if ORDER > 32
+#define REPEAT_BLOCK(x) x x x
+#elif ORDER > 16
+#define REPEAT_BLOCK(x) x
+#else
+#define REPEAT_BLOCK(x)
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4",
+        "d5", "d6", "d7", "d8", "d9"
+    );
+    return res;
+}

diff --git a/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h new file mode 100644 index 0000000000..84afda3e5d --- /dev/null +++ b/lib/rbcodec/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
	1	/*
	2
	3	libdemac - A Monkey's Audio decoder
	4
	5	$Id$
	6
	7	Copyright (C) Dave Chapman 2007
	8
	9	ARMv7 neon vector math copyright (C) 2010 Jens Arnold
	10
	11	This program is free software; you can redistribute it and/or modify
	12	it under the terms of the GNU General Public License as published by
	13	the Free Software Foundation; either version 2 of the License, or
	14	(at your option) any later version.
	15
	16	This program is distributed in the hope that it will be useful,
	17	but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	GNU General Public License for more details.
	20
	21	You should have received a copy of the GNU General Public License
	22	along with this program; if not, write to the Free Software
	23	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
	24
	25	*/
	26
	27	#define FUSED_VECTOR_MATH
	28
	29	#if ORDER > 32
	30	#define REPEAT_BLOCK(x) x x x
	31	#elif ORDER > 16
	32	#define REPEAT_BLOCK(x) x
	33	#else
	34	#define REPEAT_BLOCK(x)
	35	#endif
	36
	37	/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
	38	static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
	39	{
	40	int res;
	41	#if ORDER > 64
	42	int cnt = ORDER>>6;
	43	#endif
	44
	45	asm volatile (
	46	#if ORDER > 64
	47	"vmov.i16 q0, #0 \n"
	48	"1: \n"
	49	"subs %[cnt], %[cnt], #1 \n"
	50	#endif
	51	"vld1.16 {d6-d9}, [%[f2]]! \n"
	52	"vld1.16 {d2-d5}, [%[v1]] \n"
	53	"vld1.16 {d10-d13}, [%[s2]]! \n"
	54	#if ORDER > 64
	55	"vmlal.s16 q0, d2, d6 \n"
	56	#else
	57	"vmull.s16 q0, d2, d6 \n"
	58	#endif
	59	"vmlal.s16 q0, d3, d7 \n"
	60	"vmlal.s16 q0, d4, d8 \n"
	61	"vmlal.s16 q0, d5, d9 \n"
	62	"vadd.i16 q1, q1, q5 \n"
	63	"vadd.i16 q2, q2, q6 \n"
	64	"vst1.16 {d2-d5}, [%[v1]]! \n"
	65
	66	REPEAT_BLOCK(
	67	"vld1.16 {d6-d9}, [%[f2]]! \n"
	68	"vld1.16 {d2-d5}, [%[v1]] \n"
	69	"vld1.16 {d10-d13}, [%[s2]]! \n"
	70	"vmlal.s16 q0, d2, d6 \n"
	71	"vmlal.s16 q0, d3, d7 \n"
	72	"vmlal.s16 q0, d4, d8 \n"
	73	"vmlal.s16 q0, d5, d9 \n"
	74	"vadd.i16 q1, q1, q5 \n"
	75	"vadd.i16 q2, q2, q6 \n"
	76	"vst1.16 {d2-d5}, [%[v1]]! \n"
	77	)
	78	#if ORDER > 64
	79	"bne 1b \n"
	80	#endif
	81	"vpadd.i32 d0, d0, d1 \n"
	82	"vpaddl.s32 d0, d0 \n"
	83	"vmov.32 %[res], d0[0] \n"
	84	: /* outputs */
	85	#if ORDER > 64
	86	[cnt]"+r"(cnt),
	87	#endif
	88	[v1] "+r"(v1),
	89	[f2] "+r"(f2),
	90	[s2] "+r"(s2),
	91	[res]"=r"(res)
	92	: /* inputs */
	93	: /* clobbers */
	94	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
	95	"d8", "d9", "d10", "d11", "d12", "d13", "memory"
	96	);
	97	return res;
	98	}
	99
	100	/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
	101	static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
	102	{
	103	int res;
	104	#if ORDER > 64
	105	int cnt = ORDER>>6;
	106	#endif
	107
	108	asm volatile (
	109	#if ORDER > 64
	110	"vmov.i16 q0, #0 \n"
	111	"1: \n"
	112	"subs %[cnt], %[cnt], #1 \n"
	113	#endif
	114	"vld1.16 {d6-d9}, [%[f2]]! \n"
	115	"vld1.16 {d2-d5}, [%[v1]] \n"
	116	"vld1.16 {d10-d13}, [%[s2]]! \n"
	117	#if ORDER > 64
	118	"vmlal.s16 q0, d2, d6 \n"
	119	#else
	120	"vmull.s16 q0, d2, d6 \n"
	121	#endif
	122	"vmlal.s16 q0, d3, d7 \n"
	123	"vmlal.s16 q0, d4, d8 \n"
	124	"vmlal.s16 q0, d5, d9 \n"
	125	"vsub.i16 q1, q1, q5 \n"
	126	"vsub.i16 q2, q2, q6 \n"
	127	"vst1.16 {d2-d5}, [%[v1]]! \n"
	128
	129	REPEAT_BLOCK(
	130	"vld1.16 {d6-d9}, [%[f2]]! \n"
	131	"vld1.16 {d2-d5}, [%[v1]] \n"
	132	"vld1.16 {d10-d13}, [%[s2]]! \n"
	133	"vmlal.s16 q0, d2, d6 \n"
	134	"vmlal.s16 q0, d3, d7 \n"
	135	"vmlal.s16 q0, d4, d8 \n"
	136	"vmlal.s16 q0, d5, d9 \n"
	137	"vsub.i16 q1, q1, q5 \n"
	138	"vsub.i16 q2, q2, q6 \n"
	139	"vst1.16 {d2-d5}, [%[v1]]! \n"
	140	)
	141	#if ORDER > 64
	142	"bne 1b \n"
	143	#endif
	144	"vpadd.i32 d0, d0, d1 \n"
	145	"vpaddl.s32 d0, d0 \n"
	146	"vmov.32 %[res], d0[0] \n"
	147	: /* outputs */
	148	#if ORDER > 64
	149	[cnt]"+r"(cnt),
	150	#endif
	151	[v1] "+r"(v1),
	152	[f2] "+r"(f2),
	153	[s2] "+r"(s2),
	154	[res]"=r"(res)
	155	: /* inputs */
	156	: /* clobbers */
	157	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
	158	"d8", "d9", "d10", "d11", "d12", "d13", "memory"
	159	);
	160	return res;
	161	}
	162
	163	static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
	164	{
	165	int res;
	166	#if ORDER > 64
	167	int cnt = ORDER>>6;
	168	#endif
	169
	170	asm volatile (
	171	#if ORDER > 64
	172	"vmov.i16 q0, #0 \n"
	173	"1: \n"
	174	"subs %[cnt], %[cnt], #1 \n"
	175	#endif
	176	"vld1.16 {d2-d5}, [%[v1]]! \n"
	177	"vld1.16 {d6-d9}, [%[v2]]! \n"
	178	#if ORDER > 64
	179	"vmlal.s16 q0, d2, d6 \n"
	180	#else
	181	"vmull.s16 q0, d2, d6 \n"
	182	#endif
	183	"vmlal.s16 q0, d3, d7 \n"
	184	"vmlal.s16 q0, d4, d8 \n"
	185	"vmlal.s16 q0, d5, d9 \n"
	186
	187	REPEAT_BLOCK(
	188	"vld1.16 {d2-d5}, [%[v1]]! \n"
	189	"vld1.16 {d6-d9}, [%[v2]]! \n"
	190	"vmlal.s16 q0, d2, d6 \n"
	191	"vmlal.s16 q0, d3, d7 \n"
	192	"vmlal.s16 q0, d4, d8 \n"
	193	"vmlal.s16 q0, d5, d9 \n"
	194	)
	195	#if ORDER > 64
	196	"bne 1b \n"
	197	#endif
	198	"vpadd.i32 d0, d0, d1 \n"
	199	"vpaddl.s32 d0, d0 \n"
	200	"vmov.32 %[res], d0[0] \n"
	201	: /* outputs */
	202	#if ORDER > 64
	203	[cnt]"+r"(cnt),
	204	#endif
	205	[v1] "+r"(v1),
	206	[v2] "+r"(v2),
	207	[res]"=r"(res)
	208	: /* inputs */
	209	: /* clobbers */
	210	"d0", "d1", "d2", "d3", "d4",
	211	"d5", "d6", "d7", "d8", "d9"
	212	);
	213	return res;
	214	}