libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2010-08-30 06:31:47 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2010-08-30 06:31:47 +0000
commit: 811877e5b3ae95b70e285b786bb7cc9d73d333e0 (patch)
tree: c4e7865faaaad715566f7b1ebb559eeba25d7221 /apps/codecs/demac/libdemac
parent: dd5e3eb5424a66a5399f99386b59a8ee86d6cde0 (diff)
download: rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.tar.gz
rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.zip
2 files changed, 216 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 8055098301..903885cf00 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifdef CPU_COLDFIRE
 #include "vector_math16_cf.h"
+#elif defined(CPU_ARM) && (ARM_ARCH >= 7)
+#include "vector_math16_armv7.h"
 #elif defined(CPU_ARM) && (ARM_ARCH >= 6)
 #include "vector_math16_armv6.h"
 #elif defined(CPU_ARM) && (ARM_ARCH >= 5)
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv7.h b/apps/codecs/demac/libdemac/vector_math16_armv7.h
new file mode 100644
index 0000000000..84afda3e5d
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+ARMv7 neon vector math copyright (C) 2010 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#if ORDER > 32
+#define REPEAT_BLOCK(x) x x x
+#elif ORDER > 16
+#define REPEAT_BLOCK(x) x
+#else
+#define REPEAT_BLOCK(x)
+#endif
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vadd.i16    q1, q1, q5          \n"
+        "vadd.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d6-d9}, [%[f2]]!   \n"
+        "vld1.16     {d2-d5}, [%[v1]]    \n"
+        "vld1.16     {d10-d13}, [%[s2]]! \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        "vsub.i16    q1, q1, q5          \n"
+        "vsub.i16    q2, q2, q6          \n"
+        "vst1.16     {d2-d5}, [%[v1]]!   \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [f2] "+r"(f2),
+        [s2] "+r"(s2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res;
+#if ORDER > 64
+    int cnt = ORDER>>6;
+#endif
+    asm volatile (
+#if ORDER > 64
+        "vmov.i16    q0, #0              \n"
+    "1:                                  \n"
+        "subs        %[cnt], %[cnt], #1  \n"
+#endif
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+#if ORDER > 64
+        "vmlal.s16   q0, d2, d6          \n"
+#else
+        "vmull.s16   q0, d2, d6          \n"
+#endif
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        REPEAT_BLOCK(
+        "vld1.16     {d2-d5}, [%[v1]]!   \n"
+        "vld1.16     {d6-d9}, [%[v2]]!   \n"
+        "vmlal.s16   q0, d2, d6          \n"
+        "vmlal.s16   q0, d3, d7          \n"
+        "vmlal.s16   q0, d4, d8          \n"
+        "vmlal.s16   q0, d5, d9          \n"
+        )
+#if ORDER > 64
+        "bne         1b                  \n"
+#endif
+        "vpadd.i32   d0, d0, d1          \n"
+        "vpaddl.s32  d0, d0              \n"
+        "vmov.32     %[res], d0[0]       \n"
+        : /* outputs */
+#if ORDER > 64
+        [cnt]"+r"(cnt),
+#endif
+        [v1] "+r"(v1),
+        [v2] "+r"(v2),
+        [res]"=r"(res)
+        : /* inputs */
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4",
+        "d5", "d6", "d7", "d8", "d9"
+    );
+    return res;
+}
author	Jens Arnold <amiconn@rockbox.org>	2010-08-30 06:31:47 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2010-08-30 06:31:47 +0000
commit	811877e5b3ae95b70e285b786bb7cc9d73d333e0 (patch)
tree	c4e7865faaaad715566f7b1ebb559eeba25d7221 /apps/codecs/demac/libdemac
parent	dd5e3eb5424a66a5399f99386b59a8ee86d6cde0 (diff)
download	rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.tar.gz rockbox-811877e5b3ae95b70e285b786bb7cc9d73d333e0.zip

diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 8055098301..903885cf00 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c
@@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
41		41
42	#ifdef CPU_COLDFIRE	42	#ifdef CPU_COLDFIRE
43	#include "vector_math16_cf.h"	43	#include "vector_math16_cf.h"
		44	#elif defined(CPU_ARM) && (ARM_ARCH >= 7)
		45	#include "vector_math16_armv7.h"
44	#elif defined(CPU_ARM) && (ARM_ARCH >= 6)	46	#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
45	#include "vector_math16_armv6.h"	47	#include "vector_math16_armv6.h"
46	#elif defined(CPU_ARM) && (ARM_ARCH >= 5)	48	#elif defined(CPU_ARM) && (ARM_ARCH >= 5)


diff --git a/apps/codecs/demac/libdemac/vector_math16_armv7.h b/apps/codecs/demac/libdemac/vector_math16_armv7.h new file mode 100644 index 0000000000..84afda3e5d --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h
@@ -0,0 +1,214 @@
		1	/*
		2
		3	libdemac - A Monkey's Audio decoder
		4
		5	$Id$
		6
		7	Copyright (C) Dave Chapman 2007
		8
		9	ARMv7 neon vector math copyright (C) 2010 Jens Arnold
		10
		11	This program is free software; you can redistribute it and/or modify
		12	it under the terms of the GNU General Public License as published by
		13	the Free Software Foundation; either version 2 of the License, or
		14	(at your option) any later version.
		15
		16	This program is distributed in the hope that it will be useful,
		17	but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	GNU General Public License for more details.
		20
		21	You should have received a copy of the GNU General Public License
		22	along with this program; if not, write to the Free Software
		23	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
		24
		25	*/
		26
		27	#define FUSED_VECTOR_MATH
		28
		29	#if ORDER > 32
		30	#define REPEAT_BLOCK(x) x x x
		31	#elif ORDER > 16
		32	#define REPEAT_BLOCK(x) x
		33	#else
		34	#define REPEAT_BLOCK(x)
		35	#endif
		36
		37	/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
		38	static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
		39	{
		40	int res;
		41	#if ORDER > 64
		42	int cnt = ORDER>>6;
		43	#endif
		44
		45	asm volatile (
		46	#if ORDER > 64
		47	"vmov.i16 q0, #0 \n"
		48	"1: \n"
		49	"subs %[cnt], %[cnt], #1 \n"
		50	#endif
		51	"vld1.16 {d6-d9}, [%[f2]]! \n"
		52	"vld1.16 {d2-d5}, [%[v1]] \n"
		53	"vld1.16 {d10-d13}, [%[s2]]! \n"
		54	#if ORDER > 64
		55	"vmlal.s16 q0, d2, d6 \n"
		56	#else
		57	"vmull.s16 q0, d2, d6 \n"
		58	#endif
		59	"vmlal.s16 q0, d3, d7 \n"
		60	"vmlal.s16 q0, d4, d8 \n"
		61	"vmlal.s16 q0, d5, d9 \n"
		62	"vadd.i16 q1, q1, q5 \n"
		63	"vadd.i16 q2, q2, q6 \n"
		64	"vst1.16 {d2-d5}, [%[v1]]! \n"
		65
		66	REPEAT_BLOCK(
		67	"vld1.16 {d6-d9}, [%[f2]]! \n"
		68	"vld1.16 {d2-d5}, [%[v1]] \n"
		69	"vld1.16 {d10-d13}, [%[s2]]! \n"
		70	"vmlal.s16 q0, d2, d6 \n"
		71	"vmlal.s16 q0, d3, d7 \n"
		72	"vmlal.s16 q0, d4, d8 \n"
		73	"vmlal.s16 q0, d5, d9 \n"
		74	"vadd.i16 q1, q1, q5 \n"
		75	"vadd.i16 q2, q2, q6 \n"
		76	"vst1.16 {d2-d5}, [%[v1]]! \n"
		77	)
		78	#if ORDER > 64
		79	"bne 1b \n"
		80	#endif
		81	"vpadd.i32 d0, d0, d1 \n"
		82	"vpaddl.s32 d0, d0 \n"
		83	"vmov.32 %[res], d0[0] \n"
		84	: /* outputs */
		85	#if ORDER > 64
		86	[cnt]"+r"(cnt),
		87	#endif
		88	[v1] "+r"(v1),
		89	[f2] "+r"(f2),
		90	[s2] "+r"(s2),
		91	[res]"=r"(res)
		92	: /* inputs */
		93	: /* clobbers */
		94	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
		95	"d8", "d9", "d10", "d11", "d12", "d13", "memory"
		96	);
		97	return res;
		98	}
		99
		100	/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
		101	static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
		102	{
		103	int res;
		104	#if ORDER > 64
		105	int cnt = ORDER>>6;
		106	#endif
		107
		108	asm volatile (
		109	#if ORDER > 64
		110	"vmov.i16 q0, #0 \n"
		111	"1: \n"
		112	"subs %[cnt], %[cnt], #1 \n"
		113	#endif
		114	"vld1.16 {d6-d9}, [%[f2]]! \n"
		115	"vld1.16 {d2-d5}, [%[v1]] \n"
		116	"vld1.16 {d10-d13}, [%[s2]]! \n"
		117	#if ORDER > 64
		118	"vmlal.s16 q0, d2, d6 \n"
		119	#else
		120	"vmull.s16 q0, d2, d6 \n"
		121	#endif
		122	"vmlal.s16 q0, d3, d7 \n"
		123	"vmlal.s16 q0, d4, d8 \n"
		124	"vmlal.s16 q0, d5, d9 \n"
		125	"vsub.i16 q1, q1, q5 \n"
		126	"vsub.i16 q2, q2, q6 \n"
		127	"vst1.16 {d2-d5}, [%[v1]]! \n"
		128
		129	REPEAT_BLOCK(
		130	"vld1.16 {d6-d9}, [%[f2]]! \n"
		131	"vld1.16 {d2-d5}, [%[v1]] \n"
		132	"vld1.16 {d10-d13}, [%[s2]]! \n"
		133	"vmlal.s16 q0, d2, d6 \n"
		134	"vmlal.s16 q0, d3, d7 \n"
		135	"vmlal.s16 q0, d4, d8 \n"
		136	"vmlal.s16 q0, d5, d9 \n"
		137	"vsub.i16 q1, q1, q5 \n"
		138	"vsub.i16 q2, q2, q6 \n"
		139	"vst1.16 {d2-d5}, [%[v1]]! \n"
		140	)
		141	#if ORDER > 64
		142	"bne 1b \n"
		143	#endif
		144	"vpadd.i32 d0, d0, d1 \n"
		145	"vpaddl.s32 d0, d0 \n"
		146	"vmov.32 %[res], d0[0] \n"
		147	: /* outputs */
		148	#if ORDER > 64
		149	[cnt]"+r"(cnt),
		150	#endif
		151	[v1] "+r"(v1),
		152	[f2] "+r"(f2),
		153	[s2] "+r"(s2),
		154	[res]"=r"(res)
		155	: /* inputs */
		156	: /* clobbers */
		157	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
		158	"d8", "d9", "d10", "d11", "d12", "d13", "memory"
		159	);
		160	return res;
		161	}
		162
		163	static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
		164	{
		165	int res;
		166	#if ORDER > 64
		167	int cnt = ORDER>>6;
		168	#endif
		169
		170	asm volatile (
		171	#if ORDER > 64
		172	"vmov.i16 q0, #0 \n"
		173	"1: \n"
		174	"subs %[cnt], %[cnt], #1 \n"
		175	#endif
		176	"vld1.16 {d2-d5}, [%[v1]]! \n"
		177	"vld1.16 {d6-d9}, [%[v2]]! \n"
		178	#if ORDER > 64
		179	"vmlal.s16 q0, d2, d6 \n"
		180	#else
		181	"vmull.s16 q0, d2, d6 \n"
		182	#endif
		183	"vmlal.s16 q0, d3, d7 \n"
		184	"vmlal.s16 q0, d4, d8 \n"
		185	"vmlal.s16 q0, d5, d9 \n"
		186
		187	REPEAT_BLOCK(
		188	"vld1.16 {d2-d5}, [%[v1]]! \n"
		189	"vld1.16 {d6-d9}, [%[v2]]! \n"
		190	"vmlal.s16 q0, d2, d6 \n"
		191	"vmlal.s16 q0, d3, d7 \n"
		192	"vmlal.s16 q0, d4, d8 \n"
		193	"vmlal.s16 q0, d5, d9 \n"
		194	)
		195	#if ORDER > 64
		196	"bne 1b \n"
		197	#endif
		198	"vpadd.i32 d0, d0, d1 \n"
		199	"vpaddl.s32 d0, d0 \n"
		200	"vmov.32 %[res], d0[0] \n"
		201	: /* outputs */
		202	#if ORDER > 64
		203	[cnt]"+r"(cnt),
		204	#endif
		205	[v1] "+r"(v1),
		206	[v2] "+r"(v2),
		207	[res]"=r"(res)
		208	: /* inputs */
		209	: /* clobbers */
		210	"d0", "d1", "d2", "d3", "d4",
		211	"d5", "d6", "d7", "d8", "d9"
		212	);
		213	return res;
		214	}