libdemac: Add x86/x86_64 MMX asm for the filters. Not relevant for target but speeds up decoding on x86/x86_64 sims. Average speedup ranges from 25% for -c2000 to 3 times for -c5000; on Intel Atom it's even 45% for -c2000 to 6 times for -c5000.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24663 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2010-02-15 01:27:04 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2010-02-15 01:27:04 +0000
commit: b8eb272e48b98de6ce9fba04798e4652119e0a0a (patch)
tree: f55f5f0e2192d34f658000f537d96767bfa5533d /apps/codecs
parent: 1bef4c66501893b4b7e154979a80f5386cbe964f (diff)
download: rockbox-b8eb272e48b98de6ce9fba04798e4652119e0a0a.tar.gz
rockbox-b8eb272e48b98de6ce9fba04798e4652119e0a0a.zip
2 files changed, 222 insertions, 0 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index ed6f3c8dc6..275f12f6ae 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -46,6 +46,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #elif defined(CPU_ARM) && (ARM_ARCH >= 5)
 /* Assume all our ARMv5 targets are ARMv5te(j) */
 #include "vector_math16_armv5te.h"
+#elif (defined(__i386__) || defined(__i486__))  && defined(__MMX__) \
+    || defined(__x86_64__)
+#include "vector_math16_mmx.h"
 #else
 #include "vector_math_generic.h"
 #endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_mmx.h b/apps/codecs/demac/libdemac/vector_math16_mmx.h
new file mode 100644
index 0000000000..a7f9c73af7
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_mmx.h
@@ -0,0 +1,219 @@
+/*
+libdemac - A Monkey's Audio decoder
+$Id$
+Copyright (C) Dave Chapman 2007
+MMX vector math copyright (C) 2010 Jens Arnold
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+*/
+#define FUSED_VECTOR_MATH
+#define __E(__e) #__e
+#define __S(__e) __E(__e)
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "paddw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+        ".set    ofs, 8              \n"
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm1   \n"
+        "movq    %%mm1, %%mm0        \n"
+        "pmaddwd ofs(%[f2]), %%mm1   \n"
+        "paddw   ofs(%[s2]), %%mm0   \n"
+        "movq    %%mm0, ofs(%[v1])   \n"
+        "paddd   %%mm1, %%mm2        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "psubw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+        ".set    ofs, 8              \n"
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm1   \n"
+        "movq    %%mm1, %%mm0        \n"
+        "pmaddwd ofs(%[f2]), %%mm1   \n"
+        "psubw   ofs(%[s2]), %%mm0   \n"
+        "movq    %%mm0, ofs(%[v1])   \n"
+        "paddd   %%mm1, %%mm2        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+               
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm1, %%mm1        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm1      \n"
+        "pmaddwd (%[v2]), %%mm1      \n"
+        ".set    ofs, 8              \n"
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm0   \n"
+        "pmaddwd ofs(%[v2]), %%mm0   \n"
+        "paddd   %%mm0, %%mm1        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[v2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+        "movd    %%mm1, %[t]         \n"
+        "psrlq   $32, %%mm1          \n"
+        "movd    %%mm1, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"1"(v1),
+        [v2]"2"(v2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [v2]"r"(v2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1"
+    );
+    return res;
+}
author	Jens Arnold <amiconn@rockbox.org>	2010-02-15 01:27:04 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2010-02-15 01:27:04 +0000
commit	b8eb272e48b98de6ce9fba04798e4652119e0a0a (patch)
tree	f55f5f0e2192d34f658000f537d96767bfa5533d /apps/codecs
parent	1bef4c66501893b4b7e154979a80f5386cbe964f (diff)
download	rockbox-b8eb272e48b98de6ce9fba04798e4652119e0a0a.tar.gz rockbox-b8eb272e48b98de6ce9fba04798e4652119e0a0a.zip

diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index ed6f3c8dc6..275f12f6ae 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c
@@ -46,6 +46,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
46	#elif defined(CPU_ARM) && (ARM_ARCH >= 5)	46	#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
47	/* Assume all our ARMv5 targets are ARMv5te(j) */	47	/* Assume all our ARMv5 targets are ARMv5te(j) */
48	#include "vector_math16_armv5te.h"	48	#include "vector_math16_armv5te.h"
		49	#elif (defined(__i386__) \|\| defined(__i486__)) && defined(__MMX__) \
		50	\|\| defined(__x86_64__)
		51	#include "vector_math16_mmx.h"
49	#else	52	#else
50	#include "vector_math_generic.h"	53	#include "vector_math_generic.h"
51	#endif	54	#endif


diff --git a/apps/codecs/demac/libdemac/vector_math16_mmx.h b/apps/codecs/demac/libdemac/vector_math16_mmx.h new file mode 100644 index 0000000000..a7f9c73af7 --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_mmx.h
@@ -0,0 +1,219 @@
		1	/*
		2
		3	libdemac - A Monkey's Audio decoder
		4
		5	$Id$
		6
		7	Copyright (C) Dave Chapman 2007
		8
		9	MMX vector math copyright (C) 2010 Jens Arnold
		10
		11	This program is free software; you can redistribute it and/or modify
		12	it under the terms of the GNU General Public License as published by
		13	the Free Software Foundation; either version 2 of the License, or
		14	(at your option) any later version.
		15
		16	This program is distributed in the hope that it will be useful,
		17	but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	GNU General Public License for more details.
		20
		21	You should have received a copy of the GNU General Public License
		22	along with this program; if not, write to the Free Software
		23	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
		24
		25	*/
		26
		27	#define FUSED_VECTOR_MATH
		28
		29	#define __E(__e) #__e
		30	#define __S(__e) __E(__e)
		31
		32	static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
		33	{
		34	int res, t;
		35	#if ORDER > 256
		36	int cnt = ORDER>>8;
		37	#endif
		38
		39	asm volatile (
		40	#if ORDER > 256
		41	"pxor %%mm2, %%mm2 \n"
		42	".set ofs, 0 \n"
		43	"1: \n"
		44	".rept 64 \n"
		45	#else
		46	"movq (%[v1]), %%mm2 \n"
		47	"movq %%mm2, %%mm0 \n"
		48	"pmaddwd (%[f2]), %%mm2 \n"
		49	"paddw (%[s2]), %%mm0 \n"
		50	"movq %%mm0, (%[v1]) \n"
		51	".set ofs, 8 \n"
		52
		53	".rept " __S(ORDER>>2 - 1) "\n"
		54	#endif
		55	"movq ofs(%[v1]), %%mm1 \n"
		56	"movq %%mm1, %%mm0 \n"
		57	"pmaddwd ofs(%[f2]), %%mm1 \n"
		58	"paddw ofs(%[s2]), %%mm0 \n"
		59	"movq %%mm0, ofs(%[v1]) \n"
		60	"paddd %%mm1, %%mm2 \n"
		61	".set ofs, ofs + 8 \n"
		62	".endr \n"
		63	#if ORDER > 256
		64	"add $512, %[v1] \n"
		65	"add $512, %[s2] \n"
		66	"add $512, %[f2] \n"
		67	"dec %[cnt] \n"
		68	"jne 1b \n"
		69	#endif
		70
		71	"movd %%mm2, %[t] \n"
		72	"psrlq $32, %%mm2 \n"
		73	"movd %%mm2, %[res] \n"
		74	"add %[t], %[res] \n"
		75	: /* outputs */
		76	#if ORDER > 256
		77	[cnt]"+r"(cnt),
		78	[s2] "+r"(s2),
		79	[res]"=r"(res),
		80	[t] "=r"(t)
		81	: /* inputs */
		82	[v1]"2"(v1),
		83	[f2]"3"(f2)
		84	#else
		85	[res]"=r"(res),
		86	[t] "=r"(t)
		87	: /* inputs */
		88	[v1]"r"(v1),
		89	[f2]"r"(f2),
		90	[s2]"r"(s2)
		91	#endif
		92	: /* clobbers */
		93	"mm0", "mm1", "mm2"
		94	);
		95	return res;
		96	}
		97
		98	static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
		99	{
		100	int res, t;
		101	#if ORDER > 256
		102	int cnt = ORDER>>8;
		103	#endif
		104
		105	asm volatile (
		106	#if ORDER > 256
		107	"pxor %%mm2, %%mm2 \n"
		108	".set ofs, 0 \n"
		109	"1: \n"
		110	".rept 64 \n"
		111	#else
		112	"movq (%[v1]), %%mm2 \n"
		113	"movq %%mm2, %%mm0 \n"
		114	"pmaddwd (%[f2]), %%mm2 \n"
		115	"psubw (%[s2]), %%mm0 \n"
		116	"movq %%mm0, (%[v1]) \n"
		117	".set ofs, 8 \n"
		118
		119	".rept " __S(ORDER>>2 - 1) "\n"
		120	#endif
		121	"movq ofs(%[v1]), %%mm1 \n"
		122	"movq %%mm1, %%mm0 \n"
		123	"pmaddwd ofs(%[f2]), %%mm1 \n"
		124	"psubw ofs(%[s2]), %%mm0 \n"
		125	"movq %%mm0, ofs(%[v1]) \n"
		126	"paddd %%mm1, %%mm2 \n"
		127	".set ofs, ofs + 8 \n"
		128	".endr \n"
		129	#if ORDER > 256
		130	"add $512, %[v1] \n"
		131	"add $512, %[s2] \n"
		132	"add $512, %[f2] \n"
		133	"dec %[cnt] \n"
		134	"jne 1b \n"
		135	#endif
		136
		137	"movd %%mm2, %[t] \n"
		138	"psrlq $32, %%mm2 \n"
		139	"movd %%mm2, %[res] \n"
		140	"add %[t], %[res] \n"
		141	: /* outputs */
		142	#if ORDER > 256
		143	[cnt]"+r"(cnt),
		144	[s2] "+r"(s2),
		145	[res]"=r"(res),
		146	[t] "=r"(t)
		147	: /* inputs */
		148	[v1]"2"(v1),
		149	[f2]"3"(f2)
		150	#else
		151	[res]"=r"(res),
		152	[t] "=r"(t)
		153	: /* inputs */
		154	[v1]"r"(v1),
		155	[f2]"r"(f2),
		156	[s2]"r"(s2)
		157	#endif
		158	: /* clobbers */
		159	"mm0", "mm1", "mm2"
		160	);
		161	return res;
		162	}
		163
		164	static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
		165	{
		166	int res, t;
		167	#if ORDER > 256
		168	int cnt = ORDER>>8;
		169	#endif
		170
		171	asm volatile (
		172	#if ORDER > 256
		173	"pxor %%mm1, %%mm1 \n"
		174	".set ofs, 0 \n"
		175	"1: \n"
		176	".rept 64 \n"
		177	#else
		178	"movq (%[v1]), %%mm1 \n"
		179	"pmaddwd (%[v2]), %%mm1 \n"
		180	".set ofs, 8 \n"
		181
		182	".rept " __S(ORDER>>2 - 1) "\n"
		183	#endif
		184	"movq ofs(%[v1]), %%mm0 \n"
		185	"pmaddwd ofs(%[v2]), %%mm0 \n"
		186	"paddd %%mm0, %%mm1 \n"
		187	".set ofs, ofs + 8 \n"
		188	".endr \n"
		189	#if ORDER > 256
		190	"add $512, %[v1] \n"
		191	"add $512, %[v2] \n"
		192	"dec %[cnt] \n"
		193	"jne 1b \n"
		194	#endif
		195
		196	"movd %%mm1, %[t] \n"
		197	"psrlq $32, %%mm1 \n"
		198	"movd %%mm1, %[res] \n"
		199	"add %[t], %[res] \n"
		200	: /* outputs */
		201	#if ORDER > 256
		202	[cnt]"+r"(cnt),
		203	[res]"=r"(res),
		204	[t] "=r"(t)
		205	: /* inputs */
		206	[v1]"1"(v1),
		207	[v2]"2"(v2)
		208	#else
		209	[res]"=r"(res),
		210	[t] "=r"(t)
		211	: /* inputs */
		212	[v1]"r"(v1),
		213	[v2]"r"(v2)
		214	#endif
		215	: /* clobbers */
		216	"mm0", "mm1"
		217	);
		218	return res;
		219	}