Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating blocks. * Use MUL (variant) instead of MLA (variant) in the first step of the ARM scalarproduct() if there's no loop. * Unroll ARM assembler functions to 32 where not already done, plus the generic scalarproduct().

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19144 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2008-11-19 21:31:33 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2008-11-19 21:31:33 +0000
commit: 2a5053f58c1a33334776cc90264c67dde815cef3 (patch)
tree: 7acc0727874ff6b307eff293a18172a3239cd895 /apps/codecs/demac/libdemac/vector_math32_armv4.h
parent: 14d37cb4555703d216e954db15ccca2c34642dc3 (diff)
download: rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.tar.gz
rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.zip
1 files changed, 45 insertions, 50 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
index b729bd3a0a..89b24f2b06 100644
--- a/apps/codecs/demac/libdemac/vector_math32_armv4.h
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2)
    int cnt = ORDER>>5;
 #endif
-#define ADDBLOCK4                        \
+#if ORDER > 16
-        "ldmia   %[v1],  {r0-r3}     \n" \
+#define ADD_SUB_BLOCKS "8"
-        "ldmia   %[v2]!, {r4-r7}     \n" \
+#else
-        "add     r0, r0, r4          \n" \
+#define ADD_SUB_BLOCKS "4"
-        "add     r1, r1, r5          \n" \
+#endif
-        "add     r2, r2, r6          \n" \
-        "add     r3, r3, r7          \n" \
-        "stmia   %[v1]!, {r0-r3}     \n"
    asm volatile (
    "1:                              \n"
-        ADDBLOCK4
+        ".rept " ADD_SUB_BLOCKS     "\n"
-        ADDBLOCK4
+        "ldmia   %[v1],  {r0-r3}     \n"
-        ADDBLOCK4
+        "ldmia   %[v2]!, {r4-r7}     \n"
-        ADDBLOCK4
+        "add     r0, r0, r4          \n"
-#if ORDER > 16
+        "add     r1, r1, r5          \n"
-        ADDBLOCK4
+        "add     r2, r2, r6          \n"
-        ADDBLOCK4
+        "add     r3, r3, r7          \n"
-        ADDBLOCK4
+        "stmia   %[v1]!, {r0-r3}     \n"
-        ADDBLOCK4
+        ".endr                       \n"
-#endif
 #if ORDER > 32
        "subs    %[cnt], %[cnt], #1  \n"
        "bne     1b                  \n"
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
    int cnt = ORDER>>5;
 #endif
-#define SUBBLOCK4                        \
-        "ldmia   %[v1],  {r0-r3}     \n" \
-        "ldmia   %[v2]!, {r4-r7}     \n" \
-        "sub     r0, r0, r4          \n" \
-        "sub     r1, r1, r5          \n" \
-        "sub     r2, r2, r6          \n" \
-        "sub     r3, r3, r7          \n" \
-        "stmia   %[v1]!, {r0-r3}     \n"
    asm volatile (
    "1:                              \n"
-        SUBBLOCK4
+        ".rept " ADD_SUB_BLOCKS     "\n"
-        SUBBLOCK4
+        "ldmia   %[v1],  {r0-r3}     \n"
-        SUBBLOCK4
+        "ldmia   %[v2]!, {r4-r7}     \n"
-        SUBBLOCK4
+        "sub     r0, r0, r4          \n"
-#if ORDER > 16
+        "sub     r1, r1, r5          \n"
-        SUBBLOCK4
+        "sub     r2, r2, r6          \n"
-        SUBBLOCK4
+        "sub     r3, r3, r7          \n"
-        SUBBLOCK4
+        "stmia   %[v1]!, {r0-r3}     \n"
-        SUBBLOCK4
+        ".endr                       \n"
-#endif
 #if ORDER > 32
        "subs    %[cnt], %[cnt], #1  \n"
        "bne     1b                  \n"
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
 static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
 {
-    int res = 0;
+    int res;
 #if ORDER > 32
    int cnt = ORDER>>5;
 #endif
    asm volatile (
 #if ORDER > 16
+#if ORDER > 32
+        "mov     %[res], #0              \n"
+#endif
        "ldmia   %[v2]!, {r6-r7}         \n"
    "1:                                  \n"
        "ldmia   %[v1]!, {r0,r1,r3-r5}   \n"
+#if ORDER > 32
        "mla     %[res], r6, r0, %[res]  \n"
+#else
+        "mul     %[res], r6, r0          \n"
+#endif
        "mla     %[res], r7, r1, %[res]  \n"
        "ldmia   %[v2]!, {r0-r2,r6-r8}   \n"
        "mla     %[res], r0, r3, %[res]  \n"
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
 #endif
 #else /* ORDER <= 16 */
+        "ldmia   %[v1]!, {r0-r3}         \n"
-#define MLABLOCK4                            \
+        "ldmia   %[v2]!, {r4-r7}         \n"
-        "ldmia   %[v1]!, {r0-r3}         \n" \
+        "mul     %[res], r4, r0          \n"
-        "ldmia   %[v2]!, {r4-r7}         \n" \
+        "mla     %[res], r5, r1, %[res]  \n"
-        "mla     %[res], r4, r0, %[res]  \n" \
+        "mla     %[res], r6, r2, %[res]  \n"
-        "mla     %[res], r5, r1, %[res]  \n" \
-        "mla     %[res], r6, r2, %[res]  \n" \
        "mla     %[res], r7, r3, %[res]  \n"
-        MLABLOCK4
+        ".rept   3                       \n"
-        MLABLOCK4
+        "ldmia   %[v1]!, {r0-r3}         \n"
-        MLABLOCK4
+        "ldmia   %[v2]!, {r4-r7}         \n"
-        MLABLOCK4
+        "mla     %[res], r4, r0, %[res]  \n"
+        "mla     %[res], r5, r1, %[res]  \n"
+        "mla     %[res], r6, r2, %[res]  \n"
+        "mla     %[res], r7, r3, %[res]  \n"
+        ".endr                           \n"
 #endif /* ORDER <= 16 */
        : /* outputs */
 #if ORDER > 32
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
 #endif
        [v1] "+r"(v1),
        [v2] "+r"(v2),
-        [res]"+r"(res)
+        [res]"=r"(res)
        : /* inputs */
        : /* clobbers */
        "r0", "r1", "r2", "r3",
author	Jens Arnold <amiconn@rockbox.org>	2008-11-19 21:31:33 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2008-11-19 21:31:33 +0000
commit	2a5053f58c1a33334776cc90264c67dde815cef3 (patch)
tree	7acc0727874ff6b307eff293a18172a3239cd895 /apps/codecs/demac/libdemac/vector_math32_armv4.h
parent	14d37cb4555703d216e954db15ccca2c34642dc3 (diff)
download	rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.tar.gz rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.zip

diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index b729bd3a0a..89b24f2b06 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2)
30	int cnt = ORDER>>5;	30	int cnt = ORDER>>5;
31	#endif	31	#endif
32		32
33	#define ADDBLOCK4 \	33	#if ORDER > 16
34	"ldmia %[v1], {r0-r3} \n" \	34	#define ADD_SUB_BLOCKS "8"
35	"ldmia %[v2]!, {r4-r7} \n" \	35	#else
36	"add r0, r0, r4 \n" \	36	#define ADD_SUB_BLOCKS "4"
37	"add r1, r1, r5 \n" \	37	#endif
38	"add r2, r2, r6 \n" \
39	"add r3, r3, r7 \n" \
40	"stmia %[v1]!, {r0-r3} \n"
41		38
42	asm volatile (	39	asm volatile (
43	"1: \n"	40	"1: \n"
44	ADDBLOCK4	41	".rept " ADD_SUB_BLOCKS "\n"
45	ADDBLOCK4	42	"ldmia %[v1], {r0-r3} \n"
46	ADDBLOCK4	43	"ldmia %[v2]!, {r4-r7} \n"
47	ADDBLOCK4	44	"add r0, r0, r4 \n"
48	#if ORDER > 16	45	"add r1, r1, r5 \n"
49	ADDBLOCK4	46	"add r2, r2, r6 \n"
50	ADDBLOCK4	47	"add r3, r3, r7 \n"
51	ADDBLOCK4	48	"stmia %[v1]!, {r0-r3} \n"
52	ADDBLOCK4	49	".endr \n"
53	#endif
54	#if ORDER > 32	50	#if ORDER > 32
55	"subs %[cnt], %[cnt], #1 \n"	51	"subs %[cnt], %[cnt], #1 \n"
56	"bne 1b \n"	52	"bne 1b \n"
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
74	int cnt = ORDER>>5;	70	int cnt = ORDER>>5;
75	#endif	71	#endif
76		72
77	#define SUBBLOCK4 \
78	"ldmia %[v1], {r0-r3} \n" \
79	"ldmia %[v2]!, {r4-r7} \n" \
80	"sub r0, r0, r4 \n" \
81	"sub r1, r1, r5 \n" \
82	"sub r2, r2, r6 \n" \
83	"sub r3, r3, r7 \n" \
84	"stmia %[v1]!, {r0-r3} \n"
85
86	asm volatile (	73	asm volatile (
87	"1: \n"	74	"1: \n"
88	SUBBLOCK4	75	".rept " ADD_SUB_BLOCKS "\n"
89	SUBBLOCK4	76	"ldmia %[v1], {r0-r3} \n"
90	SUBBLOCK4	77	"ldmia %[v2]!, {r4-r7} \n"
91	SUBBLOCK4	78	"sub r0, r0, r4 \n"
92	#if ORDER > 16	79	"sub r1, r1, r5 \n"
93	SUBBLOCK4	80	"sub r2, r2, r6 \n"
94	SUBBLOCK4	81	"sub r3, r3, r7 \n"
95	SUBBLOCK4	82	"stmia %[v1]!, {r0-r3} \n"
96	SUBBLOCK4	83	".endr \n"
97	#endif
98	#if ORDER > 32	84	#if ORDER > 32
99	"subs %[cnt], %[cnt], #1 \n"	85	"subs %[cnt], %[cnt], #1 \n"
100	"bne 1b \n"	86	"bne 1b \n"
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
114		100
115	static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)	101	static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
116	{	102	{
117	int res = 0;	103	int res;
118	#if ORDER > 32	104	#if ORDER > 32
119	int cnt = ORDER>>5;	105	int cnt = ORDER>>5;
120	#endif	106	#endif
121		107
122	asm volatile (	108	asm volatile (
123	#if ORDER > 16	109	#if ORDER > 16
		110	#if ORDER > 32
		111	"mov %[res], #0 \n"
		112	#endif
124	"ldmia %[v2]!, {r6-r7} \n"	113	"ldmia %[v2]!, {r6-r7} \n"
125	"1: \n"	114	"1: \n"
126	"ldmia %[v1]!, {r0,r1,r3-r5} \n"	115	"ldmia %[v1]!, {r0,r1,r3-r5} \n"
		116	#if ORDER > 32
127	"mla %[res], r6, r0, %[res] \n"	117	"mla %[res], r6, r0, %[res] \n"
		118	#else
		119	"mul %[res], r6, r0 \n"
		120	#endif
128	"mla %[res], r7, r1, %[res] \n"	121	"mla %[res], r7, r1, %[res] \n"
129	"ldmia %[v2]!, {r0-r2,r6-r8} \n"	122	"ldmia %[v2]!, {r0-r2,r6-r8} \n"
130	"mla %[res], r0, r3, %[res] \n"	123	"mla %[res], r0, r3, %[res] \n"
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
177	#endif	170	#endif
178		171
179	#else /* ORDER <= 16 */	172	#else /* ORDER <= 16 */
180		173	"ldmia %[v1]!, {r0-r3} \n"
181	#define MLABLOCK4 \	174	"ldmia %[v2]!, {r4-r7} \n"
182	"ldmia %[v1]!, {r0-r3} \n" \	175	"mul %[res], r4, r0 \n"
183	"ldmia %[v2]!, {r4-r7} \n" \	176	"mla %[res], r5, r1, %[res] \n"
184	"mla %[res], r4, r0, %[res] \n" \	177	"mla %[res], r6, r2, %[res] \n"
185	"mla %[res], r5, r1, %[res] \n" \
186	"mla %[res], r6, r2, %[res] \n" \
187	"mla %[res], r7, r3, %[res] \n"	178	"mla %[res], r7, r3, %[res] \n"
188		179
189	MLABLOCK4	180	".rept 3 \n"
190	MLABLOCK4	181	"ldmia %[v1]!, {r0-r3} \n"
191	MLABLOCK4	182	"ldmia %[v2]!, {r4-r7} \n"
192	MLABLOCK4	183	"mla %[res], r4, r0, %[res] \n"
		184	"mla %[res], r5, r1, %[res] \n"
		185	"mla %[res], r6, r2, %[res] \n"
		186	"mla %[res], r7, r3, %[res] \n"
		187	".endr \n"
193	#endif /* ORDER <= 16 */	188	#endif /* ORDER <= 16 */
194	: /* outputs */	189	: /* outputs */
195	#if ORDER > 32	190	#if ORDER > 32
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
197	#endif	192	#endif
198	[v1] "+r"(v1),	193	[v1] "+r"(v1),
199	[v2] "+r"(v2),	194	[v2] "+r"(v2),
200	[res]"+r"(res)	195	[res]"=r"(res)
201	: /* inputs */	196	: /* inputs */
202	: /* clobbers */	197	: /* clobbers */
203	"r0", "r1", "r2", "r3",	198	"r0", "r1", "r2", "r3",