diff options
author | Jens Arnold <amiconn@rockbox.org> | 2008-11-19 21:31:33 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2008-11-19 21:31:33 +0000 |
commit | 2a5053f58c1a33334776cc90264c67dde815cef3 (patch) | |
tree | 7acc0727874ff6b307eff293a18172a3239cd895 /apps/codecs/demac/libdemac/vector_math32_armv4.h | |
parent | 14d37cb4555703d216e954db15ccca2c34642dc3 (diff) | |
download | rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.tar.gz rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.zip |
Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating blocks. * Use MUL (variant) instead of MLA (variant) in the first step of the ARM scalarproduct() if there's no loop. * Unroll ARM assembler functions to 32 where not already done, plus the generic scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19144 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math32_armv4.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math32_armv4.h | 95 |
1 files changed, 45 insertions, 50 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index b729bd3a0a..89b24f2b06 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h | |||
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2) | |||
30 | int cnt = ORDER>>5; | 30 | int cnt = ORDER>>5; |
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | #define ADDBLOCK4 \ | 33 | #if ORDER > 16 |
34 | "ldmia %[v1], {r0-r3} \n" \ | 34 | #define ADD_SUB_BLOCKS "8" |
35 | "ldmia %[v2]!, {r4-r7} \n" \ | 35 | #else |
36 | "add r0, r0, r4 \n" \ | 36 | #define ADD_SUB_BLOCKS "4" |
37 | "add r1, r1, r5 \n" \ | 37 | #endif |
38 | "add r2, r2, r6 \n" \ | ||
39 | "add r3, r3, r7 \n" \ | ||
40 | "stmia %[v1]!, {r0-r3} \n" | ||
41 | 38 | ||
42 | asm volatile ( | 39 | asm volatile ( |
43 | "1: \n" | 40 | "1: \n" |
44 | ADDBLOCK4 | 41 | ".rept " ADD_SUB_BLOCKS "\n" |
45 | ADDBLOCK4 | 42 | "ldmia %[v1], {r0-r3} \n" |
46 | ADDBLOCK4 | 43 | "ldmia %[v2]!, {r4-r7} \n" |
47 | ADDBLOCK4 | 44 | "add r0, r0, r4 \n" |
48 | #if ORDER > 16 | 45 | "add r1, r1, r5 \n" |
49 | ADDBLOCK4 | 46 | "add r2, r2, r6 \n" |
50 | ADDBLOCK4 | 47 | "add r3, r3, r7 \n" |
51 | ADDBLOCK4 | 48 | "stmia %[v1]!, {r0-r3} \n" |
52 | ADDBLOCK4 | 49 | ".endr \n" |
53 | #endif | ||
54 | #if ORDER > 32 | 50 | #if ORDER > 32 |
55 | "subs %[cnt], %[cnt], #1 \n" | 51 | "subs %[cnt], %[cnt], #1 \n" |
56 | "bne 1b \n" | 52 | "bne 1b \n" |
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) | |||
74 | int cnt = ORDER>>5; | 70 | int cnt = ORDER>>5; |
75 | #endif | 71 | #endif |
76 | 72 | ||
77 | #define SUBBLOCK4 \ | ||
78 | "ldmia %[v1], {r0-r3} \n" \ | ||
79 | "ldmia %[v2]!, {r4-r7} \n" \ | ||
80 | "sub r0, r0, r4 \n" \ | ||
81 | "sub r1, r1, r5 \n" \ | ||
82 | "sub r2, r2, r6 \n" \ | ||
83 | "sub r3, r3, r7 \n" \ | ||
84 | "stmia %[v1]!, {r0-r3} \n" | ||
85 | |||
86 | asm volatile ( | 73 | asm volatile ( |
87 | "1: \n" | 74 | "1: \n" |
88 | SUBBLOCK4 | 75 | ".rept " ADD_SUB_BLOCKS "\n" |
89 | SUBBLOCK4 | 76 | "ldmia %[v1], {r0-r3} \n" |
90 | SUBBLOCK4 | 77 | "ldmia %[v2]!, {r4-r7} \n" |
91 | SUBBLOCK4 | 78 | "sub r0, r0, r4 \n" |
92 | #if ORDER > 16 | 79 | "sub r1, r1, r5 \n" |
93 | SUBBLOCK4 | 80 | "sub r2, r2, r6 \n" |
94 | SUBBLOCK4 | 81 | "sub r3, r3, r7 \n" |
95 | SUBBLOCK4 | 82 | "stmia %[v1]!, {r0-r3} \n" |
96 | SUBBLOCK4 | 83 | ".endr \n" |
97 | #endif | ||
98 | #if ORDER > 32 | 84 | #if ORDER > 32 |
99 | "subs %[cnt], %[cnt], #1 \n" | 85 | "subs %[cnt], %[cnt], #1 \n" |
100 | "bne 1b \n" | 86 | "bne 1b \n" |
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) | |||
114 | 100 | ||
115 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | 101 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) |
116 | { | 102 | { |
117 | int res = 0; | 103 | int res; |
118 | #if ORDER > 32 | 104 | #if ORDER > 32 |
119 | int cnt = ORDER>>5; | 105 | int cnt = ORDER>>5; |
120 | #endif | 106 | #endif |
121 | 107 | ||
122 | asm volatile ( | 108 | asm volatile ( |
123 | #if ORDER > 16 | 109 | #if ORDER > 16 |
110 | #if ORDER > 32 | ||
111 | "mov %[res], #0 \n" | ||
112 | #endif | ||
124 | "ldmia %[v2]!, {r6-r7} \n" | 113 | "ldmia %[v2]!, {r6-r7} \n" |
125 | "1: \n" | 114 | "1: \n" |
126 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | 115 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" |
116 | #if ORDER > 32 | ||
127 | "mla %[res], r6, r0, %[res] \n" | 117 | "mla %[res], r6, r0, %[res] \n" |
118 | #else | ||
119 | "mul %[res], r6, r0 \n" | ||
120 | #endif | ||
128 | "mla %[res], r7, r1, %[res] \n" | 121 | "mla %[res], r7, r1, %[res] \n" |
129 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | 122 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" |
130 | "mla %[res], r0, r3, %[res] \n" | 123 | "mla %[res], r0, r3, %[res] \n" |
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
177 | #endif | 170 | #endif |
178 | 171 | ||
179 | #else /* ORDER <= 16 */ | 172 | #else /* ORDER <= 16 */ |
180 | 173 | "ldmia %[v1]!, {r0-r3} \n" | |
181 | #define MLABLOCK4 \ | 174 | "ldmia %[v2]!, {r4-r7} \n" |
182 | "ldmia %[v1]!, {r0-r3} \n" \ | 175 | "mul %[res], r4, r0 \n" |
183 | "ldmia %[v2]!, {r4-r7} \n" \ | 176 | "mla %[res], r5, r1, %[res] \n" |
184 | "mla %[res], r4, r0, %[res] \n" \ | 177 | "mla %[res], r6, r2, %[res] \n" |
185 | "mla %[res], r5, r1, %[res] \n" \ | ||
186 | "mla %[res], r6, r2, %[res] \n" \ | ||
187 | "mla %[res], r7, r3, %[res] \n" | 178 | "mla %[res], r7, r3, %[res] \n" |
188 | 179 | ||
189 | MLABLOCK4 | 180 | ".rept 3 \n" |
190 | MLABLOCK4 | 181 | "ldmia %[v1]!, {r0-r3} \n" |
191 | MLABLOCK4 | 182 | "ldmia %[v2]!, {r4-r7} \n" |
192 | MLABLOCK4 | 183 | "mla %[res], r4, r0, %[res] \n" |
184 | "mla %[res], r5, r1, %[res] \n" | ||
185 | "mla %[res], r6, r2, %[res] \n" | ||
186 | "mla %[res], r7, r3, %[res] \n" | ||
187 | ".endr \n" | ||
193 | #endif /* ORDER <= 16 */ | 188 | #endif /* ORDER <= 16 */ |
194 | : /* outputs */ | 189 | : /* outputs */ |
195 | #if ORDER > 32 | 190 | #if ORDER > 32 |
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
197 | #endif | 192 | #endif |
198 | [v1] "+r"(v1), | 193 | [v1] "+r"(v1), |
199 | [v2] "+r"(v2), | 194 | [v2] "+r"(v2), |
200 | [res]"+r"(res) | 195 | [res]"=r"(res) |
201 | : /* inputs */ | 196 | : /* inputs */ |
202 | : /* clobbers */ | 197 | : /* clobbers */ |
203 | "r0", "r1", "r2", "r3", | 198 | "r0", "r1", "r2", "r3", |