summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-19 21:31:33 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-19 21:31:33 +0000
commit2a5053f58c1a33334776cc90264c67dde815cef3 (patch)
tree7acc0727874ff6b307eff293a18172a3239cd895
parent14d37cb4555703d216e954db15ccca2c34642dc3 (diff)
downloadrockbox-2a5053f58c1a33334776cc90264c67dde815cef3.tar.gz
rockbox-2a5053f58c1a33334776cc90264c67dde815cef3.zip
Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating blocks. * Use MUL (variant) instead of MLA (variant) in the first step of the ARM scalarproduct() if there's no loop. * Unroll ARM assembler functions to 32 where not already done, plus the generic scalarproduct().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19144 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv5te.h36
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv6.h111
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h55
-rw-r--r--apps/codecs/demac/libdemac/vector_math32_armv4.h95
-rw-r--r--apps/codecs/demac/libdemac/vector_math_generic.h22
5 files changed, 171 insertions, 148 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
index a999c0333a..826aaa3f80 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
@@ -117,21 +117,35 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
117 * incorrect results (if ARM aligncheck is disabled). */ 117 * incorrect results (if ARM aligncheck is disabled). */
118static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 118static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
119{ 119{
120 int res = 0; 120 int res;
121#if ORDER > 32
122 int cnt = ORDER>>5;
123#endif
124
121#if ORDER > 16 125#if ORDER > 16
122 int cnt = ORDER>>4; 126#define MLA_BLOCKS "3"
127#else
128#define MLA_BLOCKS "1"
123#endif 129#endif
124 130
125 asm volatile ( 131 asm volatile (
132#if ORDER > 32
133 "mov %[res], #0 \n"
134#endif
126 "tst %[v2], #2 \n" 135 "tst %[v2], #2 \n"
127 "beq 20f \n" 136 "beq 20f \n"
128 137
129 "10: \n" 138 "10: \n"
130 "ldrh r7, [%[v2]], #2 \n" 139 "ldrh r7, [%[v2]], #2 \n"
140#if ORDER > 32
131 "mov r7, r7, lsl #16 \n" 141 "mov r7, r7, lsl #16 \n"
132 "1: \n" 142 "1: \n"
133 "ldmia %[v1]!, {r0-r3} \n" 143 "ldmia %[v1]!, {r0-r3} \n"
134 "smlabt %[res], r0, r7, %[res] \n" 144 "smlabt %[res], r0, r7, %[res] \n"
145#else
146 "ldmia %[v1]!, {r0-r3} \n"
147 "smulbb %[res], r0, r7 \n"
148#endif
135 "ldmia %[v2]!, {r4-r7} \n" 149 "ldmia %[v2]!, {r4-r7} \n"
136 "smlatb %[res], r0, r4, %[res] \n" 150 "smlatb %[res], r0, r4, %[res] \n"
137 "smlabt %[res], r1, r4, %[res] \n" 151 "smlabt %[res], r1, r4, %[res] \n"
@@ -140,6 +154,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
140 "smlatb %[res], r2, r6, %[res] \n" 154 "smlatb %[res], r2, r6, %[res] \n"
141 "smlabt %[res], r3, r6, %[res] \n" 155 "smlabt %[res], r3, r6, %[res] \n"
142 "smlatb %[res], r3, r7, %[res] \n" 156 "smlatb %[res], r3, r7, %[res] \n"
157
158 ".rept " MLA_BLOCKS "\n"
143 "ldmia %[v1]!, {r0-r3} \n" 159 "ldmia %[v1]!, {r0-r3} \n"
144 "smlabt %[res], r0, r7, %[res] \n" 160 "smlabt %[res], r0, r7, %[res] \n"
145 "ldmia %[v2]!, {r4-r7} \n" 161 "ldmia %[v2]!, {r4-r7} \n"
@@ -150,7 +166,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
150 "smlatb %[res], r2, r6, %[res] \n" 166 "smlatb %[res], r2, r6, %[res] \n"
151 "smlabt %[res], r3, r6, %[res] \n" 167 "smlabt %[res], r3, r6, %[res] \n"
152 "smlatb %[res], r3, r7, %[res] \n" 168 "smlatb %[res], r3, r7, %[res] \n"
153#if ORDER > 16 169 ".endr \n"
170#if ORDER > 32
154 "subs %[cnt], %[cnt], #1 \n" 171 "subs %[cnt], %[cnt], #1 \n"
155 "bne 1b \n" 172 "bne 1b \n"
156#endif 173#endif
@@ -160,7 +177,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
160 "1: \n" 177 "1: \n"
161 "ldmia %[v1]!, {r0-r3} \n" 178 "ldmia %[v1]!, {r0-r3} \n"
162 "ldmia %[v2]!, {r4-r7} \n" 179 "ldmia %[v2]!, {r4-r7} \n"
180#if ORDER > 32
163 "smlabb %[res], r0, r4, %[res] \n" 181 "smlabb %[res], r0, r4, %[res] \n"
182#else
183 "smulbb %[res], r0, r4 \n"
184#endif
164 "smlatt %[res], r0, r4, %[res] \n" 185 "smlatt %[res], r0, r4, %[res] \n"
165 "smlabb %[res], r1, r5, %[res] \n" 186 "smlabb %[res], r1, r5, %[res] \n"
166 "smlatt %[res], r1, r5, %[res] \n" 187 "smlatt %[res], r1, r5, %[res] \n"
@@ -168,6 +189,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
168 "smlatt %[res], r2, r6, %[res] \n" 189 "smlatt %[res], r2, r6, %[res] \n"
169 "smlabb %[res], r3, r7, %[res] \n" 190 "smlabb %[res], r3, r7, %[res] \n"
170 "smlatt %[res], r3, r7, %[res] \n" 191 "smlatt %[res], r3, r7, %[res] \n"
192
193 ".rept " MLA_BLOCKS "\n"
171 "ldmia %[v1]!, {r0-r3} \n" 194 "ldmia %[v1]!, {r0-r3} \n"
172 "ldmia %[v2]!, {r4-r7} \n" 195 "ldmia %[v2]!, {r4-r7} \n"
173 "smlabb %[res], r0, r4, %[res] \n" 196 "smlabb %[res], r0, r4, %[res] \n"
@@ -178,19 +201,20 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
178 "smlatt %[res], r2, r6, %[res] \n" 201 "smlatt %[res], r2, r6, %[res] \n"
179 "smlabb %[res], r3, r7, %[res] \n" 202 "smlabb %[res], r3, r7, %[res] \n"
180 "smlatt %[res], r3, r7, %[res] \n" 203 "smlatt %[res], r3, r7, %[res] \n"
181#if ORDER > 16 204 ".endr \n"
205#if ORDER > 32
182 "subs %[cnt], %[cnt], #1 \n" 206 "subs %[cnt], %[cnt], #1 \n"
183 "bne 1b \n" 207 "bne 1b \n"
184#endif 208#endif
185 209
186 "99: \n" 210 "99: \n"
187 : /* outputs */ 211 : /* outputs */
188#if ORDER > 16 212#if ORDER > 32
189 [cnt]"+r"(cnt), 213 [cnt]"+r"(cnt),
190#endif 214#endif
191 [v1] "+r"(v1), 215 [v1] "+r"(v1),
192 [v2] "+r"(v2), 216 [v2] "+r"(v2),
193 [res]"+r"(res) 217 [res]"=r"(res)
194 : /* inputs */ 218 : /* inputs */
195 : /* clobbers */ 219 : /* clobbers */
196 "r0", "r1", "r2", "r3", 220 "r0", "r1", "r2", "r3",
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h
index 49fa2ceb7d..cd27b271af 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv6.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h
@@ -29,8 +29,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
29 * incorrect results (if ARM aligncheck is disabled). */ 29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2) 30static inline void vector_add(int16_t* v1, int16_t* v2)
31{ 31{
32#if ORDER > 32
33 int cnt = ORDER>>5;
34#endif
35
32#if ORDER > 16 36#if ORDER > 16
33 int cnt = ORDER>>4; 37#define ADD_SUB_BLOCKS "4"
38#else
39#define ADD_SUB_BLOCKS "2"
34#endif 40#endif
35 41
36 asm volatile ( 42 asm volatile (
@@ -42,6 +48,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
42 "ldr r5, [%[v2]], #4 \n" 48 "ldr r5, [%[v2]], #4 \n"
43 "mov r4, r4, lsl #16 \n" 49 "mov r4, r4, lsl #16 \n"
44 "1: \n" 50 "1: \n"
51 ".rept " ADD_SUB_BLOCKS "\n"
45 "ldmia %[v2]!, {r6-r7} \n" 52 "ldmia %[v2]!, {r6-r7} \n"
46 "ldmia %[v1], {r0-r3} \n" 53 "ldmia %[v1], {r0-r3} \n"
47 "mov r5, r5, ror #16 \n" 54 "mov r5, r5, ror #16 \n"
@@ -56,21 +63,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
56 "pkhbt r7, r7, r4, lsl #16 \n" 63 "pkhbt r7, r7, r4, lsl #16 \n"
57 "sadd16 r3, r3, r7 \n" 64 "sadd16 r3, r3, r7 \n"
58 "stmia %[v1]!, {r0-r3} \n" 65 "stmia %[v1]!, {r0-r3} \n"
59 "ldmia %[v2]!, {r6-r7} \n" 66 ".endr \n"
60 "ldmia %[v1], {r0-r3} \n" 67#if ORDER > 32
61 "mov r5, r5, ror #16 \n"
62 "pkhtb r4, r5, r4, asr #16 \n"
63 "sadd16 r0, r0, r4 \n"
64 "pkhbt r5, r5, r6, lsl #16 \n"
65 "sadd16 r1, r1, r5 \n"
66 "ldmia %[v2]!, {r4-r5} \n"
67 "mov r7, r7, ror #16 \n"
68 "pkhtb r6, r7, r6, asr #16 \n"
69 "sadd16 r2, r2, r6 \n"
70 "pkhbt r7, r7, r4, lsl #16 \n"
71 "sadd16 r3, r3, r7 \n"
72 "stmia %[v1]!, {r0-r3} \n"
73#if ORDER > 16
74 "subs %[cnt], %[cnt], #1 \n" 68 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n" 69 "bne 1b \n"
76#endif 70#endif
@@ -78,6 +72,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
78 72
79 "20: \n" 73 "20: \n"
80 "1: \n" 74 "1: \n"
75 ".rept " ADD_SUB_BLOCKS "\n"
81 "ldmia %[v2]!, {r4-r7} \n" 76 "ldmia %[v2]!, {r4-r7} \n"
82 "ldmia %[v1], {r0-r3} \n" 77 "ldmia %[v1], {r0-r3} \n"
83 "sadd16 r0, r0, r4 \n" 78 "sadd16 r0, r0, r4 \n"
@@ -85,21 +80,15 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
85 "sadd16 r2, r2, r6 \n" 80 "sadd16 r2, r2, r6 \n"
86 "sadd16 r3, r3, r7 \n" 81 "sadd16 r3, r3, r7 \n"
87 "stmia %[v1]!, {r0-r3} \n" 82 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v2]!, {r4-r7} \n" 83 ".endr \n"
89 "ldmia %[v1], {r0-r3} \n" 84#if ORDER > 32
90 "sadd16 r0, r0, r4 \n"
91 "sadd16 r1, r1, r5 \n"
92 "sadd16 r2, r2, r6 \n"
93 "sadd16 r3, r3, r7 \n"
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n" 85 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n" 86 "bne 1b \n"
98#endif 87#endif
99 88
100 "99: \n" 89 "99: \n"
101 : /* outputs */ 90 : /* outputs */
102#if ORDER > 16 91#if ORDER > 32
103 [cnt]"+r"(cnt), 92 [cnt]"+r"(cnt),
104#endif 93#endif
105 [v1] "+r"(v1), 94 [v1] "+r"(v1),
@@ -116,8 +105,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
116 * incorrect results (if ARM aligncheck is disabled). */ 105 * incorrect results (if ARM aligncheck is disabled). */
117static inline void vector_sub(int16_t* v1, int16_t* v2) 106static inline void vector_sub(int16_t* v1, int16_t* v2)
118{ 107{
119#if ORDER > 16 108#if ORDER > 32
120 int cnt = ORDER>>4; 109 int cnt = ORDER>>5;
121#endif 110#endif
122 111
123 asm volatile ( 112 asm volatile (
@@ -129,6 +118,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
129 "ldr r5, [%[v2]], #4 \n" 118 "ldr r5, [%[v2]], #4 \n"
130 "mov r4, r4, lsl #16 \n" 119 "mov r4, r4, lsl #16 \n"
131 "1: \n" 120 "1: \n"
121 ".rept " ADD_SUB_BLOCKS "\n"
132 "ldmia %[v2]!, {r6-r7} \n" 122 "ldmia %[v2]!, {r6-r7} \n"
133 "ldmia %[v1], {r0-r3} \n" 123 "ldmia %[v1], {r0-r3} \n"
134 "mov r5, r5, ror #16 \n" 124 "mov r5, r5, ror #16 \n"
@@ -143,21 +133,8 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
143 "pkhbt r7, r7, r4, lsl #16 \n" 133 "pkhbt r7, r7, r4, lsl #16 \n"
144 "ssub16 r3, r3, r7 \n" 134 "ssub16 r3, r3, r7 \n"
145 "stmia %[v1]!, {r0-r3} \n" 135 "stmia %[v1]!, {r0-r3} \n"
146 "ldmia %[v2]!, {r6-r7} \n" 136 ".endr \n"
147 "ldmia %[v1], {r0-r3} \n" 137#if ORDER > 32
148 "mov r5, r5, ror #16 \n"
149 "pkhtb r4, r5, r4, asr #16 \n"
150 "ssub16 r0, r0, r4 \n"
151 "pkhbt r5, r5, r6, lsl #16 \n"
152 "ssub16 r1, r1, r5 \n"
153 "ldmia %[v2]!, {r4-r5} \n"
154 "mov r7, r7, ror #16 \n"
155 "pkhtb r6, r7, r6, asr #16 \n"
156 "ssub16 r2, r2, r6 \n"
157 "pkhbt r7, r7, r4, lsl #16 \n"
158 "ssub16 r3, r3, r7 \n"
159 "stmia %[v1]!, {r0-r3} \n"
160#if ORDER > 16
161 "subs %[cnt], %[cnt], #1 \n" 138 "subs %[cnt], %[cnt], #1 \n"
162 "bne 1b \n" 139 "bne 1b \n"
163#endif 140#endif
@@ -165,6 +142,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
165 142
166 "20: \n" 143 "20: \n"
167 "1: \n" 144 "1: \n"
145 ".rept " ADD_SUB_BLOCKS "\n"
168 "ldmia %[v2]!, {r4-r7} \n" 146 "ldmia %[v2]!, {r4-r7} \n"
169 "ldmia %[v1], {r0-r3} \n" 147 "ldmia %[v1], {r0-r3} \n"
170 "ssub16 r0, r0, r4 \n" 148 "ssub16 r0, r0, r4 \n"
@@ -172,21 +150,15 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
172 "ssub16 r2, r2, r6 \n" 150 "ssub16 r2, r2, r6 \n"
173 "ssub16 r3, r3, r7 \n" 151 "ssub16 r3, r3, r7 \n"
174 "stmia %[v1]!, {r0-r3} \n" 152 "stmia %[v1]!, {r0-r3} \n"
175 "ldmia %[v2]!, {r4-r7} \n" 153 ".endr \n"
176 "ldmia %[v1], {r0-r3} \n" 154#if ORDER > 32
177 "ssub16 r0, r0, r4 \n"
178 "ssub16 r1, r1, r5 \n"
179 "ssub16 r2, r2, r6 \n"
180 "ssub16 r3, r3, r7 \n"
181 "stmia %[v1]!, {r0-r3} \n"
182#if ORDER > 16
183 "subs %[cnt], %[cnt], #1 \n" 155 "subs %[cnt], %[cnt], #1 \n"
184 "bne 1b \n" 156 "bne 1b \n"
185#endif 157#endif
186 158
187 "99: \n" 159 "99: \n"
188 : /* outputs */ 160 : /* outputs */
189#if ORDER > 16 161#if ORDER > 32
190 [cnt]"+r"(cnt), 162 [cnt]"+r"(cnt),
191#endif 163#endif
192 [v1] "+r"(v1), 164 [v1] "+r"(v1),
@@ -203,12 +175,21 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
203 * incorrect results (if ARM aligncheck is disabled). */ 175 * incorrect results (if ARM aligncheck is disabled). */
204static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 176static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
205{ 177{
206 int res = 0; 178 int res;
179#if ORDER > 32
180 int cnt = ORDER>>5;
181#endif
182
207#if ORDER > 16 183#if ORDER > 16
208 int cnt = ORDER>>4; 184#define MLA_BLOCKS "3"
185#else
186#define MLA_BLOCKS "1"
209#endif 187#endif
210 188
211 asm volatile ( 189 asm volatile (
190#if ORDER > 32
191 "mov %[res], #0 \n"
192#endif
212 "tst %[v2], #2 \n" 193 "tst %[v2], #2 \n"
213 "beq 20f \n" 194 "beq 20f \n"
214 195
@@ -216,11 +197,18 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
216 "ldrh r7, [%[v2]], #2 \n" 197 "ldrh r7, [%[v2]], #2 \n"
217 "ldmia %[v2]!, {r4-r5} \n" 198 "ldmia %[v2]!, {r4-r5} \n"
218 "ldmia %[v1]!, {r0-r1} \n" 199 "ldmia %[v1]!, {r0-r1} \n"
200#if ORDER > 32
219 "mov r7, r7, lsl #16 \n" 201 "mov r7, r7, lsl #16 \n"
220 "1: \n" 202 "1: \n"
221 "pkhbt r8, r4, r7 \n" 203 "pkhbt r8, r4, r7 \n"
222 "ldmia %[v2]!, {r6-r7} \n" 204 "ldmia %[v2]!, {r6-r7} \n"
223 "smladx %[res], r0, r8, %[res] \n" 205 "smladx %[res], r0, r8, %[res] \n"
206#else
207 "pkhbt r8, r4, r7, lsl #16 \n"
208 "ldmia %[v2]!, {r6-r7} \n"
209 "smuadx %[res], r0, r8 \n"
210#endif
211 ".rept " MLA_BLOCKS "\n"
224 "pkhbt r8, r5, r4 \n" 212 "pkhbt r8, r5, r4 \n"
225 "ldmia %[v1]!, {r2-r3} \n" 213 "ldmia %[v1]!, {r2-r3} \n"
226 "smladx %[res], r1, r8, %[res] \n" 214 "smladx %[res], r1, r8, %[res] \n"
@@ -233,11 +221,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
233 "pkhbt r8, r4, r7 \n" 221 "pkhbt r8, r4, r7 \n"
234 "ldmia %[v2]!, {r6-r7} \n" 222 "ldmia %[v2]!, {r6-r7} \n"
235 "smladx %[res], r0, r8, %[res] \n" 223 "smladx %[res], r0, r8, %[res] \n"
224 ".endr \n"
225
236 "pkhbt r8, r5, r4 \n" 226 "pkhbt r8, r5, r4 \n"
237 "ldmia %[v1]!, {r2-r3} \n" 227 "ldmia %[v1]!, {r2-r3} \n"
238 "smladx %[res], r1, r8, %[res] \n" 228 "smladx %[res], r1, r8, %[res] \n"
239 "pkhbt r8, r6, r5 \n" 229 "pkhbt r8, r6, r5 \n"
240#if ORDER > 16 230#if ORDER > 32
241 "subs %[cnt], %[cnt], #1 \n" 231 "subs %[cnt], %[cnt], #1 \n"
242 "ldmneia %[v2]!, {r4-r5} \n" 232 "ldmneia %[v2]!, {r4-r5} \n"
243 "smladx %[res], r2, r8, %[res] \n" 233 "smladx %[res], r2, r8, %[res] \n"
@@ -257,7 +247,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
257 "ldmia %[v2]!, {r5-r7} \n" 247 "ldmia %[v2]!, {r5-r7} \n"
258 "1: \n" 248 "1: \n"
259 "ldmia %[v1]!, {r2-r3} \n" 249 "ldmia %[v1]!, {r2-r3} \n"
250#if ORDER > 32
260 "smlad %[res], r0, r5, %[res] \n" 251 "smlad %[res], r0, r5, %[res] \n"
252#else
253 "smuad %[res], r0, r5 \n"
254#endif
255 ".rept " MLA_BLOCKS "\n"
261 "ldmia %[v2]!, {r4-r5} \n" 256 "ldmia %[v2]!, {r4-r5} \n"
262 "smlad %[res], r1, r6, %[res] \n" 257 "smlad %[res], r1, r6, %[res] \n"
263 "ldmia %[v1]!, {r0-r1} \n" 258 "ldmia %[v1]!, {r0-r1} \n"
@@ -266,9 +261,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
266 "smlad %[res], r3, r4, %[res] \n" 261 "smlad %[res], r3, r4, %[res] \n"
267 "ldmia %[v1]!, {r2-r3} \n" 262 "ldmia %[v1]!, {r2-r3} \n"
268 "smlad %[res], r0, r5, %[res] \n" 263 "smlad %[res], r0, r5, %[res] \n"
264 ".endr \n"
265
269 "ldmia %[v2]!, {r4-r5} \n" 266 "ldmia %[v2]!, {r4-r5} \n"
270 "smlad %[res], r1, r6, %[res] \n" 267 "smlad %[res], r1, r6, %[res] \n"
271#if ORDER > 16 268#if ORDER > 32
272 "subs %[cnt], %[cnt], #1 \n" 269 "subs %[cnt], %[cnt], #1 \n"
273 "ldmneia %[v1]!, {r0-r1} \n" 270 "ldmneia %[v1]!, {r0-r1} \n"
274 "smlad %[res], r2, r7, %[res] \n" 271 "smlad %[res], r2, r7, %[res] \n"
@@ -282,12 +279,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
282 279
283 "99: \n" 280 "99: \n"
284 : /* outputs */ 281 : /* outputs */
285#if ORDER > 16 282#if ORDER > 32
286 [cnt]"+r"(cnt), 283 [cnt]"+r"(cnt),
287#endif 284#endif
288 [v1] "+r"(v1), 285 [v1] "+r"(v1),
289 [v2] "+r"(v2), 286 [v2] "+r"(v2),
290 [res]"+r"(res) 287 [res]"=r"(res)
291 : /* inputs */ 288 : /* inputs */
292 : /* clobbers */ 289 : /* clobbers */
293 "r0", "r1", "r2", "r3", "r4", 290 "r0", "r1", "r2", "r3", "r4",
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 0c3aaca223..11e7f07adf 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -67,7 +67,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
67 "move.l %%d3, (%[v1])+ \n" 67 "move.l %%d3, (%[v1])+ \n"
68 "lea.l (16, %[v2]), %[v2] \n" 68 "lea.l (16, %[v2]), %[v2] \n"
69 "move.l %%d4, %%d0 \n" 69 "move.l %%d4, %%d0 \n"
70 70
71 "movem.l (%[v1]), %%a0-%%a3 \n" 71 "movem.l (%[v1]), %%a0-%%a3 \n"
72 "movem.l (%[v2]), %%d1-%%d4 \n" 72 "movem.l (%[v2]), %%d1-%%d4 \n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0) 73 ADDHALFXREGS(%%a0, %%d1, %%d0)
@@ -175,7 +175,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
175 "move.l %%d3, (%[v1])+ \n" 175 "move.l %%d3, (%[v1])+ \n"
176 "lea.l (16, %[v2]), %[v2] \n" 176 "lea.l (16, %[v2]), %[v2] \n"
177 "move.l %%d4, %%d0 \n" 177 "move.l %%d4, %%d0 \n"
178 178
179 "movem.l (%[v2]), %%d1-%%d4 \n" 179 "movem.l (%[v2]), %%d1-%%d4 \n"
180 "movem.l (%[v1]), %%a0-%%a3 \n" 180 "movem.l (%[v1]), %%a0-%%a3 \n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0) 181 SUBHALFXREGS(%%a0, %%d1, %%d0)
@@ -207,7 +207,6 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
207 "move.l %%d2, (%[v1])+ \n" 207 "move.l %%d2, (%[v1])+ \n"
208 SUBHALFREGS(%%a3, %%d4, %%d3) 208 SUBHALFREGS(%%a3, %%d4, %%d3)
209 "move.l %%d3, (%[v1])+ \n" 209 "move.l %%d3, (%[v1])+ \n"
210
211 "lea.l (16, %[v2]), %[v2] \n" 210 "lea.l (16, %[v2]), %[v2] \n"
212 211
213 "movem.l (%[v2]), %%d1-%%d4 \n" 212 "movem.l (%[v2]), %%d1-%%d4 \n"
@@ -248,22 +247,16 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
248 * in signed integer mode - call above macro before use. */ 247 * in signed integer mode - call above macro before use. */
249static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 248static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
250{ 249{
251 int res = 0; 250 int res;
252#if ORDER > 32 251#if ORDER > 32
253 int cnt = ORDER>>5; 252 int cnt = ORDER>>5;
254#endif 253#endif
255 254
256#define MACBLOCK4 \ 255#if ORDER > 16
257 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ 256#define MAC_BLOCKS "7"
258 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ 257#else
259 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ 258#define MAC_BLOCKS "3"
260 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" 259#endif
261
262#define MACBLOCK4_U2 \
263 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
264 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \
265 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \
266 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
267 260
268 asm volatile ( 261 asm volatile (
269 "move.l %[v2], %%d0 \n" 262 "move.l %[v2], %%d0 \n"
@@ -274,15 +267,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
274 "move.l (%[v1])+, %%d0 \n" 267 "move.l (%[v1])+, %%d0 \n"
275 "move.w (%[v2])+, %%d1 \n" 268 "move.w (%[v2])+, %%d1 \n"
276 "1: \n" 269 "1: \n"
277#if ORDER > 16 270 ".rept " MAC_BLOCKS "\n"
278 MACBLOCK4_U2 271 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
279 MACBLOCK4_U2 272 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
280 MACBLOCK4_U2 273 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
281 MACBLOCK4_U2 274 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
282#endif 275 ".endr \n"
283 MACBLOCK4_U2 276
284 MACBLOCK4_U2
285 MACBLOCK4_U2
286 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" 277 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
287 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" 278 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
288 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" 279 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
@@ -299,15 +290,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
299 "move.l (%[v1])+, %%d0 \n" 290 "move.l (%[v1])+, %%d0 \n"
300 "move.l (%[v2])+, %%d1 \n" 291 "move.l (%[v2])+, %%d1 \n"
301 "1: \n" 292 "1: \n"
302#if ORDER > 16 293 ".rept " MAC_BLOCKS "\n"
303 MACBLOCK4 294 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
304 MACBLOCK4 295 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
305 MACBLOCK4 296 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
306 MACBLOCK4 297 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
307#endif 298 ".endr \n"
308 MACBLOCK4 299
309 MACBLOCK4
310 MACBLOCK4
311 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 300 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
312 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" 301 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
313#if ORDER > 32 302#if ORDER > 32
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
index b729bd3a0a..89b24f2b06 100644
--- a/apps/codecs/demac/libdemac/vector_math32_armv4.h
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2)
30 int cnt = ORDER>>5; 30 int cnt = ORDER>>5;
31#endif 31#endif
32 32
33#define ADDBLOCK4 \ 33#if ORDER > 16
34 "ldmia %[v1], {r0-r3} \n" \ 34#define ADD_SUB_BLOCKS "8"
35 "ldmia %[v2]!, {r4-r7} \n" \ 35#else
36 "add r0, r0, r4 \n" \ 36#define ADD_SUB_BLOCKS "4"
37 "add r1, r1, r5 \n" \ 37#endif
38 "add r2, r2, r6 \n" \
39 "add r3, r3, r7 \n" \
40 "stmia %[v1]!, {r0-r3} \n"
41 38
42 asm volatile ( 39 asm volatile (
43 "1: \n" 40 "1: \n"
44 ADDBLOCK4 41 ".rept " ADD_SUB_BLOCKS "\n"
45 ADDBLOCK4 42 "ldmia %[v1], {r0-r3} \n"
46 ADDBLOCK4 43 "ldmia %[v2]!, {r4-r7} \n"
47 ADDBLOCK4 44 "add r0, r0, r4 \n"
48#if ORDER > 16 45 "add r1, r1, r5 \n"
49 ADDBLOCK4 46 "add r2, r2, r6 \n"
50 ADDBLOCK4 47 "add r3, r3, r7 \n"
51 ADDBLOCK4 48 "stmia %[v1]!, {r0-r3} \n"
52 ADDBLOCK4 49 ".endr \n"
53#endif
54#if ORDER > 32 50#if ORDER > 32
55 "subs %[cnt], %[cnt], #1 \n" 51 "subs %[cnt], %[cnt], #1 \n"
56 "bne 1b \n" 52 "bne 1b \n"
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
74 int cnt = ORDER>>5; 70 int cnt = ORDER>>5;
75#endif 71#endif
76 72
77#define SUBBLOCK4 \
78 "ldmia %[v1], {r0-r3} \n" \
79 "ldmia %[v2]!, {r4-r7} \n" \
80 "sub r0, r0, r4 \n" \
81 "sub r1, r1, r5 \n" \
82 "sub r2, r2, r6 \n" \
83 "sub r3, r3, r7 \n" \
84 "stmia %[v1]!, {r0-r3} \n"
85
86 asm volatile ( 73 asm volatile (
87 "1: \n" 74 "1: \n"
88 SUBBLOCK4 75 ".rept " ADD_SUB_BLOCKS "\n"
89 SUBBLOCK4 76 "ldmia %[v1], {r0-r3} \n"
90 SUBBLOCK4 77 "ldmia %[v2]!, {r4-r7} \n"
91 SUBBLOCK4 78 "sub r0, r0, r4 \n"
92#if ORDER > 16 79 "sub r1, r1, r5 \n"
93 SUBBLOCK4 80 "sub r2, r2, r6 \n"
94 SUBBLOCK4 81 "sub r3, r3, r7 \n"
95 SUBBLOCK4 82 "stmia %[v1]!, {r0-r3} \n"
96 SUBBLOCK4 83 ".endr \n"
97#endif
98#if ORDER > 32 84#if ORDER > 32
99 "subs %[cnt], %[cnt], #1 \n" 85 "subs %[cnt], %[cnt], #1 \n"
100 "bne 1b \n" 86 "bne 1b \n"
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2)
114 100
115static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) 101static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
116{ 102{
117 int res = 0; 103 int res;
118#if ORDER > 32 104#if ORDER > 32
119 int cnt = ORDER>>5; 105 int cnt = ORDER>>5;
120#endif 106#endif
121 107
122 asm volatile ( 108 asm volatile (
123#if ORDER > 16 109#if ORDER > 16
110#if ORDER > 32
111 "mov %[res], #0 \n"
112#endif
124 "ldmia %[v2]!, {r6-r7} \n" 113 "ldmia %[v2]!, {r6-r7} \n"
125 "1: \n" 114 "1: \n"
126 "ldmia %[v1]!, {r0,r1,r3-r5} \n" 115 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
116#if ORDER > 32
127 "mla %[res], r6, r0, %[res] \n" 117 "mla %[res], r6, r0, %[res] \n"
118#else
119 "mul %[res], r6, r0 \n"
120#endif
128 "mla %[res], r7, r1, %[res] \n" 121 "mla %[res], r7, r1, %[res] \n"
129 "ldmia %[v2]!, {r0-r2,r6-r8} \n" 122 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
130 "mla %[res], r0, r3, %[res] \n" 123 "mla %[res], r0, r3, %[res] \n"
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
177#endif 170#endif
178 171
179#else /* ORDER <= 16 */ 172#else /* ORDER <= 16 */
180 173 "ldmia %[v1]!, {r0-r3} \n"
181#define MLABLOCK4 \ 174 "ldmia %[v2]!, {r4-r7} \n"
182 "ldmia %[v1]!, {r0-r3} \n" \ 175 "mul %[res], r4, r0 \n"
183 "ldmia %[v2]!, {r4-r7} \n" \ 176 "mla %[res], r5, r1, %[res] \n"
184 "mla %[res], r4, r0, %[res] \n" \ 177 "mla %[res], r6, r2, %[res] \n"
185 "mla %[res], r5, r1, %[res] \n" \
186 "mla %[res], r6, r2, %[res] \n" \
187 "mla %[res], r7, r3, %[res] \n" 178 "mla %[res], r7, r3, %[res] \n"
188 179
189 MLABLOCK4 180 ".rept 3 \n"
190 MLABLOCK4 181 "ldmia %[v1]!, {r0-r3} \n"
191 MLABLOCK4 182 "ldmia %[v2]!, {r4-r7} \n"
192 MLABLOCK4 183 "mla %[res], r4, r0, %[res] \n"
184 "mla %[res], r5, r1, %[res] \n"
185 "mla %[res], r6, r2, %[res] \n"
186 "mla %[res], r7, r3, %[res] \n"
187 ".endr \n"
193#endif /* ORDER <= 16 */ 188#endif /* ORDER <= 16 */
194 : /* outputs */ 189 : /* outputs */
195#if ORDER > 32 190#if ORDER > 32
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
197#endif 192#endif
198 [v1] "+r"(v1), 193 [v1] "+r"(v1),
199 [v2] "+r"(v2), 194 [v2] "+r"(v2),
200 [res]"+r"(res) 195 [res]"=r"(res)
201 : /* inputs */ 196 : /* inputs */
202 : /* clobbers */ 197 : /* clobbers */
203 "r0", "r1", "r2", "r3", 198 "r0", "r1", "r2", "r3",
diff --git a/apps/codecs/demac/libdemac/vector_math_generic.h b/apps/codecs/demac/libdemac/vector_math_generic.h
index 7b61db77be..00bf07a007 100644
--- a/apps/codecs/demac/libdemac/vector_math_generic.h
+++ b/apps/codecs/demac/libdemac/vector_math_generic.h
@@ -116,8 +116,8 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
116{ 116{
117 int res = 0; 117 int res = 0;
118 118
119#if ORDER > 16 119#if ORDER > 32
120 int order = (ORDER >> 4); 120 int order = (ORDER >> 5);
121 while (order--) 121 while (order--)
122#endif 122#endif
123 { 123 {
@@ -137,6 +137,24 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
137 res += *v1++ * *v2++; 137 res += *v1++ * *v2++;
138 res += *v1++ * *v2++; 138 res += *v1++ * *v2++;
139 res += *v1++ * *v2++; 139 res += *v1++ * *v2++;
140#if ORDER > 16
141 res += *v1++ * *v2++;
142 res += *v1++ * *v2++;
143 res += *v1++ * *v2++;
144 res += *v1++ * *v2++;
145 res += *v1++ * *v2++;
146 res += *v1++ * *v2++;
147 res += *v1++ * *v2++;
148 res += *v1++ * *v2++;
149 res += *v1++ * *v2++;
150 res += *v1++ * *v2++;
151 res += *v1++ * *v2++;
152 res += *v1++ * *v2++;
153 res += *v1++ * *v2++;
154 res += *v1++ * *v2++;
155 res += *v1++ * *v2++;
156 res += *v1++ * *v2++;
157#endif
140 } 158 }
141 return res; 159 return res;
142} 160}