diff options
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math16_armv6.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv6.h | 111 |
1 files changed, 54 insertions, 57 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index 49fa2ceb7d..cd27b271af 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -29,8 +29,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
29 | * incorrect results (if ARM aligncheck is disabled). */ | 29 | * incorrect results (if ARM aligncheck is disabled). */ |
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | 30 | static inline void vector_add(int16_t* v1, int16_t* v2) |
31 | { | 31 | { |
32 | #if ORDER > 32 | ||
33 | int cnt = ORDER>>5; | ||
34 | #endif | ||
35 | |||
32 | #if ORDER > 16 | 36 | #if ORDER > 16 |
33 | int cnt = ORDER>>4; | 37 | #define ADD_SUB_BLOCKS "4" |
38 | #else | ||
39 | #define ADD_SUB_BLOCKS "2" | ||
34 | #endif | 40 | #endif |
35 | 41 | ||
36 | asm volatile ( | 42 | asm volatile ( |
@@ -42,6 +48,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
42 | "ldr r5, [%[v2]], #4 \n" | 48 | "ldr r5, [%[v2]], #4 \n" |
43 | "mov r4, r4, lsl #16 \n" | 49 | "mov r4, r4, lsl #16 \n" |
44 | "1: \n" | 50 | "1: \n" |
51 | ".rept " ADD_SUB_BLOCKS "\n" | ||
45 | "ldmia %[v2]!, {r6-r7} \n" | 52 | "ldmia %[v2]!, {r6-r7} \n" |
46 | "ldmia %[v1], {r0-r3} \n" | 53 | "ldmia %[v1], {r0-r3} \n" |
47 | "mov r5, r5, ror #16 \n" | 54 | "mov r5, r5, ror #16 \n" |
@@ -56,21 +63,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
56 | "pkhbt r7, r7, r4, lsl #16 \n" | 63 | "pkhbt r7, r7, r4, lsl #16 \n" |
57 | "sadd16 r3, r3, r7 \n" | 64 | "sadd16 r3, r3, r7 \n" |
58 | "stmia %[v1]!, {r0-r3} \n" | 65 | "stmia %[v1]!, {r0-r3} \n" |
59 | "ldmia %[v2]!, {r6-r7} \n" | 66 | ".endr \n" |
60 | "ldmia %[v1], {r0-r3} \n" | 67 | #if ORDER > 32 |
61 | "mov r5, r5, ror #16 \n" | ||
62 | "pkhtb r4, r5, r4, asr #16 \n" | ||
63 | "sadd16 r0, r0, r4 \n" | ||
64 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
65 | "sadd16 r1, r1, r5 \n" | ||
66 | "ldmia %[v2]!, {r4-r5} \n" | ||
67 | "mov r7, r7, ror #16 \n" | ||
68 | "pkhtb r6, r7, r6, asr #16 \n" | ||
69 | "sadd16 r2, r2, r6 \n" | ||
70 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
71 | "sadd16 r3, r3, r7 \n" | ||
72 | "stmia %[v1]!, {r0-r3} \n" | ||
73 | #if ORDER > 16 | ||
74 | "subs %[cnt], %[cnt], #1 \n" | 68 | "subs %[cnt], %[cnt], #1 \n" |
75 | "bne 1b \n" | 69 | "bne 1b \n" |
76 | #endif | 70 | #endif |
@@ -78,6 +72,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
78 | 72 | ||
79 | "20: \n" | 73 | "20: \n" |
80 | "1: \n" | 74 | "1: \n" |
75 | ".rept " ADD_SUB_BLOCKS "\n" | ||
81 | "ldmia %[v2]!, {r4-r7} \n" | 76 | "ldmia %[v2]!, {r4-r7} \n" |
82 | "ldmia %[v1], {r0-r3} \n" | 77 | "ldmia %[v1], {r0-r3} \n" |
83 | "sadd16 r0, r0, r4 \n" | 78 | "sadd16 r0, r0, r4 \n" |
@@ -85,21 +80,15 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
85 | "sadd16 r2, r2, r6 \n" | 80 | "sadd16 r2, r2, r6 \n" |
86 | "sadd16 r3, r3, r7 \n" | 81 | "sadd16 r3, r3, r7 \n" |
87 | "stmia %[v1]!, {r0-r3} \n" | 82 | "stmia %[v1]!, {r0-r3} \n" |
88 | "ldmia %[v2]!, {r4-r7} \n" | 83 | ".endr \n" |
89 | "ldmia %[v1], {r0-r3} \n" | 84 | #if ORDER > 32 |
90 | "sadd16 r0, r0, r4 \n" | ||
91 | "sadd16 r1, r1, r5 \n" | ||
92 | "sadd16 r2, r2, r6 \n" | ||
93 | "sadd16 r3, r3, r7 \n" | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | ||
96 | "subs %[cnt], %[cnt], #1 \n" | 85 | "subs %[cnt], %[cnt], #1 \n" |
97 | "bne 1b \n" | 86 | "bne 1b \n" |
98 | #endif | 87 | #endif |
99 | 88 | ||
100 | "99: \n" | 89 | "99: \n" |
101 | : /* outputs */ | 90 | : /* outputs */ |
102 | #if ORDER > 16 | 91 | #if ORDER > 32 |
103 | [cnt]"+r"(cnt), | 92 | [cnt]"+r"(cnt), |
104 | #endif | 93 | #endif |
105 | [v1] "+r"(v1), | 94 | [v1] "+r"(v1), |
@@ -116,8 +105,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
116 | * incorrect results (if ARM aligncheck is disabled). */ | 105 | * incorrect results (if ARM aligncheck is disabled). */ |
117 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 106 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
118 | { | 107 | { |
119 | #if ORDER > 16 | 108 | #if ORDER > 32 |
120 | int cnt = ORDER>>4; | 109 | int cnt = ORDER>>5; |
121 | #endif | 110 | #endif |
122 | 111 | ||
123 | asm volatile ( | 112 | asm volatile ( |
@@ -129,6 +118,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
129 | "ldr r5, [%[v2]], #4 \n" | 118 | "ldr r5, [%[v2]], #4 \n" |
130 | "mov r4, r4, lsl #16 \n" | 119 | "mov r4, r4, lsl #16 \n" |
131 | "1: \n" | 120 | "1: \n" |
121 | ".rept " ADD_SUB_BLOCKS "\n" | ||
132 | "ldmia %[v2]!, {r6-r7} \n" | 122 | "ldmia %[v2]!, {r6-r7} \n" |
133 | "ldmia %[v1], {r0-r3} \n" | 123 | "ldmia %[v1], {r0-r3} \n" |
134 | "mov r5, r5, ror #16 \n" | 124 | "mov r5, r5, ror #16 \n" |
@@ -143,21 +133,8 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
143 | "pkhbt r7, r7, r4, lsl #16 \n" | 133 | "pkhbt r7, r7, r4, lsl #16 \n" |
144 | "ssub16 r3, r3, r7 \n" | 134 | "ssub16 r3, r3, r7 \n" |
145 | "stmia %[v1]!, {r0-r3} \n" | 135 | "stmia %[v1]!, {r0-r3} \n" |
146 | "ldmia %[v2]!, {r6-r7} \n" | 136 | ".endr \n" |
147 | "ldmia %[v1], {r0-r3} \n" | 137 | #if ORDER > 32 |
148 | "mov r5, r5, ror #16 \n" | ||
149 | "pkhtb r4, r5, r4, asr #16 \n" | ||
150 | "ssub16 r0, r0, r4 \n" | ||
151 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
152 | "ssub16 r1, r1, r5 \n" | ||
153 | "ldmia %[v2]!, {r4-r5} \n" | ||
154 | "mov r7, r7, ror #16 \n" | ||
155 | "pkhtb r6, r7, r6, asr #16 \n" | ||
156 | "ssub16 r2, r2, r6 \n" | ||
157 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
158 | "ssub16 r3, r3, r7 \n" | ||
159 | "stmia %[v1]!, {r0-r3} \n" | ||
160 | #if ORDER > 16 | ||
161 | "subs %[cnt], %[cnt], #1 \n" | 138 | "subs %[cnt], %[cnt], #1 \n" |
162 | "bne 1b \n" | 139 | "bne 1b \n" |
163 | #endif | 140 | #endif |
@@ -165,6 +142,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
165 | 142 | ||
166 | "20: \n" | 143 | "20: \n" |
167 | "1: \n" | 144 | "1: \n" |
145 | ".rept " ADD_SUB_BLOCKS "\n" | ||
168 | "ldmia %[v2]!, {r4-r7} \n" | 146 | "ldmia %[v2]!, {r4-r7} \n" |
169 | "ldmia %[v1], {r0-r3} \n" | 147 | "ldmia %[v1], {r0-r3} \n" |
170 | "ssub16 r0, r0, r4 \n" | 148 | "ssub16 r0, r0, r4 \n" |
@@ -172,21 +150,15 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
172 | "ssub16 r2, r2, r6 \n" | 150 | "ssub16 r2, r2, r6 \n" |
173 | "ssub16 r3, r3, r7 \n" | 151 | "ssub16 r3, r3, r7 \n" |
174 | "stmia %[v1]!, {r0-r3} \n" | 152 | "stmia %[v1]!, {r0-r3} \n" |
175 | "ldmia %[v2]!, {r4-r7} \n" | 153 | ".endr \n" |
176 | "ldmia %[v1], {r0-r3} \n" | 154 | #if ORDER > 32 |
177 | "ssub16 r0, r0, r4 \n" | ||
178 | "ssub16 r1, r1, r5 \n" | ||
179 | "ssub16 r2, r2, r6 \n" | ||
180 | "ssub16 r3, r3, r7 \n" | ||
181 | "stmia %[v1]!, {r0-r3} \n" | ||
182 | #if ORDER > 16 | ||
183 | "subs %[cnt], %[cnt], #1 \n" | 155 | "subs %[cnt], %[cnt], #1 \n" |
184 | "bne 1b \n" | 156 | "bne 1b \n" |
185 | #endif | 157 | #endif |
186 | 158 | ||
187 | "99: \n" | 159 | "99: \n" |
188 | : /* outputs */ | 160 | : /* outputs */ |
189 | #if ORDER > 16 | 161 | #if ORDER > 32 |
190 | [cnt]"+r"(cnt), | 162 | [cnt]"+r"(cnt), |
191 | #endif | 163 | #endif |
192 | [v1] "+r"(v1), | 164 | [v1] "+r"(v1), |
@@ -203,12 +175,21 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
203 | * incorrect results (if ARM aligncheck is disabled). */ | 175 | * incorrect results (if ARM aligncheck is disabled). */ |
204 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 176 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
205 | { | 177 | { |
206 | int res = 0; | 178 | int res; |
179 | #if ORDER > 32 | ||
180 | int cnt = ORDER>>5; | ||
181 | #endif | ||
182 | |||
207 | #if ORDER > 16 | 183 | #if ORDER > 16 |
208 | int cnt = ORDER>>4; | 184 | #define MLA_BLOCKS "3" |
185 | #else | ||
186 | #define MLA_BLOCKS "1" | ||
209 | #endif | 187 | #endif |
210 | 188 | ||
211 | asm volatile ( | 189 | asm volatile ( |
190 | #if ORDER > 32 | ||
191 | "mov %[res], #0 \n" | ||
192 | #endif | ||
212 | "tst %[v2], #2 \n" | 193 | "tst %[v2], #2 \n" |
213 | "beq 20f \n" | 194 | "beq 20f \n" |
214 | 195 | ||
@@ -216,11 +197,18 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
216 | "ldrh r7, [%[v2]], #2 \n" | 197 | "ldrh r7, [%[v2]], #2 \n" |
217 | "ldmia %[v2]!, {r4-r5} \n" | 198 | "ldmia %[v2]!, {r4-r5} \n" |
218 | "ldmia %[v1]!, {r0-r1} \n" | 199 | "ldmia %[v1]!, {r0-r1} \n" |
200 | #if ORDER > 32 | ||
219 | "mov r7, r7, lsl #16 \n" | 201 | "mov r7, r7, lsl #16 \n" |
220 | "1: \n" | 202 | "1: \n" |
221 | "pkhbt r8, r4, r7 \n" | 203 | "pkhbt r8, r4, r7 \n" |
222 | "ldmia %[v2]!, {r6-r7} \n" | 204 | "ldmia %[v2]!, {r6-r7} \n" |
223 | "smladx %[res], r0, r8, %[res] \n" | 205 | "smladx %[res], r0, r8, %[res] \n" |
206 | #else | ||
207 | "pkhbt r8, r4, r7, lsl #16 \n" | ||
208 | "ldmia %[v2]!, {r6-r7} \n" | ||
209 | "smuadx %[res], r0, r8 \n" | ||
210 | #endif | ||
211 | ".rept " MLA_BLOCKS "\n" | ||
224 | "pkhbt r8, r5, r4 \n" | 212 | "pkhbt r8, r5, r4 \n" |
225 | "ldmia %[v1]!, {r2-r3} \n" | 213 | "ldmia %[v1]!, {r2-r3} \n" |
226 | "smladx %[res], r1, r8, %[res] \n" | 214 | "smladx %[res], r1, r8, %[res] \n" |
@@ -233,11 +221,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
233 | "pkhbt r8, r4, r7 \n" | 221 | "pkhbt r8, r4, r7 \n" |
234 | "ldmia %[v2]!, {r6-r7} \n" | 222 | "ldmia %[v2]!, {r6-r7} \n" |
235 | "smladx %[res], r0, r8, %[res] \n" | 223 | "smladx %[res], r0, r8, %[res] \n" |
224 | ".endr \n" | ||
225 | |||
236 | "pkhbt r8, r5, r4 \n" | 226 | "pkhbt r8, r5, r4 \n" |
237 | "ldmia %[v1]!, {r2-r3} \n" | 227 | "ldmia %[v1]!, {r2-r3} \n" |
238 | "smladx %[res], r1, r8, %[res] \n" | 228 | "smladx %[res], r1, r8, %[res] \n" |
239 | "pkhbt r8, r6, r5 \n" | 229 | "pkhbt r8, r6, r5 \n" |
240 | #if ORDER > 16 | 230 | #if ORDER > 32 |
241 | "subs %[cnt], %[cnt], #1 \n" | 231 | "subs %[cnt], %[cnt], #1 \n" |
242 | "ldmneia %[v2]!, {r4-r5} \n" | 232 | "ldmneia %[v2]!, {r4-r5} \n" |
243 | "smladx %[res], r2, r8, %[res] \n" | 233 | "smladx %[res], r2, r8, %[res] \n" |
@@ -257,7 +247,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
257 | "ldmia %[v2]!, {r5-r7} \n" | 247 | "ldmia %[v2]!, {r5-r7} \n" |
258 | "1: \n" | 248 | "1: \n" |
259 | "ldmia %[v1]!, {r2-r3} \n" | 249 | "ldmia %[v1]!, {r2-r3} \n" |
250 | #if ORDER > 32 | ||
260 | "smlad %[res], r0, r5, %[res] \n" | 251 | "smlad %[res], r0, r5, %[res] \n" |
252 | #else | ||
253 | "smuad %[res], r0, r5 \n" | ||
254 | #endif | ||
255 | ".rept " MLA_BLOCKS "\n" | ||
261 | "ldmia %[v2]!, {r4-r5} \n" | 256 | "ldmia %[v2]!, {r4-r5} \n" |
262 | "smlad %[res], r1, r6, %[res] \n" | 257 | "smlad %[res], r1, r6, %[res] \n" |
263 | "ldmia %[v1]!, {r0-r1} \n" | 258 | "ldmia %[v1]!, {r0-r1} \n" |
@@ -266,9 +261,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
266 | "smlad %[res], r3, r4, %[res] \n" | 261 | "smlad %[res], r3, r4, %[res] \n" |
267 | "ldmia %[v1]!, {r2-r3} \n" | 262 | "ldmia %[v1]!, {r2-r3} \n" |
268 | "smlad %[res], r0, r5, %[res] \n" | 263 | "smlad %[res], r0, r5, %[res] \n" |
264 | ".endr \n" | ||
265 | |||
269 | "ldmia %[v2]!, {r4-r5} \n" | 266 | "ldmia %[v2]!, {r4-r5} \n" |
270 | "smlad %[res], r1, r6, %[res] \n" | 267 | "smlad %[res], r1, r6, %[res] \n" |
271 | #if ORDER > 16 | 268 | #if ORDER > 32 |
272 | "subs %[cnt], %[cnt], #1 \n" | 269 | "subs %[cnt], %[cnt], #1 \n" |
273 | "ldmneia %[v1]!, {r0-r1} \n" | 270 | "ldmneia %[v1]!, {r0-r1} \n" |
274 | "smlad %[res], r2, r7, %[res] \n" | 271 | "smlad %[res], r2, r7, %[res] \n" |
@@ -282,12 +279,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
282 | 279 | ||
283 | "99: \n" | 280 | "99: \n" |
284 | : /* outputs */ | 281 | : /* outputs */ |
285 | #if ORDER > 16 | 282 | #if ORDER > 32 |
286 | [cnt]"+r"(cnt), | 283 | [cnt]"+r"(cnt), |
287 | #endif | 284 | #endif |
288 | [v1] "+r"(v1), | 285 | [v1] "+r"(v1), |
289 | [v2] "+r"(v2), | 286 | [v2] "+r"(v2), |
290 | [res]"+r"(res) | 287 | [res]"=r"(res) |
291 | : /* inputs */ | 288 | : /* inputs */ |
292 | : /* clobbers */ | 289 | : /* clobbers */ |
293 | "r0", "r1", "r2", "r3", "r4", | 290 | "r0", "r1", "r2", "r3", "r4", |