diff options
Diffstat (limited to 'apps/codecs/demac')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv5te.h | 36 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv6.h | 111 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 55 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math32_armv4.h | 95 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math_generic.h | 22 |
5 files changed, 171 insertions, 148 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h index a999c0333a..826aaa3f80 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h | |||
@@ -117,21 +117,35 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
117 | * incorrect results (if ARM aligncheck is disabled). */ | 117 | * incorrect results (if ARM aligncheck is disabled). */ |
118 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 118 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
119 | { | 119 | { |
120 | int res = 0; | 120 | int res; |
121 | #if ORDER > 32 | ||
122 | int cnt = ORDER>>5; | ||
123 | #endif | ||
124 | |||
121 | #if ORDER > 16 | 125 | #if ORDER > 16 |
122 | int cnt = ORDER>>4; | 126 | #define MLA_BLOCKS "3" |
127 | #else | ||
128 | #define MLA_BLOCKS "1" | ||
123 | #endif | 129 | #endif |
124 | 130 | ||
125 | asm volatile ( | 131 | asm volatile ( |
132 | #if ORDER > 32 | ||
133 | "mov %[res], #0 \n" | ||
134 | #endif | ||
126 | "tst %[v2], #2 \n" | 135 | "tst %[v2], #2 \n" |
127 | "beq 20f \n" | 136 | "beq 20f \n" |
128 | 137 | ||
129 | "10: \n" | 138 | "10: \n" |
130 | "ldrh r7, [%[v2]], #2 \n" | 139 | "ldrh r7, [%[v2]], #2 \n" |
140 | #if ORDER > 32 | ||
131 | "mov r7, r7, lsl #16 \n" | 141 | "mov r7, r7, lsl #16 \n" |
132 | "1: \n" | 142 | "1: \n" |
133 | "ldmia %[v1]!, {r0-r3} \n" | 143 | "ldmia %[v1]!, {r0-r3} \n" |
134 | "smlabt %[res], r0, r7, %[res] \n" | 144 | "smlabt %[res], r0, r7, %[res] \n" |
145 | #else | ||
146 | "ldmia %[v1]!, {r0-r3} \n" | ||
147 | "smulbb %[res], r0, r7 \n" | ||
148 | #endif | ||
135 | "ldmia %[v2]!, {r4-r7} \n" | 149 | "ldmia %[v2]!, {r4-r7} \n" |
136 | "smlatb %[res], r0, r4, %[res] \n" | 150 | "smlatb %[res], r0, r4, %[res] \n" |
137 | "smlabt %[res], r1, r4, %[res] \n" | 151 | "smlabt %[res], r1, r4, %[res] \n" |
@@ -140,6 +154,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
140 | "smlatb %[res], r2, r6, %[res] \n" | 154 | "smlatb %[res], r2, r6, %[res] \n" |
141 | "smlabt %[res], r3, r6, %[res] \n" | 155 | "smlabt %[res], r3, r6, %[res] \n" |
142 | "smlatb %[res], r3, r7, %[res] \n" | 156 | "smlatb %[res], r3, r7, %[res] \n" |
157 | |||
158 | ".rept " MLA_BLOCKS "\n" | ||
143 | "ldmia %[v1]!, {r0-r3} \n" | 159 | "ldmia %[v1]!, {r0-r3} \n" |
144 | "smlabt %[res], r0, r7, %[res] \n" | 160 | "smlabt %[res], r0, r7, %[res] \n" |
145 | "ldmia %[v2]!, {r4-r7} \n" | 161 | "ldmia %[v2]!, {r4-r7} \n" |
@@ -150,7 +166,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
150 | "smlatb %[res], r2, r6, %[res] \n" | 166 | "smlatb %[res], r2, r6, %[res] \n" |
151 | "smlabt %[res], r3, r6, %[res] \n" | 167 | "smlabt %[res], r3, r6, %[res] \n" |
152 | "smlatb %[res], r3, r7, %[res] \n" | 168 | "smlatb %[res], r3, r7, %[res] \n" |
153 | #if ORDER > 16 | 169 | ".endr \n" |
170 | #if ORDER > 32 | ||
154 | "subs %[cnt], %[cnt], #1 \n" | 171 | "subs %[cnt], %[cnt], #1 \n" |
155 | "bne 1b \n" | 172 | "bne 1b \n" |
156 | #endif | 173 | #endif |
@@ -160,7 +177,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
160 | "1: \n" | 177 | "1: \n" |
161 | "ldmia %[v1]!, {r0-r3} \n" | 178 | "ldmia %[v1]!, {r0-r3} \n" |
162 | "ldmia %[v2]!, {r4-r7} \n" | 179 | "ldmia %[v2]!, {r4-r7} \n" |
180 | #if ORDER > 32 | ||
163 | "smlabb %[res], r0, r4, %[res] \n" | 181 | "smlabb %[res], r0, r4, %[res] \n" |
182 | #else | ||
183 | "smulbb %[res], r0, r4 \n" | ||
184 | #endif | ||
164 | "smlatt %[res], r0, r4, %[res] \n" | 185 | "smlatt %[res], r0, r4, %[res] \n" |
165 | "smlabb %[res], r1, r5, %[res] \n" | 186 | "smlabb %[res], r1, r5, %[res] \n" |
166 | "smlatt %[res], r1, r5, %[res] \n" | 187 | "smlatt %[res], r1, r5, %[res] \n" |
@@ -168,6 +189,8 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
168 | "smlatt %[res], r2, r6, %[res] \n" | 189 | "smlatt %[res], r2, r6, %[res] \n" |
169 | "smlabb %[res], r3, r7, %[res] \n" | 190 | "smlabb %[res], r3, r7, %[res] \n" |
170 | "smlatt %[res], r3, r7, %[res] \n" | 191 | "smlatt %[res], r3, r7, %[res] \n" |
192 | |||
193 | ".rept " MLA_BLOCKS "\n" | ||
171 | "ldmia %[v1]!, {r0-r3} \n" | 194 | "ldmia %[v1]!, {r0-r3} \n" |
172 | "ldmia %[v2]!, {r4-r7} \n" | 195 | "ldmia %[v2]!, {r4-r7} \n" |
173 | "smlabb %[res], r0, r4, %[res] \n" | 196 | "smlabb %[res], r0, r4, %[res] \n" |
@@ -178,19 +201,20 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
178 | "smlatt %[res], r2, r6, %[res] \n" | 201 | "smlatt %[res], r2, r6, %[res] \n" |
179 | "smlabb %[res], r3, r7, %[res] \n" | 202 | "smlabb %[res], r3, r7, %[res] \n" |
180 | "smlatt %[res], r3, r7, %[res] \n" | 203 | "smlatt %[res], r3, r7, %[res] \n" |
181 | #if ORDER > 16 | 204 | ".endr \n" |
205 | #if ORDER > 32 | ||
182 | "subs %[cnt], %[cnt], #1 \n" | 206 | "subs %[cnt], %[cnt], #1 \n" |
183 | "bne 1b \n" | 207 | "bne 1b \n" |
184 | #endif | 208 | #endif |
185 | 209 | ||
186 | "99: \n" | 210 | "99: \n" |
187 | : /* outputs */ | 211 | : /* outputs */ |
188 | #if ORDER > 16 | 212 | #if ORDER > 32 |
189 | [cnt]"+r"(cnt), | 213 | [cnt]"+r"(cnt), |
190 | #endif | 214 | #endif |
191 | [v1] "+r"(v1), | 215 | [v1] "+r"(v1), |
192 | [v2] "+r"(v2), | 216 | [v2] "+r"(v2), |
193 | [res]"+r"(res) | 217 | [res]"=r"(res) |
194 | : /* inputs */ | 218 | : /* inputs */ |
195 | : /* clobbers */ | 219 | : /* clobbers */ |
196 | "r0", "r1", "r2", "r3", | 220 | "r0", "r1", "r2", "r3", |
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv6.h b/apps/codecs/demac/libdemac/vector_math16_armv6.h index 49fa2ceb7d..cd27b271af 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv6.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv6.h | |||
@@ -29,8 +29,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
29 | * incorrect results (if ARM aligncheck is disabled). */ | 29 | * incorrect results (if ARM aligncheck is disabled). */ |
30 | static inline void vector_add(int16_t* v1, int16_t* v2) | 30 | static inline void vector_add(int16_t* v1, int16_t* v2) |
31 | { | 31 | { |
32 | #if ORDER > 32 | ||
33 | int cnt = ORDER>>5; | ||
34 | #endif | ||
35 | |||
32 | #if ORDER > 16 | 36 | #if ORDER > 16 |
33 | int cnt = ORDER>>4; | 37 | #define ADD_SUB_BLOCKS "4" |
38 | #else | ||
39 | #define ADD_SUB_BLOCKS "2" | ||
34 | #endif | 40 | #endif |
35 | 41 | ||
36 | asm volatile ( | 42 | asm volatile ( |
@@ -42,6 +48,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
42 | "ldr r5, [%[v2]], #4 \n" | 48 | "ldr r5, [%[v2]], #4 \n" |
43 | "mov r4, r4, lsl #16 \n" | 49 | "mov r4, r4, lsl #16 \n" |
44 | "1: \n" | 50 | "1: \n" |
51 | ".rept " ADD_SUB_BLOCKS "\n" | ||
45 | "ldmia %[v2]!, {r6-r7} \n" | 52 | "ldmia %[v2]!, {r6-r7} \n" |
46 | "ldmia %[v1], {r0-r3} \n" | 53 | "ldmia %[v1], {r0-r3} \n" |
47 | "mov r5, r5, ror #16 \n" | 54 | "mov r5, r5, ror #16 \n" |
@@ -56,21 +63,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
56 | "pkhbt r7, r7, r4, lsl #16 \n" | 63 | "pkhbt r7, r7, r4, lsl #16 \n" |
57 | "sadd16 r3, r3, r7 \n" | 64 | "sadd16 r3, r3, r7 \n" |
58 | "stmia %[v1]!, {r0-r3} \n" | 65 | "stmia %[v1]!, {r0-r3} \n" |
59 | "ldmia %[v2]!, {r6-r7} \n" | 66 | ".endr \n" |
60 | "ldmia %[v1], {r0-r3} \n" | 67 | #if ORDER > 32 |
61 | "mov r5, r5, ror #16 \n" | ||
62 | "pkhtb r4, r5, r4, asr #16 \n" | ||
63 | "sadd16 r0, r0, r4 \n" | ||
64 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
65 | "sadd16 r1, r1, r5 \n" | ||
66 | "ldmia %[v2]!, {r4-r5} \n" | ||
67 | "mov r7, r7, ror #16 \n" | ||
68 | "pkhtb r6, r7, r6, asr #16 \n" | ||
69 | "sadd16 r2, r2, r6 \n" | ||
70 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
71 | "sadd16 r3, r3, r7 \n" | ||
72 | "stmia %[v1]!, {r0-r3} \n" | ||
73 | #if ORDER > 16 | ||
74 | "subs %[cnt], %[cnt], #1 \n" | 68 | "subs %[cnt], %[cnt], #1 \n" |
75 | "bne 1b \n" | 69 | "bne 1b \n" |
76 | #endif | 70 | #endif |
@@ -78,6 +72,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
78 | 72 | ||
79 | "20: \n" | 73 | "20: \n" |
80 | "1: \n" | 74 | "1: \n" |
75 | ".rept " ADD_SUB_BLOCKS "\n" | ||
81 | "ldmia %[v2]!, {r4-r7} \n" | 76 | "ldmia %[v2]!, {r4-r7} \n" |
82 | "ldmia %[v1], {r0-r3} \n" | 77 | "ldmia %[v1], {r0-r3} \n" |
83 | "sadd16 r0, r0, r4 \n" | 78 | "sadd16 r0, r0, r4 \n" |
@@ -85,21 +80,15 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
85 | "sadd16 r2, r2, r6 \n" | 80 | "sadd16 r2, r2, r6 \n" |
86 | "sadd16 r3, r3, r7 \n" | 81 | "sadd16 r3, r3, r7 \n" |
87 | "stmia %[v1]!, {r0-r3} \n" | 82 | "stmia %[v1]!, {r0-r3} \n" |
88 | "ldmia %[v2]!, {r4-r7} \n" | 83 | ".endr \n" |
89 | "ldmia %[v1], {r0-r3} \n" | 84 | #if ORDER > 32 |
90 | "sadd16 r0, r0, r4 \n" | ||
91 | "sadd16 r1, r1, r5 \n" | ||
92 | "sadd16 r2, r2, r6 \n" | ||
93 | "sadd16 r3, r3, r7 \n" | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | ||
96 | "subs %[cnt], %[cnt], #1 \n" | 85 | "subs %[cnt], %[cnt], #1 \n" |
97 | "bne 1b \n" | 86 | "bne 1b \n" |
98 | #endif | 87 | #endif |
99 | 88 | ||
100 | "99: \n" | 89 | "99: \n" |
101 | : /* outputs */ | 90 | : /* outputs */ |
102 | #if ORDER > 16 | 91 | #if ORDER > 32 |
103 | [cnt]"+r"(cnt), | 92 | [cnt]"+r"(cnt), |
104 | #endif | 93 | #endif |
105 | [v1] "+r"(v1), | 94 | [v1] "+r"(v1), |
@@ -116,8 +105,8 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
116 | * incorrect results (if ARM aligncheck is disabled). */ | 105 | * incorrect results (if ARM aligncheck is disabled). */ |
117 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 106 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
118 | { | 107 | { |
119 | #if ORDER > 16 | 108 | #if ORDER > 32 |
120 | int cnt = ORDER>>4; | 109 | int cnt = ORDER>>5; |
121 | #endif | 110 | #endif |
122 | 111 | ||
123 | asm volatile ( | 112 | asm volatile ( |
@@ -129,6 +118,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
129 | "ldr r5, [%[v2]], #4 \n" | 118 | "ldr r5, [%[v2]], #4 \n" |
130 | "mov r4, r4, lsl #16 \n" | 119 | "mov r4, r4, lsl #16 \n" |
131 | "1: \n" | 120 | "1: \n" |
121 | ".rept " ADD_SUB_BLOCKS "\n" | ||
132 | "ldmia %[v2]!, {r6-r7} \n" | 122 | "ldmia %[v2]!, {r6-r7} \n" |
133 | "ldmia %[v1], {r0-r3} \n" | 123 | "ldmia %[v1], {r0-r3} \n" |
134 | "mov r5, r5, ror #16 \n" | 124 | "mov r5, r5, ror #16 \n" |
@@ -143,21 +133,8 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
143 | "pkhbt r7, r7, r4, lsl #16 \n" | 133 | "pkhbt r7, r7, r4, lsl #16 \n" |
144 | "ssub16 r3, r3, r7 \n" | 134 | "ssub16 r3, r3, r7 \n" |
145 | "stmia %[v1]!, {r0-r3} \n" | 135 | "stmia %[v1]!, {r0-r3} \n" |
146 | "ldmia %[v2]!, {r6-r7} \n" | 136 | ".endr \n" |
147 | "ldmia %[v1], {r0-r3} \n" | 137 | #if ORDER > 32 |
148 | "mov r5, r5, ror #16 \n" | ||
149 | "pkhtb r4, r5, r4, asr #16 \n" | ||
150 | "ssub16 r0, r0, r4 \n" | ||
151 | "pkhbt r5, r5, r6, lsl #16 \n" | ||
152 | "ssub16 r1, r1, r5 \n" | ||
153 | "ldmia %[v2]!, {r4-r5} \n" | ||
154 | "mov r7, r7, ror #16 \n" | ||
155 | "pkhtb r6, r7, r6, asr #16 \n" | ||
156 | "ssub16 r2, r2, r6 \n" | ||
157 | "pkhbt r7, r7, r4, lsl #16 \n" | ||
158 | "ssub16 r3, r3, r7 \n" | ||
159 | "stmia %[v1]!, {r0-r3} \n" | ||
160 | #if ORDER > 16 | ||
161 | "subs %[cnt], %[cnt], #1 \n" | 138 | "subs %[cnt], %[cnt], #1 \n" |
162 | "bne 1b \n" | 139 | "bne 1b \n" |
163 | #endif | 140 | #endif |
@@ -165,6 +142,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
165 | 142 | ||
166 | "20: \n" | 143 | "20: \n" |
167 | "1: \n" | 144 | "1: \n" |
145 | ".rept " ADD_SUB_BLOCKS "\n" | ||
168 | "ldmia %[v2]!, {r4-r7} \n" | 146 | "ldmia %[v2]!, {r4-r7} \n" |
169 | "ldmia %[v1], {r0-r3} \n" | 147 | "ldmia %[v1], {r0-r3} \n" |
170 | "ssub16 r0, r0, r4 \n" | 148 | "ssub16 r0, r0, r4 \n" |
@@ -172,21 +150,15 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
172 | "ssub16 r2, r2, r6 \n" | 150 | "ssub16 r2, r2, r6 \n" |
173 | "ssub16 r3, r3, r7 \n" | 151 | "ssub16 r3, r3, r7 \n" |
174 | "stmia %[v1]!, {r0-r3} \n" | 152 | "stmia %[v1]!, {r0-r3} \n" |
175 | "ldmia %[v2]!, {r4-r7} \n" | 153 | ".endr \n" |
176 | "ldmia %[v1], {r0-r3} \n" | 154 | #if ORDER > 32 |
177 | "ssub16 r0, r0, r4 \n" | ||
178 | "ssub16 r1, r1, r5 \n" | ||
179 | "ssub16 r2, r2, r6 \n" | ||
180 | "ssub16 r3, r3, r7 \n" | ||
181 | "stmia %[v1]!, {r0-r3} \n" | ||
182 | #if ORDER > 16 | ||
183 | "subs %[cnt], %[cnt], #1 \n" | 155 | "subs %[cnt], %[cnt], #1 \n" |
184 | "bne 1b \n" | 156 | "bne 1b \n" |
185 | #endif | 157 | #endif |
186 | 158 | ||
187 | "99: \n" | 159 | "99: \n" |
188 | : /* outputs */ | 160 | : /* outputs */ |
189 | #if ORDER > 16 | 161 | #if ORDER > 32 |
190 | [cnt]"+r"(cnt), | 162 | [cnt]"+r"(cnt), |
191 | #endif | 163 | #endif |
192 | [v1] "+r"(v1), | 164 | [v1] "+r"(v1), |
@@ -203,12 +175,21 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
203 | * incorrect results (if ARM aligncheck is disabled). */ | 175 | * incorrect results (if ARM aligncheck is disabled). */ |
204 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 176 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
205 | { | 177 | { |
206 | int res = 0; | 178 | int res; |
179 | #if ORDER > 32 | ||
180 | int cnt = ORDER>>5; | ||
181 | #endif | ||
182 | |||
207 | #if ORDER > 16 | 183 | #if ORDER > 16 |
208 | int cnt = ORDER>>4; | 184 | #define MLA_BLOCKS "3" |
185 | #else | ||
186 | #define MLA_BLOCKS "1" | ||
209 | #endif | 187 | #endif |
210 | 188 | ||
211 | asm volatile ( | 189 | asm volatile ( |
190 | #if ORDER > 32 | ||
191 | "mov %[res], #0 \n" | ||
192 | #endif | ||
212 | "tst %[v2], #2 \n" | 193 | "tst %[v2], #2 \n" |
213 | "beq 20f \n" | 194 | "beq 20f \n" |
214 | 195 | ||
@@ -216,11 +197,18 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
216 | "ldrh r7, [%[v2]], #2 \n" | 197 | "ldrh r7, [%[v2]], #2 \n" |
217 | "ldmia %[v2]!, {r4-r5} \n" | 198 | "ldmia %[v2]!, {r4-r5} \n" |
218 | "ldmia %[v1]!, {r0-r1} \n" | 199 | "ldmia %[v1]!, {r0-r1} \n" |
200 | #if ORDER > 32 | ||
219 | "mov r7, r7, lsl #16 \n" | 201 | "mov r7, r7, lsl #16 \n" |
220 | "1: \n" | 202 | "1: \n" |
221 | "pkhbt r8, r4, r7 \n" | 203 | "pkhbt r8, r4, r7 \n" |
222 | "ldmia %[v2]!, {r6-r7} \n" | 204 | "ldmia %[v2]!, {r6-r7} \n" |
223 | "smladx %[res], r0, r8, %[res] \n" | 205 | "smladx %[res], r0, r8, %[res] \n" |
206 | #else | ||
207 | "pkhbt r8, r4, r7, lsl #16 \n" | ||
208 | "ldmia %[v2]!, {r6-r7} \n" | ||
209 | "smuadx %[res], r0, r8 \n" | ||
210 | #endif | ||
211 | ".rept " MLA_BLOCKS "\n" | ||
224 | "pkhbt r8, r5, r4 \n" | 212 | "pkhbt r8, r5, r4 \n" |
225 | "ldmia %[v1]!, {r2-r3} \n" | 213 | "ldmia %[v1]!, {r2-r3} \n" |
226 | "smladx %[res], r1, r8, %[res] \n" | 214 | "smladx %[res], r1, r8, %[res] \n" |
@@ -233,11 +221,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
233 | "pkhbt r8, r4, r7 \n" | 221 | "pkhbt r8, r4, r7 \n" |
234 | "ldmia %[v2]!, {r6-r7} \n" | 222 | "ldmia %[v2]!, {r6-r7} \n" |
235 | "smladx %[res], r0, r8, %[res] \n" | 223 | "smladx %[res], r0, r8, %[res] \n" |
224 | ".endr \n" | ||
225 | |||
236 | "pkhbt r8, r5, r4 \n" | 226 | "pkhbt r8, r5, r4 \n" |
237 | "ldmia %[v1]!, {r2-r3} \n" | 227 | "ldmia %[v1]!, {r2-r3} \n" |
238 | "smladx %[res], r1, r8, %[res] \n" | 228 | "smladx %[res], r1, r8, %[res] \n" |
239 | "pkhbt r8, r6, r5 \n" | 229 | "pkhbt r8, r6, r5 \n" |
240 | #if ORDER > 16 | 230 | #if ORDER > 32 |
241 | "subs %[cnt], %[cnt], #1 \n" | 231 | "subs %[cnt], %[cnt], #1 \n" |
242 | "ldmneia %[v2]!, {r4-r5} \n" | 232 | "ldmneia %[v2]!, {r4-r5} \n" |
243 | "smladx %[res], r2, r8, %[res] \n" | 233 | "smladx %[res], r2, r8, %[res] \n" |
@@ -257,7 +247,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
257 | "ldmia %[v2]!, {r5-r7} \n" | 247 | "ldmia %[v2]!, {r5-r7} \n" |
258 | "1: \n" | 248 | "1: \n" |
259 | "ldmia %[v1]!, {r2-r3} \n" | 249 | "ldmia %[v1]!, {r2-r3} \n" |
250 | #if ORDER > 32 | ||
260 | "smlad %[res], r0, r5, %[res] \n" | 251 | "smlad %[res], r0, r5, %[res] \n" |
252 | #else | ||
253 | "smuad %[res], r0, r5 \n" | ||
254 | #endif | ||
255 | ".rept " MLA_BLOCKS "\n" | ||
261 | "ldmia %[v2]!, {r4-r5} \n" | 256 | "ldmia %[v2]!, {r4-r5} \n" |
262 | "smlad %[res], r1, r6, %[res] \n" | 257 | "smlad %[res], r1, r6, %[res] \n" |
263 | "ldmia %[v1]!, {r0-r1} \n" | 258 | "ldmia %[v1]!, {r0-r1} \n" |
@@ -266,9 +261,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
266 | "smlad %[res], r3, r4, %[res] \n" | 261 | "smlad %[res], r3, r4, %[res] \n" |
267 | "ldmia %[v1]!, {r2-r3} \n" | 262 | "ldmia %[v1]!, {r2-r3} \n" |
268 | "smlad %[res], r0, r5, %[res] \n" | 263 | "smlad %[res], r0, r5, %[res] \n" |
264 | ".endr \n" | ||
265 | |||
269 | "ldmia %[v2]!, {r4-r5} \n" | 266 | "ldmia %[v2]!, {r4-r5} \n" |
270 | "smlad %[res], r1, r6, %[res] \n" | 267 | "smlad %[res], r1, r6, %[res] \n" |
271 | #if ORDER > 16 | 268 | #if ORDER > 32 |
272 | "subs %[cnt], %[cnt], #1 \n" | 269 | "subs %[cnt], %[cnt], #1 \n" |
273 | "ldmneia %[v1]!, {r0-r1} \n" | 270 | "ldmneia %[v1]!, {r0-r1} \n" |
274 | "smlad %[res], r2, r7, %[res] \n" | 271 | "smlad %[res], r2, r7, %[res] \n" |
@@ -282,12 +279,12 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
282 | 279 | ||
283 | "99: \n" | 280 | "99: \n" |
284 | : /* outputs */ | 281 | : /* outputs */ |
285 | #if ORDER > 16 | 282 | #if ORDER > 32 |
286 | [cnt]"+r"(cnt), | 283 | [cnt]"+r"(cnt), |
287 | #endif | 284 | #endif |
288 | [v1] "+r"(v1), | 285 | [v1] "+r"(v1), |
289 | [v2] "+r"(v2), | 286 | [v2] "+r"(v2), |
290 | [res]"+r"(res) | 287 | [res]"=r"(res) |
291 | : /* inputs */ | 288 | : /* inputs */ |
292 | : /* clobbers */ | 289 | : /* clobbers */ |
293 | "r0", "r1", "r2", "r3", "r4", | 290 | "r0", "r1", "r2", "r3", "r4", |
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 0c3aaca223..11e7f07adf 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -67,7 +67,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
67 | "move.l %%d3, (%[v1])+ \n" | 67 | "move.l %%d3, (%[v1])+ \n" |
68 | "lea.l (16, %[v2]), %[v2] \n" | 68 | "lea.l (16, %[v2]), %[v2] \n" |
69 | "move.l %%d4, %%d0 \n" | 69 | "move.l %%d4, %%d0 \n" |
70 | 70 | ||
71 | "movem.l (%[v1]), %%a0-%%a3 \n" | 71 | "movem.l (%[v1]), %%a0-%%a3 \n" |
72 | "movem.l (%[v2]), %%d1-%%d4 \n" | 72 | "movem.l (%[v2]), %%d1-%%d4 \n" |
73 | ADDHALFXREGS(%%a0, %%d1, %%d0) | 73 | ADDHALFXREGS(%%a0, %%d1, %%d0) |
@@ -175,7 +175,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
175 | "move.l %%d3, (%[v1])+ \n" | 175 | "move.l %%d3, (%[v1])+ \n" |
176 | "lea.l (16, %[v2]), %[v2] \n" | 176 | "lea.l (16, %[v2]), %[v2] \n" |
177 | "move.l %%d4, %%d0 \n" | 177 | "move.l %%d4, %%d0 \n" |
178 | 178 | ||
179 | "movem.l (%[v2]), %%d1-%%d4 \n" | 179 | "movem.l (%[v2]), %%d1-%%d4 \n" |
180 | "movem.l (%[v1]), %%a0-%%a3 \n" | 180 | "movem.l (%[v1]), %%a0-%%a3 \n" |
181 | SUBHALFXREGS(%%a0, %%d1, %%d0) | 181 | SUBHALFXREGS(%%a0, %%d1, %%d0) |
@@ -207,7 +207,6 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
207 | "move.l %%d2, (%[v1])+ \n" | 207 | "move.l %%d2, (%[v1])+ \n" |
208 | SUBHALFREGS(%%a3, %%d4, %%d3) | 208 | SUBHALFREGS(%%a3, %%d4, %%d3) |
209 | "move.l %%d3, (%[v1])+ \n" | 209 | "move.l %%d3, (%[v1])+ \n" |
210 | |||
211 | "lea.l (16, %[v2]), %[v2] \n" | 210 | "lea.l (16, %[v2]), %[v2] \n" |
212 | 211 | ||
213 | "movem.l (%[v2]), %%d1-%%d4 \n" | 212 | "movem.l (%[v2]), %%d1-%%d4 \n" |
@@ -248,22 +247,16 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
248 | * in signed integer mode - call above macro before use. */ | 247 | * in signed integer mode - call above macro before use. */ |
249 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 248 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
250 | { | 249 | { |
251 | int res = 0; | 250 | int res; |
252 | #if ORDER > 32 | 251 | #if ORDER > 32 |
253 | int cnt = ORDER>>5; | 252 | int cnt = ORDER>>5; |
254 | #endif | 253 | #endif |
255 | 254 | ||
256 | #define MACBLOCK4 \ | 255 | #if ORDER > 16 |
257 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ | 256 | #define MAC_BLOCKS "7" |
258 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | 257 | #else |
259 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ | 258 | #define MAC_BLOCKS "3" |
260 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 259 | #endif |
261 | |||
262 | #define MACBLOCK4_U2 \ | ||
263 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
264 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" \ | ||
265 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" \ | ||
266 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
267 | 260 | ||
268 | asm volatile ( | 261 | asm volatile ( |
269 | "move.l %[v2], %%d0 \n" | 262 | "move.l %[v2], %%d0 \n" |
@@ -274,15 +267,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
274 | "move.l (%[v1])+, %%d0 \n" | 267 | "move.l (%[v1])+, %%d0 \n" |
275 | "move.w (%[v2])+, %%d1 \n" | 268 | "move.w (%[v2])+, %%d1 \n" |
276 | "1: \n" | 269 | "1: \n" |
277 | #if ORDER > 16 | 270 | ".rept " MAC_BLOCKS "\n" |
278 | MACBLOCK4_U2 | 271 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
279 | MACBLOCK4_U2 | 272 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
280 | MACBLOCK4_U2 | 273 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
281 | MACBLOCK4_U2 | 274 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
282 | #endif | 275 | ".endr \n" |
283 | MACBLOCK4_U2 | 276 | |
284 | MACBLOCK4_U2 | ||
285 | MACBLOCK4_U2 | ||
286 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 277 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
287 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 278 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
288 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 279 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
@@ -299,15 +290,13 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
299 | "move.l (%[v1])+, %%d0 \n" | 290 | "move.l (%[v1])+, %%d0 \n" |
300 | "move.l (%[v2])+, %%d1 \n" | 291 | "move.l (%[v2])+, %%d1 \n" |
301 | "1: \n" | 292 | "1: \n" |
302 | #if ORDER > 16 | 293 | ".rept " MAC_BLOCKS "\n" |
303 | MACBLOCK4 | 294 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
304 | MACBLOCK4 | 295 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
305 | MACBLOCK4 | 296 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
306 | MACBLOCK4 | 297 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
307 | #endif | 298 | ".endr \n" |
308 | MACBLOCK4 | 299 | |
309 | MACBLOCK4 | ||
310 | MACBLOCK4 | ||
311 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 300 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
312 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 301 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
313 | #if ORDER > 32 | 302 | #if ORDER > 32 |
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index b729bd3a0a..89b24f2b06 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h | |||
@@ -30,27 +30,23 @@ static inline void vector_add(int32_t* v1, int32_t* v2) | |||
30 | int cnt = ORDER>>5; | 30 | int cnt = ORDER>>5; |
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | #define ADDBLOCK4 \ | 33 | #if ORDER > 16 |
34 | "ldmia %[v1], {r0-r3} \n" \ | 34 | #define ADD_SUB_BLOCKS "8" |
35 | "ldmia %[v2]!, {r4-r7} \n" \ | 35 | #else |
36 | "add r0, r0, r4 \n" \ | 36 | #define ADD_SUB_BLOCKS "4" |
37 | "add r1, r1, r5 \n" \ | 37 | #endif |
38 | "add r2, r2, r6 \n" \ | ||
39 | "add r3, r3, r7 \n" \ | ||
40 | "stmia %[v1]!, {r0-r3} \n" | ||
41 | 38 | ||
42 | asm volatile ( | 39 | asm volatile ( |
43 | "1: \n" | 40 | "1: \n" |
44 | ADDBLOCK4 | 41 | ".rept " ADD_SUB_BLOCKS "\n" |
45 | ADDBLOCK4 | 42 | "ldmia %[v1], {r0-r3} \n" |
46 | ADDBLOCK4 | 43 | "ldmia %[v2]!, {r4-r7} \n" |
47 | ADDBLOCK4 | 44 | "add r0, r0, r4 \n" |
48 | #if ORDER > 16 | 45 | "add r1, r1, r5 \n" |
49 | ADDBLOCK4 | 46 | "add r2, r2, r6 \n" |
50 | ADDBLOCK4 | 47 | "add r3, r3, r7 \n" |
51 | ADDBLOCK4 | 48 | "stmia %[v1]!, {r0-r3} \n" |
52 | ADDBLOCK4 | 49 | ".endr \n" |
53 | #endif | ||
54 | #if ORDER > 32 | 50 | #if ORDER > 32 |
55 | "subs %[cnt], %[cnt], #1 \n" | 51 | "subs %[cnt], %[cnt], #1 \n" |
56 | "bne 1b \n" | 52 | "bne 1b \n" |
@@ -74,27 +70,17 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) | |||
74 | int cnt = ORDER>>5; | 70 | int cnt = ORDER>>5; |
75 | #endif | 71 | #endif |
76 | 72 | ||
77 | #define SUBBLOCK4 \ | ||
78 | "ldmia %[v1], {r0-r3} \n" \ | ||
79 | "ldmia %[v2]!, {r4-r7} \n" \ | ||
80 | "sub r0, r0, r4 \n" \ | ||
81 | "sub r1, r1, r5 \n" \ | ||
82 | "sub r2, r2, r6 \n" \ | ||
83 | "sub r3, r3, r7 \n" \ | ||
84 | "stmia %[v1]!, {r0-r3} \n" | ||
85 | |||
86 | asm volatile ( | 73 | asm volatile ( |
87 | "1: \n" | 74 | "1: \n" |
88 | SUBBLOCK4 | 75 | ".rept " ADD_SUB_BLOCKS "\n" |
89 | SUBBLOCK4 | 76 | "ldmia %[v1], {r0-r3} \n" |
90 | SUBBLOCK4 | 77 | "ldmia %[v2]!, {r4-r7} \n" |
91 | SUBBLOCK4 | 78 | "sub r0, r0, r4 \n" |
92 | #if ORDER > 16 | 79 | "sub r1, r1, r5 \n" |
93 | SUBBLOCK4 | 80 | "sub r2, r2, r6 \n" |
94 | SUBBLOCK4 | 81 | "sub r3, r3, r7 \n" |
95 | SUBBLOCK4 | 82 | "stmia %[v1]!, {r0-r3} \n" |
96 | SUBBLOCK4 | 83 | ".endr \n" |
97 | #endif | ||
98 | #if ORDER > 32 | 84 | #if ORDER > 32 |
99 | "subs %[cnt], %[cnt], #1 \n" | 85 | "subs %[cnt], %[cnt], #1 \n" |
100 | "bne 1b \n" | 86 | "bne 1b \n" |
@@ -114,17 +100,24 @@ static inline void vector_sub(int32_t* v1, int32_t* v2) | |||
114 | 100 | ||
115 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | 101 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) |
116 | { | 102 | { |
117 | int res = 0; | 103 | int res; |
118 | #if ORDER > 32 | 104 | #if ORDER > 32 |
119 | int cnt = ORDER>>5; | 105 | int cnt = ORDER>>5; |
120 | #endif | 106 | #endif |
121 | 107 | ||
122 | asm volatile ( | 108 | asm volatile ( |
123 | #if ORDER > 16 | 109 | #if ORDER > 16 |
110 | #if ORDER > 32 | ||
111 | "mov %[res], #0 \n" | ||
112 | #endif | ||
124 | "ldmia %[v2]!, {r6-r7} \n" | 113 | "ldmia %[v2]!, {r6-r7} \n" |
125 | "1: \n" | 114 | "1: \n" |
126 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | 115 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" |
116 | #if ORDER > 32 | ||
127 | "mla %[res], r6, r0, %[res] \n" | 117 | "mla %[res], r6, r0, %[res] \n" |
118 | #else | ||
119 | "mul %[res], r6, r0 \n" | ||
120 | #endif | ||
128 | "mla %[res], r7, r1, %[res] \n" | 121 | "mla %[res], r7, r1, %[res] \n" |
129 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | 122 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" |
130 | "mla %[res], r0, r3, %[res] \n" | 123 | "mla %[res], r0, r3, %[res] \n" |
@@ -177,19 +170,21 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
177 | #endif | 170 | #endif |
178 | 171 | ||
179 | #else /* ORDER <= 16 */ | 172 | #else /* ORDER <= 16 */ |
180 | 173 | "ldmia %[v1]!, {r0-r3} \n" | |
181 | #define MLABLOCK4 \ | 174 | "ldmia %[v2]!, {r4-r7} \n" |
182 | "ldmia %[v1]!, {r0-r3} \n" \ | 175 | "mul %[res], r4, r0 \n" |
183 | "ldmia %[v2]!, {r4-r7} \n" \ | 176 | "mla %[res], r5, r1, %[res] \n" |
184 | "mla %[res], r4, r0, %[res] \n" \ | 177 | "mla %[res], r6, r2, %[res] \n" |
185 | "mla %[res], r5, r1, %[res] \n" \ | ||
186 | "mla %[res], r6, r2, %[res] \n" \ | ||
187 | "mla %[res], r7, r3, %[res] \n" | 178 | "mla %[res], r7, r3, %[res] \n" |
188 | 179 | ||
189 | MLABLOCK4 | 180 | ".rept 3 \n" |
190 | MLABLOCK4 | 181 | "ldmia %[v1]!, {r0-r3} \n" |
191 | MLABLOCK4 | 182 | "ldmia %[v2]!, {r4-r7} \n" |
192 | MLABLOCK4 | 183 | "mla %[res], r4, r0, %[res] \n" |
184 | "mla %[res], r5, r1, %[res] \n" | ||
185 | "mla %[res], r6, r2, %[res] \n" | ||
186 | "mla %[res], r7, r3, %[res] \n" | ||
187 | ".endr \n" | ||
193 | #endif /* ORDER <= 16 */ | 188 | #endif /* ORDER <= 16 */ |
194 | : /* outputs */ | 189 | : /* outputs */ |
195 | #if ORDER > 32 | 190 | #if ORDER > 32 |
@@ -197,7 +192,7 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
197 | #endif | 192 | #endif |
198 | [v1] "+r"(v1), | 193 | [v1] "+r"(v1), |
199 | [v2] "+r"(v2), | 194 | [v2] "+r"(v2), |
200 | [res]"+r"(res) | 195 | [res]"=r"(res) |
201 | : /* inputs */ | 196 | : /* inputs */ |
202 | : /* clobbers */ | 197 | : /* clobbers */ |
203 | "r0", "r1", "r2", "r3", | 198 | "r0", "r1", "r2", "r3", |
diff --git a/apps/codecs/demac/libdemac/vector_math_generic.h b/apps/codecs/demac/libdemac/vector_math_generic.h index 7b61db77be..00bf07a007 100644 --- a/apps/codecs/demac/libdemac/vector_math_generic.h +++ b/apps/codecs/demac/libdemac/vector_math_generic.h | |||
@@ -116,8 +116,8 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2) | |||
116 | { | 116 | { |
117 | int res = 0; | 117 | int res = 0; |
118 | 118 | ||
119 | #if ORDER > 16 | 119 | #if ORDER > 32 |
120 | int order = (ORDER >> 4); | 120 | int order = (ORDER >> 5); |
121 | while (order--) | 121 | while (order--) |
122 | #endif | 122 | #endif |
123 | { | 123 | { |
@@ -137,6 +137,24 @@ static inline int32_t scalarproduct(filter_int* v1, filter_int* v2) | |||
137 | res += *v1++ * *v2++; | 137 | res += *v1++ * *v2++; |
138 | res += *v1++ * *v2++; | 138 | res += *v1++ * *v2++; |
139 | res += *v1++ * *v2++; | 139 | res += *v1++ * *v2++; |
140 | #if ORDER > 16 | ||
141 | res += *v1++ * *v2++; | ||
142 | res += *v1++ * *v2++; | ||
143 | res += *v1++ * *v2++; | ||
144 | res += *v1++ * *v2++; | ||
145 | res += *v1++ * *v2++; | ||
146 | res += *v1++ * *v2++; | ||
147 | res += *v1++ * *v2++; | ||
148 | res += *v1++ * *v2++; | ||
149 | res += *v1++ * *v2++; | ||
150 | res += *v1++ * *v2++; | ||
151 | res += *v1++ * *v2++; | ||
152 | res += *v1++ * *v2++; | ||
153 | res += *v1++ * *v2++; | ||
154 | res += *v1++ * *v2++; | ||
155 | res += *v1++ * *v2++; | ||
156 | res += *v1++ * *v2++; | ||
157 | #endif | ||
140 | } | 158 | } |
141 | return res; | 159 | return res; |
142 | } | 160 | } |