diff options
Diffstat (limited to 'apps/codecs/demac/libdemac/vector_math32_armv4.h')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math32_armv4.h | 194 |
1 files changed, 95 insertions, 99 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index 89b24f2b06..207fca3038 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h | |||
@@ -24,78 +24,134 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | static inline void vector_add(int32_t* v1, int32_t* v2) | 27 | #define FUSED_VECTOR_MATH |
28 | |||
29 | #if ORDER > 32 | ||
30 | #define BLOCK_REPEAT "8" | ||
31 | #elif ORDER > 16 | ||
32 | #define BLOCK_REPEAT "7" | ||
33 | #else | ||
34 | #define BLOCK_REPEAT "3" | ||
35 | #endif | ||
36 | |||
37 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ | ||
38 | static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2) | ||
28 | { | 39 | { |
40 | int res; | ||
29 | #if ORDER > 32 | 41 | #if ORDER > 32 |
30 | int cnt = ORDER>>5; | 42 | int cnt = ORDER>>5; |
31 | #endif | 43 | #endif |
32 | 44 | ||
33 | #if ORDER > 16 | 45 | asm volatile ( |
34 | #define ADD_SUB_BLOCKS "8" | 46 | #if ORDER > 32 |
47 | "mov %[res], #0 \n" | ||
48 | "1: \n" | ||
35 | #else | 49 | #else |
36 | #define ADD_SUB_BLOCKS "4" | 50 | "ldmia %[v1], {r0-r3} \n" |
51 | "ldmia %[f2]!, {r4-r7} \n" | ||
52 | "mul %[res], r4, r0 \n" | ||
53 | "mla %[res], r5, r1, %[res] \n" | ||
54 | "mla %[res], r6, r2, %[res] \n" | ||
55 | "mla %[res], r7, r3, %[res] \n" | ||
56 | "ldmia %[s2]!, {r4-r7} \n" | ||
57 | "add r0, r0, r4 \n" | ||
58 | "add r1, r1, r5 \n" | ||
59 | "add r2, r2, r6 \n" | ||
60 | "add r3, r3, r7 \n" | ||
61 | "stmia %[v1]!, {r0-r3} \n" | ||
37 | #endif | 62 | #endif |
38 | 63 | ".rept " BLOCK_REPEAT "\n" | |
39 | asm volatile ( | 64 | "ldmia %[v1], {r0-r3} \n" |
40 | "1: \n" | 65 | "ldmia %[f2]!, {r4-r7} \n" |
41 | ".rept " ADD_SUB_BLOCKS "\n" | 66 | "mla %[res], r4, r0, %[res] \n" |
42 | "ldmia %[v1], {r0-r3} \n" | 67 | "mla %[res], r5, r1, %[res] \n" |
43 | "ldmia %[v2]!, {r4-r7} \n" | 68 | "mla %[res], r6, r2, %[res] \n" |
44 | "add r0, r0, r4 \n" | 69 | "mla %[res], r7, r3, %[res] \n" |
45 | "add r1, r1, r5 \n" | 70 | "ldmia %[s2]!, {r4-r7} \n" |
46 | "add r2, r2, r6 \n" | 71 | "add r0, r0, r4 \n" |
47 | "add r3, r3, r7 \n" | 72 | "add r1, r1, r5 \n" |
48 | "stmia %[v1]!, {r0-r3} \n" | 73 | "add r2, r2, r6 \n" |
49 | ".endr \n" | 74 | "add r3, r3, r7 \n" |
75 | "stmia %[v1]!, {r0-r3} \n" | ||
76 | ".endr \n" | ||
50 | #if ORDER > 32 | 77 | #if ORDER > 32 |
51 | "subs %[cnt], %[cnt], #1 \n" | 78 | "subs %[cnt], %[cnt], #1 \n" |
52 | "bne 1b \n" | 79 | "bne 1b \n" |
53 | #endif | 80 | #endif |
54 | : /* outputs */ | 81 | : /* outputs */ |
55 | #if ORDER > 32 | 82 | #if ORDER > 32 |
56 | [cnt]"+r"(cnt), | 83 | [cnt]"+r"(cnt), |
57 | #endif | 84 | #endif |
58 | [v1] "+r"(v1), | 85 | [v1] "+r"(v1), |
59 | [v2] "+r"(v2) | 86 | [f2] "+r"(f2), |
87 | [s2] "+r"(s2), | ||
88 | [res]"=r"(res) | ||
60 | : /* inputs */ | 89 | : /* inputs */ |
61 | : /* clobbers */ | 90 | : /* clobbers */ |
62 | "r0", "r1", "r2", "r3", "r4", | 91 | "r0", "r1", "r2", "r3", "r4", |
63 | "r5", "r6", "r7", "memory" | 92 | "r5", "r6", "r7", "memory" |
64 | ); | 93 | ); |
94 | return res; | ||
65 | } | 95 | } |
66 | 96 | ||
67 | static inline void vector_sub(int32_t* v1, int32_t* v2) | 97 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ |
98 | static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2) | ||
68 | { | 99 | { |
100 | int res; | ||
69 | #if ORDER > 32 | 101 | #if ORDER > 32 |
70 | int cnt = ORDER>>5; | 102 | int cnt = ORDER>>5; |
71 | #endif | 103 | #endif |
72 | 104 | ||
73 | asm volatile ( | 105 | asm volatile ( |
74 | "1: \n" | ||
75 | ".rept " ADD_SUB_BLOCKS "\n" | ||
76 | "ldmia %[v1], {r0-r3} \n" | ||
77 | "ldmia %[v2]!, {r4-r7} \n" | ||
78 | "sub r0, r0, r4 \n" | ||
79 | "sub r1, r1, r5 \n" | ||
80 | "sub r2, r2, r6 \n" | ||
81 | "sub r3, r3, r7 \n" | ||
82 | "stmia %[v1]!, {r0-r3} \n" | ||
83 | ".endr \n" | ||
84 | #if ORDER > 32 | 106 | #if ORDER > 32 |
85 | "subs %[cnt], %[cnt], #1 \n" | 107 | "mov %[res], #0 \n" |
86 | "bne 1b \n" | 108 | "1: \n" |
109 | #else | ||
110 | "ldmia %[v1], {r0-r3} \n" | ||
111 | "ldmia %[f2]!, {r4-r7} \n" | ||
112 | "mul %[res], r4, r0 \n" | ||
113 | "mla %[res], r5, r1, %[res] \n" | ||
114 | "mla %[res], r6, r2, %[res] \n" | ||
115 | "mla %[res], r7, r3, %[res] \n" | ||
116 | "ldmia %[s2]!, {r4-r7} \n" | ||
117 | "sub r0, r0, r4 \n" | ||
118 | "sub r1, r1, r5 \n" | ||
119 | "sub r2, r2, r6 \n" | ||
120 | "sub r3, r3, r7 \n" | ||
121 | "stmia %[v1]!, {r0-r3} \n" | ||
122 | #endif | ||
123 | ".rept " BLOCK_REPEAT "\n" | ||
124 | "ldmia %[v1], {r0-r3} \n" | ||
125 | "ldmia %[f2]!, {r4-r7} \n" | ||
126 | "mla %[res], r4, r0, %[res] \n" | ||
127 | "mla %[res], r5, r1, %[res] \n" | ||
128 | "mla %[res], r6, r2, %[res] \n" | ||
129 | "mla %[res], r7, r3, %[res] \n" | ||
130 | "ldmia %[s2]!, {r4-r7} \n" | ||
131 | "sub r0, r0, r4 \n" | ||
132 | "sub r1, r1, r5 \n" | ||
133 | "sub r2, r2, r6 \n" | ||
134 | "sub r3, r3, r7 \n" | ||
135 | "stmia %[v1]!, {r0-r3} \n" | ||
136 | ".endr \n" | ||
137 | #if ORDER > 32 | ||
138 | "subs %[cnt], %[cnt], #1 \n" | ||
139 | "bne 1b \n" | ||
87 | #endif | 140 | #endif |
88 | : /* outputs */ | 141 | : /* outputs */ |
89 | #if ORDER > 32 | 142 | #if ORDER > 32 |
90 | [cnt]"+r"(cnt), | 143 | [cnt]"+r"(cnt), |
91 | #endif | 144 | #endif |
92 | [v1] "+r"(v1), | 145 | [v1] "+r"(v1), |
93 | [v2] "+r"(v2) | 146 | [f2] "+r"(f2), |
147 | [s2] "+r"(s2), | ||
148 | [res]"=r"(res) | ||
94 | : /* inputs */ | 149 | : /* inputs */ |
95 | : /* clobbers */ | 150 | : /* clobbers */ |
96 | "r0", "r1", "r2", "r3", "r4", | 151 | "r0", "r1", "r2", "r3", "r4", |
97 | "r5", "r6", "r7", "memory" | 152 | "r5", "r6", "r7", "memory" |
98 | ); | 153 | ); |
154 | return res; | ||
99 | } | 155 | } |
100 | 156 | ||
101 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | 157 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) |
@@ -106,78 +162,18 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
106 | #endif | 162 | #endif |
107 | 163 | ||
108 | asm volatile ( | 164 | asm volatile ( |
109 | #if ORDER > 16 | ||
110 | #if ORDER > 32 | 165 | #if ORDER > 32 |
111 | "mov %[res], #0 \n" | 166 | "mov %[res], #0 \n" |
112 | #endif | ||
113 | "ldmia %[v2]!, {r6-r7} \n" | ||
114 | "1: \n" | 167 | "1: \n" |
115 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | ||
116 | #if ORDER > 32 | ||
117 | "mla %[res], r6, r0, %[res] \n" | ||
118 | #else | 168 | #else |
119 | "mul %[res], r6, r0 \n" | ||
120 | #endif | ||
121 | "mla %[res], r7, r1, %[res] \n" | ||
122 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | ||
123 | "mla %[res], r0, r3, %[res] \n" | ||
124 | "mla %[res], r1, r4, %[res] \n" | ||
125 | "mla %[res], r2, r5, %[res] \n" | ||
126 | "ldmia %[v1]!, {r0-r4} \n" | ||
127 | "mla %[res], r6, r0, %[res] \n" | ||
128 | "mla %[res], r7, r1, %[res] \n" | ||
129 | "mla %[res], r8, r2, %[res] \n" | ||
130 | "ldmia %[v2]!, {r0,r1,r6-r8} \n" | ||
131 | "mla %[res], r0, r3, %[res] \n" | ||
132 | "mla %[res], r1, r4, %[res] \n" | ||
133 | "ldmia %[v1]!, {r0-r5} \n" | ||
134 | "mla %[res], r6, r0, %[res] \n" | ||
135 | "mla %[res], r7, r1, %[res] \n" | ||
136 | "mla %[res], r8, r2, %[res] \n" | ||
137 | "ldmia %[v2]!, {r0-r2,r6,r7} \n" | ||
138 | "mla %[res], r0, r3, %[res] \n" | ||
139 | "mla %[res], r1, r4, %[res] \n" | ||
140 | "mla %[res], r2, r5, %[res] \n" | ||
141 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | ||
142 | "mla %[res], r6, r0, %[res] \n" | ||
143 | "mla %[res], r7, r1, %[res] \n" | ||
144 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | ||
145 | "mla %[res], r0, r3, %[res] \n" | ||
146 | "mla %[res], r1, r4, %[res] \n" | ||
147 | "mla %[res], r2, r5, %[res] \n" | ||
148 | "ldmia %[v1]!, {r0-r4} \n" | ||
149 | "mla %[res], r6, r0, %[res] \n" | ||
150 | "mla %[res], r7, r1, %[res] \n" | ||
151 | "mla %[res], r8, r2, %[res] \n" | ||
152 | "ldmia %[v2]!, {r0,r1,r6-r8} \n" | ||
153 | "mla %[res], r0, r3, %[res] \n" | ||
154 | "mla %[res], r1, r4, %[res] \n" | ||
155 | "ldmia %[v1]!, {r0-r5} \n" | ||
156 | "mla %[res], r6, r0, %[res] \n" | ||
157 | "mla %[res], r7, r1, %[res] \n" | ||
158 | "mla %[res], r8, r2, %[res] \n" | ||
159 | #if ORDER > 32 | ||
160 | "ldmia %[v2]!, {r0-r2,r6,r7} \n" | ||
161 | #else | ||
162 | "ldmia %[v2]!, {r0-r2} \n" | ||
163 | #endif | ||
164 | "mla %[res], r0, r3, %[res] \n" | ||
165 | "mla %[res], r1, r4, %[res] \n" | ||
166 | "mla %[res], r2, r5, %[res] \n" | ||
167 | #if ORDER > 32 | ||
168 | "subs %[cnt], %[cnt], #1 \n" | ||
169 | "bne 1b \n" | ||
170 | #endif | ||
171 | |||
172 | #else /* ORDER <= 16 */ | ||
173 | "ldmia %[v1]!, {r0-r3} \n" | 169 | "ldmia %[v1]!, {r0-r3} \n" |
174 | "ldmia %[v2]!, {r4-r7} \n" | 170 | "ldmia %[v2]!, {r4-r7} \n" |
175 | "mul %[res], r4, r0 \n" | 171 | "mul %[res], r4, r0 \n" |
176 | "mla %[res], r5, r1, %[res] \n" | 172 | "mla %[res], r5, r1, %[res] \n" |
177 | "mla %[res], r6, r2, %[res] \n" | 173 | "mla %[res], r6, r2, %[res] \n" |
178 | "mla %[res], r7, r3, %[res] \n" | 174 | "mla %[res], r7, r3, %[res] \n" |
179 | 175 | #endif | |
180 | ".rept 3 \n" | 176 | ".rept " BLOCK_REPEAT "\n" |
181 | "ldmia %[v1]!, {r0-r3} \n" | 177 | "ldmia %[v1]!, {r0-r3} \n" |
182 | "ldmia %[v2]!, {r4-r7} \n" | 178 | "ldmia %[v2]!, {r4-r7} \n" |
183 | "mla %[res], r4, r0, %[res] \n" | 179 | "mla %[res], r4, r0, %[res] \n" |
@@ -185,7 +181,10 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
185 | "mla %[res], r6, r2, %[res] \n" | 181 | "mla %[res], r6, r2, %[res] \n" |
186 | "mla %[res], r7, r3, %[res] \n" | 182 | "mla %[res], r7, r3, %[res] \n" |
187 | ".endr \n" | 183 | ".endr \n" |
188 | #endif /* ORDER <= 16 */ | 184 | #if ORDER > 32 |
185 | "subs %[cnt], %[cnt], #1 \n" | ||
186 | "bne 1b \n" | ||
187 | #endif | ||
189 | : /* outputs */ | 188 | : /* outputs */ |
190 | #if ORDER > 32 | 189 | #if ORDER > 32 |
191 | [cnt]"+r"(cnt), | 190 | [cnt]"+r"(cnt), |
@@ -197,9 +196,6 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
197 | : /* clobbers */ | 196 | : /* clobbers */ |
198 | "r0", "r1", "r2", "r3", | 197 | "r0", "r1", "r2", "r3", |
199 | "r4", "r5", "r6", "r7" | 198 | "r4", "r5", "r6", "r7" |
200 | #if ORDER > 16 | ||
201 | ,"r8" | ||
202 | #endif | ||
203 | ); | 199 | ); |
204 | return res; | 200 | return res; |
205 | } | 201 | } |