diff options
author | Jens Arnold <amiconn@rockbox.org> | 2010-02-02 22:50:21 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2010-02-02 22:50:21 +0000 |
commit | 9f6586698a411186750d2b1ff3c2f3d7a8e117a8 (patch) | |
tree | d9f73e9c50f90af02d0a0f875c6711070ca8c456 /apps/codecs | |
parent | 24de239d3d0f48f9f67721c716e6880cbbf95bae (diff) | |
download | rockbox-9f6586698a411186750d2b1ff3c2f3d7a8e117a8.tar.gz rockbox-9f6586698a411186750d2b1ff3c2f3d7a8e117a8.zip |
APE codec: Speed up decoding of -c2000 and higher on ARMv4 and coldfire by fusing vector math for the filters. Speedup is roughly 3.5% for -c2000, 8% for -c3000 and 12% for -c4000. To be extended to other architectures.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24473 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r-- | apps/codecs/demac/libdemac/filter.c | 28 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 388 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math32_armv4.h | 194 | ||||
-rw-r--r-- | apps/codecs/lib/udiv32_arm.S | 2 |
4 files changed, 335 insertions, 277 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 93edf39cb2..ed6f3c8dc6 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c | |||
@@ -134,6 +134,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, | |||
134 | 134 | ||
135 | while(LIKELY(count--)) | 135 | while(LIKELY(count--)) |
136 | { | 136 | { |
137 | #ifdef FUSED_VECTOR_MATH | ||
138 | if (LIKELY(*data != 0)) { | ||
139 | if (*data < 0) | ||
140 | res = vector_sp_add(f->coeffs, f->delay - ORDER, | ||
141 | f->adaptcoeffs - ORDER); | ||
142 | else | ||
143 | res = vector_sp_sub(f->coeffs, f->delay - ORDER, | ||
144 | f->adaptcoeffs - ORDER); | ||
145 | } else { | ||
146 | res = scalarproduct(f->coeffs, f->delay - ORDER); | ||
147 | } | ||
148 | res = FP_TO_INT(res); | ||
149 | #else | ||
137 | res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); | 150 | res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); |
138 | 151 | ||
139 | if (LIKELY(*data != 0)) { | 152 | if (LIKELY(*data != 0)) { |
@@ -142,6 +155,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, | |||
142 | else | 155 | else |
143 | vector_sub(f->coeffs, f->adaptcoeffs - ORDER); | 156 | vector_sub(f->coeffs, f->adaptcoeffs - ORDER); |
144 | } | 157 | } |
158 | #endif | ||
145 | 159 | ||
146 | res += *data; | 160 | res += *data; |
147 | 161 | ||
@@ -193,6 +207,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, | |||
193 | 207 | ||
194 | while(LIKELY(count--)) | 208 | while(LIKELY(count--)) |
195 | { | 209 | { |
210 | #ifdef FUSED_VECTOR_MATH | ||
211 | if (LIKELY(*data != 0)) { | ||
212 | if (*data < 0) | ||
213 | res = vector_sp_add(f->coeffs, f->delay - ORDER, | ||
214 | f->adaptcoeffs - ORDER); | ||
215 | else | ||
216 | res = vector_sp_sub(f->coeffs, f->delay - ORDER, | ||
217 | f->adaptcoeffs - ORDER); | ||
218 | } else { | ||
219 | res = scalarproduct(f->coeffs, f->delay - ORDER); | ||
220 | } | ||
221 | res = FP_TO_INT(res); | ||
222 | #else | ||
196 | res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); | 223 | res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); |
197 | 224 | ||
198 | if (LIKELY(*data != 0)) { | 225 | if (LIKELY(*data != 0)) { |
@@ -201,6 +228,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, | |||
201 | else | 228 | else |
202 | vector_sub(f->coeffs, f->adaptcoeffs - ORDER); | 229 | vector_sub(f->coeffs, f->adaptcoeffs - ORDER); |
203 | } | 230 | } |
231 | #endif | ||
204 | 232 | ||
205 | /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an | 233 | /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an |
206 | integer (rounding to nearest) and add the input value to | 234 | integer (rounding to nearest) and add the input value to |
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 11e7f07adf..6e8216c9cc 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -24,19 +24,27 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | 27 | #define FUSED_VECTOR_MATH |
28 | * 32 bit aligned, otherwise performance will suffer. */ | 28 | |
29 | static inline void vector_add(int16_t* v1, int16_t* v2) | 29 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ |
30 | |||
31 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) | ||
32 | * This version fetches data as 32 bit words, and *recommends* v1 to be | ||
33 | * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit | ||
34 | * aligned or both unaligned. Performance will suffer if either condition | ||
35 | * isn't met. It also needs EMAC in signed integer mode. */ | ||
36 | static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) | ||
30 | { | 37 | { |
38 | int res; | ||
31 | #if ORDER > 16 | 39 | #if ORDER > 16 |
32 | int cnt = ORDER>>4; | 40 | int cnt = ORDER>>4; |
33 | #endif | 41 | #endif |
34 | 42 | ||
35 | #define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ | 43 | #define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \ |
36 | "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ | 44 | "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \ |
37 | "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ | 45 | "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \ |
38 | "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ | 46 | "clr.w " #sum " \n" /* 's1' is clobbered! */ \ |
39 | "add.l %%d4 , " #sum "\n" \ | 47 | "add.l " #s2 ", " #sum "\n" \ |
40 | "move.w " #s1 ", " #sum "\n" | 48 | "move.w " #s1 ", " #sum "\n" |
41 | 49 | ||
42 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ | 50 | #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ |
@@ -47,94 +55,115 @@ static inline void vector_add(int16_t* v1, int16_t* v2) | |||
47 | "move.w " #s1 ", " #sum "\n" | 55 | "move.w " #s1 ", " #sum "\n" |
48 | 56 | ||
49 | asm volatile ( | 57 | asm volatile ( |
50 | "move.l %[v2], %%d0 \n" | 58 | "move.l %[f2], %%d0 \n" |
51 | "and.l #2, %%d0 \n" | 59 | "and.l #2, %%d0 \n" |
52 | "jeq 20f \n" | 60 | "jeq 20f \n" |
53 | 61 | ||
54 | "10: \n" | 62 | "10: \n" |
55 | "move.w (%[v2])+, %%d0 \n" | 63 | "move.w (%[f2])+, %%d0 \n" |
56 | "swap %%d0 \n" | 64 | "move.w (%[s2])+, %%d1 \n" |
57 | "1: \n" | 65 | "swap %%d1 \n" |
58 | "movem.l (%[v1]), %%a0-%%a3 \n" | 66 | "1: \n" |
59 | "movem.l (%[v2]), %%d1-%%d4 \n" | 67 | ".rept 2 \n" |
60 | ADDHALFXREGS(%%a0, %%d1, %%d0) | 68 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" |
61 | "move.l %%d0, (%[v1])+ \n" | 69 | "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" |
62 | ADDHALFXREGS(%%a1, %%d2, %%d1) | 70 | "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" |
63 | "move.l %%d1, (%[v1])+ \n" | 71 | ADDHALFXREGS(%%d6, %%d2, %%d1) |
64 | ADDHALFXREGS(%%a2, %%d3, %%d2) | 72 | "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" |
65 | "move.l %%d2, (%[v1])+ \n" | 73 | "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" |
66 | ADDHALFXREGS(%%a3, %%d4, %%d3) | 74 | "move.l %%d1, (%[v1])+ \n" |
67 | "move.l %%d3, (%[v1])+ \n" | 75 | ADDHALFXREGS(%%d7, %%d6, %%d2) |
68 | "lea.l (16, %[v2]), %[v2] \n" | 76 | "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" |
69 | "move.l %%d4, %%d0 \n" | 77 | "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" |
70 | 78 | "move.l %%d2, (%[v1])+ \n" | |
71 | "movem.l (%[v1]), %%a0-%%a3 \n" | 79 | ADDHALFXREGS(%%a0, %%d7, %%d6) |
72 | "movem.l (%[v2]), %%d1-%%d4 \n" | 80 | "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" |
73 | ADDHALFXREGS(%%a0, %%d1, %%d0) | 81 | "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" |
74 | "move.l %%d0, (%[v1])+ \n" | 82 | "move.l %%d6, (%[v1])+ \n" |
75 | ADDHALFXREGS(%%a1, %%d2, %%d1) | 83 | ADDHALFXREGS(%%a1, %%d1, %%d7) |
76 | "move.l %%d1, (%[v1])+ \n" | 84 | "move.l %%d7, (%[v1])+ \n" |
77 | ADDHALFXREGS(%%a2, %%d3, %%d2) | 85 | ".endr \n" |
78 | "move.l %%d2, (%[v1])+ \n" | 86 | |
79 | ADDHALFXREGS(%%a3, %%d4, %%d3) | ||
80 | "move.l %%d3, (%[v1])+ \n" | ||
81 | #if ORDER > 16 | 87 | #if ORDER > 16 |
82 | "lea.l (16, %[v2]), %[v2] \n" | 88 | "subq.l #1, %[res] \n" |
83 | "move.l %%d4, %%d0 \n" | 89 | "bne.w 1b \n" |
90 | #endif | ||
91 | "jra 99f \n" | ||
84 | 92 | ||
85 | "subq.l #1, %[cnt] \n" | 93 | "20: \n" |
86 | "jne 1b \n" | 94 | "move.l (%[f2])+, %%d0 \n" |
95 | "1: \n" | ||
96 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
97 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
98 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
99 | ADDHALFREGS(%%d6, %%d1, %%d2) | ||
100 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
101 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
102 | "move.l %%d2, (%[v1])+ \n" | ||
103 | ADDHALFREGS(%%d7, %%d1, %%d2) | ||
104 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
105 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
106 | "move.l %%d2, (%[v1])+ \n" | ||
107 | ADDHALFREGS(%%a0, %%d1, %%d2) | ||
108 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
109 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
110 | "move.l %%d2, (%[v1])+ \n" | ||
111 | ADDHALFREGS(%%a1, %%d1, %%d2) | ||
112 | "move.l %%d2, (%[v1])+ \n" | ||
113 | |||
114 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
115 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
116 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
117 | ADDHALFREGS(%%d6, %%d1, %%d2) | ||
118 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
119 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
120 | "move.l %%d2, (%[v1])+ \n" | ||
121 | ADDHALFREGS(%%d7, %%d1, %%d2) | ||
122 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
123 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
124 | "move.l %%d2, (%[v1])+ \n" | ||
125 | ADDHALFREGS(%%a0, %%d1, %%d2) | ||
126 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
127 | #if ORDER > 16 | ||
128 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
129 | #else | ||
130 | "mac.w %%d0l, %%a1l, %%acc0 \n" | ||
87 | #endif | 131 | #endif |
88 | "jra 99f \n" | 132 | "move.l %%d2, (%[v1])+ \n" |
89 | 133 | ADDHALFREGS(%%a1, %%d1, %%d2) | |
90 | "20: \n" | 134 | "move.l %%d2, (%[v1])+ \n" |
91 | "1: \n" | ||
92 | "movem.l (%[v2]), %%a0-%%a3 \n" | ||
93 | "movem.l (%[v1]), %%d0-%%d3 \n" | ||
94 | ADDHALFREGS(%%a0, %%d0) | ||
95 | "move.l %%d0, (%[v1])+ \n" | ||
96 | ADDHALFREGS(%%a1, %%d1) | ||
97 | "move.l %%d1, (%[v1])+ \n" | ||
98 | ADDHALFREGS(%%a2, %%d2) | ||
99 | "move.l %%d2, (%[v1])+ \n" | ||
100 | ADDHALFREGS(%%a3, %%d3) | ||
101 | "move.l %%d3, (%[v1])+ \n" | ||
102 | "lea.l (16, %[v2]), %[v2] \n" | ||
103 | |||
104 | "movem.l (%[v2]), %%a0-%%a3 \n" | ||
105 | "movem.l (%[v1]), %%d0-%%d3 \n" | ||
106 | ADDHALFREGS(%%a0, %%d0) | ||
107 | "move.l %%d0, (%[v1])+ \n" | ||
108 | ADDHALFREGS(%%a1, %%d1) | ||
109 | "move.l %%d1, (%[v1])+ \n" | ||
110 | ADDHALFREGS(%%a2, %%d2) | ||
111 | "move.l %%d2, (%[v1])+ \n" | ||
112 | ADDHALFREGS(%%a3, %%d3) | ||
113 | "move.l %%d3, (%[v1])+ \n" | ||
114 | #if ORDER > 16 | 135 | #if ORDER > 16 |
115 | "lea.l (16, %[v2]), %[v2] \n" | 136 | "subq.l #1, %[res] \n" |
116 | 137 | "bne.w 1b \n" | |
117 | "subq.l #1, %[cnt] \n" | ||
118 | "jne 1b \n" | ||
119 | #endif | 138 | #endif |
120 | "99: \n" | 139 | |
140 | "99: \n" | ||
141 | "movclr.l %%acc0, %[res] \n" | ||
121 | : /* outputs */ | 142 | : /* outputs */ |
143 | [v1]"+a"(v1), | ||
144 | [f2]"+a"(f2), | ||
145 | [s2]"+a"(s2), | ||
146 | [res]"=d"(res) | ||
147 | : /* inputs */ | ||
122 | #if ORDER > 16 | 148 | #if ORDER > 16 |
123 | [cnt]"+d"(cnt), | 149 | [cnt]"[res]"(cnt) |
124 | #endif | 150 | #endif |
125 | [v1] "+a"(v1), | ||
126 | [v2] "+a"(v2) | ||
127 | : /* inputs */ | ||
128 | : /* clobbers */ | 151 | : /* clobbers */ |
129 | "d0", "d1", "d2", "d3", "d4", | 152 | "d0", "d1", "d2", "d6", "d7", |
130 | "a0", "a1", "a2", "a3", "memory" | 153 | "a0", "a1", "memory" |
154 | |||
131 | ); | 155 | ); |
156 | return res; | ||
132 | } | 157 | } |
133 | 158 | ||
134 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | 159 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) |
135 | * 32 bit aligned, otherwise performance will suffer. */ | 160 | * This version fetches data as 32 bit words, and *recommends* v1 to be |
136 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 161 | * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit |
162 | * aligned or both unaligned. Performance will suffer if either condition | ||
163 | * isn't met. It also needs EMAC in signed integer mode. */ | ||
164 | static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) | ||
137 | { | 165 | { |
166 | int res; | ||
138 | #if ORDER > 16 | 167 | #if ORDER > 16 |
139 | int cnt = ORDER>>4; | 168 | int cnt = ORDER>>4; |
140 | #endif | 169 | #endif |
@@ -155,107 +184,116 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) | |||
155 | "move.w " #min ", " #s1d "\n" | 184 | "move.w " #min ", " #s1d "\n" |
156 | 185 | ||
157 | asm volatile ( | 186 | asm volatile ( |
158 | "move.l %[v2], %%d0 \n" | 187 | "move.l %[f2], %%d0 \n" |
159 | "and.l #2, %%d0 \n" | 188 | "and.l #2, %%d0 \n" |
160 | "jeq 20f \n" | 189 | "jeq 20f \n" |
161 | 190 | ||
162 | "10: \n" | 191 | "10: \n" |
163 | "move.w (%[v2])+, %%d0 \n" | 192 | "move.w (%[f2])+, %%d0 \n" |
164 | "swap %%d0 \n" | 193 | "move.w (%[s2])+, %%d1 \n" |
165 | "1: \n" | 194 | "swap %%d1 \n" |
166 | "movem.l (%[v2]), %%d1-%%d4 \n" | 195 | "1: \n" |
167 | "movem.l (%[v1]), %%a0-%%a3 \n" | 196 | ".rept 2 \n" |
168 | SUBHALFXREGS(%%a0, %%d1, %%d0) | 197 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" |
169 | "move.l %%d0, (%[v1])+ \n" | 198 | "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" |
170 | SUBHALFXREGS(%%a1, %%d2, %%d1) | 199 | "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" |
171 | "move.l %%d1, (%[v1])+ \n" | 200 | SUBHALFXREGS(%%d6, %%d2, %%d1) |
172 | SUBHALFXREGS(%%a2, %%d3, %%d2) | 201 | "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" |
173 | "move.l %%d2, (%[v1])+ \n" | 202 | "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" |
174 | SUBHALFXREGS(%%a3, %%d4, %%d3) | 203 | "move.l %%d1, (%[v1])+ \n" |
175 | "move.l %%d3, (%[v1])+ \n" | 204 | SUBHALFXREGS(%%d7, %%d6, %%d2) |
176 | "lea.l (16, %[v2]), %[v2] \n" | 205 | "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" |
177 | "move.l %%d4, %%d0 \n" | 206 | "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" |
178 | 207 | "move.l %%d2, (%[v1])+ \n" | |
179 | "movem.l (%[v2]), %%d1-%%d4 \n" | 208 | SUBHALFXREGS(%%a0, %%d7, %%d6) |
180 | "movem.l (%[v1]), %%a0-%%a3 \n" | 209 | "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" |
181 | SUBHALFXREGS(%%a0, %%d1, %%d0) | 210 | "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" |
182 | "move.l %%d0, (%[v1])+ \n" | 211 | "move.l %%d6, (%[v1])+ \n" |
183 | SUBHALFXREGS(%%a1, %%d2, %%d1) | 212 | SUBHALFXREGS(%%a1, %%d1, %%d7) |
184 | "move.l %%d1, (%[v1])+ \n" | 213 | "move.l %%d7, (%[v1])+ \n" |
185 | SUBHALFXREGS(%%a2, %%d3, %%d2) | 214 | ".endr \n" |
186 | "move.l %%d2, (%[v1])+ \n" | 215 | |
187 | SUBHALFXREGS(%%a3, %%d4, %%d3) | ||
188 | "move.l %%d3, (%[v1])+ \n" | ||
189 | #if ORDER > 16 | 216 | #if ORDER > 16 |
190 | "lea.l (16, %[v2]), %[v2] \n" | 217 | "subq.l #1, %[res] \n" |
191 | "move.l %%d4, %%d0 \n" | 218 | "bne.w 1b \n" |
192 | |||
193 | "subq.l #1, %[cnt] \n" | ||
194 | "bne.w 1b \n" | ||
195 | #endif | 219 | #endif |
196 | "jra 99f \n" | ||
197 | |||
198 | "20: \n" | ||
199 | "1: \n" | ||
200 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
201 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
202 | SUBHALFREGS(%%a0, %%d1, %%d0) | ||
203 | "move.l %%d0, (%[v1])+ \n" | ||
204 | SUBHALFREGS(%%a1, %%d2, %%d1) | ||
205 | "move.l %%d1, (%[v1])+ \n" | ||
206 | SUBHALFREGS(%%a2, %%d3, %%d2) | ||
207 | "move.l %%d2, (%[v1])+ \n" | ||
208 | SUBHALFREGS(%%a3, %%d4, %%d3) | ||
209 | "move.l %%d3, (%[v1])+ \n" | ||
210 | "lea.l (16, %[v2]), %[v2] \n" | ||
211 | |||
212 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
213 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
214 | SUBHALFREGS(%%a0, %%d1, %%d0) | ||
215 | "move.l %%d0, (%[v1])+ \n" | ||
216 | SUBHALFREGS(%%a1, %%d2, %%d1) | ||
217 | "move.l %%d1, (%[v1])+ \n" | ||
218 | SUBHALFREGS(%%a2, %%d3, %%d2) | ||
219 | "move.l %%d2, (%[v1])+ \n" | ||
220 | SUBHALFREGS(%%a3, %%d4, %%d3) | ||
221 | "move.l %%d3, (%[v1])+ \n" | ||
222 | #if ORDER > 16 | ||
223 | "lea.l (16, %[v2]), %[v2] \n" | ||
224 | 220 | ||
225 | "subq.l #1, %[cnt] \n" | 221 | "jra 99f \n" |
226 | "bne.w 1b \n" | 222 | |
223 | "20: \n" | ||
224 | "move.l (%[f2])+, %%d0 \n" | ||
225 | "1: \n" | ||
226 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
227 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
228 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
229 | SUBHALFREGS(%%d6, %%d1, %%d2) | ||
230 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
231 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
232 | "move.l %%d2, (%[v1])+ \n" | ||
233 | SUBHALFREGS(%%d7, %%d1, %%d2) | ||
234 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
235 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
236 | "move.l %%d2, (%[v1])+ \n" | ||
237 | SUBHALFREGS(%%a0, %%d1, %%d2) | ||
238 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
239 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
240 | "move.l %%d2, (%[v1])+ \n" | ||
241 | SUBHALFREGS(%%a1, %%d1, %%d2) | ||
242 | "move.l %%d2, (%[v1])+ \n" | ||
243 | |||
244 | "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" | ||
245 | "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" | ||
246 | "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" | ||
247 | SUBHALFREGS(%%d6, %%d1, %%d2) | ||
248 | "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" | ||
249 | "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" | ||
250 | "move.l %%d2, (%[v1])+ \n" | ||
251 | SUBHALFREGS(%%d7, %%d1, %%d2) | ||
252 | "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" | ||
253 | "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" | ||
254 | "move.l %%d2, (%[v1])+ \n" | ||
255 | SUBHALFREGS(%%a0, %%d1, %%d2) | ||
256 | "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" | ||
257 | #if ORDER > 16 | ||
258 | "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" | ||
259 | #else | ||
260 | "mac.w %%d0l, %%a1l, %%acc0 \n" | ||
261 | #endif | ||
262 | "move.l %%d2, (%[v1])+ \n" | ||
263 | SUBHALFREGS(%%a1, %%d1, %%d2) | ||
264 | "move.l %%d2, (%[v1])+ \n" | ||
265 | #if ORDER > 16 | ||
266 | "subq.l #1, %[res] \n" | ||
267 | "bne.w 1b \n" | ||
227 | #endif | 268 | #endif |
228 | 269 | ||
229 | "99: \n" | 270 | "99: \n" |
271 | "movclr.l %%acc0, %[res] \n" | ||
230 | : /* outputs */ | 272 | : /* outputs */ |
273 | [v1]"+a"(v1), | ||
274 | [f2]"+a"(f2), | ||
275 | [s2]"+a"(s2), | ||
276 | [res]"=d"(res) | ||
277 | : /* inputs */ | ||
231 | #if ORDER > 16 | 278 | #if ORDER > 16 |
232 | [cnt]"+d"(cnt), | 279 | [cnt]"[res]"(cnt) |
233 | #endif | 280 | #endif |
234 | [v1] "+a"(v1), | ||
235 | [v2] "+a"(v2) | ||
236 | : /* inputs */ | ||
237 | : /* clobbers */ | 281 | : /* clobbers */ |
238 | "d0", "d1", "d2", "d3", "d4", | 282 | "d0", "d1", "d2", "d6", "d7", |
239 | "a0", "a1", "a2", "a3", "memory" | 283 | "a0", "a1", "memory" |
284 | |||
240 | ); | 285 | ); |
286 | return res; | ||
241 | } | 287 | } |
242 | 288 | ||
243 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | ||
244 | |||
245 | /* This version fetches data as 32 bit words, and *recommends* v1 to be | 289 | /* This version fetches data as 32 bit words, and *recommends* v1 to be |
246 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC | 290 | * 32 bit aligned, otherwise performance will suffer. It also needs EMAC |
247 | * in signed integer mode - call above macro before use. */ | 291 | * in signed integer mode. */ |
248 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | 292 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) |
249 | { | 293 | { |
250 | int res; | 294 | int res; |
251 | #if ORDER > 32 | ||
252 | int cnt = ORDER>>5; | ||
253 | #endif | ||
254 | |||
255 | #if ORDER > 16 | 295 | #if ORDER > 16 |
256 | #define MAC_BLOCKS "7" | 296 | int cnt = ORDER>>4; |
257 | #else | ||
258 | #define MAC_BLOCKS "3" | ||
259 | #endif | 297 | #endif |
260 | 298 | ||
261 | asm volatile ( | 299 | asm volatile ( |
@@ -267,20 +305,16 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
267 | "move.l (%[v1])+, %%d0 \n" | 305 | "move.l (%[v1])+, %%d0 \n" |
268 | "move.w (%[v2])+, %%d1 \n" | 306 | "move.w (%[v2])+, %%d1 \n" |
269 | "1: \n" | 307 | "1: \n" |
270 | ".rept " MAC_BLOCKS "\n" | 308 | ".rept 7 \n" |
271 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
272 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | ||
273 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 309 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
274 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 310 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
275 | ".endr \n" | 311 | ".endr \n" |
276 | 312 | ||
277 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 313 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
278 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 314 | #if ORDER > 16 |
279 | "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" | ||
280 | #if ORDER > 32 | ||
281 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 315 | "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
282 | "subq.l #1, %[res] \n" | 316 | "subq.l #1, %[res] \n" |
283 | "bne.w 1b \n" | 317 | "bne.b 1b \n" |
284 | #else | 318 | #else |
285 | "mac.w %%d0l, %%d1u, %%acc0 \n" | 319 | "mac.w %%d0l, %%d1u, %%acc0 \n" |
286 | #endif | 320 | #endif |
@@ -290,7 +324,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
290 | "move.l (%[v1])+, %%d0 \n" | 324 | "move.l (%[v1])+, %%d0 \n" |
291 | "move.l (%[v2])+, %%d1 \n" | 325 | "move.l (%[v2])+, %%d1 \n" |
292 | "1: \n" | 326 | "1: \n" |
293 | ".rept " MAC_BLOCKS "\n" | 327 | ".rept 3 \n" |
294 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 328 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
295 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 329 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
296 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 330 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
@@ -299,11 +333,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
299 | 333 | ||
300 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | 334 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" |
301 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 335 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
302 | #if ORDER > 32 | 336 | #if ORDER > 16 |
303 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" | 337 | "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" |
304 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" | 338 | "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" |
305 | "subq.l #1, %[res] \n" | 339 | "subq.l #1, %[res] \n" |
306 | "bne.w 1b \n" | 340 | "bne.b 1b \n" |
307 | #else | 341 | #else |
308 | "mac.w %%d2u, %%d1u, %%acc0 \n" | 342 | "mac.w %%d2u, %%d1u, %%acc0 \n" |
309 | "mac.w %%d2l, %%d1l, %%acc0 \n" | 343 | "mac.w %%d2l, %%d1l, %%acc0 \n" |
@@ -316,7 +350,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | |||
316 | [v2]"+a"(v2), | 350 | [v2]"+a"(v2), |
317 | [res]"=d"(res) | 351 | [res]"=d"(res) |
318 | : /* inputs */ | 352 | : /* inputs */ |
319 | #if ORDER > 32 | 353 | #if ORDER > 16 |
320 | [cnt]"[res]"(cnt) | 354 | [cnt]"[res]"(cnt) |
321 | #endif | 355 | #endif |
322 | : /* clobbers */ | 356 | : /* clobbers */ |
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index 89b24f2b06..207fca3038 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h | |||
@@ -24,78 +24,134 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | static inline void vector_add(int32_t* v1, int32_t* v2) | 27 | #define FUSED_VECTOR_MATH |
28 | |||
29 | #if ORDER > 32 | ||
30 | #define BLOCK_REPEAT "8" | ||
31 | #elif ORDER > 16 | ||
32 | #define BLOCK_REPEAT "7" | ||
33 | #else | ||
34 | #define BLOCK_REPEAT "3" | ||
35 | #endif | ||
36 | |||
37 | /* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ | ||
38 | static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2) | ||
28 | { | 39 | { |
40 | int res; | ||
29 | #if ORDER > 32 | 41 | #if ORDER > 32 |
30 | int cnt = ORDER>>5; | 42 | int cnt = ORDER>>5; |
31 | #endif | 43 | #endif |
32 | 44 | ||
33 | #if ORDER > 16 | 45 | asm volatile ( |
34 | #define ADD_SUB_BLOCKS "8" | 46 | #if ORDER > 32 |
47 | "mov %[res], #0 \n" | ||
48 | "1: \n" | ||
35 | #else | 49 | #else |
36 | #define ADD_SUB_BLOCKS "4" | 50 | "ldmia %[v1], {r0-r3} \n" |
51 | "ldmia %[f2]!, {r4-r7} \n" | ||
52 | "mul %[res], r4, r0 \n" | ||
53 | "mla %[res], r5, r1, %[res] \n" | ||
54 | "mla %[res], r6, r2, %[res] \n" | ||
55 | "mla %[res], r7, r3, %[res] \n" | ||
56 | "ldmia %[s2]!, {r4-r7} \n" | ||
57 | "add r0, r0, r4 \n" | ||
58 | "add r1, r1, r5 \n" | ||
59 | "add r2, r2, r6 \n" | ||
60 | "add r3, r3, r7 \n" | ||
61 | "stmia %[v1]!, {r0-r3} \n" | ||
37 | #endif | 62 | #endif |
38 | 63 | ".rept " BLOCK_REPEAT "\n" | |
39 | asm volatile ( | 64 | "ldmia %[v1], {r0-r3} \n" |
40 | "1: \n" | 65 | "ldmia %[f2]!, {r4-r7} \n" |
41 | ".rept " ADD_SUB_BLOCKS "\n" | 66 | "mla %[res], r4, r0, %[res] \n" |
42 | "ldmia %[v1], {r0-r3} \n" | 67 | "mla %[res], r5, r1, %[res] \n" |
43 | "ldmia %[v2]!, {r4-r7} \n" | 68 | "mla %[res], r6, r2, %[res] \n" |
44 | "add r0, r0, r4 \n" | 69 | "mla %[res], r7, r3, %[res] \n" |
45 | "add r1, r1, r5 \n" | 70 | "ldmia %[s2]!, {r4-r7} \n" |
46 | "add r2, r2, r6 \n" | 71 | "add r0, r0, r4 \n" |
47 | "add r3, r3, r7 \n" | 72 | "add r1, r1, r5 \n" |
48 | "stmia %[v1]!, {r0-r3} \n" | 73 | "add r2, r2, r6 \n" |
49 | ".endr \n" | 74 | "add r3, r3, r7 \n" |
75 | "stmia %[v1]!, {r0-r3} \n" | ||
76 | ".endr \n" | ||
50 | #if ORDER > 32 | 77 | #if ORDER > 32 |
51 | "subs %[cnt], %[cnt], #1 \n" | 78 | "subs %[cnt], %[cnt], #1 \n" |
52 | "bne 1b \n" | 79 | "bne 1b \n" |
53 | #endif | 80 | #endif |
54 | : /* outputs */ | 81 | : /* outputs */ |
55 | #if ORDER > 32 | 82 | #if ORDER > 32 |
56 | [cnt]"+r"(cnt), | 83 | [cnt]"+r"(cnt), |
57 | #endif | 84 | #endif |
58 | [v1] "+r"(v1), | 85 | [v1] "+r"(v1), |
59 | [v2] "+r"(v2) | 86 | [f2] "+r"(f2), |
87 | [s2] "+r"(s2), | ||
88 | [res]"=r"(res) | ||
60 | : /* inputs */ | 89 | : /* inputs */ |
61 | : /* clobbers */ | 90 | : /* clobbers */ |
62 | "r0", "r1", "r2", "r3", "r4", | 91 | "r0", "r1", "r2", "r3", "r4", |
63 | "r5", "r6", "r7", "memory" | 92 | "r5", "r6", "r7", "memory" |
64 | ); | 93 | ); |
94 | return res; | ||
65 | } | 95 | } |
66 | 96 | ||
67 | static inline void vector_sub(int32_t* v1, int32_t* v2) | 97 | /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ |
98 | static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2) | ||
68 | { | 99 | { |
100 | int res; | ||
69 | #if ORDER > 32 | 101 | #if ORDER > 32 |
70 | int cnt = ORDER>>5; | 102 | int cnt = ORDER>>5; |
71 | #endif | 103 | #endif |
72 | 104 | ||
73 | asm volatile ( | 105 | asm volatile ( |
74 | "1: \n" | ||
75 | ".rept " ADD_SUB_BLOCKS "\n" | ||
76 | "ldmia %[v1], {r0-r3} \n" | ||
77 | "ldmia %[v2]!, {r4-r7} \n" | ||
78 | "sub r0, r0, r4 \n" | ||
79 | "sub r1, r1, r5 \n" | ||
80 | "sub r2, r2, r6 \n" | ||
81 | "sub r3, r3, r7 \n" | ||
82 | "stmia %[v1]!, {r0-r3} \n" | ||
83 | ".endr \n" | ||
84 | #if ORDER > 32 | 106 | #if ORDER > 32 |
85 | "subs %[cnt], %[cnt], #1 \n" | 107 | "mov %[res], #0 \n" |
86 | "bne 1b \n" | 108 | "1: \n" |
109 | #else | ||
110 | "ldmia %[v1], {r0-r3} \n" | ||
111 | "ldmia %[f2]!, {r4-r7} \n" | ||
112 | "mul %[res], r4, r0 \n" | ||
113 | "mla %[res], r5, r1, %[res] \n" | ||
114 | "mla %[res], r6, r2, %[res] \n" | ||
115 | "mla %[res], r7, r3, %[res] \n" | ||
116 | "ldmia %[s2]!, {r4-r7} \n" | ||
117 | "sub r0, r0, r4 \n" | ||
118 | "sub r1, r1, r5 \n" | ||
119 | "sub r2, r2, r6 \n" | ||
120 | "sub r3, r3, r7 \n" | ||
121 | "stmia %[v1]!, {r0-r3} \n" | ||
122 | #endif | ||
123 | ".rept " BLOCK_REPEAT "\n" | ||
124 | "ldmia %[v1], {r0-r3} \n" | ||
125 | "ldmia %[f2]!, {r4-r7} \n" | ||
126 | "mla %[res], r4, r0, %[res] \n" | ||
127 | "mla %[res], r5, r1, %[res] \n" | ||
128 | "mla %[res], r6, r2, %[res] \n" | ||
129 | "mla %[res], r7, r3, %[res] \n" | ||
130 | "ldmia %[s2]!, {r4-r7} \n" | ||
131 | "sub r0, r0, r4 \n" | ||
132 | "sub r1, r1, r5 \n" | ||
133 | "sub r2, r2, r6 \n" | ||
134 | "sub r3, r3, r7 \n" | ||
135 | "stmia %[v1]!, {r0-r3} \n" | ||
136 | ".endr \n" | ||
137 | #if ORDER > 32 | ||
138 | "subs %[cnt], %[cnt], #1 \n" | ||
139 | "bne 1b \n" | ||
87 | #endif | 140 | #endif |
88 | : /* outputs */ | 141 | : /* outputs */ |
89 | #if ORDER > 32 | 142 | #if ORDER > 32 |
90 | [cnt]"+r"(cnt), | 143 | [cnt]"+r"(cnt), |
91 | #endif | 144 | #endif |
92 | [v1] "+r"(v1), | 145 | [v1] "+r"(v1), |
93 | [v2] "+r"(v2) | 146 | [f2] "+r"(f2), |
147 | [s2] "+r"(s2), | ||
148 | [res]"=r"(res) | ||
94 | : /* inputs */ | 149 | : /* inputs */ |
95 | : /* clobbers */ | 150 | : /* clobbers */ |
96 | "r0", "r1", "r2", "r3", "r4", | 151 | "r0", "r1", "r2", "r3", "r4", |
97 | "r5", "r6", "r7", "memory" | 152 | "r5", "r6", "r7", "memory" |
98 | ); | 153 | ); |
154 | return res; | ||
99 | } | 155 | } |
100 | 156 | ||
101 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | 157 | static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) |
@@ -106,78 +162,18 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
106 | #endif | 162 | #endif |
107 | 163 | ||
108 | asm volatile ( | 164 | asm volatile ( |
109 | #if ORDER > 16 | ||
110 | #if ORDER > 32 | 165 | #if ORDER > 32 |
111 | "mov %[res], #0 \n" | 166 | "mov %[res], #0 \n" |
112 | #endif | ||
113 | "ldmia %[v2]!, {r6-r7} \n" | ||
114 | "1: \n" | 167 | "1: \n" |
115 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | ||
116 | #if ORDER > 32 | ||
117 | "mla %[res], r6, r0, %[res] \n" | ||
118 | #else | 168 | #else |
119 | "mul %[res], r6, r0 \n" | ||
120 | #endif | ||
121 | "mla %[res], r7, r1, %[res] \n" | ||
122 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | ||
123 | "mla %[res], r0, r3, %[res] \n" | ||
124 | "mla %[res], r1, r4, %[res] \n" | ||
125 | "mla %[res], r2, r5, %[res] \n" | ||
126 | "ldmia %[v1]!, {r0-r4} \n" | ||
127 | "mla %[res], r6, r0, %[res] \n" | ||
128 | "mla %[res], r7, r1, %[res] \n" | ||
129 | "mla %[res], r8, r2, %[res] \n" | ||
130 | "ldmia %[v2]!, {r0,r1,r6-r8} \n" | ||
131 | "mla %[res], r0, r3, %[res] \n" | ||
132 | "mla %[res], r1, r4, %[res] \n" | ||
133 | "ldmia %[v1]!, {r0-r5} \n" | ||
134 | "mla %[res], r6, r0, %[res] \n" | ||
135 | "mla %[res], r7, r1, %[res] \n" | ||
136 | "mla %[res], r8, r2, %[res] \n" | ||
137 | "ldmia %[v2]!, {r0-r2,r6,r7} \n" | ||
138 | "mla %[res], r0, r3, %[res] \n" | ||
139 | "mla %[res], r1, r4, %[res] \n" | ||
140 | "mla %[res], r2, r5, %[res] \n" | ||
141 | "ldmia %[v1]!, {r0,r1,r3-r5} \n" | ||
142 | "mla %[res], r6, r0, %[res] \n" | ||
143 | "mla %[res], r7, r1, %[res] \n" | ||
144 | "ldmia %[v2]!, {r0-r2,r6-r8} \n" | ||
145 | "mla %[res], r0, r3, %[res] \n" | ||
146 | "mla %[res], r1, r4, %[res] \n" | ||
147 | "mla %[res], r2, r5, %[res] \n" | ||
148 | "ldmia %[v1]!, {r0-r4} \n" | ||
149 | "mla %[res], r6, r0, %[res] \n" | ||
150 | "mla %[res], r7, r1, %[res] \n" | ||
151 | "mla %[res], r8, r2, %[res] \n" | ||
152 | "ldmia %[v2]!, {r0,r1,r6-r8} \n" | ||
153 | "mla %[res], r0, r3, %[res] \n" | ||
154 | "mla %[res], r1, r4, %[res] \n" | ||
155 | "ldmia %[v1]!, {r0-r5} \n" | ||
156 | "mla %[res], r6, r0, %[res] \n" | ||
157 | "mla %[res], r7, r1, %[res] \n" | ||
158 | "mla %[res], r8, r2, %[res] \n" | ||
159 | #if ORDER > 32 | ||
160 | "ldmia %[v2]!, {r0-r2,r6,r7} \n" | ||
161 | #else | ||
162 | "ldmia %[v2]!, {r0-r2} \n" | ||
163 | #endif | ||
164 | "mla %[res], r0, r3, %[res] \n" | ||
165 | "mla %[res], r1, r4, %[res] \n" | ||
166 | "mla %[res], r2, r5, %[res] \n" | ||
167 | #if ORDER > 32 | ||
168 | "subs %[cnt], %[cnt], #1 \n" | ||
169 | "bne 1b \n" | ||
170 | #endif | ||
171 | |||
172 | #else /* ORDER <= 16 */ | ||
173 | "ldmia %[v1]!, {r0-r3} \n" | 169 | "ldmia %[v1]!, {r0-r3} \n" |
174 | "ldmia %[v2]!, {r4-r7} \n" | 170 | "ldmia %[v2]!, {r4-r7} \n" |
175 | "mul %[res], r4, r0 \n" | 171 | "mul %[res], r4, r0 \n" |
176 | "mla %[res], r5, r1, %[res] \n" | 172 | "mla %[res], r5, r1, %[res] \n" |
177 | "mla %[res], r6, r2, %[res] \n" | 173 | "mla %[res], r6, r2, %[res] \n" |
178 | "mla %[res], r7, r3, %[res] \n" | 174 | "mla %[res], r7, r3, %[res] \n" |
179 | 175 | #endif | |
180 | ".rept 3 \n" | 176 | ".rept " BLOCK_REPEAT "\n" |
181 | "ldmia %[v1]!, {r0-r3} \n" | 177 | "ldmia %[v1]!, {r0-r3} \n" |
182 | "ldmia %[v2]!, {r4-r7} \n" | 178 | "ldmia %[v2]!, {r4-r7} \n" |
183 | "mla %[res], r4, r0, %[res] \n" | 179 | "mla %[res], r4, r0, %[res] \n" |
@@ -185,7 +181,10 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
185 | "mla %[res], r6, r2, %[res] \n" | 181 | "mla %[res], r6, r2, %[res] \n" |
186 | "mla %[res], r7, r3, %[res] \n" | 182 | "mla %[res], r7, r3, %[res] \n" |
187 | ".endr \n" | 183 | ".endr \n" |
188 | #endif /* ORDER <= 16 */ | 184 | #if ORDER > 32 |
185 | "subs %[cnt], %[cnt], #1 \n" | ||
186 | "bne 1b \n" | ||
187 | #endif | ||
189 | : /* outputs */ | 188 | : /* outputs */ |
190 | #if ORDER > 32 | 189 | #if ORDER > 32 |
191 | [cnt]"+r"(cnt), | 190 | [cnt]"+r"(cnt), |
@@ -197,9 +196,6 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) | |||
197 | : /* clobbers */ | 196 | : /* clobbers */ |
198 | "r0", "r1", "r2", "r3", | 197 | "r0", "r1", "r2", "r3", |
199 | "r4", "r5", "r6", "r7" | 198 | "r4", "r5", "r6", "r7" |
200 | #if ORDER > 16 | ||
201 | ,"r8" | ||
202 | #endif | ||
203 | ); | 199 | ); |
204 | return res; | 200 | return res; |
205 | } | 201 | } |
diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S index 8efc92c2e6..117b1789b1 100644 --- a/apps/codecs/lib/udiv32_arm.S +++ b/apps/codecs/lib/udiv32_arm.S | |||
@@ -92,7 +92,7 @@ | |||
92 | #if CONFIG_CPU == PP5020 | 92 | #if CONFIG_CPU == PP5020 |
93 | .set recip_max, 8384 | 93 | .set recip_max, 8384 |
94 | #elif CONFIG_CPU == PP5002 | 94 | #elif CONFIG_CPU == PP5002 |
95 | .set recip_max, 4992 | 95 | .set recip_max, 4608 |
96 | #else | 96 | #else |
97 | .set recip_max, 16384 | 97 | .set recip_max, 16384 |
98 | #endif | 98 | #endif |