summaryrefslogtreecommitdiff
path: root/apps/codecs/demac
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2010-02-02 22:50:21 +0000
committerJens Arnold <amiconn@rockbox.org>2010-02-02 22:50:21 +0000
commit9f6586698a411186750d2b1ff3c2f3d7a8e117a8 (patch)
treed9f73e9c50f90af02d0a0f875c6711070ca8c456 /apps/codecs/demac
parent24de239d3d0f48f9f67721c716e6880cbbf95bae (diff)
downloadrockbox-9f6586698a411186750d2b1ff3c2f3d7a8e117a8.tar.gz
rockbox-9f6586698a411186750d2b1ff3c2f3d7a8e117a8.zip
APE codec: Speed up decoding of -c2000 and higher on ARMv4 and coldfire by fusing vector math for the filters. Speedup is roughly 3.5% for -c2000, 8% for -c3000 and 12% for -c4000. To be extended to other architectures.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24473 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac')
-rw-r--r--apps/codecs/demac/libdemac/filter.c28
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h388
-rw-r--r--apps/codecs/demac/libdemac/vector_math32_armv4.h194
3 files changed, 334 insertions, 276 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 93edf39cb2..ed6f3c8dc6 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -134,6 +134,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
134 134
135 while(LIKELY(count--)) 135 while(LIKELY(count--))
136 { 136 {
137#ifdef FUSED_VECTOR_MATH
138 if (LIKELY(*data != 0)) {
139 if (*data < 0)
140 res = vector_sp_add(f->coeffs, f->delay - ORDER,
141 f->adaptcoeffs - ORDER);
142 else
143 res = vector_sp_sub(f->coeffs, f->delay - ORDER,
144 f->adaptcoeffs - ORDER);
145 } else {
146 res = scalarproduct(f->coeffs, f->delay - ORDER);
147 }
148 res = FP_TO_INT(res);
149#else
137 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); 150 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
138 151
139 if (LIKELY(*data != 0)) { 152 if (LIKELY(*data != 0)) {
@@ -142,6 +155,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
142 else 155 else
143 vector_sub(f->coeffs, f->adaptcoeffs - ORDER); 156 vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
144 } 157 }
158#endif
145 159
146 res += *data; 160 res += *data;
147 161
@@ -193,6 +207,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
193 207
194 while(LIKELY(count--)) 208 while(LIKELY(count--))
195 { 209 {
210#ifdef FUSED_VECTOR_MATH
211 if (LIKELY(*data != 0)) {
212 if (*data < 0)
213 res = vector_sp_add(f->coeffs, f->delay - ORDER,
214 f->adaptcoeffs - ORDER);
215 else
216 res = vector_sp_sub(f->coeffs, f->delay - ORDER,
217 f->adaptcoeffs - ORDER);
218 } else {
219 res = scalarproduct(f->coeffs, f->delay - ORDER);
220 }
221 res = FP_TO_INT(res);
222#else
196 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); 223 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
197 224
198 if (LIKELY(*data != 0)) { 225 if (LIKELY(*data != 0)) {
@@ -201,6 +228,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
201 else 228 else
202 vector_sub(f->coeffs, f->adaptcoeffs - ORDER); 229 vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
203 } 230 }
231#endif
204 232
205 /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an 233 /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
206 integer (rounding to nearest) and add the input value to 234 integer (rounding to nearest) and add the input value to
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 11e7f07adf..6e8216c9cc 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,19 +24,27 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *recommends* v1 to be 27#define FUSED_VECTOR_MATH
28 * 32 bit aligned, otherwise performance will suffer. */ 28
29static inline void vector_add(int16_t* v1, int16_t* v2) 29#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
30
31/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
32 * This version fetches data as 32 bit words, and *recommends* v1 to be
33 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
34 * aligned or both unaligned. Performance will suffer if either condition
35 * isn't met. It also needs EMAC in signed integer mode. */
36static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
30{ 37{
38 int res;
31#if ORDER > 16 39#if ORDER > 16
32 int cnt = ORDER>>4; 40 int cnt = ORDER>>4;
33#endif 41#endif
34 42
35#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ 43#define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \
36 "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ 44 "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \
37 "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ 45 "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \
38 "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ 46 "clr.w " #sum " \n" /* 's1' is clobbered! */ \
39 "add.l %%d4 , " #sum "\n" \ 47 "add.l " #s2 ", " #sum "\n" \
40 "move.w " #s1 ", " #sum "\n" 48 "move.w " #s1 ", " #sum "\n"
41 49
42#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ 50#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
@@ -47,94 +55,115 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
47 "move.w " #s1 ", " #sum "\n" 55 "move.w " #s1 ", " #sum "\n"
48 56
49 asm volatile ( 57 asm volatile (
50 "move.l %[v2], %%d0 \n" 58 "move.l %[f2], %%d0 \n"
51 "and.l #2, %%d0 \n" 59 "and.l #2, %%d0 \n"
52 "jeq 20f \n" 60 "jeq 20f \n"
53 61
54 "10: \n" 62 "10: \n"
55 "move.w (%[v2])+, %%d0 \n" 63 "move.w (%[f2])+, %%d0 \n"
56 "swap %%d0 \n" 64 "move.w (%[s2])+, %%d1 \n"
57 "1: \n" 65 "swap %%d1 \n"
58 "movem.l (%[v1]), %%a0-%%a3 \n" 66 "1: \n"
59 "movem.l (%[v2]), %%d1-%%d4 \n" 67 ".rept 2 \n"
60 ADDHALFXREGS(%%a0, %%d1, %%d0) 68 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
61 "move.l %%d0, (%[v1])+ \n" 69 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
62 ADDHALFXREGS(%%a1, %%d2, %%d1) 70 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
63 "move.l %%d1, (%[v1])+ \n" 71 ADDHALFXREGS(%%d6, %%d2, %%d1)
64 ADDHALFXREGS(%%a2, %%d3, %%d2) 72 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
65 "move.l %%d2, (%[v1])+ \n" 73 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
66 ADDHALFXREGS(%%a3, %%d4, %%d3) 74 "move.l %%d1, (%[v1])+ \n"
67 "move.l %%d3, (%[v1])+ \n" 75 ADDHALFXREGS(%%d7, %%d6, %%d2)
68 "lea.l (16, %[v2]), %[v2] \n" 76 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
69 "move.l %%d4, %%d0 \n" 77 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
70 78 "move.l %%d2, (%[v1])+ \n"
71 "movem.l (%[v1]), %%a0-%%a3 \n" 79 ADDHALFXREGS(%%a0, %%d7, %%d6)
72 "movem.l (%[v2]), %%d1-%%d4 \n" 80 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0) 81 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
74 "move.l %%d0, (%[v1])+ \n" 82 "move.l %%d6, (%[v1])+ \n"
75 ADDHALFXREGS(%%a1, %%d2, %%d1) 83 ADDHALFXREGS(%%a1, %%d1, %%d7)
76 "move.l %%d1, (%[v1])+ \n" 84 "move.l %%d7, (%[v1])+ \n"
77 ADDHALFXREGS(%%a2, %%d3, %%d2) 85 ".endr \n"
78 "move.l %%d2, (%[v1])+ \n" 86
79 ADDHALFXREGS(%%a3, %%d4, %%d3)
80 "move.l %%d3, (%[v1])+ \n"
81#if ORDER > 16 87#if ORDER > 16
82 "lea.l (16, %[v2]), %[v2] \n" 88 "subq.l #1, %[res] \n"
83 "move.l %%d4, %%d0 \n" 89 "bne.w 1b \n"
90#endif
91 "jra 99f \n"
84 92
85 "subq.l #1, %[cnt] \n" 93 "20: \n"
86 "jne 1b \n" 94 "move.l (%[f2])+, %%d0 \n"
95 "1: \n"
96 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
97 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
98 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
99 ADDHALFREGS(%%d6, %%d1, %%d2)
100 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
101 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
102 "move.l %%d2, (%[v1])+ \n"
103 ADDHALFREGS(%%d7, %%d1, %%d2)
104 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
105 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
106 "move.l %%d2, (%[v1])+ \n"
107 ADDHALFREGS(%%a0, %%d1, %%d2)
108 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
109 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
110 "move.l %%d2, (%[v1])+ \n"
111 ADDHALFREGS(%%a1, %%d1, %%d2)
112 "move.l %%d2, (%[v1])+ \n"
113
114 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
115 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
116 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
117 ADDHALFREGS(%%d6, %%d1, %%d2)
118 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
119 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
120 "move.l %%d2, (%[v1])+ \n"
121 ADDHALFREGS(%%d7, %%d1, %%d2)
122 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
123 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
124 "move.l %%d2, (%[v1])+ \n"
125 ADDHALFREGS(%%a0, %%d1, %%d2)
126 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
127#if ORDER > 16
128 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
129#else
130 "mac.w %%d0l, %%a1l, %%acc0 \n"
87#endif 131#endif
88 "jra 99f \n" 132 "move.l %%d2, (%[v1])+ \n"
89 133 ADDHALFREGS(%%a1, %%d1, %%d2)
90 "20: \n" 134 "move.l %%d2, (%[v1])+ \n"
91 "1: \n"
92 "movem.l (%[v2]), %%a0-%%a3 \n"
93 "movem.l (%[v1]), %%d0-%%d3 \n"
94 ADDHALFREGS(%%a0, %%d0)
95 "move.l %%d0, (%[v1])+ \n"
96 ADDHALFREGS(%%a1, %%d1)
97 "move.l %%d1, (%[v1])+ \n"
98 ADDHALFREGS(%%a2, %%d2)
99 "move.l %%d2, (%[v1])+ \n"
100 ADDHALFREGS(%%a3, %%d3)
101 "move.l %%d3, (%[v1])+ \n"
102 "lea.l (16, %[v2]), %[v2] \n"
103
104 "movem.l (%[v2]), %%a0-%%a3 \n"
105 "movem.l (%[v1]), %%d0-%%d3 \n"
106 ADDHALFREGS(%%a0, %%d0)
107 "move.l %%d0, (%[v1])+ \n"
108 ADDHALFREGS(%%a1, %%d1)
109 "move.l %%d1, (%[v1])+ \n"
110 ADDHALFREGS(%%a2, %%d2)
111 "move.l %%d2, (%[v1])+ \n"
112 ADDHALFREGS(%%a3, %%d3)
113 "move.l %%d3, (%[v1])+ \n"
114#if ORDER > 16 135#if ORDER > 16
115 "lea.l (16, %[v2]), %[v2] \n" 136 "subq.l #1, %[res] \n"
116 137 "bne.w 1b \n"
117 "subq.l #1, %[cnt] \n"
118 "jne 1b \n"
119#endif 138#endif
120 "99: \n" 139
140 "99: \n"
141 "movclr.l %%acc0, %[res] \n"
121 : /* outputs */ 142 : /* outputs */
143 [v1]"+a"(v1),
144 [f2]"+a"(f2),
145 [s2]"+a"(s2),
146 [res]"=d"(res)
147 : /* inputs */
122#if ORDER > 16 148#if ORDER > 16
123 [cnt]"+d"(cnt), 149 [cnt]"[res]"(cnt)
124#endif 150#endif
125 [v1] "+a"(v1),
126 [v2] "+a"(v2)
127 : /* inputs */
128 : /* clobbers */ 151 : /* clobbers */
129 "d0", "d1", "d2", "d3", "d4", 152 "d0", "d1", "d2", "d6", "d7",
130 "a0", "a1", "a2", "a3", "memory" 153 "a0", "a1", "memory"
154
131 ); 155 );
156 return res;
132} 157}
133 158
134/* This version fetches data as 32 bit words, and *recommends* v1 to be 159/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
135 * 32 bit aligned, otherwise performance will suffer. */ 160 * This version fetches data as 32 bit words, and *recommends* v1 to be
136static inline void vector_sub(int16_t* v1, int16_t* v2) 161 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
162 * aligned or both unaligned. Performance will suffer if either condition
163 * isn't met. It also needs EMAC in signed integer mode. */
164static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
137{ 165{
166 int res;
138#if ORDER > 16 167#if ORDER > 16
139 int cnt = ORDER>>4; 168 int cnt = ORDER>>4;
140#endif 169#endif
@@ -155,107 +184,116 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
155 "move.w " #min ", " #s1d "\n" 184 "move.w " #min ", " #s1d "\n"
156 185
157 asm volatile ( 186 asm volatile (
158 "move.l %[v2], %%d0 \n" 187 "move.l %[f2], %%d0 \n"
159 "and.l #2, %%d0 \n" 188 "and.l #2, %%d0 \n"
160 "jeq 20f \n" 189 "jeq 20f \n"
161 190
162 "10: \n" 191 "10: \n"
163 "move.w (%[v2])+, %%d0 \n" 192 "move.w (%[f2])+, %%d0 \n"
164 "swap %%d0 \n" 193 "move.w (%[s2])+, %%d1 \n"
165 "1: \n" 194 "swap %%d1 \n"
166 "movem.l (%[v2]), %%d1-%%d4 \n" 195 "1: \n"
167 "movem.l (%[v1]), %%a0-%%a3 \n" 196 ".rept 2 \n"
168 SUBHALFXREGS(%%a0, %%d1, %%d0) 197 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
169 "move.l %%d0, (%[v1])+ \n" 198 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
170 SUBHALFXREGS(%%a1, %%d2, %%d1) 199 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
171 "move.l %%d1, (%[v1])+ \n" 200 SUBHALFXREGS(%%d6, %%d2, %%d1)
172 SUBHALFXREGS(%%a2, %%d3, %%d2) 201 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
173 "move.l %%d2, (%[v1])+ \n" 202 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
174 SUBHALFXREGS(%%a3, %%d4, %%d3) 203 "move.l %%d1, (%[v1])+ \n"
175 "move.l %%d3, (%[v1])+ \n" 204 SUBHALFXREGS(%%d7, %%d6, %%d2)
176 "lea.l (16, %[v2]), %[v2] \n" 205 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
177 "move.l %%d4, %%d0 \n" 206 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
178 207 "move.l %%d2, (%[v1])+ \n"
179 "movem.l (%[v2]), %%d1-%%d4 \n" 208 SUBHALFXREGS(%%a0, %%d7, %%d6)
180 "movem.l (%[v1]), %%a0-%%a3 \n" 209 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0) 210 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
182 "move.l %%d0, (%[v1])+ \n" 211 "move.l %%d6, (%[v1])+ \n"
183 SUBHALFXREGS(%%a1, %%d2, %%d1) 212 SUBHALFXREGS(%%a1, %%d1, %%d7)
184 "move.l %%d1, (%[v1])+ \n" 213 "move.l %%d7, (%[v1])+ \n"
185 SUBHALFXREGS(%%a2, %%d3, %%d2) 214 ".endr \n"
186 "move.l %%d2, (%[v1])+ \n" 215
187 SUBHALFXREGS(%%a3, %%d4, %%d3)
188 "move.l %%d3, (%[v1])+ \n"
189#if ORDER > 16 216#if ORDER > 16
190 "lea.l (16, %[v2]), %[v2] \n" 217 "subq.l #1, %[res] \n"
191 "move.l %%d4, %%d0 \n" 218 "bne.w 1b \n"
192
193 "subq.l #1, %[cnt] \n"
194 "bne.w 1b \n"
195#endif 219#endif
196 "jra 99f \n"
197
198 "20: \n"
199 "1: \n"
200 "movem.l (%[v2]), %%d1-%%d4 \n"
201 "movem.l (%[v1]), %%a0-%%a3 \n"
202 SUBHALFREGS(%%a0, %%d1, %%d0)
203 "move.l %%d0, (%[v1])+ \n"
204 SUBHALFREGS(%%a1, %%d2, %%d1)
205 "move.l %%d1, (%[v1])+ \n"
206 SUBHALFREGS(%%a2, %%d3, %%d2)
207 "move.l %%d2, (%[v1])+ \n"
208 SUBHALFREGS(%%a3, %%d4, %%d3)
209 "move.l %%d3, (%[v1])+ \n"
210 "lea.l (16, %[v2]), %[v2] \n"
211
212 "movem.l (%[v2]), %%d1-%%d4 \n"
213 "movem.l (%[v1]), %%a0-%%a3 \n"
214 SUBHALFREGS(%%a0, %%d1, %%d0)
215 "move.l %%d0, (%[v1])+ \n"
216 SUBHALFREGS(%%a1, %%d2, %%d1)
217 "move.l %%d1, (%[v1])+ \n"
218 SUBHALFREGS(%%a2, %%d3, %%d2)
219 "move.l %%d2, (%[v1])+ \n"
220 SUBHALFREGS(%%a3, %%d4, %%d3)
221 "move.l %%d3, (%[v1])+ \n"
222#if ORDER > 16
223 "lea.l (16, %[v2]), %[v2] \n"
224 220
225 "subq.l #1, %[cnt] \n" 221 "jra 99f \n"
226 "bne.w 1b \n" 222
223 "20: \n"
224 "move.l (%[f2])+, %%d0 \n"
225 "1: \n"
226 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
227 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
228 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
229 SUBHALFREGS(%%d6, %%d1, %%d2)
230 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
231 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
232 "move.l %%d2, (%[v1])+ \n"
233 SUBHALFREGS(%%d7, %%d1, %%d2)
234 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
235 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
236 "move.l %%d2, (%[v1])+ \n"
237 SUBHALFREGS(%%a0, %%d1, %%d2)
238 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
239 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
240 "move.l %%d2, (%[v1])+ \n"
241 SUBHALFREGS(%%a1, %%d1, %%d2)
242 "move.l %%d2, (%[v1])+ \n"
243
244 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
245 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
246 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
247 SUBHALFREGS(%%d6, %%d1, %%d2)
248 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
249 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
250 "move.l %%d2, (%[v1])+ \n"
251 SUBHALFREGS(%%d7, %%d1, %%d2)
252 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
253 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
254 "move.l %%d2, (%[v1])+ \n"
255 SUBHALFREGS(%%a0, %%d1, %%d2)
256 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
257#if ORDER > 16
258 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
259#else
260 "mac.w %%d0l, %%a1l, %%acc0 \n"
261#endif
262 "move.l %%d2, (%[v1])+ \n"
263 SUBHALFREGS(%%a1, %%d1, %%d2)
264 "move.l %%d2, (%[v1])+ \n"
265#if ORDER > 16
266 "subq.l #1, %[res] \n"
267 "bne.w 1b \n"
227#endif 268#endif
228 269
229 "99: \n" 270 "99: \n"
271 "movclr.l %%acc0, %[res] \n"
230 : /* outputs */ 272 : /* outputs */
273 [v1]"+a"(v1),
274 [f2]"+a"(f2),
275 [s2]"+a"(s2),
276 [res]"=d"(res)
277 : /* inputs */
231#if ORDER > 16 278#if ORDER > 16
232 [cnt]"+d"(cnt), 279 [cnt]"[res]"(cnt)
233#endif 280#endif
234 [v1] "+a"(v1),
235 [v2] "+a"(v2)
236 : /* inputs */
237 : /* clobbers */ 281 : /* clobbers */
238 "d0", "d1", "d2", "d3", "d4", 282 "d0", "d1", "d2", "d6", "d7",
239 "a0", "a1", "a2", "a3", "memory" 283 "a0", "a1", "memory"
284
240 ); 285 );
286 return res;
241} 287}
242 288
243#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
244
245/* This version fetches data as 32 bit words, and *recommends* v1 to be 289/* This version fetches data as 32 bit words, and *recommends* v1 to be
246 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC 290 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
247 * in signed integer mode - call above macro before use. */ 291 * in signed integer mode. */
248static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 292static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
249{ 293{
250 int res; 294 int res;
251#if ORDER > 32
252 int cnt = ORDER>>5;
253#endif
254
255#if ORDER > 16 295#if ORDER > 16
256#define MAC_BLOCKS "7" 296 int cnt = ORDER>>4;
257#else
258#define MAC_BLOCKS "3"
259#endif 297#endif
260 298
261 asm volatile ( 299 asm volatile (
@@ -267,20 +305,16 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
267 "move.l (%[v1])+, %%d0 \n" 305 "move.l (%[v1])+, %%d0 \n"
268 "move.w (%[v2])+, %%d1 \n" 306 "move.w (%[v2])+, %%d1 \n"
269 "1: \n" 307 "1: \n"
270 ".rept " MAC_BLOCKS "\n" 308 ".rept 7 \n"
271 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
272 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
273 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" 309 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
274 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" 310 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
275 ".endr \n" 311 ".endr \n"
276 312
277 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" 313 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
278 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" 314#if ORDER > 16
279 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
280#if ORDER > 32
281 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" 315 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
282 "subq.l #1, %[res] \n" 316 "subq.l #1, %[res] \n"
283 "bne.w 1b \n" 317 "bne.b 1b \n"
284#else 318#else
285 "mac.w %%d0l, %%d1u, %%acc0 \n" 319 "mac.w %%d0l, %%d1u, %%acc0 \n"
286#endif 320#endif
@@ -290,7 +324,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
290 "move.l (%[v1])+, %%d0 \n" 324 "move.l (%[v1])+, %%d0 \n"
291 "move.l (%[v2])+, %%d1 \n" 325 "move.l (%[v2])+, %%d1 \n"
292 "1: \n" 326 "1: \n"
293 ".rept " MAC_BLOCKS "\n" 327 ".rept 3 \n"
294 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 328 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
295 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" 329 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
296 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" 330 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
@@ -299,11 +333,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
299 333
300 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" 334 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
301 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" 335 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
302#if ORDER > 32 336#if ORDER > 16
303 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" 337 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
304 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" 338 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
305 "subq.l #1, %[res] \n" 339 "subq.l #1, %[res] \n"
306 "bne.w 1b \n" 340 "bne.b 1b \n"
307#else 341#else
308 "mac.w %%d2u, %%d1u, %%acc0 \n" 342 "mac.w %%d2u, %%d1u, %%acc0 \n"
309 "mac.w %%d2l, %%d1l, %%acc0 \n" 343 "mac.w %%d2l, %%d1l, %%acc0 \n"
@@ -316,7 +350,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
316 [v2]"+a"(v2), 350 [v2]"+a"(v2),
317 [res]"=d"(res) 351 [res]"=d"(res)
318 : /* inputs */ 352 : /* inputs */
319#if ORDER > 32 353#if ORDER > 16
320 [cnt]"[res]"(cnt) 354 [cnt]"[res]"(cnt)
321#endif 355#endif
322 : /* clobbers */ 356 : /* clobbers */
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
index 89b24f2b06..207fca3038 100644
--- a/apps/codecs/demac/libdemac/vector_math32_armv4.h
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -24,78 +24,134 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27static inline void vector_add(int32_t* v1, int32_t* v2) 27#define FUSED_VECTOR_MATH
28
29#if ORDER > 32
30#define BLOCK_REPEAT "8"
31#elif ORDER > 16
32#define BLOCK_REPEAT "7"
33#else
34#define BLOCK_REPEAT "3"
35#endif
36
37/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
38static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2)
28{ 39{
40 int res;
29#if ORDER > 32 41#if ORDER > 32
30 int cnt = ORDER>>5; 42 int cnt = ORDER>>5;
31#endif 43#endif
32 44
33#if ORDER > 16 45 asm volatile (
34#define ADD_SUB_BLOCKS "8" 46#if ORDER > 32
47 "mov %[res], #0 \n"
48 "1: \n"
35#else 49#else
36#define ADD_SUB_BLOCKS "4" 50 "ldmia %[v1], {r0-r3} \n"
51 "ldmia %[f2]!, {r4-r7} \n"
52 "mul %[res], r4, r0 \n"
53 "mla %[res], r5, r1, %[res] \n"
54 "mla %[res], r6, r2, %[res] \n"
55 "mla %[res], r7, r3, %[res] \n"
56 "ldmia %[s2]!, {r4-r7} \n"
57 "add r0, r0, r4 \n"
58 "add r1, r1, r5 \n"
59 "add r2, r2, r6 \n"
60 "add r3, r3, r7 \n"
61 "stmia %[v1]!, {r0-r3} \n"
37#endif 62#endif
38 63 ".rept " BLOCK_REPEAT "\n"
39 asm volatile ( 64 "ldmia %[v1], {r0-r3} \n"
40 "1: \n" 65 "ldmia %[f2]!, {r4-r7} \n"
41 ".rept " ADD_SUB_BLOCKS "\n" 66 "mla %[res], r4, r0, %[res] \n"
42 "ldmia %[v1], {r0-r3} \n" 67 "mla %[res], r5, r1, %[res] \n"
43 "ldmia %[v2]!, {r4-r7} \n" 68 "mla %[res], r6, r2, %[res] \n"
44 "add r0, r0, r4 \n" 69 "mla %[res], r7, r3, %[res] \n"
45 "add r1, r1, r5 \n" 70 "ldmia %[s2]!, {r4-r7} \n"
46 "add r2, r2, r6 \n" 71 "add r0, r0, r4 \n"
47 "add r3, r3, r7 \n" 72 "add r1, r1, r5 \n"
48 "stmia %[v1]!, {r0-r3} \n" 73 "add r2, r2, r6 \n"
49 ".endr \n" 74 "add r3, r3, r7 \n"
75 "stmia %[v1]!, {r0-r3} \n"
76 ".endr \n"
50#if ORDER > 32 77#if ORDER > 32
51 "subs %[cnt], %[cnt], #1 \n" 78 "subs %[cnt], %[cnt], #1 \n"
52 "bne 1b \n" 79 "bne 1b \n"
53#endif 80#endif
54 : /* outputs */ 81 : /* outputs */
55#if ORDER > 32 82#if ORDER > 32
56 [cnt]"+r"(cnt), 83 [cnt]"+r"(cnt),
57#endif 84#endif
58 [v1] "+r"(v1), 85 [v1] "+r"(v1),
59 [v2] "+r"(v2) 86 [f2] "+r"(f2),
87 [s2] "+r"(s2),
88 [res]"=r"(res)
60 : /* inputs */ 89 : /* inputs */
61 : /* clobbers */ 90 : /* clobbers */
62 "r0", "r1", "r2", "r3", "r4", 91 "r0", "r1", "r2", "r3", "r4",
63 "r5", "r6", "r7", "memory" 92 "r5", "r6", "r7", "memory"
64 ); 93 );
94 return res;
65} 95}
66 96
67static inline void vector_sub(int32_t* v1, int32_t* v2) 97/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
98static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2)
68{ 99{
100 int res;
69#if ORDER > 32 101#if ORDER > 32
70 int cnt = ORDER>>5; 102 int cnt = ORDER>>5;
71#endif 103#endif
72 104
73 asm volatile ( 105 asm volatile (
74 "1: \n"
75 ".rept " ADD_SUB_BLOCKS "\n"
76 "ldmia %[v1], {r0-r3} \n"
77 "ldmia %[v2]!, {r4-r7} \n"
78 "sub r0, r0, r4 \n"
79 "sub r1, r1, r5 \n"
80 "sub r2, r2, r6 \n"
81 "sub r3, r3, r7 \n"
82 "stmia %[v1]!, {r0-r3} \n"
83 ".endr \n"
84#if ORDER > 32 106#if ORDER > 32
85 "subs %[cnt], %[cnt], #1 \n" 107 "mov %[res], #0 \n"
86 "bne 1b \n" 108 "1: \n"
109#else
110 "ldmia %[v1], {r0-r3} \n"
111 "ldmia %[f2]!, {r4-r7} \n"
112 "mul %[res], r4, r0 \n"
113 "mla %[res], r5, r1, %[res] \n"
114 "mla %[res], r6, r2, %[res] \n"
115 "mla %[res], r7, r3, %[res] \n"
116 "ldmia %[s2]!, {r4-r7} \n"
117 "sub r0, r0, r4 \n"
118 "sub r1, r1, r5 \n"
119 "sub r2, r2, r6 \n"
120 "sub r3, r3, r7 \n"
121 "stmia %[v1]!, {r0-r3} \n"
122#endif
123 ".rept " BLOCK_REPEAT "\n"
124 "ldmia %[v1], {r0-r3} \n"
125 "ldmia %[f2]!, {r4-r7} \n"
126 "mla %[res], r4, r0, %[res] \n"
127 "mla %[res], r5, r1, %[res] \n"
128 "mla %[res], r6, r2, %[res] \n"
129 "mla %[res], r7, r3, %[res] \n"
130 "ldmia %[s2]!, {r4-r7} \n"
131 "sub r0, r0, r4 \n"
132 "sub r1, r1, r5 \n"
133 "sub r2, r2, r6 \n"
134 "sub r3, r3, r7 \n"
135 "stmia %[v1]!, {r0-r3} \n"
136 ".endr \n"
137#if ORDER > 32
138 "subs %[cnt], %[cnt], #1 \n"
139 "bne 1b \n"
87#endif 140#endif
88 : /* outputs */ 141 : /* outputs */
89#if ORDER > 32 142#if ORDER > 32
90 [cnt]"+r"(cnt), 143 [cnt]"+r"(cnt),
91#endif 144#endif
92 [v1] "+r"(v1), 145 [v1] "+r"(v1),
93 [v2] "+r"(v2) 146 [f2] "+r"(f2),
147 [s2] "+r"(s2),
148 [res]"=r"(res)
94 : /* inputs */ 149 : /* inputs */
95 : /* clobbers */ 150 : /* clobbers */
96 "r0", "r1", "r2", "r3", "r4", 151 "r0", "r1", "r2", "r3", "r4",
97 "r5", "r6", "r7", "memory" 152 "r5", "r6", "r7", "memory"
98 ); 153 );
154 return res;
99} 155}
100 156
101static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) 157static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
@@ -106,78 +162,18 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
106#endif 162#endif
107 163
108 asm volatile ( 164 asm volatile (
109#if ORDER > 16
110#if ORDER > 32 165#if ORDER > 32
111 "mov %[res], #0 \n" 166 "mov %[res], #0 \n"
112#endif
113 "ldmia %[v2]!, {r6-r7} \n"
114 "1: \n" 167 "1: \n"
115 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
116#if ORDER > 32
117 "mla %[res], r6, r0, %[res] \n"
118#else 168#else
119 "mul %[res], r6, r0 \n"
120#endif
121 "mla %[res], r7, r1, %[res] \n"
122 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
123 "mla %[res], r0, r3, %[res] \n"
124 "mla %[res], r1, r4, %[res] \n"
125 "mla %[res], r2, r5, %[res] \n"
126 "ldmia %[v1]!, {r0-r4} \n"
127 "mla %[res], r6, r0, %[res] \n"
128 "mla %[res], r7, r1, %[res] \n"
129 "mla %[res], r8, r2, %[res] \n"
130 "ldmia %[v2]!, {r0,r1,r6-r8} \n"
131 "mla %[res], r0, r3, %[res] \n"
132 "mla %[res], r1, r4, %[res] \n"
133 "ldmia %[v1]!, {r0-r5} \n"
134 "mla %[res], r6, r0, %[res] \n"
135 "mla %[res], r7, r1, %[res] \n"
136 "mla %[res], r8, r2, %[res] \n"
137 "ldmia %[v2]!, {r0-r2,r6,r7} \n"
138 "mla %[res], r0, r3, %[res] \n"
139 "mla %[res], r1, r4, %[res] \n"
140 "mla %[res], r2, r5, %[res] \n"
141 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
142 "mla %[res], r6, r0, %[res] \n"
143 "mla %[res], r7, r1, %[res] \n"
144 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
145 "mla %[res], r0, r3, %[res] \n"
146 "mla %[res], r1, r4, %[res] \n"
147 "mla %[res], r2, r5, %[res] \n"
148 "ldmia %[v1]!, {r0-r4} \n"
149 "mla %[res], r6, r0, %[res] \n"
150 "mla %[res], r7, r1, %[res] \n"
151 "mla %[res], r8, r2, %[res] \n"
152 "ldmia %[v2]!, {r0,r1,r6-r8} \n"
153 "mla %[res], r0, r3, %[res] \n"
154 "mla %[res], r1, r4, %[res] \n"
155 "ldmia %[v1]!, {r0-r5} \n"
156 "mla %[res], r6, r0, %[res] \n"
157 "mla %[res], r7, r1, %[res] \n"
158 "mla %[res], r8, r2, %[res] \n"
159#if ORDER > 32
160 "ldmia %[v2]!, {r0-r2,r6,r7} \n"
161#else
162 "ldmia %[v2]!, {r0-r2} \n"
163#endif
164 "mla %[res], r0, r3, %[res] \n"
165 "mla %[res], r1, r4, %[res] \n"
166 "mla %[res], r2, r5, %[res] \n"
167#if ORDER > 32
168 "subs %[cnt], %[cnt], #1 \n"
169 "bne 1b \n"
170#endif
171
172#else /* ORDER <= 16 */
173 "ldmia %[v1]!, {r0-r3} \n" 169 "ldmia %[v1]!, {r0-r3} \n"
174 "ldmia %[v2]!, {r4-r7} \n" 170 "ldmia %[v2]!, {r4-r7} \n"
175 "mul %[res], r4, r0 \n" 171 "mul %[res], r4, r0 \n"
176 "mla %[res], r5, r1, %[res] \n" 172 "mla %[res], r5, r1, %[res] \n"
177 "mla %[res], r6, r2, %[res] \n" 173 "mla %[res], r6, r2, %[res] \n"
178 "mla %[res], r7, r3, %[res] \n" 174 "mla %[res], r7, r3, %[res] \n"
179 175#endif
180 ".rept 3 \n" 176 ".rept " BLOCK_REPEAT "\n"
181 "ldmia %[v1]!, {r0-r3} \n" 177 "ldmia %[v1]!, {r0-r3} \n"
182 "ldmia %[v2]!, {r4-r7} \n" 178 "ldmia %[v2]!, {r4-r7} \n"
183 "mla %[res], r4, r0, %[res] \n" 179 "mla %[res], r4, r0, %[res] \n"
@@ -185,7 +181,10 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
185 "mla %[res], r6, r2, %[res] \n" 181 "mla %[res], r6, r2, %[res] \n"
186 "mla %[res], r7, r3, %[res] \n" 182 "mla %[res], r7, r3, %[res] \n"
187 ".endr \n" 183 ".endr \n"
188#endif /* ORDER <= 16 */ 184#if ORDER > 32
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
189 : /* outputs */ 188 : /* outputs */
190#if ORDER > 32 189#if ORDER > 32
191 [cnt]"+r"(cnt), 190 [cnt]"+r"(cnt),
@@ -197,9 +196,6 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
197 : /* clobbers */ 196 : /* clobbers */
198 "r0", "r1", "r2", "r3", 197 "r0", "r1", "r2", "r3",
199 "r4", "r5", "r6", "r7" 198 "r4", "r5", "r6", "r7"
200#if ORDER > 16
201 ,"r8"
202#endif
203 ); 199 );
204 return res; 200 return res;
205} 201}