diff options
-rwxr-xr-x | apps/codecs/libmusepack/synth_filter_arm.S | 204 |
1 files changed, 203 insertions, 1 deletions
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 41bfda740b..83867086aa 100755 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S | |||
@@ -99,12 +99,15 @@ mpc_decoder_windowing_D: | |||
99 | .align 2 | 99 | .align 2 |
100 | .global mpc_decoder_windowing_D | 100 | .global mpc_decoder_windowing_D |
101 | .type mpc_decoder_windowing_D, %function | 101 | .type mpc_decoder_windowing_D, %function |
102 | #if 0 | ||
102 | mpc_decoder_windowing_D: | 103 | mpc_decoder_windowing_D: |
103 | /* r0 = Data[] */ | 104 | /* r0 = Data[] */ |
104 | /* r1 = V[] */ | 105 | /* r1 = V[] */ |
105 | /* r2 = D[] */ | 106 | /* r2 = D[] */ |
106 | /* lr = counter */ | 107 | /* lr = counter */ |
107 | 108 | /************************************************************************ | |
109 | * Reference implementation. | ||
110 | ***********************************************************************/ | ||
108 | stmfd sp!, {r4-r9, lr} | 111 | stmfd sp!, {r4-r9, lr} |
109 | 112 | ||
110 | mov lr, #32 | 113 | mov lr, #32 |
@@ -154,6 +157,205 @@ mpc_decoder_windowing_D: | |||
154 | bgt .loop32 | 157 | bgt .loop32 |
155 | 158 | ||
156 | ldmfd sp!, {r4-r9, pc} | 159 | ldmfd sp!, {r4-r9, pc} |
160 | #else | ||
161 | mpc_decoder_windowing_D: | ||
162 | /* r0 = Data[] */ | ||
163 | /* r1 = V[] */ | ||
164 | /* r2 = D[] */ | ||
165 | /* lr = counter */ | ||
166 | /************************************************************************ | ||
167 | * Further speed up through making use of symmetries within D[]-window. | ||
168 | * The row V[00] can be extracted as it has symmetries within this single | ||
169 | * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. | ||
170 | * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be | ||
171 | * saved at the cost of 15 x 4 + 1 add's. | ||
172 | * The row V[16] can be extracted as it has symmetries within this single | ||
173 | * row. 8 smull/mlal and 8 ldr's can be saved. | ||
174 | ***********************************************************************/ | ||
175 | stmfd sp!, {r4-r12, lr} | ||
176 | |||
177 | /****************************************** | ||
178 | * row 0 with internal symmetry | ||
179 | *****************************************/ | ||
180 | add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ | ||
181 | ldmia r2!, { r3-r6 } /* load D[01..04] */ | ||
182 | ldr r7 , [r1, #96*4] /* 1 */ | ||
183 | ldr r10, [r1, #992*4] /* 15 */ | ||
184 | rsb r10, r10, r7 /* V[01] - V[15] */ | ||
185 | smull r8, r9, r10, r3 | ||
186 | ldr r7 , [r1, #128*4] /* 2 */ | ||
187 | ldr r10, [r1, #896*4] /* 14 */ | ||
188 | add r10, r10, r7 /* V[02] + V[14] */ | ||
189 | smlal r8, r9, r10, r4 | ||
190 | ldr r7 , [r1, #224*4] /* 3 */ | ||
191 | ldr r10, [r1, #864*4] /* 13 */ | ||
192 | rsb r10, r10, r7 /* V[03] - V[13] */ | ||
193 | smlal r8, r9, r10, r5 | ||
194 | ldr r7 , [r1, #256*4] /* 4 */ | ||
195 | ldr r10, [r1, #768*4] /* 12 */ | ||
196 | add r10, r10, r7 /* V[04] + V[12] */ | ||
197 | smlal r8, r9, r10, r6 | ||
198 | ldmia r2!, { r3-r6 } /* load D[05..08] */ | ||
199 | ldr r7 , [r1, #352*4] /* 5 */ | ||
200 | ldr r10, [r1, #736*4] /* 11 */ | ||
201 | rsb r10, r10, r7 /* V[05] - V[11] */ | ||
202 | smlal r8, r9, r10, r3 | ||
203 | ldr r7 , [r1, #384*4] /* 6 */ | ||
204 | ldr r10, [r1, #640*4] /* 10 */ | ||
205 | add r10, r10, r7 /* V[06] + V[10] */ | ||
206 | smlal r8, r9, r10, r4 | ||
207 | ldr r7 , [r1, #480*4] /* 7 */ | ||
208 | ldr r10, [r1, #608*4] /* 9 */ | ||
209 | rsb r10, r10, r7 /* V[07] - V[09] */ | ||
210 | smlal r8, r9, r10, r5 | ||
211 | ldr r10, [r1, #512*4] /* 8 */ | ||
212 | smlal r8, r9, r10, r6 | ||
213 | mov r8, r8, lsr #16 | ||
214 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
215 | str r8, [r0], #4 /* store Data */ | ||
216 | add r1, r1, #4 /* V+=1, r1 = V[01] */ | ||
217 | add r2, r2, #7*4 /* D+=7, r2 = D[16] */ | ||
218 | |||
219 | /****************************************** | ||
220 | * rows 01..15 are symmetrc to rows 31..17 | ||
221 | * r8 = lo, r9 = hi of 01..15 | ||
222 | * r1 = V[01..15] | ||
223 | * r10 = lo, r11 = hi of 31..17 | ||
224 | * r12 = V[31..16] | ||
225 | *****************************************/ | ||
226 | mov lr, #15 | ||
227 | add r12, r1, #30*4 /* r12 = V[31] */ | ||
228 | .loop15: | ||
229 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
230 | ldr r7, [r12, #768*4] /* 12 */ | ||
231 | smull r10, r11, r7, r6 | ||
232 | ldr r7, [r12, #864*4] /* 13 */ | ||
233 | smlal r10, r11, r7, r5 | ||
234 | ldr r7, [r12, #896*4] /* 14 */ | ||
235 | smlal r10, r11, r7, r4 | ||
236 | ldr r7, [r12, #992*4] /* 15 */ | ||
237 | smlal r10, r11, r7, r3 | ||
238 | ldr r7, [r1] /* 0 */ | ||
239 | smull r8, r9, r7, r3 | ||
240 | ldr r7, [r1, #96*4] /* 1 */ | ||
241 | smlal r8, r9, r7, r4 | ||
242 | ldr r7, [r1, #128*4] /* 2 */ | ||
243 | smlal r8, r9, r7, r5 | ||
244 | ldr r7, [r1, #224*4] /* 3 */ | ||
245 | smlal r8, r9, r7, r6 | ||
246 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
247 | ldr r7, [r1, #256*4] /* 4 */ | ||
248 | smlal r8, r9, r7, r3 | ||
249 | ldr r7, [r1, #352*4] /* 5 */ | ||
250 | smlal r8, r9, r7, r4 | ||
251 | ldr r7, [r1, #384*4] /* 6 */ | ||
252 | smlal r8, r9, r7, r5 | ||
253 | ldr r7, [r1, #480*4] /* 7 */ | ||
254 | smlal r8, r9, r7, r6 | ||
255 | ldr r7, [r12, #512*4] /* 8 */ | ||
256 | smlal r10, r11, r7, r6 | ||
257 | ldr r7, [r12, #608*4] /* 9 */ | ||
258 | smlal r10, r11, r7, r5 | ||
259 | ldr r7, [r12, #640*4] /* 10 */ | ||
260 | smlal r10, r11, r7, r4 | ||
261 | ldr r7, [r12, #736*4] /* 11 */ | ||
262 | smlal r10, r11, r7, r3 | ||
263 | ldmia r2!, { r3-r6 } /* load D[08..11] */ | ||
264 | ldr r7, [r12, #256*4] /* 4 */ | ||
265 | smlal r10, r11, r7, r6 | ||
266 | ldr r7, [r12, #352*4] /* 5 */ | ||
267 | smlal r10, r11, r7, r5 | ||
268 | ldr r7, [r12, #384*4] /* 6 */ | ||
269 | smlal r10, r11, r7, r4 | ||
270 | ldr r7, [r12, #480*4] /* 7 */ | ||
271 | smlal r10, r11, r7, r3 | ||
272 | ldr r7, [r1, #512*4] /* 8 */ | ||
273 | smlal r8, r9, r7, r3 | ||
274 | ldr r7, [r1, #608*4] /* 9 */ | ||
275 | smlal r8, r9, r7, r4 | ||
276 | ldr r7, [r1, #640*4] /* 10 */ | ||
277 | smlal r8, r9, r7, r5 | ||
278 | ldr r7, [r1, #736*4] /* 11 */ | ||
279 | smlal r8, r9, r7, r6 | ||
280 | ldmia r2!, { r3-r6 } /* load D[12..15] */ | ||
281 | ldr r7, [r1, #768*4] /* 12 */ | ||
282 | smlal r8, r9, r7, r3 | ||
283 | ldr r7, [r1, #864*4] /* 13 */ | ||
284 | smlal r8, r9, r7, r4 | ||
285 | ldr r7, [r1, #896*4] /* 14 */ | ||
286 | smlal r8, r9, r7, r5 | ||
287 | ldr r7, [r1, #992*4] /* 15 */ | ||
288 | smlal r8, r9, r7, r6 | ||
289 | ldr r7, [r12] /* 0 */ | ||
290 | smlal r10, r11, r7, r6 | ||
291 | ldr r7, [r12, #96*4] /* 1 */ | ||
292 | smlal r10, r11, r7, r5 | ||
293 | ldr r7, [r12, #128*4] /* 2 */ | ||
294 | smlal r10, r11, r7, r4 | ||
295 | ldr r7, [r12, #224*4] /* 3 */ | ||
296 | smlal r10, r11, r7, r3 | ||
297 | /* store Data[01..15] */ | ||
298 | mov r8, r8, lsr #16 | ||
299 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
300 | str r8, [r0] /* store Data */ | ||
301 | /* store Data[31..17] */ | ||
302 | add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */ | ||
303 | mov r10, r10, lsr #16 | ||
304 | orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
305 | rsb r10, r10, #0 /* r10 = -r10 */ | ||
306 | str r10, [r0], #4 /* store Data */ | ||
307 | sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */ | ||
308 | /* correct adresses for next loop */ | ||
309 | sub r12, r12, #4 /* r12 = V-- */ | ||
310 | add r1, r1, #4 /* r1 = V++ */ | ||
311 | /* next loop */ | ||
312 | subs lr, lr, #1 | ||
313 | bgt .loop15 | ||
314 | |||
315 | /****************************************** | ||
316 | * V[16] with internal symmetry | ||
317 | *****************************************/ | ||
318 | ldmia r2!, { r3-r6 } /* load D[00..03] */ | ||
319 | ldr r7 , [r1] /* 0 */ | ||
320 | ldr r10, [r1, #992*4] /* 15 */ | ||
321 | rsb r10, r10, r7 /* V[00] - V[15] */ | ||
322 | smull r8, r9, r10, r3 | ||
323 | ldr r7 , [r1, #96*4] /* 1 */ | ||
324 | ldr r10, [r1, #896*4] /* 14 */ | ||
325 | rsb r10, r10, r7 /* V[01] - V[14] */ | ||
326 | smlal r8, r9, r10, r4 | ||
327 | ldr r7 , [r1, #128*4] /* 2 */ | ||
328 | ldr r10, [r1, #864*4] /* 13 */ | ||
329 | rsb r10, r10, r7 /* V[02] - V[13] */ | ||
330 | smlal r8, r9, r10, r5 | ||
331 | ldr r7 , [r1, #224*4] /* 3 */ | ||
332 | ldr r10, [r1, #768*4] /* 12 */ | ||
333 | rsb r10, r10, r7 /* V[03] - V[12] */ | ||
334 | smlal r8, r9, r10, r6 | ||
335 | ldmia r2!, { r3-r6 } /* load D[04..07] */ | ||
336 | ldr r7 , [r1, #256*4] /* 4 */ | ||
337 | ldr r10, [r1, #736*4] /* 11 */ | ||
338 | rsb r10, r10, r7 /* V[04] - V[11] */ | ||
339 | smlal r8, r9, r10, r3 | ||
340 | ldr r7 , [r1, #352*4] /* 5 */ | ||
341 | ldr r10, [r1, #640*4] /* 10 */ | ||
342 | rsb r10, r10, r7 /* V[05] - V[10] */ | ||
343 | smlal r8, r9, r10, r4 | ||
344 | ldr r7 , [r1, #384*4] /* 6 */ | ||
345 | ldr r10, [r1, #608*4] /* 9 */ | ||
346 | rsb r10, r10, r7 /* V[06] - V[09] */ | ||
347 | smlal r8, r9, r10, r5 | ||
348 | ldr r7 , [r1, #480*4] /* 7 */ | ||
349 | ldr r10, [r1, #512*4] /* 8 */ | ||
350 | rsb r10, r10, r7 /* V[07] - V[08] */ | ||
351 | smlal r8, r9, r10, r6 | ||
352 | mov r8, r8, lsr #16 | ||
353 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
354 | str r8, [r0], #4 /* store Data */ | ||
355 | add r1, r1, #4 /* V++ */ | ||
356 | |||
357 | ldmfd sp!, {r4-r12, pc} | ||
358 | #endif | ||
157 | .mpc_dewindowing_end: | 359 | .mpc_dewindowing_end: |
158 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D | 360 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D |
159 | #endif | 361 | #endif |