summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libwavpack/arml.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libwavpack/arml.S')
-rw-r--r--lib/rbcodec/codecs/libwavpack/arml.S506
1 files changed, 506 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libwavpack/arml.S b/lib/rbcodec/codecs/libwavpack/arml.S
new file mode 100644
index 0000000000..60818aa1e6
--- /dev/null
+++ b/lib/rbcodec/codecs/libwavpack/arml.S
@@ -0,0 +1,506 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2006 by David Bryant
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22/* This is an assembly optimized version of the following WavPack function:
23 *
24 * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
25 * long *buffer, long sample_count);
26 *
27 * It performs a single pass of stereo decorrelation on the provided buffer.
28 * Note that this version of the function requires that the 8 previous stereo
29 * samples are visible and correct. In other words, it ignores the "samples_*"
30 * fields in the decorr_pass structure and gets the history data directly
31 * from the buffer. It does, however, return the appropriate history samples
32 * to the decorr_pass structure before returning.
33 *
34 * This is written to work on a ARM7TDMI processor. This version uses the
35 * 64-bit multiply-accumulate instruction and so can be used with all
36 * WavPack files. However, for optimum performance with 16-bit WavPack
37 * files, there is a faster version that only uses the 32-bit MLA
38 * instruction.
39 */
40
41#include "config.h"
42
43 .text
44 .align
45 .global decorr_stereo_pass_cont_arml
46
47/*
48 * on entry:
49 *
50 * r0 = struct decorr_pass *dpp
51 * r1 = long *buffer
52 * r2 = long sample_count
53 */
54
55decorr_stereo_pass_cont_arml:
56
57 stmfd sp!, {r4 - r8, r10, r11, lr}
58 mov r5, r0 @ r5 = dpp
59 mov r11, #512 @ r11 = 512 for rounding
60 ldrsh r6, [r0, #2] @ r6 = dpp->delta
61 ldrsh r4, [r0, #4] @ r4 = dpp->weight_A
62 ldrsh r0, [r0, #6] @ r0 = dpp->weight_B
63 cmp r2, #0 @ exit if no samples to process
64 beq common_exit
65
66 mov r0, r0, asl #18 @ for 64-bit math we use weights << 18
67 mov r4, r4, asl #18
68 mov r6, r6, asl #18
69 add r7, r1, r2, asl #3 @ r7 = buffer ending position
70 ldrsh r2, [r5, #0] @ r2 = dpp->term
71 cmp r2, #0
72 blt minus_term
73
74 ldr lr, [r1, #-16] @ load 2 sample history from buffer
75 ldr r10, [r1, #-12] @ for terms 2, 17, and 18
76 ldr r8, [r1, #-8]
77 ldr r3, [r1, #-4]
78
79 cmp r2, #18
80 beq term_18_loop
81 mov lr, lr, asl #4
82 mov r10, r10, asl #4
83 cmp r2, #2
84 beq term_2_loop
85 cmp r2, #17
86 beq term_17_loop
87 b term_default_loop
88
89minus_term:
90 mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping
91 rsb r10, r10, #0 @ (only used for negative terms)
92 cmn r2, #1
93 beq term_minus_1
94 cmn r2, #2
95 beq term_minus_2
96 cmn r2, #3
97 beq term_minus_3
98 b common_exit
99
100/*
101 ******************************************************************************
102 * Loop to handle term = 17 condition
103 *
104 * r0 = dpp->weight_B r8 = previous left sample
105 * r1 = bptr r9 =
106 * r2 = current sample r10 = second previous left sample << 4
107 * r3 = previous right sample r11 = lo accumulator (for rounding)
108 * r4 = dpp->weight_A ip = current decorrelation value
109 * r5 = dpp sp =
110 * r6 = dpp->delta lr = second previous right sample << 4
111 * r7 = eptr pc =
112 *******************************************************************************
113 */
114
115term_17_loop:
116 rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev
117 mov lr, r8, asl #4 @ previous becomes 2nd previous
118 ldr r2, [r1], #4 @ get sample & update pointer
119 mov r11, #0x80000000
120 mov r8, r2
121 smlalne r11, r8, r4, ip
122 strne r8, [r1, #-4] @ if change possible, store sample back
123 cmpne r2, #0
124 beq .L325
125 teq ip, r2 @ update weight based on signs
126 submi r4, r4, r6
127 addpl r4, r4, r6
128
129.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel
130 mov r10, r3, asl #4
131 ldr r2, [r1], #4
132 mov r11, #0x80000000
133 mov r3, r2
134 smlalne r11, r3, r0, ip
135 strne r3, [r1, #-4]
136 cmpne r2, #0
137 beq .L329
138 teq ip, r2
139 submi r0, r0, r6
140 addpl r0, r0, r6
141
142.L329: cmp r7, r1 @ loop back if more samples to do
143 bhi term_17_loop
144 mov lr, lr, asr #4
145 mov r10, r10, asr #4
146 b store_1718 @ common exit for terms 17 & 18
147
148/*
149 ******************************************************************************
150 * Loop to handle term = 18 condition
151 *
152 * r0 = dpp->weight_B r8 = previous left sample
153 * r1 = bptr r9 =
154 * r2 = current sample r10 = second previous left sample
155 * r3 = previous right sample r11 = lo accumulator (for rounding)
156 * r4 = dpp->weight_A ip = decorrelation value
157 * r5 = dpp sp =
158 * r6 = dpp->delta lr = second previous right sample
159 * r7 = eptr pc =
160 *******************************************************************************
161 */
162
163term_18_loop:
164 rsb ip, lr, r8 @ decorr value =
165 mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1
166 add ip, lr, ip, asr #1
167 movs ip, ip, asl #4
168 ldr r2, [r1], #4 @ get sample & update pointer
169 mov r11, #0x80000000
170 mov r8, r2
171 smlalne r11, r8, r4, ip
172 strne r8, [r1, #-4] @ if change possible, store sample back
173 cmpne r2, #0
174 beq .L337
175 teq ip, r2 @ update weight based on signs
176 submi r4, r4, r6
177 addpl r4, r4, r6
178
179.L337: rsb ip, r10, r3 @ do same thing for right channel
180 mov r10, r3
181 add ip, r10, ip, asr #1
182 movs ip, ip, asl #4
183 ldr r2, [r1], #4
184 mov r11, #0x80000000
185 mov r3, r2
186 smlalne r11, r3, r0, ip
187 strne r3, [r1, #-4]
188 cmpne r2, #0
189 beq .L341
190 teq ip, r2
191 submi r0, r0, r6
192 addpl r0, r0, r6
193
194.L341: cmp r7, r1 @ loop back if more samples to do
195 bhi term_18_loop
196
197/* common exit for terms 17 & 18 */
198
199store_1718:
200 str r3, [r5, #40] @ store sample history into struct
201 str r8, [r5, #8]
202 str r10, [r5, #44]
203 str lr, [r5, #12]
204 b common_exit @ and return
205
206/*
207 ******************************************************************************
208 * Loop to handle term = 2 condition
209 * (note that this case can be handled by the default term handler (1-8), but
210 * this special case is faster because it doesn't have to read memory twice)
211 *
212 * r0 = dpp->weight_B r8 = previous left sample
213 * r1 = bptr r9 =
214 * r2 = current sample r10 = second previous left sample << 4
215 * r3 = previous right sample r11 = lo accumulator (for rounding)
216 * r4 = dpp->weight_A ip = decorrelation value
217 * r5 = dpp sp =
218 * r6 = dpp->delta lr = second previous right sample << 4
219 * r7 = eptr pc =
220 *******************************************************************************
221 */
222
223term_2_loop:
224 movs ip, lr @ get decorrelation value & test
225 ldr r2, [r1], #4 @ get sample & update pointer
226 mov lr, r8, asl #4 @ previous becomes 2nd previous
227 mov r11, #0x80000000
228 mov r8, r2
229 smlalne r11, r8, r4, ip
230 strne r8, [r1, #-4] @ if change possible, store sample back
231 cmpne r2, #0
232 beq .L225
233 teq ip, r2 @ update weight based on signs
234 submi r4, r4, r6
235 addpl r4, r4, r6
236
237.L225: movs ip, r10 @ do same thing for right channel
238 ldr r2, [r1], #4
239 mov r10, r3, asl #4
240 mov r11, #0x80000000
241 mov r3, r2
242 smlalne r11, r3, r0, ip
243 strne r3, [r1, #-4]
244 cmpne r2, #0
245 beq .L229
246 teq ip, r2
247 submi r0, r0, r6
248 addpl r0, r0, r6
249
250.L229: cmp r7, r1 @ loop back if more samples to do
251 bhi term_2_loop
252
253 b default_term_exit @ this exit updates all dpp->samples
254
255/*
256 ******************************************************************************
257 * Loop to handle default term condition
258 *
259 * r0 = dpp->weight_B r8 = result accumulator
260 * r1 = bptr r9 =
261 * r2 = dpp->term r10 =
262 * r3 = decorrelation value r11 = lo accumulator (for rounding)
263 * r4 = dpp->weight_A ip = current sample
264 * r5 = dpp sp =
265 * r6 = dpp->delta lr =
266 * r7 = eptr pc =
267 *******************************************************************************
268 */
269
270term_default_loop:
271 ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term
272 ldr ip, [r1], #4 @ get original sample and bump ptr
273 movs r3, r3, asl #4
274 mov r11, #0x80000000
275 mov r8, ip
276 smlalne r11, r8, r4, r3
277 strne r8, [r1, #-4] @ if possibly changed, store updated sample
278 cmpne ip, #0
279 beq .L350
280 teq ip, r3 @ update weight based on signs
281 submi r4, r4, r6
282 addpl r4, r4, r6
283
284.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel
285 ldr ip, [r1], #4
286 movs r3, r3, asl #4
287 mov r11, #0x80000000
288 mov r8, ip
289 smlalne r11, r8, r0, r3
290 strne r8, [r1, #-4]
291 cmpne ip, #0
292 beq .L354
293 teq ip, r3
294 submi r0, r0, r6
295 addpl r0, r0, r6
296
297.L354: cmp r7, r1 @ loop back if more samples to do
298 bhi term_default_loop
299
300/*
301 * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
302 * structure (even if they are not all used for the given term)
303 */
304
305default_term_exit:
306 ldrsh r3, [r5, #0]
307 sub ip, r3, #1
308 mov lr, #7
309
310.L358: and r3, ip, #7
311 add r3, r5, r3, asl #2
312 ldr r2, [r1, #-4]
313 str r2, [r3, #40]
314 ldr r2, [r1, #-8]!
315 str r2, [r3, #8]
316 sub ip, ip, #1
317 sub lr, lr, #1
318 cmn lr, #1
319 bne .L358
320 b common_exit
321
322/*
323 ******************************************************************************
324 * Loop to handle term = -1 condition
325 *
326 * r0 = dpp->weight_B r8 =
327 * r1 = bptr r9 =
328 * r2 = intermediate result r10 = -1024 (for clipping)
329 * r3 = previous right sample r11 = lo accumulator (for rounding)
330 * r4 = dpp->weight_A ip = current sample
331 * r5 = dpp sp =
332 * r6 = dpp->delta lr = updated left sample
333 * r7 = eptr pc =
334 *******************************************************************************
335 */
336
337term_minus_1:
338 ldr r3, [r1, #-4]
339
340term_minus_1_loop:
341 ldr ip, [r1], #8 @ for left channel the decorrelation value
342 movs r3, r3, asl #4 @ is the previous right sample (in r3)
343 mov r11, #0x80000000
344 mov lr, ip
345 smlalne r11, lr, r4, r3
346 strne lr, [r1, #-8]
347 cmpne ip, #0
348 beq .L361
349 teq ip, r3 @ update weight based on signs
350 submi r4, r4, r6
351 addpl r4, r4, r6
352 cmp r4, #(1024 << 18)
353 movgt r4, #(1024 << 18)
354 cmp r4, r10
355 movlt r4, r10
356
357.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value
358 movs lr, lr, asl #4
359 mov r11, #0x80000000
360 mov r3, r2
361 smlalne r11, r3, r0, lr
362 strne r3, [r1, #-4]
363 cmpne r2, #0
364 beq .L369
365 teq r2, lr
366 submi r0, r0, r6
367 addpl r0, r0, r6
368 cmp r0, #(1024 << 18) @ then clip weight to +/-1024
369 movgt r0, #(1024 << 18)
370 cmp r0, r10
371 movlt r0, r10
372
373.L369: cmp r7, r1 @ loop back if more samples to do
374 bhi term_minus_1_loop
375
376 str r3, [r5, #8] @ else store right sample and exit
377 b common_exit
378
379/*
380 ******************************************************************************
381 * Loop to handle term = -2 condition
382 * (note that the channels are processed in the reverse order here)
383 *
384 * r0 = dpp->weight_B r8 =
385 * r1 = bptr r9 =
386 * r2 = intermediate result r10 = -1024 (for clipping)
387 * r3 = previous left sample r11 = lo accumulator (for rounding)
388 * r4 = dpp->weight_A ip = current sample
389 * r5 = dpp sp =
390 * r6 = dpp->delta lr = updated right sample
391 * r7 = eptr pc =
392 *******************************************************************************
393 */
394
395term_minus_2:
396 ldr r3, [r1, #-8]
397
398term_minus_2_loop:
399 ldr ip, [r1, #4] @ for right channel the decorrelation value
400 movs r3, r3, asl #4 @ is the previous left sample (in r3)
401 mov r11, #0x80000000
402 mov lr, ip
403 smlalne r11, lr, r0, r3
404 strne lr, [r1, #4]
405 cmpne ip, #0
406 beq .L380
407 teq ip, r3 @ update weight based on signs
408 submi r0, r0, r6
409 addpl r0, r0, r6
410 cmp r0, #(1024 << 18) @ then clip weight to +/-1024
411 movgt r0, #(1024 << 18)
412 cmp r0, r10
413 movlt r0, r10
414
415.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value
416 movs lr, lr, asl #4
417 mov r11, #0x80000000
418 mov r3, r2
419 smlalne r11, r3, r4, lr
420 strne r3, [r1, #-8]
421 cmpne r2, #0
422 beq .L388
423 teq r2, lr
424 submi r4, r4, r6
425 addpl r4, r4, r6
426 cmp r4, #(1024 << 18)
427 movgt r4, #(1024 << 18)
428 cmp r4, r10
429 movlt r4, r10
430
431.L388: cmp r7, r1 @ loop back if more samples to do
432 bhi term_minus_2_loop
433
434 str r3, [r5, #40] @ else store left channel and exit
435 b common_exit
436
437/*
438 ******************************************************************************
439 * Loop to handle term = -3 condition
440 *
441 * r0 = dpp->weight_B r8 = previous left sample
442 * r1 = bptr r9 =
443 * r2 = current left sample r10 = -1024 (for clipping)
444 * r3 = previous right sample r11 = lo accumulator (for rounding)
445 * r4 = dpp->weight_A ip = intermediate result
446 * r5 = dpp sp =
447 * r6 = dpp->delta lr =
448 * r7 = eptr pc =
449 *******************************************************************************
450 */
451
452term_minus_3:
453 ldr r3, [r1, #-4] @ load previous samples
454 ldr r8, [r1, #-8]
455
456term_minus_3_loop:
457 ldr ip, [r1], #4
458 movs r3, r3, asl #4
459 mov r11, #0x80000000
460 mov r2, ip
461 smlalne r11, r2, r4, r3
462 strne r2, [r1, #-4]
463 cmpne ip, #0
464 beq .L399
465 teq ip, r3 @ update weight based on signs
466 submi r4, r4, r6
467 addpl r4, r4, r6
468 cmp r4, #(1024 << 18) @ then clip weight to +/-1024
469 movgt r4, #(1024 << 18)
470 cmp r4, r10
471 movlt r4, r10
472
473.L399: movs ip, r8, asl #4 @ ip = previous left we use now
474 mov r8, r2 @ r8 = current left we use next time
475 ldr r2, [r1], #4
476 mov r11, #0x80000000
477 mov r3, r2
478 smlalne r11, r3, r0, ip
479 strne r3, [r1, #-4]
480 cmpne r2, #0
481 beq .L407
482 teq ip, r2
483 submi r0, r0, r6
484 addpl r0, r0, r6
485 cmp r0, #(1024 << 18)
486 movgt r0, #(1024 << 18)
487 cmp r0, r10
488 movlt r0, r10
489
490.L407: cmp r7, r1 @ loop back if more samples to do
491 bhi term_minus_3_loop
492
493 str r3, [r5, #8] @ else store previous samples & exit
494 str r8, [r5, #40]
495
496/*
497 * Before finally exiting we must store weights back for next time
498 */
499
500common_exit:
501 mov r0, r0, asr #18 @ restore weights to real magnitude
502 mov r4, r4, asr #18
503 strh r4, [r5, #4]
504 strh r0, [r5, #6]
505 ldmpc regs="r4-r8, r10-r11"
506