summaryrefslogtreecommitdiff
path: root/lib/rbcodec/dsp/dsp_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/dsp/dsp_arm.S')
-rw-r--r--lib/rbcodec/dsp/dsp_arm.S621
1 files changed, 342 insertions, 279 deletions
diff --git a/lib/rbcodec/dsp/dsp_arm.S b/lib/rbcodec/dsp/dsp_arm.S
index 685aca411c..9fd19ae108 100644
--- a/lib/rbcodec/dsp/dsp_arm.S
+++ b/lib/rbcodec/dsp/dsp_arm.S
@@ -21,20 +21,19 @@
21 #include "config.h" 21 #include "config.h"
22 22
23/**************************************************************************** 23/****************************************************************************
24 * void channels_process_sound_chan_mono(int count, int32_t *buf[]) 24 * void channel_mode_proc_mono(struct dsp_proc_entry *this,
25 * struct dsp_buffer **buf_p)
25 */ 26 */
26 27 .section .icode
27#include "config.h" 28 .global channel_mode_proc_mono
28 29 .type channel_mode_proc_mono, %function
29 .section .icode, "ax", %progbits 30channel_mode_proc_mono:
30 .align 2 31 @ input: r0 = this, r1 = buf_p
31 .global channels_process_sound_chan_mono 32 ldr r1, [r1] @ r1 = buf = *buf_p;
32 .type channels_process_sound_chan_mono, %function
33channels_process_sound_chan_mono:
34 @ input: r0 = count, r1 = buf
35 stmfd sp!, { r4, lr } @ 33 stmfd sp!, { r4, lr } @
36 @ 34 @
37 ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] 35 ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
36 @ r2 = buf->p32[1]
38 subs r0, r0, #1 @ odd: end at 0; even: end at -1 37 subs r0, r0, #1 @ odd: end at 0; even: end at -1
39 beq .mono_singlesample @ Zero? Only one sample! 38 beq .mono_singlesample @ Zero? Only one sample!
40 @ 39 @
@@ -61,25 +60,26 @@ channels_process_sound_chan_mono:
61 str r12, [r2] @ store Mo 60 str r12, [r2] @ store Mo
62 @ 61 @
63 ldmpc regs=r4 @ 62 ldmpc regs=r4 @
64 .size channels_process_sound_chan_mono, \ 63 .size channel_mode_proc_mono, .-channel_mode_proc_mono
65 .-channels_process_sound_chan_mono
66 64
67/**************************************************************************** 65/****************************************************************************
68 * void channels_process_sound_chan_custom(int count, int32_t *buf[]) 66 * void channel_mode_proc_custom(struct dsp_proc_entry *this,
67 * struct dsp_buffer **buf_p)
69 */ 68 */
70 .section .icode, "ax", %progbits 69 .section .icode
71 .align 2 70 .global channel_mode_proc_custom
72 .global channels_process_sound_chan_custom 71 .type channel_mode_proc_custom, %function
73 .type channels_process_sound_chan_custom, %function 72channel_mode_proc_custom:
74channels_process_sound_chan_custom: 73 @ input: r0 = this, r1 = buf_p
74 ldr r2, [r0] @ r2 = &channel_mode_data = this->data
75 ldr r1, [r1] @ r1 = buf = *buf_p;
76
75 stmfd sp!, { r4-r10, lr } 77 stmfd sp!, { r4-r10, lr }
76 78
77 ldr r3, =dsp_sw_gain 79 ldmia r2, { r3, r4 } @ r3 = sw_gain, r4 = sw_cross
78 ldr r4, =dsp_sw_cross
79 80
80 ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] 81 ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
81 ldr r3, [r3] @ r3 = dsp_sw_gain 82 @ r2 = buf->p32[1]
82 ldr r4, [r4] @ r4 = dsp_sw_cross
83 83
84 subs r0, r0, #1 84 subs r0, r0, #1
85 beq .custom_single_sample @ Zero? Only one sample! 85 beq .custom_single_sample @ Zero? Only one sample!
@@ -135,21 +135,22 @@ channels_process_sound_chan_custom:
135 str r7, [r2] @ Store Rc0 135 str r7, [r2] @ Store Rc0
136 136
137 ldmpc regs=r4-r10 137 ldmpc regs=r4-r10
138 .size channels_process_sound_chan_custom, \ 138 .size channel_mode_proc_custom, .-channel_mode_proc_custom
139 .-channels_process_sound_chan_custom
140 139
141/**************************************************************************** 140/****************************************************************************
142 * void channels_process_sound_chan_karaoke(int count, int32_t *buf[]) 141 * void channel_mode_proc_karaoke(struct dsp_proc_entry *this,
142 * struct dsp_buffer **buf_p)
143 */ 143 */
144 .section .icode, "ax", %progbits 144 .section .icode
145 .align 2 145 .global channel_mode_proc_karaoke
146 .global channels_process_sound_chan_karaoke 146 .type channel_mode_proc_karaoke, %function
147 .type channels_process_sound_chan_karaoke, %function 147channel_mode_proc_karaoke:
148channels_process_sound_chan_karaoke: 148 @ input: r0 = this, r1 = buf_p
149 @ input: r0 = count, r1 = buf 149 ldr r1, [r1] @ r1 = buf = *buf_p;
150 stmfd sp!, { r4, lr } @ 150 stmfd sp!, { r4, lr } @
151 @ 151 @
152 ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] 152 ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
153 @ r2 = buf->p32[1]
153 subs r0, r0, #1 @ odd: end at 0; even: end at -1 154 subs r0, r0, #1 @ odd: end at 0; even: end at -1
154 beq .karaoke_singlesample @ Zero? Only one sample! 155 beq .karaoke_singlesample @ Zero? Only one sample!
155 @ 156 @
@@ -179,24 +180,313 @@ channels_process_sound_chan_karaoke:
179 str r12, [r2] @ store Ro 180 str r12, [r2] @ store Ro
180 @ 181 @
181 ldmpc regs=r4 @ 182 ldmpc regs=r4 @
182 .size channels_process_sound_chan_karaoke, \ 183 .size channel_mode_proc_karaoke, .-channel_mode_proc_karaoke
183 .-channels_process_sound_chan_karaoke 184
185/****************************************************************************
186 * void crossfeed_process(struct dsp_proc_entry *this,
187 * struct dsp_buffer **buf_p)
188 */
189 .section .text
190 .global crossfeed_process
191crossfeed_process:
192 @ input: r0 = this, r1 = buf_p
193 @ unfortunately, we ended up in a bit of a register squeeze here, and need
194 @ to keep the count on the stack :/
195 ldr r1, [r1] @ r1 = buf = *buf_p;
196 stmfd sp!, { r4-r11, lr } @ stack modified regs
197 ldr r12, [r1] @ r12 = buf->remcount
198 ldr r14, [r0] @ r14 = this->data = &crossfeed_state
199 ldmib r1, { r2-r3 } @ r2 = buf->p32[0], r3 = buf->p32[1]
200 ldmia r14!, { r4-r11 } @ load direct gain and filter data
201 add r0, r14, #13*2*4 @ calculate end of delay
202 stmfd sp!, { r0, r12 } @ stack end of delay adr, count and state
203 ldr r0, [r0] @ fetch current delay line address
204
205 /* Register usage in loop:
206 * r0 = &delay[index][0], r1 = accumulator high, r2 = buf->p32[0],
207 * r3 = buf->p32[1], r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
208 * r8-r11 = filter history, r12 = temp, r14 = accumulator low
209 */
210.cfloop:
211 smull r14, r1, r6, r8 @ acc = b1*dr[n - 1]
212 smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1]
213 ldr r8, [r0, #4] @ r8 = dr[n]
214 smlal r14, r1, r5, r8 @ acc += b0*dr[n]
215 mov r9, r1, lsl #1 @ fix format for filter history
216 ldr r12, [r2] @ load left input
217 smlal r14, r1, r4, r12 @ acc += gain*x_l[n]
218 mov r1, r1, lsl #1 @ fix format
219 str r1, [r2], #4 @ save result
220
221 smull r14, r1, r6, r10 @ acc = b1*dl[n - 1]
222 smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1]
223 ldr r10, [r0] @ r10 = dl[n]
224 str r12, [r0], #4 @ save left input to delay line
225 smlal r14, r1, r5, r10 @ acc += b0*dl[n]
226 mov r11, r1, lsl #1 @ fix format for filter history
227 ldr r12, [r3] @ load right input
228 smlal r14, r1, r4, r12 @ acc += gain*x_r[n]
229 str r12, [r0], #4 @ save right input to delay line
230 mov r1, r1, lsl #1 @ fix format
231 ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack
232 str r1, [r3], #4 @ save result
233
234 cmp r0, r12 @ need to wrap to start of delay?
235 subhs r0, r12, #13*2*4 @ wrap back delay line ptr to start
236
237 subs r14, r14, #1 @ are we finished?
238 strgt r14, [sp, #4] @ nope, save count back to stack
239 bgt .cfloop
240
241 @ save data back to struct
242 str r0, [r12] @ save delay line index
243 sub r12, r12, #13*2*4 + 4*4 @ r12 = data->history
244 stmia r12, { r8-r11 } @ save filter history
245 add sp, sp, #8 @ remove temp variables from stack
246 ldmpc regs=r4-r11
247 .size crossfeed_process, .-crossfeed_process
248
249/****************************************************************************
250 * int lin_resample_resample(struct resample_data *data,
251 * struct dsp_buffer *src,
252 * struct dsp_buffer *dst)
253 */
254 .section .text
255 .global lin_resample_resample
256lin_resample_resample:
257 @input: r0 = data, r1 = src, r2 = dst
258 stmfd sp!, { r4-r11, lr } @ stack modified regs
259 ldr r4, [r0] @ r4 = data->delta
260 add r10, r0, #4 @ r10 = &data->phase
261 ldrb r3, [r1, #17] @ r3 = num_channels,
262 stmfd sp!, { r1, r10 } @ stack src, &data->phase
263.lrs_channel_loop:
264 ldr r5, [r10] @ r5 = data->phase
265 ldr r6, [r1] @ r6 = srcrem = src->remcount
266 ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch]
267 ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch]
268 ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount
269
270 cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
271 movgt r6, #0x8000 @
272 mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem)
273 cmp r0, r6 @
274 movgt r0, r6 @ r0 = pos = phase >> 16
275 cmp r0, #0 @
276 ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch]
277 addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1]
278 ldrgt r11, [r12, #-4] @
279 cmp r0, r6 @
280 bge .lrs_channel_done @ pos >= count? channel complete
281
282 cmp r4, #0x10000 @ delta >= 1.0?
283 ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos]
284 bhs .lrs_dsstart @ yes? is downsampling
285
286 /** Upsampling **/
287 mov r5, r5, lsl #16 @ Move phase into high halfword
288 add r7, r7, r0, lsl #2 @ r7 = &s[pos]
289 sub r0, r6, r0 @ r0 = dte = srcrem - pos
290.lrs_usloop_1:
291 ldr r12, [r7], #4 @ r12 = s[pos]
292 sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
293.lrs_usloop_0:
294 mov r1, r5, lsr #16 @ r1 = frac = phase >> 16
295 @ keep frac in Rs to take advantage of multiplier early termination
296 smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
297 add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
298 add r1, r1, r10, lsl #16 @
299 str r1, [r8], #4 @ *d++ = out
300 subs r9, r9, #1 @ destination full?
301 bls .lrs_usfull @ yes? channel is done
302 adds r5, r5, r4, lsl #16 @ phase += delta << 16
303 bcc .lrs_usloop_0 @ if carry is set, pos is incremented
304 subs r0, r0, #1 @ if srcrem > 0, do another sample
305 mov r11, r12 @ r11 = last = s[pos-1] (pos changed)
306 bgt .lrs_usloop_1
307 b .lrs_usdone
308
309.lrs_usfull:
310 adds r5, r5, r4, lsl #16 @ do missed phase increment
311 subcs r0, r0, #1 @ do missed srcrem decrement
312 movcs r11, r12 @ r11 = s[pos-1] (pos changed)
313
314.lrs_usdone:
315 sub r0, r6, r0 @ r0 = pos = srcrem - dte
316 orr r5, r5, r0 @ reconstruct swapped phase
317 mov r5, r5, ror #16 @ swap pos and frac for phase
318 b .lrs_channel_done @
319
320 /** Downsampling **/
321.lrs_dsloop:
322 add r10, r7, r0, lsl #2 @ r10 = &s[pos]
323 ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos]
324.lrs_dsstart:
325 sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
326 @ keep frac in Rs to take advantage of multiplier early termination
327 bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff
328 smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
329 add r5, r5, r4 @ phase += delta
330 subs r9, r9, #1 @ destination full? ...
331 mov r0, r5, lsr #16 @ pos = phase >> 16
332 add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
333 add r1, r1, r10, lsl #16 @
334 str r1, [r8], #4 @ *d++ = out
335 cmpgt r6, r0 @ ... || pos >= srcrem? ...
336 bgt .lrs_dsloop @ ... no, do more samples
337
338 cmp r0, r6 @ pos = MIN(pos, srcrem)
339 movgt r0, r6 @
340 sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0
341 ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1]
342
343.lrs_channel_done:
344 ldmia sp, { r1, r10 } @ recover src, &data->phase
345 str r11, [r10, r3, lsl #2] @ last_sample[ch] = last
346 subs r3, r3, #1 @
347 bgt .lrs_channel_loop @
348
349 ldr r6, [r2, #12] @ r6 = dst->bufcount
350 sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
351 str r5, [r10] @ data->phase = r5
352 sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount
353 str r6, [r2] @ dst->remcount = dstcount
354 add sp, sp, #8 @ adjust stack for temp variables
355 ldmpc regs=r4-r11 @ ... and we're out
356 .size lin_resample_resample, .-lin_resample_resample
357
358/****************************************************************************
359 * void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
360 */
361 .section .icode
362 .global pga_process
363 .type pga_process, %function
364pga_process:
365 @ input: r0 = this, r1 = buf_p
366 ldr r0, [r0] @ r0 = data = this->data (&pga_data)
367 ldr r1, [r1] @ r1 = buf = *buf_p;
368 stmfd sp!, { r4-r8, lr }
369
370 ldr r4, [r0] @ r4 = data->gain
371 ldr r0, [r1], #4 @ r0 = buf->remcount, r1 = buf->p32
372 ldrb r3, [r1, #13] @ r3 = buf->format.num_channels
373
374.pga_channelloop:
375 ldr r2, [r1], #4 @ r2 = buf->p32[ch] and inc index of p32
376 subs r12, r0, #1 @ r12 = count - 1
377 beq .pga_singlesample @ Zero? Only one sample!
378
379.pga_loop:
380 ldmia r2, { r5, r6 } @ load r5, r6 from r2 (*p32[ch])
381 smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
382 smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8)
383 subs r12, r12, #2
384 mov r7, r7, lsr #23
385 mov r14, r14, lsr #23
386 orr r7, r7, r8, asl #9
387 orr r14, r14, r5, asl #9
388 stmia r2!, { r7, r14 } @ save r7, r14 to *p32[ch] and increment
389 bgt .pga_loop @ end of pga loop
390
391 blt .pga_evencount @ < 0? even count
392
393.pga_singlesample:
394 ldr r5, [r2] @ handle odd sample
395 smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
396 mov r7, r7, lsr #23
397 orr r7, r7, r8, asl #9
398 str r7, [r2]
399
400.pga_evencount:
401 subs r3, r3, #1
402 bgt .pga_channelloop @ end of channel loop
403
404 ldmpc regs=r4-r8
405 .size pga_process, .-pga_process
406
407/****************************************************************************
408 * void filter_process(struct dsp_filter *f, int32_t *buf[], int count,
409 * unsigned int channels)
410 *
411 * define HIGH_PRECISION as '1' to make filtering calculate lower bits after
412 * shifting. without this, "shift" - 1 of the lower bits will be lost here.
413 */
414#define HIGH_PRECISION 0
415
416#if CONFIG_CPU == PP5002
417 .section .icode,"ax",%progbits
418#else
419 .text
420#endif
421 .global filter_process
422filter_process:
423 @input: r0 = f, r1 = buf, r2 = count, r3 = channels
424 stmfd sp!, { r4-r11, lr } @ save all clobbered regs
425 ldmia r0!, { r4-r8 } @ load coefs, r0 = f->history
426 sub r3, r3, #1 @ r3 = ch = channels - 1
427 stmfd sp!, { r0-r3 } @ save adjusted params
428 ldrb r14, [r0, #32] @ r14 = shift
429
430 @ Channels are processed high to low while history is saved low to high
431 @ It's really noone's business how we do this
432.fp_channelloop:
433 ldmia r0, { r9-r12 } @ load history, r0 = history[channels-ch-1]
434 ldr r3, [r1, r3, lsl #2] @ r3 = buf[ch]
435
436 @ r9-r12 = history, r4-r8 = coefs, r0..r1 = accumulator,
437 @ r2 = number of samples, r3 = buf[ch], r14 = shift amount
438.fp_loop:
439 @ Direct form 1 filtering code.
440 @ y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
441 @ where y[] is output and x[] is input. This is performed out of order to
442 @ reuse registers, we're pretty short on regs.
443 smull r0, r1, r5, r9 @ acc = b1*x[i - 1]
444 smlal r0, r1, r6, r10 @ acc += b2*x[i - 2]
445 mov r10, r9 @ fix input history
446 ldr r9, [r3] @ load input and fix history
447 smlal r0, r1, r7, r11 @ acc += a1*y[i - 1]
448 smlal r0, r1, r8, r12 @ acc += a2*y[i - 2]
449 smlal r0, r1, r4, r9 @ acc += b0*x[i] /* avoid stall on arm9 */
450 mov r12, r11 @ fix output history
451 mov r11, r1, asl r14 @ get upper part of result and shift left
452#if HIGH_PRECISION
453 rsb r1, r14, #32 @ get shift amount for lower part
454 orr r11, r11, r0, lsr r1 @ then mix in correctly shifted lower part
455#endif
456 str r11, [r3], #4 @ save result
457 subs r2, r2, #1 @ are we done with this channel?
458 bgt .fp_loop @
459
460 ldr r3, [sp, #12] @ r3 = ch
461 ldr r0, [sp] @ r0 = history[channels-ch-1]
462 subs r3, r3, #1 @ all channels processed?
463 stmia r0!, { r9-r12 } @ save back history, history++
464 ldmhsib sp, { r1-r2 } @ r1 = buf, r2 = count
465 strhs r3, [sp, #12] @ store ch
466 strhs r0, [sp] @ store history[channels-ch-1]
467 bhs .fp_channelloop
468
469 add sp, sp, #16 @ compensate for temp storage
470 ldmpc regs=r4-r11
471 .size filter_process, .-filter_process
184 472
185#if ARM_ARCH < 6 473#if ARM_ARCH < 6
186/**************************************************************************** 474/****************************************************************************
187 * void sample_output_mono(int count, struct dsp_data *data, 475 * void sample_output_mono(struct sample_io_data *this,
188 * const int32_t *src[], int16_t *dst) 476 * struct dsp_buffer *src,
477 * struct dsp_buffer *dst)
189 */ 478 */
190 .section .icode, "ax", %progbits 479 .section .icode
191 .align 2
192 .global sample_output_mono 480 .global sample_output_mono
193 .type sample_output_mono, %function 481 .type sample_output_mono, %function
194sample_output_mono: 482sample_output_mono:
195 @ input: r0 = count, r1 = data, r2 = src, r3 = dst 483 @ input: r0 = this, r1 = src, r2 = dst
196 stmfd sp!, { r4-r6, lr } 484 stmfd sp!, { r4-r6, lr }
197 485
198 ldr r1, [r1] @ lr = data->output_scale 486 ldr r0, [r0] @ r0 = this->outcount
199 ldr r2, [r2] @ r2 = src[0] 487 ldr r3, [r2, #4] @ r2 = dst->p16out
488 ldr r2, [r1, #4] @ r1 = src->p32[0]
489 ldrb r1, [r1, #19] @ r2 = src->format.output_scale
200 490
201 mov r4, #1 491 mov r4, #1
202 mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) 492 mov r4, r4, lsl r1 @ r4 = 1 << (scale-1)
@@ -246,19 +536,21 @@ sample_output_mono:
246 .size sample_output_mono, .-sample_output_mono 536 .size sample_output_mono, .-sample_output_mono
247 537
248/**************************************************************************** 538/****************************************************************************
249 * void sample_output_stereo(int count, struct dsp_data *data, 539 * void sample_output_stereo(struct sample_io_data *this,
250 * const int32_t *src[], int16_t *dst) 540 * struct dsp_buffer *src,
541 * struct dsp_buffer *dst)
251 */ 542 */
252 .section .icode, "ax", %progbits 543 .section .icode
253 .align 2
254 .global sample_output_stereo 544 .global sample_output_stereo
255 .type sample_output_stereo, %function 545 .type sample_output_stereo, %function
256sample_output_stereo: 546sample_output_stereo:
257 @ input: r0 = count, r1 = data, r2 = src, r3 = dst 547 @ input: r0 = this, r1 = src, r2 = dst
258 stmfd sp!, { r4-r9, lr } 548 stmfd sp!, { r4-r9, lr }
259 549
260 ldr r1, [r1] @ r1 = data->output_scale 550 ldr r0, [r0] @ r0 = this->outcount
261 ldmia r2, { r2, r5 } @ r2 = src[0], r5 = src[1] 551 ldr r3, [r2, #4] @ r3 = dsp->p16out
552 ldmib r1, { r2, r5 } @ r2 = src->p32[0], r5 = src->p32[1]
553 ldrb r1, [r1, #19] @ r1 = src->format.output_scale
262 554
263 mov r4, #1 555 mov r4, #1
264 mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) 556 mov r4, r4, lsl r1 @ r4 = 1 << (scale-1)
@@ -330,232 +622,3 @@ sample_output_stereo:
330 ldmpc regs=r4-r9 622 ldmpc regs=r4-r9
331 .size sample_output_stereo, .-sample_output_stereo 623 .size sample_output_stereo, .-sample_output_stereo
332#endif /* ARM_ARCH < 6 */ 624#endif /* ARM_ARCH < 6 */
333
334/****************************************************************************
335 * void apply_crossfeed(int count, int32_t* src[])
336 */
337 .section .text
338 .global apply_crossfeed
339apply_crossfeed:
340 @ unfortunately, we ended up in a bit of a register squeeze here, and need
341 @ to keep the count on the stack :/
342 stmdb sp!, { r4-r11, lr } @ stack modified regs
343 ldmia r1, { r2-r3 } @ r2 = src[0], r3 = src[1]
344
345 ldr r1, =crossfeed_data
346 ldmia r1!, { r4-r11 } @ load direct gain and filter data
347 mov r12, r0 @ better to ldm delay + count later
348 add r0, r1, #13*4*2 @ calculate end of delay
349 stmdb sp!, { r0, r12 } @ stack end of delay adr and count
350 ldr r0, [r1, #13*4*2] @ fetch current delay line address
351
352 /* Register usage in loop:
353 * r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1],
354 * r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
355 * r8-r11 = filter history, r12 = temp, r14 = accumulator low
356 */
357.cfloop:
358 smull r14, r1, r6, r8 @ acc = b1*dr[n - 1]
359 smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1]
360 ldr r8, [r0, #4] @ r8 = dr[n]
361 smlal r14, r1, r5, r8 @ acc += b0*dr[n]
362 mov r9, r1, lsl #1 @ fix format for filter history
363 ldr r12, [r2] @ load left input
364 smlal r14, r1, r4, r12 @ acc += gain*x_l[n]
365 mov r1, r1, lsl #1 @ fix format
366 str r1, [r2], #4 @ save result
367
368 smull r14, r1, r6, r10 @ acc = b1*dl[n - 1]
369 smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1]
370 ldr r10, [r0] @ r10 = dl[n]
371 str r12, [r0], #4 @ save left input to delay line
372 smlal r14, r1, r5, r10 @ acc += b0*dl[n]
373 mov r11, r1, lsl #1 @ fix format for filter history
374 ldr r12, [r3] @ load right input
375 smlal r14, r1, r4, r12 @ acc += gain*x_r[n]
376 str r12, [r0], #4 @ save right input to delay line
377 mov r1, r1, lsl #1 @ fix format
378 ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack
379 str r1, [r3], #4 @ save result
380
381 cmp r0, r12 @ need to wrap to start of delay?
382 subeq r0, r0, #13*4*2 @ wrap back delay line ptr to start
383
384 subs r14, r14, #1 @ are we finished?
385 strne r14, [sp, #4] @ nope, save count back to stack
386 bne .cfloop
387
388 @ save data back to struct
389 ldr r12, =crossfeed_data + 4*4
390 stmia r12, { r8-r11 } @ save filter history
391 str r0, [r12, #30*4] @ save delay line index
392 add sp, sp, #8 @ remove temp variables from stack
393 ldmpc regs=r4-r11
394 .size apply_crossfeed, .-apply_crossfeed
395
396/****************************************************************************
397 * int dsp_downsample(int count, struct dsp_data *data,
398 * in32_t *src[], int32_t *dst[])
399 */
400 .section .text
401 .global dsp_downsample
402dsp_downsample:
403 stmdb sp!, { r4-r11, lr } @ stack modified regs
404 ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
405 sub r5, r5, #1 @ pre-decrement num_channels for use
406 add r4, r1, #12 @ r4 = &resample_data.phase
407 mov r12, #0xff
408 orr r12, r12, #0xff00 @ r12 = 0xffff
409.dschannel_loop:
410 ldr r1, [r4] @ r1 = resample_data.phase
411 ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
412 ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
413 add r9, r4, #4 @ r9 = &last_sample[0]
414 ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
415 sub r11, r0, #1
416 ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
417 str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
418 movs r9, r1, lsr #16 @ r9 = pos = phase >> 16
419 ldreq r11, [r7] @ if pos = 0, load src[0] and jump into loop
420 beq .dsuse_last_start
421 cmp r9, r0 @ if pos >= count, we're already done
422 bge .dsloop_skip
423
424 @ Register usage in loop:
425 @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
426 @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
427.dsloop:
428 add r9, r7, r9, lsl #2 @ r9 = &s[pos]
429 ldmda r9, { r10, r11 } @ r10 = s[pos - 1], r11 = s[pos]
430.dsuse_last_start:
431 sub r11, r11, r10 @ r11 = diff = s[pos] - s[pos - 1]
432 @ keep frac in lower bits to take advantage of multiplier early termination
433 and r9, r1, r12 @ frac = phase & 0xffff
434 smull r9, r14, r11, r9
435 add r1, r1, r6 @ phase += delta
436 add r10, r10, r9, lsr #16 @ r10 = out = s[pos - 1] + frac*diff
437 add r10, r10, r14, lsl #16
438 str r10, [r8], #4 @ *d++ = out
439 mov r9, r1, lsr #16 @ pos = phase >> 16
440 cmp r9, r0 @ pos < count?
441 blt .dsloop @ yup, do more samples
442.dsloop_skip:
443 subs r5, r5, #1
444 bpl .dschannel_loop @ if (--ch) >= 0, do another channel
445 sub r1, r1, r0, lsl #16 @ wrap phase back to start
446 str r1, [r4] @ store back
447 ldr r1, [r3] @ r1 = &dst[0]
448 sub r8, r8, r1 @ dst - &dst[0]
449 mov r0, r8, lsr #2 @ convert bytes->samples
450 ldmpc regs=r4-r11 @ ... and we're out
451 .size dsp_downsample, .-dsp_downsample
452
453/****************************************************************************
454 * int dsp_upsample(int count, struct dsp_data *dsp,
455 * in32_t *src[], int32_t *dst[])
456 */
457 .section .text
458 .global dsp_upsample
459dsp_upsample:
460 stmfd sp!, { r4-r11, lr } @ stack modified regs
461 ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
462 sub r5, r5, #1 @ pre-decrement num_channels for use
463 add r4, r1, #12 @ r4 = &resample_data.phase
464 mov r6, r6, lsl #16 @ we'll use carry to detect pos increments
465 stmfd sp!, { r0, r4 } @ stack count and &resample_data.phase
466.uschannel_loop:
467 ldr r12, [r4] @ r12 = resample_data.phase
468 ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
469 ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
470 add r9, r4, #4 @ r9 = &last_sample[0]
471 mov r1, r12, lsl #16 @ we'll use carry to detect pos increments
472 sub r11, r0, #1
473 ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
474 ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
475 str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
476 movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16
477 beq .usstart_0 @ pos = 0
478 cmp r14, r0 @ if pos >= count, we're already done
479 bge .usloop_skip
480 add r7, r7, r14, lsl #2 @ r7 = &s[pos]
481 ldr r10, [r7, #-4] @ r11 = s[pos - 1]
482 b .usstart_0
483
484 @ Register usage in loop:
485 @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
486 @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos]
487.usloop_1:
488 mov r10, r11 @ r10 = previous sample
489.usstart_0:
490 ldr r11, [r7], #4 @ r11 = next sample
491 mov r4, r1, lsr #16 @ r4 = frac = phase >> 16
492 sub r9, r11, r10 @ r9 = diff = s[pos] - s[pos - 1]
493.usloop_0:
494 smull r12, r14, r4, r9
495 adds r1, r1, r6 @ phase += delta << 16
496 mov r4, r1, lsr #16 @ r4 = frac = phase >> 16
497 add r14, r10, r14, lsl #16
498 add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff
499 str r14, [r8], #4 @ *d++ = out
500 bcc .usloop_0 @ if carry is set, pos is incremented
501 subs r0, r0, #1 @ if count > 0, do another sample
502 bgt .usloop_1
503.usloop_skip:
504 subs r5, r5, #1
505 ldmfd sp, { r0, r4 } @ reload count and &resample_data.phase
506 bpl .uschannel_loop @ if (--ch) >= 0, do another channel
507 mov r1, r1, lsr #16 @ wrap phase back to start of next frame
508 ldr r2, [r3] @ r1 = &dst[0]
509 str r1, [r4] @ store phase
510 sub r8, r8, r2 @ dst - &dst[0]
511 mov r0, r8, lsr #2 @ convert bytes->samples
512 add sp, sp, #8 @ adjust stack for temp variables
513 ldmpc regs=r4-r11 @ ... and we're out
514 .size dsp_upsample, .-dsp_upsample
515
516/****************************************************************************
517 * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
518 */
519 .section .icode, "ax", %progbits
520 .align 2
521 .global dsp_apply_gain
522 .type dsp_apply_gain, %function
523dsp_apply_gain:
524 @ input: r0 = count, r1 = data, r2 = buf[]
525 stmfd sp!, { r4-r8, lr }
526
527 ldr r3, [r1, #4] @ r3 = data->num_channels
528 ldr r4, [r1, #32] @ r5 = data->gain
529
530.dag_outerloop:
531 ldr r1, [r2], #4 @ r1 = buf[0] and increment index of buf[]
532 subs r12, r0, #1 @ r12 = r0 = count - 1
533 beq .dag_singlesample @ Zero? Only one sample!
534
535.dag_innerloop:
536 ldmia r1, { r5, r6 } @ load r5, r6 from r1
537 smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
538 smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8)
539 subs r12, r12, #2
540 mov r7, r7, lsr #23
541 mov r14, r14, lsr #23
542 orr r7, r7, r8, asl #9
543 orr r14, r14, r5, asl #9
544 stmia r1!, { r7, r14 } @ save r7, r14 to [r1] and increment r1
545 bgt .dag_innerloop @ end of inner loop
546
547 blt .dag_evencount @ < 0? even count
548
549.dag_singlesample:
550 ldr r5, [r1] @ handle odd sample
551 smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
552 mov r7, r7, lsr #23
553 orr r7, r7, r8, asl #9
554 str r7, [r1]
555
556.dag_evencount:
557 subs r3, r3, #1
558 bgt .dag_outerloop @ end of outer loop
559
560 ldmpc regs=r4-r8
561 .size dsp_apply_gain, .-dsp_apply_gain