diff options
Diffstat (limited to 'lib/rbcodec/dsp/dsp_arm.S')
-rw-r--r-- | lib/rbcodec/dsp/dsp_arm.S | 621 |
1 files changed, 342 insertions, 279 deletions
diff --git a/lib/rbcodec/dsp/dsp_arm.S b/lib/rbcodec/dsp/dsp_arm.S index 685aca411c..9fd19ae108 100644 --- a/lib/rbcodec/dsp/dsp_arm.S +++ b/lib/rbcodec/dsp/dsp_arm.S | |||
@@ -21,20 +21,19 @@ | |||
21 | #include "config.h" | 21 | #include "config.h" |
22 | 22 | ||
23 | /**************************************************************************** | 23 | /**************************************************************************** |
24 | * void channels_process_sound_chan_mono(int count, int32_t *buf[]) | 24 | * void channel_mode_proc_mono(struct dsp_proc_entry *this, |
25 | * struct dsp_buffer **buf_p) | ||
25 | */ | 26 | */ |
26 | 27 | .section .icode | |
27 | #include "config.h" | 28 | .global channel_mode_proc_mono |
28 | 29 | .type channel_mode_proc_mono, %function | |
29 | .section .icode, "ax", %progbits | 30 | channel_mode_proc_mono: |
30 | .align 2 | 31 | @ input: r0 = this, r1 = buf_p |
31 | .global channels_process_sound_chan_mono | 32 | ldr r1, [r1] @ r1 = buf = *buf_p; |
32 | .type channels_process_sound_chan_mono, %function | ||
33 | channels_process_sound_chan_mono: | ||
34 | @ input: r0 = count, r1 = buf | ||
35 | stmfd sp!, { r4, lr } @ | 33 | stmfd sp!, { r4, lr } @ |
36 | @ | 34 | @ |
37 | ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] | 35 | ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0], |
36 | @ r2 = buf->p32[1] | ||
38 | subs r0, r0, #1 @ odd: end at 0; even: end at -1 | 37 | subs r0, r0, #1 @ odd: end at 0; even: end at -1 |
39 | beq .mono_singlesample @ Zero? Only one sample! | 38 | beq .mono_singlesample @ Zero? Only one sample! |
40 | @ | 39 | @ |
@@ -61,25 +60,26 @@ channels_process_sound_chan_mono: | |||
61 | str r12, [r2] @ store Mo | 60 | str r12, [r2] @ store Mo |
62 | @ | 61 | @ |
63 | ldmpc regs=r4 @ | 62 | ldmpc regs=r4 @ |
64 | .size channels_process_sound_chan_mono, \ | 63 | .size channel_mode_proc_mono, .-channel_mode_proc_mono |
65 | .-channels_process_sound_chan_mono | ||
66 | 64 | ||
67 | /**************************************************************************** | 65 | /**************************************************************************** |
68 | * void channels_process_sound_chan_custom(int count, int32_t *buf[]) | 66 | * void channel_mode_proc_custom(struct dsp_proc_entry *this, |
67 | * struct dsp_buffer **buf_p) | ||
69 | */ | 68 | */ |
70 | .section .icode, "ax", %progbits | 69 | .section .icode |
71 | .align 2 | 70 | .global channel_mode_proc_custom |
72 | .global channels_process_sound_chan_custom | 71 | .type channel_mode_proc_custom, %function |
73 | .type channels_process_sound_chan_custom, %function | 72 | channel_mode_proc_custom: |
74 | channels_process_sound_chan_custom: | 73 | @ input: r0 = this, r1 = buf_p |
74 | ldr r2, [r0] @ r2 = &channel_mode_data = this->data | ||
75 | ldr r1, [r1] @ r1 = buf = *buf_p; | ||
76 | |||
75 | stmfd sp!, { r4-r10, lr } | 77 | stmfd sp!, { r4-r10, lr } |
76 | 78 | ||
77 | ldr r3, =dsp_sw_gain | 79 | ldmia r2, { r3, r4 } @ r3 = sw_gain, r4 = sw_cross |
78 | ldr r4, =dsp_sw_cross | ||
79 | 80 | ||
80 | ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] | 81 | ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0], |
81 | ldr r3, [r3] @ r3 = dsp_sw_gain | 82 | @ r2 = buf->p32[1] |
82 | ldr r4, [r4] @ r4 = dsp_sw_cross | ||
83 | 83 | ||
84 | subs r0, r0, #1 | 84 | subs r0, r0, #1 |
85 | beq .custom_single_sample @ Zero? Only one sample! | 85 | beq .custom_single_sample @ Zero? Only one sample! |
@@ -135,21 +135,22 @@ channels_process_sound_chan_custom: | |||
135 | str r7, [r2] @ Store Rc0 | 135 | str r7, [r2] @ Store Rc0 |
136 | 136 | ||
137 | ldmpc regs=r4-r10 | 137 | ldmpc regs=r4-r10 |
138 | .size channels_process_sound_chan_custom, \ | 138 | .size channel_mode_proc_custom, .-channel_mode_proc_custom |
139 | .-channels_process_sound_chan_custom | ||
140 | 139 | ||
141 | /**************************************************************************** | 140 | /**************************************************************************** |
142 | * void channels_process_sound_chan_karaoke(int count, int32_t *buf[]) | 141 | * void channel_mode_proc_karaoke(struct dsp_proc_entry *this, |
142 | * struct dsp_buffer **buf_p) | ||
143 | */ | 143 | */ |
144 | .section .icode, "ax", %progbits | 144 | .section .icode |
145 | .align 2 | 145 | .global channel_mode_proc_karaoke |
146 | .global channels_process_sound_chan_karaoke | 146 | .type channel_mode_proc_karaoke, %function |
147 | .type channels_process_sound_chan_karaoke, %function | 147 | channel_mode_proc_karaoke: |
148 | channels_process_sound_chan_karaoke: | 148 | @ input: r0 = this, r1 = buf_p |
149 | @ input: r0 = count, r1 = buf | 149 | ldr r1, [r1] @ r1 = buf = *buf_p; |
150 | stmfd sp!, { r4, lr } @ | 150 | stmfd sp!, { r4, lr } @ |
151 | @ | 151 | @ |
152 | ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] | 152 | ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0], |
153 | @ r2 = buf->p32[1] | ||
153 | subs r0, r0, #1 @ odd: end at 0; even: end at -1 | 154 | subs r0, r0, #1 @ odd: end at 0; even: end at -1 |
154 | beq .karaoke_singlesample @ Zero? Only one sample! | 155 | beq .karaoke_singlesample @ Zero? Only one sample! |
155 | @ | 156 | @ |
@@ -179,24 +180,313 @@ channels_process_sound_chan_karaoke: | |||
179 | str r12, [r2] @ store Ro | 180 | str r12, [r2] @ store Ro |
180 | @ | 181 | @ |
181 | ldmpc regs=r4 @ | 182 | ldmpc regs=r4 @ |
182 | .size channels_process_sound_chan_karaoke, \ | 183 | .size channel_mode_proc_karaoke, .-channel_mode_proc_karaoke |
183 | .-channels_process_sound_chan_karaoke | 184 | |
185 | /**************************************************************************** | ||
186 | * void crossfeed_process(struct dsp_proc_entry *this, | ||
187 | * struct dsp_buffer **buf_p) | ||
188 | */ | ||
189 | .section .text | ||
190 | .global crossfeed_process | ||
191 | crossfeed_process: | ||
192 | @ input: r0 = this, r1 = buf_p | ||
193 | @ unfortunately, we ended up in a bit of a register squeeze here, and need | ||
194 | @ to keep the count on the stack :/ | ||
195 | ldr r1, [r1] @ r1 = buf = *buf_p; | ||
196 | stmfd sp!, { r4-r11, lr } @ stack modified regs | ||
197 | ldr r12, [r1] @ r12 = buf->remcount | ||
198 | ldr r14, [r0] @ r14 = this->data = &crossfeed_state | ||
199 | ldmib r1, { r2-r3 } @ r2 = buf->p32[0], r3 = buf->p32[1] | ||
200 | ldmia r14!, { r4-r11 } @ load direct gain and filter data | ||
201 | add r0, r14, #13*2*4 @ calculate end of delay | ||
202 | stmfd sp!, { r0, r12 } @ stack end of delay adr, count and state | ||
203 | ldr r0, [r0] @ fetch current delay line address | ||
204 | |||
205 | /* Register usage in loop: | ||
206 | * r0 = &delay[index][0], r1 = accumulator high, r2 = buf->p32[0], | ||
207 | * r3 = buf->p32[1], r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs), | ||
208 | * r8-r11 = filter history, r12 = temp, r14 = accumulator low | ||
209 | */ | ||
210 | .cfloop: | ||
211 | smull r14, r1, r6, r8 @ acc = b1*dr[n - 1] | ||
212 | smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1] | ||
213 | ldr r8, [r0, #4] @ r8 = dr[n] | ||
214 | smlal r14, r1, r5, r8 @ acc += b0*dr[n] | ||
215 | mov r9, r1, lsl #1 @ fix format for filter history | ||
216 | ldr r12, [r2] @ load left input | ||
217 | smlal r14, r1, r4, r12 @ acc += gain*x_l[n] | ||
218 | mov r1, r1, lsl #1 @ fix format | ||
219 | str r1, [r2], #4 @ save result | ||
220 | |||
221 | smull r14, r1, r6, r10 @ acc = b1*dl[n - 1] | ||
222 | smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1] | ||
223 | ldr r10, [r0] @ r10 = dl[n] | ||
224 | str r12, [r0], #4 @ save left input to delay line | ||
225 | smlal r14, r1, r5, r10 @ acc += b0*dl[n] | ||
226 | mov r11, r1, lsl #1 @ fix format for filter history | ||
227 | ldr r12, [r3] @ load right input | ||
228 | smlal r14, r1, r4, r12 @ acc += gain*x_r[n] | ||
229 | str r12, [r0], #4 @ save right input to delay line | ||
230 | mov r1, r1, lsl #1 @ fix format | ||
231 | ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack | ||
232 | str r1, [r3], #4 @ save result | ||
233 | |||
234 | cmp r0, r12 @ need to wrap to start of delay? | ||
235 | subhs r0, r12, #13*2*4 @ wrap back delay line ptr to start | ||
236 | |||
237 | subs r14, r14, #1 @ are we finished? | ||
238 | strgt r14, [sp, #4] @ nope, save count back to stack | ||
239 | bgt .cfloop | ||
240 | |||
241 | @ save data back to struct | ||
242 | str r0, [r12] @ save delay line index | ||
243 | sub r12, r12, #13*2*4 + 4*4 @ r12 = data->history | ||
244 | stmia r12, { r8-r11 } @ save filter history | ||
245 | add sp, sp, #8 @ remove temp variables from stack | ||
246 | ldmpc regs=r4-r11 | ||
247 | .size crossfeed_process, .-crossfeed_process | ||
248 | |||
249 | /**************************************************************************** | ||
250 | * int lin_resample_resample(struct resample_data *data, | ||
251 | * struct dsp_buffer *src, | ||
252 | * struct dsp_buffer *dst) | ||
253 | */ | ||
254 | .section .text | ||
255 | .global lin_resample_resample | ||
256 | lin_resample_resample: | ||
257 | @input: r0 = data, r1 = src, r2 = dst | ||
258 | stmfd sp!, { r4-r11, lr } @ stack modified regs | ||
259 | ldr r4, [r0] @ r4 = data->delta | ||
260 | add r10, r0, #4 @ r10 = &data->phase | ||
261 | ldrb r3, [r1, #17] @ r3 = num_channels, | ||
262 | stmfd sp!, { r1, r10 } @ stack src, &data->phase | ||
263 | .lrs_channel_loop: | ||
264 | ldr r5, [r10] @ r5 = data->phase | ||
265 | ldr r6, [r1] @ r6 = srcrem = src->remcount | ||
266 | ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch] | ||
267 | ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch] | ||
268 | ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount | ||
269 | |||
270 | cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000) | ||
271 | movgt r6, #0x8000 @ | ||
272 | mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem) | ||
273 | cmp r0, r6 @ | ||
274 | movgt r0, r6 @ r0 = pos = phase >> 16 | ||
275 | cmp r0, #0 @ | ||
276 | ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch] | ||
277 | addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1] | ||
278 | ldrgt r11, [r12, #-4] @ | ||
279 | cmp r0, r6 @ | ||
280 | bge .lrs_channel_done @ pos >= count? channel complete | ||
281 | |||
282 | cmp r4, #0x10000 @ delta >= 1.0? | ||
283 | ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos] | ||
284 | bhs .lrs_dsstart @ yes? is downsampling | ||
285 | |||
286 | /** Upsampling **/ | ||
287 | mov r5, r5, lsl #16 @ Move phase into high halfword | ||
288 | add r7, r7, r0, lsl #2 @ r7 = &s[pos] | ||
289 | sub r0, r6, r0 @ r0 = dte = srcrem - pos | ||
290 | .lrs_usloop_1: | ||
291 | ldr r12, [r7], #4 @ r12 = s[pos] | ||
292 | sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1] | ||
293 | .lrs_usloop_0: | ||
294 | mov r1, r5, lsr #16 @ r1 = frac = phase >> 16 | ||
295 | @ keep frac in Rs to take advantage of multiplier early termination | ||
296 | smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi) | ||
297 | add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff | ||
298 | add r1, r1, r10, lsl #16 @ | ||
299 | str r1, [r8], #4 @ *d++ = out | ||
300 | subs r9, r9, #1 @ destination full? | ||
301 | bls .lrs_usfull @ yes? channel is done | ||
302 | adds r5, r5, r4, lsl #16 @ phase += delta << 16 | ||
303 | bcc .lrs_usloop_0 @ if carry is set, pos is incremented | ||
304 | subs r0, r0, #1 @ if srcrem > 0, do another sample | ||
305 | mov r11, r12 @ r11 = last = s[pos-1] (pos changed) | ||
306 | bgt .lrs_usloop_1 | ||
307 | b .lrs_usdone | ||
308 | |||
309 | .lrs_usfull: | ||
310 | adds r5, r5, r4, lsl #16 @ do missed phase increment | ||
311 | subcs r0, r0, #1 @ do missed srcrem decrement | ||
312 | movcs r11, r12 @ r11 = s[pos-1] (pos changed) | ||
313 | |||
314 | .lrs_usdone: | ||
315 | sub r0, r6, r0 @ r0 = pos = srcrem - dte | ||
316 | orr r5, r5, r0 @ reconstruct swapped phase | ||
317 | mov r5, r5, ror #16 @ swap pos and frac for phase | ||
318 | b .lrs_channel_done @ | ||
319 | |||
320 | /** Downsampling **/ | ||
321 | .lrs_dsloop: | ||
322 | add r10, r7, r0, lsl #2 @ r10 = &s[pos] | ||
323 | ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos] | ||
324 | .lrs_dsstart: | ||
325 | sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1] | ||
326 | @ keep frac in Rs to take advantage of multiplier early termination | ||
327 | bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff | ||
328 | smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi) | ||
329 | add r5, r5, r4 @ phase += delta | ||
330 | subs r9, r9, #1 @ destination full? ... | ||
331 | mov r0, r5, lsr #16 @ pos = phase >> 16 | ||
332 | add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff | ||
333 | add r1, r1, r10, lsl #16 @ | ||
334 | str r1, [r8], #4 @ *d++ = out | ||
335 | cmpgt r6, r0 @ ... || pos >= srcrem? ... | ||
336 | bgt .lrs_dsloop @ ... no, do more samples | ||
337 | |||
338 | cmp r0, r6 @ pos = MIN(pos, srcrem) | ||
339 | movgt r0, r6 @ | ||
340 | sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0 | ||
341 | ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1] | ||
342 | |||
343 | .lrs_channel_done: | ||
344 | ldmia sp, { r1, r10 } @ recover src, &data->phase | ||
345 | str r11, [r10, r3, lsl #2] @ last_sample[ch] = last | ||
346 | subs r3, r3, #1 @ | ||
347 | bgt .lrs_channel_loop @ | ||
348 | |||
349 | ldr r6, [r2, #12] @ r6 = dst->bufcount | ||
350 | sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16) | ||
351 | str r5, [r10] @ data->phase = r5 | ||
352 | sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount | ||
353 | str r6, [r2] @ dst->remcount = dstcount | ||
354 | add sp, sp, #8 @ adjust stack for temp variables | ||
355 | ldmpc regs=r4-r11 @ ... and we're out | ||
356 | .size lin_resample_resample, .-lin_resample_resample | ||
357 | |||
358 | /**************************************************************************** | ||
359 | * void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p) | ||
360 | */ | ||
361 | .section .icode | ||
362 | .global pga_process | ||
363 | .type pga_process, %function | ||
364 | pga_process: | ||
365 | @ input: r0 = this, r1 = buf_p | ||
366 | ldr r0, [r0] @ r0 = data = this->data (&pga_data) | ||
367 | ldr r1, [r1] @ r1 = buf = *buf_p; | ||
368 | stmfd sp!, { r4-r8, lr } | ||
369 | |||
370 | ldr r4, [r0] @ r4 = data->gain | ||
371 | ldr r0, [r1], #4 @ r0 = buf->remcount, r1 = buf->p32 | ||
372 | ldrb r3, [r1, #13] @ r3 = buf->format.num_channels | ||
373 | |||
374 | .pga_channelloop: | ||
375 | ldr r2, [r1], #4 @ r2 = buf->p32[ch] and inc index of p32 | ||
376 | subs r12, r0, #1 @ r12 = count - 1 | ||
377 | beq .pga_singlesample @ Zero? Only one sample! | ||
378 | |||
379 | .pga_loop: | ||
380 | ldmia r2, { r5, r6 } @ load r5, r6 from r2 (*p32[ch]) | ||
381 | smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) | ||
382 | smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8) | ||
383 | subs r12, r12, #2 | ||
384 | mov r7, r7, lsr #23 | ||
385 | mov r14, r14, lsr #23 | ||
386 | orr r7, r7, r8, asl #9 | ||
387 | orr r14, r14, r5, asl #9 | ||
388 | stmia r2!, { r7, r14 } @ save r7, r14 to *p32[ch] and increment | ||
389 | bgt .pga_loop @ end of pga loop | ||
390 | |||
391 | blt .pga_evencount @ < 0? even count | ||
392 | |||
393 | .pga_singlesample: | ||
394 | ldr r5, [r2] @ handle odd sample | ||
395 | smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) | ||
396 | mov r7, r7, lsr #23 | ||
397 | orr r7, r7, r8, asl #9 | ||
398 | str r7, [r2] | ||
399 | |||
400 | .pga_evencount: | ||
401 | subs r3, r3, #1 | ||
402 | bgt .pga_channelloop @ end of channel loop | ||
403 | |||
404 | ldmpc regs=r4-r8 | ||
405 | .size pga_process, .-pga_process | ||
406 | |||
407 | /**************************************************************************** | ||
408 | * void filter_process(struct dsp_filter *f, int32_t *buf[], int count, | ||
409 | * unsigned int channels) | ||
410 | * | ||
411 | * define HIGH_PRECISION as '1' to make filtering calculate lower bits after | ||
412 | * shifting. without this, "shift" - 1 of the lower bits will be lost here. | ||
413 | */ | ||
414 | #define HIGH_PRECISION 0 | ||
415 | |||
416 | #if CONFIG_CPU == PP5002 | ||
417 | .section .icode,"ax",%progbits | ||
418 | #else | ||
419 | .text | ||
420 | #endif | ||
421 | .global filter_process | ||
422 | filter_process: | ||
423 | @input: r0 = f, r1 = buf, r2 = count, r3 = channels | ||
424 | stmfd sp!, { r4-r11, lr } @ save all clobbered regs | ||
425 | ldmia r0!, { r4-r8 } @ load coefs, r0 = f->history | ||
426 | sub r3, r3, #1 @ r3 = ch = channels - 1 | ||
427 | stmfd sp!, { r0-r3 } @ save adjusted params | ||
428 | ldrb r14, [r0, #32] @ r14 = shift | ||
429 | |||
430 | @ Channels are processed high to low while history is saved low to high | ||
431 | @ It's really noone's business how we do this | ||
432 | .fp_channelloop: | ||
433 | ldmia r0, { r9-r12 } @ load history, r0 = history[channels-ch-1] | ||
434 | ldr r3, [r1, r3, lsl #2] @ r3 = buf[ch] | ||
435 | |||
436 | @ r9-r12 = history, r4-r8 = coefs, r0..r1 = accumulator, | ||
437 | @ r2 = number of samples, r3 = buf[ch], r14 = shift amount | ||
438 | .fp_loop: | ||
439 | @ Direct form 1 filtering code. | ||
440 | @ y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], | ||
441 | @ where y[] is output and x[] is input. This is performed out of order to | ||
442 | @ reuse registers, we're pretty short on regs. | ||
443 | smull r0, r1, r5, r9 @ acc = b1*x[i - 1] | ||
444 | smlal r0, r1, r6, r10 @ acc += b2*x[i - 2] | ||
445 | mov r10, r9 @ fix input history | ||
446 | ldr r9, [r3] @ load input and fix history | ||
447 | smlal r0, r1, r7, r11 @ acc += a1*y[i - 1] | ||
448 | smlal r0, r1, r8, r12 @ acc += a2*y[i - 2] | ||
449 | smlal r0, r1, r4, r9 @ acc += b0*x[i] /* avoid stall on arm9 */ | ||
450 | mov r12, r11 @ fix output history | ||
451 | mov r11, r1, asl r14 @ get upper part of result and shift left | ||
452 | #if HIGH_PRECISION | ||
453 | rsb r1, r14, #32 @ get shift amount for lower part | ||
454 | orr r11, r11, r0, lsr r1 @ then mix in correctly shifted lower part | ||
455 | #endif | ||
456 | str r11, [r3], #4 @ save result | ||
457 | subs r2, r2, #1 @ are we done with this channel? | ||
458 | bgt .fp_loop @ | ||
459 | |||
460 | ldr r3, [sp, #12] @ r3 = ch | ||
461 | ldr r0, [sp] @ r0 = history[channels-ch-1] | ||
462 | subs r3, r3, #1 @ all channels processed? | ||
463 | stmia r0!, { r9-r12 } @ save back history, history++ | ||
464 | ldmhsib sp, { r1-r2 } @ r1 = buf, r2 = count | ||
465 | strhs r3, [sp, #12] @ store ch | ||
466 | strhs r0, [sp] @ store history[channels-ch-1] | ||
467 | bhs .fp_channelloop | ||
468 | |||
469 | add sp, sp, #16 @ compensate for temp storage | ||
470 | ldmpc regs=r4-r11 | ||
471 | .size filter_process, .-filter_process | ||
184 | 472 | ||
185 | #if ARM_ARCH < 6 | 473 | #if ARM_ARCH < 6 |
186 | /**************************************************************************** | 474 | /**************************************************************************** |
187 | * void sample_output_mono(int count, struct dsp_data *data, | 475 | * void sample_output_mono(struct sample_io_data *this, |
188 | * const int32_t *src[], int16_t *dst) | 476 | * struct dsp_buffer *src, |
477 | * struct dsp_buffer *dst) | ||
189 | */ | 478 | */ |
190 | .section .icode, "ax", %progbits | 479 | .section .icode |
191 | .align 2 | ||
192 | .global sample_output_mono | 480 | .global sample_output_mono |
193 | .type sample_output_mono, %function | 481 | .type sample_output_mono, %function |
194 | sample_output_mono: | 482 | sample_output_mono: |
195 | @ input: r0 = count, r1 = data, r2 = src, r3 = dst | 483 | @ input: r0 = this, r1 = src, r2 = dst |
196 | stmfd sp!, { r4-r6, lr } | 484 | stmfd sp!, { r4-r6, lr } |
197 | 485 | ||
198 | ldr r1, [r1] @ lr = data->output_scale | 486 | ldr r0, [r0] @ r0 = this->outcount |
199 | ldr r2, [r2] @ r2 = src[0] | 487 | ldr r3, [r2, #4] @ r2 = dst->p16out |
488 | ldr r2, [r1, #4] @ r1 = src->p32[0] | ||
489 | ldrb r1, [r1, #19] @ r2 = src->format.output_scale | ||
200 | 490 | ||
201 | mov r4, #1 | 491 | mov r4, #1 |
202 | mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) | 492 | mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) |
@@ -246,19 +536,21 @@ sample_output_mono: | |||
246 | .size sample_output_mono, .-sample_output_mono | 536 | .size sample_output_mono, .-sample_output_mono |
247 | 537 | ||
248 | /**************************************************************************** | 538 | /**************************************************************************** |
249 | * void sample_output_stereo(int count, struct dsp_data *data, | 539 | * void sample_output_stereo(struct sample_io_data *this, |
250 | * const int32_t *src[], int16_t *dst) | 540 | * struct dsp_buffer *src, |
541 | * struct dsp_buffer *dst) | ||
251 | */ | 542 | */ |
252 | .section .icode, "ax", %progbits | 543 | .section .icode |
253 | .align 2 | ||
254 | .global sample_output_stereo | 544 | .global sample_output_stereo |
255 | .type sample_output_stereo, %function | 545 | .type sample_output_stereo, %function |
256 | sample_output_stereo: | 546 | sample_output_stereo: |
257 | @ input: r0 = count, r1 = data, r2 = src, r3 = dst | 547 | @ input: r0 = this, r1 = src, r2 = dst |
258 | stmfd sp!, { r4-r9, lr } | 548 | stmfd sp!, { r4-r9, lr } |
259 | 549 | ||
260 | ldr r1, [r1] @ r1 = data->output_scale | 550 | ldr r0, [r0] @ r0 = this->outcount |
261 | ldmia r2, { r2, r5 } @ r2 = src[0], r5 = src[1] | 551 | ldr r3, [r2, #4] @ r3 = dsp->p16out |
552 | ldmib r1, { r2, r5 } @ r2 = src->p32[0], r5 = src->p32[1] | ||
553 | ldrb r1, [r1, #19] @ r1 = src->format.output_scale | ||
262 | 554 | ||
263 | mov r4, #1 | 555 | mov r4, #1 |
264 | mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) | 556 | mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) |
@@ -330,232 +622,3 @@ sample_output_stereo: | |||
330 | ldmpc regs=r4-r9 | 622 | ldmpc regs=r4-r9 |
331 | .size sample_output_stereo, .-sample_output_stereo | 623 | .size sample_output_stereo, .-sample_output_stereo |
332 | #endif /* ARM_ARCH < 6 */ | 624 | #endif /* ARM_ARCH < 6 */ |
333 | |||
334 | /**************************************************************************** | ||
335 | * void apply_crossfeed(int count, int32_t* src[]) | ||
336 | */ | ||
337 | .section .text | ||
338 | .global apply_crossfeed | ||
339 | apply_crossfeed: | ||
340 | @ unfortunately, we ended up in a bit of a register squeeze here, and need | ||
341 | @ to keep the count on the stack :/ | ||
342 | stmdb sp!, { r4-r11, lr } @ stack modified regs | ||
343 | ldmia r1, { r2-r3 } @ r2 = src[0], r3 = src[1] | ||
344 | |||
345 | ldr r1, =crossfeed_data | ||
346 | ldmia r1!, { r4-r11 } @ load direct gain and filter data | ||
347 | mov r12, r0 @ better to ldm delay + count later | ||
348 | add r0, r1, #13*4*2 @ calculate end of delay | ||
349 | stmdb sp!, { r0, r12 } @ stack end of delay adr and count | ||
350 | ldr r0, [r1, #13*4*2] @ fetch current delay line address | ||
351 | |||
352 | /* Register usage in loop: | ||
353 | * r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1], | ||
354 | * r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs), | ||
355 | * r8-r11 = filter history, r12 = temp, r14 = accumulator low | ||
356 | */ | ||
357 | .cfloop: | ||
358 | smull r14, r1, r6, r8 @ acc = b1*dr[n - 1] | ||
359 | smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1] | ||
360 | ldr r8, [r0, #4] @ r8 = dr[n] | ||
361 | smlal r14, r1, r5, r8 @ acc += b0*dr[n] | ||
362 | mov r9, r1, lsl #1 @ fix format for filter history | ||
363 | ldr r12, [r2] @ load left input | ||
364 | smlal r14, r1, r4, r12 @ acc += gain*x_l[n] | ||
365 | mov r1, r1, lsl #1 @ fix format | ||
366 | str r1, [r2], #4 @ save result | ||
367 | |||
368 | smull r14, r1, r6, r10 @ acc = b1*dl[n - 1] | ||
369 | smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1] | ||
370 | ldr r10, [r0] @ r10 = dl[n] | ||
371 | str r12, [r0], #4 @ save left input to delay line | ||
372 | smlal r14, r1, r5, r10 @ acc += b0*dl[n] | ||
373 | mov r11, r1, lsl #1 @ fix format for filter history | ||
374 | ldr r12, [r3] @ load right input | ||
375 | smlal r14, r1, r4, r12 @ acc += gain*x_r[n] | ||
376 | str r12, [r0], #4 @ save right input to delay line | ||
377 | mov r1, r1, lsl #1 @ fix format | ||
378 | ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack | ||
379 | str r1, [r3], #4 @ save result | ||
380 | |||
381 | cmp r0, r12 @ need to wrap to start of delay? | ||
382 | subeq r0, r0, #13*4*2 @ wrap back delay line ptr to start | ||
383 | |||
384 | subs r14, r14, #1 @ are we finished? | ||
385 | strne r14, [sp, #4] @ nope, save count back to stack | ||
386 | bne .cfloop | ||
387 | |||
388 | @ save data back to struct | ||
389 | ldr r12, =crossfeed_data + 4*4 | ||
390 | stmia r12, { r8-r11 } @ save filter history | ||
391 | str r0, [r12, #30*4] @ save delay line index | ||
392 | add sp, sp, #8 @ remove temp variables from stack | ||
393 | ldmpc regs=r4-r11 | ||
394 | .size apply_crossfeed, .-apply_crossfeed | ||
395 | |||
396 | /**************************************************************************** | ||
397 | * int dsp_downsample(int count, struct dsp_data *data, | ||
398 | * in32_t *src[], int32_t *dst[]) | ||
399 | */ | ||
400 | .section .text | ||
401 | .global dsp_downsample | ||
402 | dsp_downsample: | ||
403 | stmdb sp!, { r4-r11, lr } @ stack modified regs | ||
404 | ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta | ||
405 | sub r5, r5, #1 @ pre-decrement num_channels for use | ||
406 | add r4, r1, #12 @ r4 = &resample_data.phase | ||
407 | mov r12, #0xff | ||
408 | orr r12, r12, #0xff00 @ r12 = 0xffff | ||
409 | .dschannel_loop: | ||
410 | ldr r1, [r4] @ r1 = resample_data.phase | ||
411 | ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1] | ||
412 | ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1] | ||
413 | add r9, r4, #4 @ r9 = &last_sample[0] | ||
414 | ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] | ||
415 | sub r11, r0, #1 | ||
416 | ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ... | ||
417 | str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample | ||
418 | movs r9, r1, lsr #16 @ r9 = pos = phase >> 16 | ||
419 | ldreq r11, [r7] @ if pos = 0, load src[0] and jump into loop | ||
420 | beq .dsuse_last_start | ||
421 | cmp r9, r0 @ if pos >= count, we're already done | ||
422 | bge .dsloop_skip | ||
423 | |||
424 | @ Register usage in loop: | ||
425 | @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel, | ||
426 | @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos] | ||
427 | .dsloop: | ||
428 | add r9, r7, r9, lsl #2 @ r9 = &s[pos] | ||
429 | ldmda r9, { r10, r11 } @ r10 = s[pos - 1], r11 = s[pos] | ||
430 | .dsuse_last_start: | ||
431 | sub r11, r11, r10 @ r11 = diff = s[pos] - s[pos - 1] | ||
432 | @ keep frac in lower bits to take advantage of multiplier early termination | ||
433 | and r9, r1, r12 @ frac = phase & 0xffff | ||
434 | smull r9, r14, r11, r9 | ||
435 | add r1, r1, r6 @ phase += delta | ||
436 | add r10, r10, r9, lsr #16 @ r10 = out = s[pos - 1] + frac*diff | ||
437 | add r10, r10, r14, lsl #16 | ||
438 | str r10, [r8], #4 @ *d++ = out | ||
439 | mov r9, r1, lsr #16 @ pos = phase >> 16 | ||
440 | cmp r9, r0 @ pos < count? | ||
441 | blt .dsloop @ yup, do more samples | ||
442 | .dsloop_skip: | ||
443 | subs r5, r5, #1 | ||
444 | bpl .dschannel_loop @ if (--ch) >= 0, do another channel | ||
445 | sub r1, r1, r0, lsl #16 @ wrap phase back to start | ||
446 | str r1, [r4] @ store back | ||
447 | ldr r1, [r3] @ r1 = &dst[0] | ||
448 | sub r8, r8, r1 @ dst - &dst[0] | ||
449 | mov r0, r8, lsr #2 @ convert bytes->samples | ||
450 | ldmpc regs=r4-r11 @ ... and we're out | ||
451 | .size dsp_downsample, .-dsp_downsample | ||
452 | |||
453 | /**************************************************************************** | ||
454 | * int dsp_upsample(int count, struct dsp_data *dsp, | ||
455 | * in32_t *src[], int32_t *dst[]) | ||
456 | */ | ||
457 | .section .text | ||
458 | .global dsp_upsample | ||
459 | dsp_upsample: | ||
460 | stmfd sp!, { r4-r11, lr } @ stack modified regs | ||
461 | ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta | ||
462 | sub r5, r5, #1 @ pre-decrement num_channels for use | ||
463 | add r4, r1, #12 @ r4 = &resample_data.phase | ||
464 | mov r6, r6, lsl #16 @ we'll use carry to detect pos increments | ||
465 | stmfd sp!, { r0, r4 } @ stack count and &resample_data.phase | ||
466 | .uschannel_loop: | ||
467 | ldr r12, [r4] @ r12 = resample_data.phase | ||
468 | ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1] | ||
469 | ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1] | ||
470 | add r9, r4, #4 @ r9 = &last_sample[0] | ||
471 | mov r1, r12, lsl #16 @ we'll use carry to detect pos increments | ||
472 | sub r11, r0, #1 | ||
473 | ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ... | ||
474 | ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] | ||
475 | str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample | ||
476 | movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16 | ||
477 | beq .usstart_0 @ pos = 0 | ||
478 | cmp r14, r0 @ if pos >= count, we're already done | ||
479 | bge .usloop_skip | ||
480 | add r7, r7, r14, lsl #2 @ r7 = &s[pos] | ||
481 | ldr r10, [r7, #-4] @ r11 = s[pos - 1] | ||
482 | b .usstart_0 | ||
483 | |||
484 | @ Register usage in loop: | ||
485 | @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel, | ||
486 | @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos] | ||
487 | .usloop_1: | ||
488 | mov r10, r11 @ r10 = previous sample | ||
489 | .usstart_0: | ||
490 | ldr r11, [r7], #4 @ r11 = next sample | ||
491 | mov r4, r1, lsr #16 @ r4 = frac = phase >> 16 | ||
492 | sub r9, r11, r10 @ r9 = diff = s[pos] - s[pos - 1] | ||
493 | .usloop_0: | ||
494 | smull r12, r14, r4, r9 | ||
495 | adds r1, r1, r6 @ phase += delta << 16 | ||
496 | mov r4, r1, lsr #16 @ r4 = frac = phase >> 16 | ||
497 | add r14, r10, r14, lsl #16 | ||
498 | add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff | ||
499 | str r14, [r8], #4 @ *d++ = out | ||
500 | bcc .usloop_0 @ if carry is set, pos is incremented | ||
501 | subs r0, r0, #1 @ if count > 0, do another sample | ||
502 | bgt .usloop_1 | ||
503 | .usloop_skip: | ||
504 | subs r5, r5, #1 | ||
505 | ldmfd sp, { r0, r4 } @ reload count and &resample_data.phase | ||
506 | bpl .uschannel_loop @ if (--ch) >= 0, do another channel | ||
507 | mov r1, r1, lsr #16 @ wrap phase back to start of next frame | ||
508 | ldr r2, [r3] @ r1 = &dst[0] | ||
509 | str r1, [r4] @ store phase | ||
510 | sub r8, r8, r2 @ dst - &dst[0] | ||
511 | mov r0, r8, lsr #2 @ convert bytes->samples | ||
512 | add sp, sp, #8 @ adjust stack for temp variables | ||
513 | ldmpc regs=r4-r11 @ ... and we're out | ||
514 | .size dsp_upsample, .-dsp_upsample | ||
515 | |||
516 | /**************************************************************************** | ||
517 | * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]) | ||
518 | */ | ||
519 | .section .icode, "ax", %progbits | ||
520 | .align 2 | ||
521 | .global dsp_apply_gain | ||
522 | .type dsp_apply_gain, %function | ||
523 | dsp_apply_gain: | ||
524 | @ input: r0 = count, r1 = data, r2 = buf[] | ||
525 | stmfd sp!, { r4-r8, lr } | ||
526 | |||
527 | ldr r3, [r1, #4] @ r3 = data->num_channels | ||
528 | ldr r4, [r1, #32] @ r5 = data->gain | ||
529 | |||
530 | .dag_outerloop: | ||
531 | ldr r1, [r2], #4 @ r1 = buf[0] and increment index of buf[] | ||
532 | subs r12, r0, #1 @ r12 = r0 = count - 1 | ||
533 | beq .dag_singlesample @ Zero? Only one sample! | ||
534 | |||
535 | .dag_innerloop: | ||
536 | ldmia r1, { r5, r6 } @ load r5, r6 from r1 | ||
537 | smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) | ||
538 | smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8) | ||
539 | subs r12, r12, #2 | ||
540 | mov r7, r7, lsr #23 | ||
541 | mov r14, r14, lsr #23 | ||
542 | orr r7, r7, r8, asl #9 | ||
543 | orr r14, r14, r5, asl #9 | ||
544 | stmia r1!, { r7, r14 } @ save r7, r14 to [r1] and increment r1 | ||
545 | bgt .dag_innerloop @ end of inner loop | ||
546 | |||
547 | blt .dag_evencount @ < 0? even count | ||
548 | |||
549 | .dag_singlesample: | ||
550 | ldr r5, [r1] @ handle odd sample | ||
551 | smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) | ||
552 | mov r7, r7, lsr #23 | ||
553 | orr r7, r7, r8, asl #9 | ||
554 | str r7, [r1] | ||
555 | |||
556 | .dag_evencount: | ||
557 | subs r3, r3, #1 | ||
558 | bgt .dag_outerloop @ end of outer loop | ||
559 | |||
560 | ldmpc regs=r4-r8 | ||
561 | .size dsp_apply_gain, .-dsp_apply_gain | ||