summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libatrac/atrac3_armv5e.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libatrac/atrac3_armv5e.S')
-rw-r--r--lib/rbcodec/codecs/libatrac/atrac3_armv5e.S163
1 files changed, 163 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libatrac/atrac3_armv5e.S b/lib/rbcodec/codecs/libatrac/atrac3_armv5e.S
new file mode 100644
index 0000000000..1d9d35a5da
--- /dev/null
+++ b/lib/rbcodec/codecs/libatrac/atrac3_armv5e.S
@@ -0,0 +1,163 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id:
9 *
10 * Copyright (C) 2010 by Michael Giacomelli
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23
24 .section .text, "ax", %progbits
25
26
27/****************************************************************************
28 * atrac3_iqmf_dewindowing_armv5e(int32_t *out,
29 * int32_t *in,
30 * int32_t *win,
31 * unsigned int nIn);
32 *
33 * Dewindowing step within iqmf of atrac3 synthesis using 16 bit filter
34 * coefficients and armv5e packed multiply instructions. Uses 2.5 cycles
35 * per filter coefficient (ideal). Benchmarked 3.54 per coefficient (Clip+).
36 *
37 * Reference implementation:
38 *
39 * for (j = nIn; j != 0; j--) {
40 * s1 = fixmul32(in[0], win[0]);
41 * s2 = fixmul32(in[1], win[1]);
42 * for (i = 2; i < 48; i += 2) {
43 * s1 += fixmul32(in[i ], win[i ]);
44 * s2 += fixmul32(in[i+1], win[i+1]);
45 * }
46 * out[0] = s2 << 1;
47 * out[1] = s1 << 1;
48 * in += 2;
49 * out += 2;
50 * }
51 * Note: r12 is a scratch register and can be used without restorage.
52 ****************************************************************************/
53 .align 2
54 .global atrac3_iqmf_dewindowing_armv5e
55 .type atrac3_iqmf_dewindowing_armv5e, %function
56
57atrac3_iqmf_dewindowing_armv5e:
58 /* r0 = dest */
59 /* r1 = input samples */
60 /* r2 = window coefficients */
61 /* r3 = counter */
62 stmfd sp!, {r4-r11, lr} /* save non-scratch registers */
63
64.iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */
65 /* 0.. 7 */
66 ldmia r2!, {r4, r5, r8, r9} /* load win[0..7] */
67 ldmia r1!, {r6, r7, r10, r11} /* load in[0..3] to avoid stall on arm11 */
68 smulwb lr, r6, r4 /* s1 = in[0] * win[0] */
69 smulwt r12, r7, r4 /* s2 = in[1] * win[1] */
70 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
71 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
72
73 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
74 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
75 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
76 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
77 smlawt r12, r11, r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
78
79 /* 8..15 */
80 ldmia r2!, {r4, r5, r8, r9} /* load win[8..15] */
81 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
82 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
83 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
84 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
85 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
86
87 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
88 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
89 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
90 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
91 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
92
93 /* 16..23 */
94 ldmia r2!, {r4, r5, r8, r9} /* load win[16..23] */
95 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
96 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
97 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
98 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
99 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
100
101 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
102 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
103 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
104 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
105 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
106
107 /* 24..31 */
108 ldmia r2!, {r4, r5, r8, r9} /* load win[24..31] */
109 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
110 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
111 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
112 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
113 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
114
115 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
116 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
117 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
118 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
119 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
120
121 /* 32..39 */
122 ldmia r2!, {r4, r5, r8, r9} /* load win[32..39] */
123 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
124 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
125 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
126 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
127 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
128
129 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
130 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
131 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
132 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
133 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
134
135 /* 40..47 */
136 ldmia r2!, {r4, r5, r8, r9} /* load win[40..47] */
137 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
138 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
139 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
140 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
141 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
142
143 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
144 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
145 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
146 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
147 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
148
149
150 mov lr , lr , lsl #1
151 mov r12, r12, lsl #1
152
153 stmia r0!, {r12, lr} /* store result out[0]=s2, out[1]=s1 */
154 sub r1, r1, #184 /* roll back 64 entries = 184 bytes */
155 sub r2, r2, #96 /* roll back 48 entries * 2 bytes = 96 bytes = win[0] */
156
157 subs r3, r3, #1 /* outer loop -= 1 */
158 bgt .iqmf_dewindow_outer_loop
159
160 ldmpc regs=r4-r11 /* restore registers */
161
162.atrac3_iqmf_dewindowing_armv5e_end:
163 .size atrac3_iqmf_dewindowing_armv5e,.atrac3_iqmf_dewindowing_armv5e_end-atrac3_iqmf_dewindowing_armv5e