diff options
Diffstat (limited to 'apps/codecs/libffmpegFLAC/arm.S')
-rw-r--r-- | apps/codecs/libffmpegFLAC/arm.S | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S new file mode 100644 index 0000000000..eba2251908 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/arm.S | |||
@@ -0,0 +1,265 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2006 by Thom Johansen | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | |||
20 | /* The following is an assembler optimised version of the LPC filtering | ||
21 | routines needed for FLAC decoding. It is optimised for use with ARM | ||
22 | processors. | ||
23 | All LPC filtering up to order 9 is done in specially optimised unrolled | ||
24 | loops, while every order above this is handled by a slower default routine. | ||
25 | */ | ||
26 | .section .icode,"ax",%progbits | ||
27 | .global lpc_decode_arm | ||
28 | lpc_decode_arm: | ||
29 | stmdb sp!, { r4-r11, lr } | ||
30 | ldr r4, [sp, #36] | ||
31 | /* r0 = blocksize, r1 = qlevel, r2 = pred_order | ||
32 | r3 = data, r4 = coeffs | ||
33 | */ | ||
34 | |||
35 | /* the data pointer always lags behind history pointer by 'pred_order' | ||
36 | samples. since we have one loop for each order, we can hard code this | ||
37 | and free a register by not saving data pointer. | ||
38 | */ | ||
39 | sub r3, r3, r2, lsl #2 @ r3 = history | ||
40 | cmp r0, #0 @ no samples to process | ||
41 | beq .exit | ||
42 | cmp r2, #9 @ check if order is too high for unrolled loops | ||
43 | addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists | ||
44 | @ jumptable: | ||
45 | b .default @ order too high, go to default routine | ||
46 | b .exit @ zero order filter isn't possible, exit function | ||
47 | b .order1 | ||
48 | b .order2 | ||
49 | b .order3 | ||
50 | b .order4 | ||
51 | b .order5 | ||
52 | b .order6 | ||
53 | b .order7 | ||
54 | b .order8 | ||
55 | |||
56 | @ last jump table entry coincides with target, so leave it out | ||
57 | .order9: | ||
58 | ldmia r4, { r5-r12, r14 } @ fetch coefs | ||
59 | .loop9: | ||
60 | ldr r4, [r3], #4 @ load first history sample | ||
61 | mul r2, r4, r14 @ multiply with last coef | ||
62 | ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2 | ||
63 | mla r2, r4, r12, r2 | ||
64 | ldr r4, [r3], #4 | ||
65 | mla r2, r4, r11, r2 | ||
66 | ldr r4, [r3], #4 | ||
67 | mla r2, r4, r10, r2 | ||
68 | ldr r4, [r3], #4 | ||
69 | mla r2, r4, r9, r2 | ||
70 | ldr r4, [r3], #4 | ||
71 | mla r2, r4, r8, r2 | ||
72 | ldr r4, [r3], #4 | ||
73 | mla r2, r4, r7, r2 | ||
74 | ldr r4, [r3], #4 | ||
75 | mla r2, r4, r6, r2 | ||
76 | ldr r4, [r3], #4 | ||
77 | mla r2, r4, r5, r2 | ||
78 | ldr r4, [r3] @ r4 = residual | ||
79 | add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual | ||
80 | str r2, [r3], #-8*4 @ save result and wrap history pointer back | ||
81 | subs r0, r0, #1 @ check if we're done | ||
82 | bne .loop9 @ nope, jump back | ||
83 | b .exit | ||
84 | |||
85 | .order8: | ||
86 | ldmia r4, { r5-r12 } | ||
87 | .loop8: | ||
88 | @ we have more registers to spare here, so start block reading | ||
89 | ldmia r3!, { r4, r14 } | ||
90 | mul r2, r4, r12 | ||
91 | mla r2, r14, r11, r2 | ||
92 | ldmia r3!, { r4, r14 } | ||
93 | mla r2, r4, r10, r2 | ||
94 | mla r2, r14, r9, r2 | ||
95 | ldmia r3!, { r4, r14 } | ||
96 | mla r2, r4, r8, r2 | ||
97 | mla r2, r14, r7, r2 | ||
98 | ldmia r3!, { r4, r14 } | ||
99 | mla r2, r4, r6, r2 | ||
100 | mla r2, r14, r5, r2 | ||
101 | ldr r4, [r3] | ||
102 | add r2, r4, r2, asr r1 | ||
103 | str r2, [r3], #-7*4 | ||
104 | subs r0, r0, #1 | ||
105 | bne .loop8 | ||
106 | b .exit | ||
107 | |||
108 | .order7: | ||
109 | ldmia r4, { r5-r11 } | ||
110 | .loop7: | ||
111 | ldmia r3!, { r4, r12, r14 } | ||
112 | mul r2, r4, r11 | ||
113 | mla r2, r12, r10, r2 | ||
114 | mla r2, r14, r9, r2 | ||
115 | ldmia r3!, { r4, r12, r14 } | ||
116 | mla r2, r4, r8, r2 | ||
117 | mla r2, r12, r7, r2 | ||
118 | mla r2, r14, r6, r2 | ||
119 | ldr r4, [r3], #4 | ||
120 | mla r2, r4, r5, r2 | ||
121 | ldr r4, [r3] | ||
122 | add r2, r4, r2, asr r1 | ||
123 | str r2, [r3], #-6*4 | ||
124 | subs r0, r0, #1 | ||
125 | bne .loop7 | ||
126 | b .exit | ||
127 | |||
128 | .order6: | ||
129 | ldmia r4, { r5-r10 } | ||
130 | .loop6: | ||
131 | ldmia r3!, { r4, r11-r12, r14 } | ||
132 | mul r2, r4, r10 | ||
133 | mla r2, r11, r9, r2 | ||
134 | mla r2, r12, r8, r2 | ||
135 | mla r2, r14, r7, r2 | ||
136 | ldmia r3!, { r4, r11 } | ||
137 | mla r2, r4, r6, r2 | ||
138 | mla r2, r11, r5, r2 | ||
139 | ldr r4, [r3] | ||
140 | add r2, r4, r2, asr r1 | ||
141 | str r2, [r3], #-5*4 | ||
142 | subs r0, r0, #1 | ||
143 | bne .loop6 | ||
144 | b .exit | ||
145 | |||
146 | .order5: | ||
147 | ldmia r4, { r5-r9 } | ||
148 | .loop5: | ||
149 | ldmia r3!, { r4, r10-r12, r14 } | ||
150 | mul r2, r4, r9 | ||
151 | mla r2, r10, r8, r2 | ||
152 | mla r2, r11, r7, r2 | ||
153 | mla r2, r12, r6, r2 | ||
154 | mla r2, r14, r5, r2 | ||
155 | ldr r4, [r3] | ||
156 | add r2, r4, r2, asr r1 | ||
157 | str r2, [r3], #-4*4 | ||
158 | subs r0, r0, #1 | ||
159 | bne .loop5 | ||
160 | b .exit | ||
161 | |||
162 | .order4: | ||
163 | ldmia r4, { r5-r8 } | ||
164 | .loop4: | ||
165 | ldmia r3!, { r4, r11-r12, r14 } | ||
166 | mul r2, r4, r8 | ||
167 | mla r2, r11, r7, r2 | ||
168 | mla r2, r12, r6, r2 | ||
169 | mla r2, r14, r5, r2 | ||
170 | ldr r4, [r3] | ||
171 | add r2, r4, r2, asr r1 | ||
172 | str r2, [r3], #-3*4 | ||
173 | subs r0, r0, #1 | ||
174 | bne .loop4 | ||
175 | b .exit | ||
176 | |||
177 | .order3: | ||
178 | ldmia r4, { r5-r7 } | ||
179 | .loop3: | ||
180 | ldmia r3!, { r4, r12, r14 } | ||
181 | mul r2, r4, r7 | ||
182 | mla r2, r12, r6, r2 | ||
183 | mla r2, r14, r5, r2 | ||
184 | ldr r4, [r3] | ||
185 | add r2, r4, r2, asr r1 | ||
186 | str r2, [r3], #-2*4 | ||
187 | subs r0, r0, #1 | ||
188 | bne .loop3 | ||
189 | b .exit | ||
190 | |||
191 | .order2: | ||
192 | ldmia r4, { r5-r6 } | ||
193 | .loop2: | ||
194 | ldmia r3!, { r4, r14 } | ||
195 | mul r2, r4, r6 | ||
196 | mla r2, r14, r5, r2 | ||
197 | ldr r4, [r3] | ||
198 | add r2, r4, r2, asr r1 | ||
199 | str r2, [r3], #-1*4 | ||
200 | subs r0, r0, #1 | ||
201 | bne .loop2 | ||
202 | b .exit | ||
203 | |||
204 | .order1: | ||
205 | ldr r5, [r4] | ||
206 | ldr r4, [r3], #4 | ||
207 | .loop1: | ||
208 | mul r2, r4, r5 | ||
209 | ldr r4, [r3] | ||
210 | add r2, r4, r2, asr r1 | ||
211 | str r2, [r3], #4 | ||
212 | subs r0, r0, #1 | ||
213 | bne .loop1 | ||
214 | b .exit | ||
215 | |||
216 | .default: | ||
217 | /* we do the filtering in an unrolled by 4 loop as far as we can, and then | ||
218 | do the rest by jump table. */ | ||
219 | add r5, r4, r2, lsl #2 @ need to start in the other end of coefs | ||
220 | mov r6, r3 @ working copy of history pointer | ||
221 | mov r7, r2, lsr #2 @ r7 = coefs/4 | ||
222 | mov r14, #0 @ init accumulator | ||
223 | .dloop1: | ||
224 | ldmdb r5!, { r8-r11 } | ||
225 | ldr r12, [r6], #4 | ||
226 | mla r14, r12, r11, r14 | ||
227 | ldr r12, [r6], #4 | ||
228 | mla r14, r12, r10, r14 | ||
229 | ldr r12, [r6], #4 | ||
230 | mla r14, r12, r9, r14 | ||
231 | ldr r12, [r6], #4 | ||
232 | mla r14, r12, r8, r14 | ||
233 | subs r7, r7, #1 | ||
234 | bne .dloop1 | ||
235 | |||
236 | and r7, r2, #3 @ get remaining samples to be filtered | ||
237 | add pc, pc, r7, lsl #2 @ jump into accumulator chain | ||
238 | @ jumptable: | ||
239 | b .dsave @ padding | ||
240 | b .dsave | ||
241 | b .oneleft | ||
242 | b .twoleft | ||
243 | @ implicit .threeleft | ||
244 | ldr r12, [r5, #-4]! | ||
245 | ldr r8, [r6], #4 | ||
246 | mla r14, r12, r8, r14 | ||
247 | .twoleft: | ||
248 | ldr r12, [r5, #-4]! | ||
249 | ldr r8, [r6], #4 | ||
250 | mla r14, r12, r8, r14 | ||
251 | .oneleft: | ||
252 | ldr r12, [r5, #-4]! | ||
253 | ldr r8, [r6], #4 | ||
254 | mla r14, r12, r8, r14 | ||
255 | |||
256 | .dsave: | ||
257 | ldr r12, [r6] @ load residual | ||
258 | add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual | ||
259 | str r14, [r6] @ store result | ||
260 | add r3, r3, #4 @ increment history pointer | ||
261 | subs r0, r0, #1 @ are we done? | ||
262 | bne .default @ no, prepare for next sample | ||
263 | |||
264 | .exit: | ||
265 | ldmia sp!, { r4-r11, pc } | ||