summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2006-03-06 03:07:00 +0000
committerThom Johansen <thomj@rockbox.org>2006-03-06 03:07:00 +0000
commit10decf883a7fbd40823f31fe42b398aecf950acd (patch)
tree8a27289e178e507a26fae0a20a87e8c356cb6a10
parent4a301c327503003d7ace07a7fdd0332edd8d7407 (diff)
downloadrockbox-10decf883a7fbd40823f31fe42b398aecf950acd.tar.gz
rockbox-10decf883a7fbd40823f31fe42b398aecf950acd.zip
ARM assembler optimised LPC decode routine for FLAC (not yet enabled).
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8927 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libffmpegFLAC/arm.S265
-rw-r--r--apps/codecs/libffmpegFLAC/arm.h8
2 files changed, 273 insertions, 0 deletions
diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S
new file mode 100644
index 0000000000..eba2251908
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/arm.S
@@ -0,0 +1,265 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2006 by Thom Johansen
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20/* The following is an assembler optimised version of the LPC filtering
21 routines needed for FLAC decoding. It is optimised for use with ARM
22 processors.
23 All LPC filtering up to order 9 is done in specially optimised unrolled
24 loops, while every order above this is handled by a slower default routine.
25 */
26 .section .icode,"ax",%progbits
27 .global lpc_decode_arm
28lpc_decode_arm:
29 stmdb sp!, { r4-r11, lr }
30 ldr r4, [sp, #36]
31 /* r0 = blocksize, r1 = qlevel, r2 = pred_order
32 r3 = data, r4 = coeffs
33 */
34
35 /* the data pointer always lags behind history pointer by 'pred_order'
36 samples. since we have one loop for each order, we can hard code this
37 and free a register by not saving data pointer.
38 */
39 sub r3, r3, r2, lsl #2 @ r3 = history
40 cmp r0, #0 @ no samples to process
41 beq .exit
42 cmp r2, #9 @ check if order is too high for unrolled loops
43 addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
44@ jumptable:
45 b .default @ order too high, go to default routine
46 b .exit @ zero order filter isn't possible, exit function
47 b .order1
48 b .order2
49 b .order3
50 b .order4
51 b .order5
52 b .order6
53 b .order7
54 b .order8
55
56@ last jump table entry coincides with target, so leave it out
57.order9:
58 ldmia r4, { r5-r12, r14 } @ fetch coefs
59.loop9:
60 ldr r4, [r3], #4 @ load first history sample
61 mul r2, r4, r14 @ multiply with last coef
62 ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
63 mla r2, r4, r12, r2
64 ldr r4, [r3], #4
65 mla r2, r4, r11, r2
66 ldr r4, [r3], #4
67 mla r2, r4, r10, r2
68 ldr r4, [r3], #4
69 mla r2, r4, r9, r2
70 ldr r4, [r3], #4
71 mla r2, r4, r8, r2
72 ldr r4, [r3], #4
73 mla r2, r4, r7, r2
74 ldr r4, [r3], #4
75 mla r2, r4, r6, r2
76 ldr r4, [r3], #4
77 mla r2, r4, r5, r2
78 ldr r4, [r3] @ r4 = residual
79 add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
80 str r2, [r3], #-8*4 @ save result and wrap history pointer back
81 subs r0, r0, #1 @ check if we're done
82 bne .loop9 @ nope, jump back
83 b .exit
84
85.order8:
86 ldmia r4, { r5-r12 }
87.loop8:
88 @ we have more registers to spare here, so start block reading
89 ldmia r3!, { r4, r14 }
90 mul r2, r4, r12
91 mla r2, r14, r11, r2
92 ldmia r3!, { r4, r14 }
93 mla r2, r4, r10, r2
94 mla r2, r14, r9, r2
95 ldmia r3!, { r4, r14 }
96 mla r2, r4, r8, r2
97 mla r2, r14, r7, r2
98 ldmia r3!, { r4, r14 }
99 mla r2, r4, r6, r2
100 mla r2, r14, r5, r2
101 ldr r4, [r3]
102 add r2, r4, r2, asr r1
103 str r2, [r3], #-7*4
104 subs r0, r0, #1
105 bne .loop8
106 b .exit
107
108.order7:
109 ldmia r4, { r5-r11 }
110.loop7:
111 ldmia r3!, { r4, r12, r14 }
112 mul r2, r4, r11
113 mla r2, r12, r10, r2
114 mla r2, r14, r9, r2
115 ldmia r3!, { r4, r12, r14 }
116 mla r2, r4, r8, r2
117 mla r2, r12, r7, r2
118 mla r2, r14, r6, r2
119 ldr r4, [r3], #4
120 mla r2, r4, r5, r2
121 ldr r4, [r3]
122 add r2, r4, r2, asr r1
123 str r2, [r3], #-6*4
124 subs r0, r0, #1
125 bne .loop7
126 b .exit
127
128.order6:
129 ldmia r4, { r5-r10 }
130.loop6:
131 ldmia r3!, { r4, r11-r12, r14 }
132 mul r2, r4, r10
133 mla r2, r11, r9, r2
134 mla r2, r12, r8, r2
135 mla r2, r14, r7, r2
136 ldmia r3!, { r4, r11 }
137 mla r2, r4, r6, r2
138 mla r2, r11, r5, r2
139 ldr r4, [r3]
140 add r2, r4, r2, asr r1
141 str r2, [r3], #-5*4
142 subs r0, r0, #1
143 bne .loop6
144 b .exit
145
146.order5:
147 ldmia r4, { r5-r9 }
148.loop5:
149 ldmia r3!, { r4, r10-r12, r14 }
150 mul r2, r4, r9
151 mla r2, r10, r8, r2
152 mla r2, r11, r7, r2
153 mla r2, r12, r6, r2
154 mla r2, r14, r5, r2
155 ldr r4, [r3]
156 add r2, r4, r2, asr r1
157 str r2, [r3], #-4*4
158 subs r0, r0, #1
159 bne .loop5
160 b .exit
161
162.order4:
163 ldmia r4, { r5-r8 }
164.loop4:
165 ldmia r3!, { r4, r11-r12, r14 }
166 mul r2, r4, r8
167 mla r2, r11, r7, r2
168 mla r2, r12, r6, r2
169 mla r2, r14, r5, r2
170 ldr r4, [r3]
171 add r2, r4, r2, asr r1
172 str r2, [r3], #-3*4
173 subs r0, r0, #1
174 bne .loop4
175 b .exit
176
177.order3:
178 ldmia r4, { r5-r7 }
179.loop3:
180 ldmia r3!, { r4, r12, r14 }
181 mul r2, r4, r7
182 mla r2, r12, r6, r2
183 mla r2, r14, r5, r2
184 ldr r4, [r3]
185 add r2, r4, r2, asr r1
186 str r2, [r3], #-2*4
187 subs r0, r0, #1
188 bne .loop3
189 b .exit
190
191.order2:
192 ldmia r4, { r5-r6 }
193.loop2:
194 ldmia r3!, { r4, r14 }
195 mul r2, r4, r6
196 mla r2, r14, r5, r2
197 ldr r4, [r3]
198 add r2, r4, r2, asr r1
199 str r2, [r3], #-1*4
200 subs r0, r0, #1
201 bne .loop2
202 b .exit
203
204.order1:
205 ldr r5, [r4]
206 ldr r4, [r3], #4
207.loop1:
208 mul r2, r4, r5
209 ldr r4, [r3]
210 add r2, r4, r2, asr r1
211 str r2, [r3], #4
212 subs r0, r0, #1
213 bne .loop1
214 b .exit
215
216.default:
217 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
218 do the rest by jump table. */
219 add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
220 mov r6, r3 @ working copy of history pointer
221 mov r7, r2, lsr #2 @ r7 = coefs/4
222 mov r14, #0 @ init accumulator
223.dloop1:
224 ldmdb r5!, { r8-r11 }
225 ldr r12, [r6], #4
226 mla r14, r12, r11, r14
227 ldr r12, [r6], #4
228 mla r14, r12, r10, r14
229 ldr r12, [r6], #4
230 mla r14, r12, r9, r14
231 ldr r12, [r6], #4
232 mla r14, r12, r8, r14
233 subs r7, r7, #1
234 bne .dloop1
235
236 and r7, r2, #3 @ get remaining samples to be filtered
237 add pc, pc, r7, lsl #2 @ jump into accumulator chain
238@ jumptable:
239 b .dsave @ padding
240 b .dsave
241 b .oneleft
242 b .twoleft
243@ implicit .threeleft
244 ldr r12, [r5, #-4]!
245 ldr r8, [r6], #4
246 mla r14, r12, r8, r14
247.twoleft:
248 ldr r12, [r5, #-4]!
249 ldr r8, [r6], #4
250 mla r14, r12, r8, r14
251.oneleft:
252 ldr r12, [r5, #-4]!
253 ldr r8, [r6], #4
254 mla r14, r12, r8, r14
255
256.dsave:
257 ldr r12, [r6] @ load residual
258 add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
259 str r14, [r6] @ store result
260 add r3, r3, #4 @ increment history pointer
261 subs r0, r0, #1 @ are we done?
262 bne .default @ no, prepare for next sample
263
264.exit:
265 ldmia sp!, { r4-r11, pc }
diff --git a/apps/codecs/libffmpegFLAC/arm.h b/apps/codecs/libffmpegFLAC/arm.h
new file mode 100644
index 0000000000..39080d7f75
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/arm.h
@@ -0,0 +1,8 @@
1#ifndef _FLAC_ARM_H
2#define _FLAC_ARM_H
3
4#include "bitstream.h"
5
6void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
7
8#endif