summaryrefslogtreecommitdiff
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/codecs/libffmpegFLAC/SOURCES2
-rw-r--r--apps/codecs/libffmpegFLAC/arm.S271
-rw-r--r--apps/codecs/libffmpegFLAC/arm.h8
-rw-r--r--apps/codecs/libffmpegFLAC/decoder.c6
4 files changed, 287 insertions, 0 deletions
diff --git a/apps/codecs/libffmpegFLAC/SOURCES b/apps/codecs/libffmpegFLAC/SOURCES
index 1bd92e8be7..deed19bcec 100644
--- a/apps/codecs/libffmpegFLAC/SOURCES
+++ b/apps/codecs/libffmpegFLAC/SOURCES
@@ -3,4 +3,6 @@ decoder.c
3shndec.c 3shndec.c
4#if defined(CPU_COLDFIRE) 4#if defined(CPU_COLDFIRE)
5coldfire.S 5coldfire.S
6#elif defined(CPU_ARM)
7arm.S
6#endif 8#endif
diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S
new file mode 100644
index 0000000000..2a2746eefa
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/arm.S
@@ -0,0 +1,271 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2006 by Thom Johansen
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23
24/* The following is an assembler optimised version of the LPC filtering
25 routines needed for FLAC decoding. It is optimised for use with ARM
26 processors.
27 All LPC filtering up to order 9 is done in specially optimised unrolled
28 loops, while every order above this is handled by a slower default routine.
29 */
30#ifdef USE_IRAM
31 .section .icode,"ax",%progbits
32#else
33 .text
34#endif
35 .global lpc_decode_arm
36lpc_decode_arm:
37 stmdb sp!, { r4-r11, lr }
38 ldr r4, [sp, #36]
39 /* r0 = blocksize, r1 = qlevel, r2 = pred_order
40 r3 = data, r4 = coeffs
41 */
42
43 /* the data pointer always lags behind history pointer by 'pred_order'
44 samples. since we have one loop for each order, we can hard code this
45 and free a register by not saving data pointer.
46 */
47 sub r3, r3, r2, lsl #2 @ r3 = history
48 cmp r0, #0 @ no samples to process
49 beq .exit
50 cmp r2, #9 @ check if order is too high for unrolled loops
51 addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
52@ jumptable:
53 b .default @ order too high, go to default routine
54 b .exit @ zero order filter isn't possible, exit function
55 b .order1
56 b .order2
57 b .order3
58 b .order4
59 b .order5
60 b .order6
61 b .order7
62 b .order8
63
64@ last jump table entry coincides with target, so leave it out
65.order9:
66 ldmia r4, { r5-r12, r14 } @ fetch coefs
67.loop9:
68 ldr r4, [r3], #4 @ load first history sample
69 mul r2, r4, r14 @ multiply with last coef
70 ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
71 mla r2, r4, r12, r2
72 ldr r4, [r3], #4
73 mla r2, r4, r11, r2
74 ldr r4, [r3], #4
75 mla r2, r4, r10, r2
76 ldr r4, [r3], #4
77 mla r2, r4, r9, r2
78 ldr r4, [r3], #4
79 mla r2, r4, r8, r2
80 ldr r4, [r3], #4
81 mla r2, r4, r7, r2
82 ldr r4, [r3], #4
83 mla r2, r4, r6, r2
84 ldr r4, [r3], #4
85 mla r2, r4, r5, r2
86 ldr r4, [r3] @ r4 = residual
87 add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
88 str r2, [r3], #-8*4 @ save result and wrap history pointer back
89 subs r0, r0, #1 @ check if we're done
90 bne .loop9 @ nope, jump back
91 b .exit
92
93.order8:
94 ldmia r4, { r5-r12 }
95.loop8:
96 @ we have more registers to spare here, so start block reading
97 ldmia r3!, { r4, r14 }
98 mul r2, r4, r12
99 mla r2, r14, r11, r2
100 ldmia r3!, { r4, r14 }
101 mla r2, r4, r10, r2
102 mla r2, r14, r9, r2
103 ldmia r3!, { r4, r14 }
104 mla r2, r4, r8, r2
105 mla r2, r14, r7, r2
106 ldmia r3!, { r4, r14 }
107 mla r2, r4, r6, r2
108 mla r2, r14, r5, r2
109 ldr r4, [r3]
110 add r2, r4, r2, asr r1
111 str r2, [r3], #-7*4
112 subs r0, r0, #1
113 bne .loop8
114 b .exit
115
116.order7:
117 ldmia r4, { r5-r11 }
118.loop7:
119 ldmia r3!, { r4, r12, r14 }
120 mul r2, r4, r11
121 mla r2, r12, r10, r2
122 mla r2, r14, r9, r2
123 ldmia r3!, { r4, r12, r14 }
124 mla r2, r4, r8, r2
125 mla r2, r12, r7, r2
126 mla r2, r14, r6, r2
127 ldr r4, [r3], #4
128 mla r2, r4, r5, r2
129 ldr r4, [r3]
130 add r2, r4, r2, asr r1
131 str r2, [r3], #-6*4
132 subs r0, r0, #1
133 bne .loop7
134 b .exit
135
136.order6:
137 ldmia r4, { r5-r10 }
138.loop6:
139 ldmia r3!, { r4, r11-r12, r14 }
140 mul r2, r4, r10
141 mla r2, r11, r9, r2
142 mla r2, r12, r8, r2
143 mla r2, r14, r7, r2
144 ldmia r3!, { r4, r11 }
145 mla r2, r4, r6, r2
146 mla r2, r11, r5, r2
147 ldr r4, [r3]
148 add r2, r4, r2, asr r1
149 str r2, [r3], #-5*4
150 subs r0, r0, #1
151 bne .loop6
152 b .exit
153
154.order5:
155 ldmia r4, { r5-r9 }
156.loop5:
157 ldmia r3!, { r4, r10-r12, r14 }
158 mul r2, r4, r9
159 mla r2, r10, r8, r2
160 mla r2, r11, r7, r2
161 mla r2, r12, r6, r2
162 mla r2, r14, r5, r2
163 ldr r4, [r3]
164 add r2, r4, r2, asr r1
165 str r2, [r3], #-4*4
166 subs r0, r0, #1
167 bne .loop5
168 b .exit
169
170.order4:
171 ldmia r4, { r5-r8 }
172.loop4:
173 ldmia r3!, { r4, r11-r12, r14 }
174 mul r2, r4, r8
175 mla r2, r11, r7, r2
176 mla r2, r12, r6, r2
177 mla r2, r14, r5, r2
178 ldr r4, [r3]
179 add r2, r4, r2, asr r1
180 str r2, [r3], #-3*4
181 subs r0, r0, #1
182 bne .loop4
183 b .exit
184
185.order3:
186 ldmia r4, { r5-r7 }
187.loop3:
188 ldmia r3!, { r4, r12, r14 }
189 mul r2, r4, r7
190 mla r2, r12, r6, r2
191 mla r2, r14, r5, r2
192 ldr r4, [r3]
193 add r2, r4, r2, asr r1
194 str r2, [r3], #-2*4
195 subs r0, r0, #1
196 bne .loop3
197 b .exit
198
199.order2:
200 ldmia r4, { r5-r6 }
201.loop2:
202 ldmia r3!, { r4, r14 }
203 mul r2, r4, r6
204 mla r2, r14, r5, r2
205 ldr r4, [r3]
206 add r2, r4, r2, asr r1
207 str r2, [r3], #-1*4
208 subs r0, r0, #1
209 bne .loop2
210 b .exit
211
212.order1:
213 ldr r5, [r4] @ load the one coef we need
214 ldr r4, [r3], #4 @ load one history sample, r3 now points to residual
215.loop1:
216 mul r2, r4, r5 @ multiply coef by history sample
217 ldr r4, [r3] @ load residual
218 add r4, r4, r2, asr r1 @ add result to residual
219 str r4, [r3], #4 @ place r3 at next residual, we already have
220 subs r0, r0, #1 @ the current sample in r4 for the next iteration
221 bne .loop1
222 b .exit
223
224.default:
225 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
226 do the rest by jump table. */
227 add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
228 mov r7, r2, lsr #2 @ r7 = coefs/4
229 mov r14, #0 @ init accumulator
230.dloop1:
231 ldmdb r5!, { r8-r11 }
232 ldmia r3!, { r6, r12 }
233 mla r14, r6, r11, r14
234 mla r14, r12, r10, r14
235 ldmia r3!, { r6, r12 }
236 mla r14, r6, r9, r14
237 mla r14, r12, r8, r14
238 subs r7, r7, #1
239 bne .dloop1
240
241 and r7, r2, #3 @ get remaining samples to be filtered
242 add pc, pc, r7, lsl #2 @ jump into accumulator chain
243@ jumptable:
244 b .dsave @ padding
245 b .dsave
246 b .oneleft
247 b .twoleft
248@ implicit .threeleft
249 ldr r12, [r5, #-4]!
250 ldr r8, [r3], #4
251 mla r14, r12, r8, r14
252.twoleft:
253 ldr r12, [r5, #-4]!
254 ldr r8, [r3], #4
255 mla r14, r12, r8, r14
256.oneleft:
257 ldr r12, [r5, #-4]!
258 ldr r8, [r3], #4
259 mla r14, r12, r8, r14
260
261.dsave:
262 ldr r12, [r3] @ load residual
263 add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
264 str r14, [r3], #4 @ store result
265 sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos
266 subs r0, r0, #1 @ are we done?
267 bne .default @ no, prepare for next sample
268
269.exit:
270 ldmia sp!, { r4-r11, pc }
271
diff --git a/apps/codecs/libffmpegFLAC/arm.h b/apps/codecs/libffmpegFLAC/arm.h
new file mode 100644
index 0000000000..39080d7f75
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/arm.h
@@ -0,0 +1,8 @@
1#ifndef _FLAC_ARM_H
2#define _FLAC_ARM_H
3
4#include "bitstream.h"
5
6void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
7
8#endif
diff --git a/apps/codecs/libffmpegFLAC/decoder.c b/apps/codecs/libffmpegFLAC/decoder.c
index e5c4b426d5..ed175548f2 100644
--- a/apps/codecs/libffmpegFLAC/decoder.c
+++ b/apps/codecs/libffmpegFLAC/decoder.c
@@ -44,6 +44,8 @@
44 44
45#if defined(CPU_COLDFIRE) 45#if defined(CPU_COLDFIRE)
46#include "coldfire.h" 46#include "coldfire.h"
47#elif defined(CPU_ARM)
48#include "arm.h"
47#endif 49#endif
48 50
49#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) 51#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
@@ -262,6 +264,10 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order)
262 (void)sum; 264 (void)sum;
263 lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, 265 lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order,
264 decoded + pred_order, coeffs); 266 decoded + pred_order, coeffs);
267 #elif defined(CPU_ARM)
268 (void)sum;
269 lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order,
270 decoded + pred_order, coeffs);
265 #else 271 #else
266 for (i = pred_order; i < s->blocksize; i++) 272 for (i = pred_order; i < s->blocksize; i++)
267 { 273 {