diff options
author | Jens Arnold <amiconn@rockbox.org> | 2009-06-20 14:05:15 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2009-06-20 14:05:15 +0000 |
commit | f289b9f591746e5ff85835194bf7eae8ff088e6b (patch) | |
tree | fb6bb36f7167a71b6bf486fd45c0d30ec6ee6a9c /apps/plugins | |
parent | e7c4cd90768cadcdc2f5202378b77c55513f4eef (diff) | |
download | rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.tar.gz rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.zip |
Faster idct for ARMv6. Overall mpegplayer speedup is quite minimal though.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21392 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins')
-rw-r--r-- | apps/plugins/mpegplayer/SOURCES | 4 | ||||
-rw-r--r-- | apps/plugins/mpegplayer/decode.c | 2 | ||||
-rw-r--r-- | apps/plugins/mpegplayer/idct_armv6.S | 337 |
3 files changed, 342 insertions, 1 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES index 5b3360cc5a..5ca0fcd86e 100644 --- a/apps/plugins/mpegplayer/SOURCES +++ b/apps/plugins/mpegplayer/SOURCES | |||
@@ -9,7 +9,11 @@ idct_coldfire.S | |||
9 | motion_comp_coldfire_c.c | 9 | motion_comp_coldfire_c.c |
10 | motion_comp_coldfire_s.S | 10 | motion_comp_coldfire_s.S |
11 | #elif defined CPU_ARM | 11 | #elif defined CPU_ARM |
12 | #if ARM_ARCH >= 6 | ||
13 | idct_armv6.S | ||
14 | #else | ||
12 | idct_arm.S | 15 | idct_arm.S |
16 | #endif | ||
13 | motion_comp_arm_c.c | 17 | motion_comp_arm_c.c |
14 | motion_comp_arm_s.S | 18 | motion_comp_arm_s.S |
15 | #else /* other CPU or SIM */ | 19 | #else /* other CPU or SIM */ |
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c index a19b929bef..91251206e9 100644 --- a/apps/plugins/mpegplayer/decode.c +++ b/apps/plugins/mpegplayer/decode.c | |||
@@ -35,7 +35,7 @@ | |||
35 | 35 | ||
36 | #define BUFFER_SIZE (1194 * 1024) | 36 | #define BUFFER_SIZE (1194 * 1024) |
37 | 37 | ||
38 | #ifdef CPU_COLDFIRE | 38 | #if defined(CPU_COLDFIRE) || (defined(CPU_ARM) && ARM_ARCH >= 6) |
39 | /* twice as large as on other targets because coldfire uses | 39 | /* twice as large as on other targets because coldfire uses |
40 | * a secondary, transposed buffer for optimisation */ | 40 | * a secondary, transposed buffer for optimisation */ |
41 | static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16); | 41 | static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16); |
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S new file mode 100644 index 0000000000..73feed4785 --- /dev/null +++ b/apps/plugins/mpegplayer/idct_armv6.S | |||
@@ -0,0 +1,337 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2009 by Jens Arnold | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version 2 | ||
15 | * of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
18 | * KIND, either express or implied. | ||
19 | * | ||
20 | ****************************************************************************/ | ||
21 | |||
22 | |||
23 | .global mpeg2_idct_copy | ||
24 | .type mpeg2_idct_copy, %function | ||
25 | .global mpeg2_idct_add | ||
26 | .type mpeg2_idct_add, %function | ||
27 | |||
28 | /* Custom calling convention: | ||
29 | * r0 contains block pointer and is non-volatile | ||
30 | * all non-volatile c context saved and restored on its behalf | ||
31 | */ | ||
32 | .idct: | ||
33 | str lr, [sp, #-4]! @ lr is used | ||
34 | add r1, r0, #128 @ secondary, transposed temp buffer | ||
35 | mov r14, #8 @ loop counter | ||
36 | |||
37 | .row_loop: | ||
38 | ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 | ||
39 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | ||
40 | |||
41 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | ||
42 | smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 | ||
43 | |||
44 | smultt r7, r5, r10 @ b1 = -W7 * f3 | ||
45 | smlabb r7, r4, r11, r7 @ + -W1 * f5 | ||
46 | smlabt r7, r5, r11, r7 @ + -W5 * f7 | ||
47 | rsb r7, r7, #0 | ||
48 | smlatb r7, r4, r10, r7 @ + W3 * f1 | ||
49 | |||
50 | smulbt r8, r4, r10 @ b2 = -W1 * f3 | ||
51 | rsb r8, r8, #0 | ||
52 | smlabb r8, r5, r10, r8 @ + W5 * f1 | ||
53 | smlatb r8, r5, r11, r8 @ + W7 * f5 | ||
54 | smlatt r8, r4, r11, r8 @ + W3 * f7 | ||
55 | |||
56 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | ||
57 | smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 | ||
58 | |||
59 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | ||
60 | add r2, r2, #1 @ f0 += 1 | ||
61 | |||
62 | smulbb r10, r4, r2 @ a0' = W0 * f0 | ||
63 | smlabb r10, r5, r3, r10 @ + W4 * f4 | ||
64 | smultt r12, r4, r2 @ a3' = W2 * f2 | ||
65 | smlatt r12, r5, r3, r12 @ + W6 * f6 | ||
66 | add r10, r10, r12 @ a0 = a0' + a3' | ||
67 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | ||
68 | |||
69 | smulbb r11, r5, r3 @ a1' = -W4 * f4 | ||
70 | rsb r11, r11, #0 | ||
71 | smlabb r11, r4, r2, r11 @ + W0 * f0 | ||
72 | smultt r3, r4, r3 @ a2' = -W2 * f6 | ||
73 | rsb r3, r3, #0 | ||
74 | smlatt r3, r5, r2, r3 @ + W6 * f2 | ||
75 | add r11, r11, r3 @ a1 = a1' + a2' | ||
76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | ||
77 | |||
78 | sub r2, r10, r6 @ block[7] = (a0 - b0) | ||
79 | mov r2, r2, asr #12 @ >> 12 | ||
80 | strh r2, [r1, #7*16] | ||
81 | sub r2, r11, r7 @ block[6] = (a1 - b1) | ||
82 | mov r2, r2, asr #12 @ >> 12 | ||
83 | strh r2, [r1, #6*16] | ||
84 | sub r2, r3, r8 @ block[5] = (a2 - b2) | ||
85 | mov r2, r2, asr #12 @ >> 12 | ||
86 | strh r2, [r1, #5*16] | ||
87 | sub r2, r12, r9 @ block[4] = (a3 - b3) | ||
88 | mov r2, r2, asr #12 @ >> 12 | ||
89 | strh r2, [r1, #4*16] | ||
90 | add r2, r12, r9 @ block[3] = (a3 + b3) | ||
91 | mov r2, r2, asr #12 @ >> 12 | ||
92 | strh r2, [r1, #3*16] | ||
93 | add r2, r3, r8 @ block[2] = (a2 + b2) | ||
94 | mov r2, r2, asr #12 @ >> 12 | ||
95 | strh r2, [r1, #2*16] | ||
96 | add r2, r11, r7 @ block[1] = (a1 + b1) | ||
97 | mov r2, r2, asr #12 @ >> 12 | ||
98 | strh r2, [r1, #1*16] | ||
99 | add r2, r10, r6 @ block[0] = (a0 + b0) | ||
100 | mov r2, r2, asr #12 @ >> 12 | ||
101 | strh r2, [r1], #2 @ advance to next temp column | ||
102 | |||
103 | subs r14, r14, #1 | ||
104 | bne .row_loop | ||
105 | b .col_start | ||
106 | |||
107 | @placed here because of ldrd's offset limit | ||
108 | L_W1357: | ||
109 | .short 2841 | ||
110 | .short 2408 | ||
111 | .short 1609 | ||
112 | .short 565 | ||
113 | |||
114 | L_W0246: | ||
115 | .short 2048 | ||
116 | .short 2676 | ||
117 | .short 2048 | ||
118 | .short 1108 | ||
119 | |||
120 | .col_start: | ||
121 | @ r0 now points to the temp buffer, where we need it. | ||
122 | sub r1, r1, #128+16 @ point r1 back to the input block | ||
123 | mov r14, #8 @ loop counter | ||
124 | |||
125 | .col_loop: | ||
126 | ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 | ||
127 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | ||
128 | |||
129 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | ||
130 | smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 | ||
131 | |||
132 | smultt r7, r5, r10 @ b1 = -W7 * f3 | ||
133 | smlabb r7, r4, r11, r7 @ + -W1 * f5 | ||
134 | smlabt r7, r5, r11, r7 @ + -W5 * f7 | ||
135 | rsb r7, r7, #0 | ||
136 | smlatb r7, r4, r10, r7 @ + W3 * f1 | ||
137 | |||
138 | smulbt r8, r4, r10 @ b2 = -W1 * f3 | ||
139 | rsb r8, r8, #0 | ||
140 | smlabb r8, r5, r10, r8 @ + W5 * f1 | ||
141 | smlatb r8, r5, r11, r8 @ + W7 * f5 | ||
142 | smlatt r8, r4, r11, r8 @ + W3 * f7 | ||
143 | |||
144 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | ||
145 | smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 | ||
146 | |||
147 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | ||
148 | add r2, r2, #32 @ DC offset: 0.5 | ||
149 | |||
150 | smulbb r10, r4, r2 @ a0' = W0 * f0 | ||
151 | smlabb r10, r5, r3, r10 @ + W4 * f4 | ||
152 | smultt r12, r4, r2 @ a3' = W2 * f2 | ||
153 | smlatt r12, r5, r3, r12 @ + W6 * f6 | ||
154 | add r10, r10, r12 @ a0 = a0' + a3' | ||
155 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | ||
156 | |||
157 | smulbb r11, r5, r3 @ a1' = -W4 * f4 | ||
158 | rsb r11, r11, #0 | ||
159 | smlabb r11, r4, r2, r11 @ + W0 * f0 | ||
160 | smultt r3, r4, r3 @ a2' = -W2 * f6 | ||
161 | rsb r3, r3, #0 | ||
162 | smlatt r3, r5, r2, r3 @ + W6 * f2 | ||
163 | add r11, r11, r3 @ a1 = a1' + a2' | ||
164 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | ||
165 | |||
166 | sub r2, r10, r6 @ block[7] = (a0 - b0) | ||
167 | mov r2, r2, asr #17 @ >> 17 | ||
168 | strh r2, [r1, #7*16] | ||
169 | sub r2, r11, r7 @ block[6] = (a1 - b1) | ||
170 | mov r2, r2, asr #17 @ >> 17 | ||
171 | strh r2, [r1, #6*16] | ||
172 | sub r2, r3, r8 @ block[5] = (a2 - b2) | ||
173 | mov r2, r2, asr #17 @ >> 17 | ||
174 | strh r2, [r1, #5*16] | ||
175 | sub r2, r12, r9 @ block[4] = (a3 - b3) | ||
176 | mov r2, r2, asr #17 @ >> 17 | ||
177 | strh r2, [r1, #4*16] | ||
178 | add r2, r12, r9 @ block[3] = (a3 + b3) | ||
179 | mov r2, r2, asr #17 @ >> 17 | ||
180 | strh r2, [r1, #3*16] | ||
181 | add r2, r3, r8 @ block[2] = (a2 + b2) | ||
182 | mov r2, r2, asr #17 @ >> 17 | ||
183 | strh r2, [r1, #2*16] | ||
184 | add r2, r11, r7 @ block[1] = (a1 + b1) | ||
185 | mov r2, r2, asr #17 @ >> 17 | ||
186 | strh r2, [r1, #1*16] | ||
187 | add r2, r10, r6 @ block[0] = (a0 + b0) | ||
188 | mov r2, r2, asr #17 @ >> 17 | ||
189 | strh r2, [r1], #2 @ advance to next column | ||
190 | |||
191 | subs r14, r14, #1 | ||
192 | bne .col_loop | ||
193 | |||
194 | sub r0, r0, #256 @ point r0 back to the input block | ||
195 | ldr pc, [sp], #4 | ||
196 | |||
197 | |||
198 | mpeg2_idct_copy: | ||
199 | stmfd sp!, {r1-r2, r4-r12, lr} | ||
200 | bl .idct | ||
201 | ldmfd sp!, {r1-r2} | ||
202 | |||
203 | add r12, r0, #128 | ||
204 | ldrd r4, [r0] | ||
205 | mov r8, #0 | ||
206 | mov r9, #0 | ||
207 | mov r10, #0 | ||
208 | mov r11, #0 | ||
209 | 1: | ||
210 | ldrd r6, [r0, #8] | ||
211 | usat16 r4, #8, r4 | ||
212 | strb r4, [r1, #0] | ||
213 | mov r4, r4, lsr #16 | ||
214 | strb r4, [r1, #1] | ||
215 | usat16 r5, #8, r5 | ||
216 | strb r5, [r1, #2] | ||
217 | mov r5, r5, lsr #16 | ||
218 | strb r5, [r1, #3] | ||
219 | ldrd r4, [r0, #16] | ||
220 | usat16 r6, #8, r6 | ||
221 | strb r6, [r1, #4] | ||
222 | mov r6, r6, lsr #16 | ||
223 | strb r6, [r1, #5] | ||
224 | usat16 r7, #8, r7 | ||
225 | strb r7, [r1, #6] | ||
226 | mov r7, r7, lsr #16 | ||
227 | strb r7, [r1, #7] | ||
228 | stmia r0!, {r8-r11} | ||
229 | add r1, r1, r2 | ||
230 | cmp r0, r12 | ||
231 | blo 1b | ||
232 | |||
233 | ldmfd sp!, {r4-r12, pc} | ||
234 | |||
235 | mpeg2_idct_add: | ||
236 | cmp r0, #129 | ||
237 | mov r0, r1 | ||
238 | ldreqsh r1, [r0, #0] | ||
239 | bne 1f | ||
240 | and r1, r1, #0x70 | ||
241 | cmp r1, #0x40 | ||
242 | bne 3f | ||
243 | 1: | ||
244 | stmfd sp!, {r2-r12, lr} | ||
245 | bl .idct | ||
246 | ldmfd sp!, {r1-r2} | ||
247 | mov r11, #0 | ||
248 | add r12, r0, #128 | ||
249 | 2: | ||
250 | ldmia r0, {r3-r6} | ||
251 | ldrb r7, [r1, #0] | ||
252 | ldrb r8, [r1, #1] | ||
253 | ldrb r9, [r1, #2] | ||
254 | ldrb r10, [r1, #3] | ||
255 | str r11, [r0], #4 | ||
256 | orr r7, r7, r8, lsl #16 | ||
257 | sadd16 r3, r3, r7 | ||
258 | usat16 r3, #8, r3 | ||
259 | strb r3, [r1, #0] | ||
260 | mov r3, r3, lsr #16 | ||
261 | strb r3, [r1, #1] | ||
262 | str r11, [r0], #4 | ||
263 | orr r9, r9, r10, lsl #16 | ||
264 | sadd16 r4, r4, r9 | ||
265 | usat16 r4, #8, r4 | ||
266 | strb r4, [r1, #2] | ||
267 | mov r4, r4, lsr #16 | ||
268 | strb r4, [r1, #3] | ||
269 | ldrb r7, [r1, #4] | ||
270 | ldrb r8, [r1, #5] | ||
271 | ldrb r9, [r1, #6] | ||
272 | ldrb r10, [r1, #7] | ||
273 | str r11, [r0], #4 | ||
274 | orr r7, r7, r8, lsl #16 | ||
275 | sadd16 r5, r5, r7 | ||
276 | usat16 r5, #8, r5 | ||
277 | strb r5, [r1, #4] | ||
278 | mov r5, r5, lsr #16 | ||
279 | strb r5, [r1, #5] | ||
280 | str r11, [r0], #4 | ||
281 | orr r9, r9, r10, lsl #16 | ||
282 | sadd16 r6, r6, r9 | ||
283 | usat16 r6, #8, r6 | ||
284 | strb r6, [r1, #6] | ||
285 | mov r6, r6, lsr #16 | ||
286 | strb r6, [r1, #7] | ||
287 | add r1, r1, r2 | ||
288 | cmp r0, r12 | ||
289 | blo 2b | ||
290 | ldmfd sp!, {r4-r12, pc} | ||
291 | |||
292 | 3: | ||
293 | stmfd sp!, {r4-r7} | ||
294 | ldrsh r1, [r0, #0] /* r1 = block[0] */ | ||
295 | mov r11, #0 | ||
296 | strh r11, [r0, #0] /* block[0] = 0 */ | ||
297 | strh r11, [r0, #126] /* block[63] = 0 */ | ||
298 | add r1, r1, #64 /* r1 = DC << 7 */ | ||
299 | add r0, r2, r3, asl #3 | ||
300 | 4: | ||
301 | ldrb r4, [r2, #0] | ||
302 | ldrb r5, [r2, #1] | ||
303 | ldrb r6, [r2, #2] | ||
304 | ldrb r7, [r2, #3] | ||
305 | add r4, r4, r1, asr #7 | ||
306 | usat r4, #8, r4 | ||
307 | strb r4, [r2, #0] | ||
308 | add r5, r5, r1, asr #7 | ||
309 | usat r5, #8, r5 | ||
310 | strb r5, [r2, #1] | ||
311 | add r6, r6, r1, asr #7 | ||
312 | usat r6, #8, r6 | ||
313 | strb r6, [r2, #2] | ||
314 | add r7, r7, r1, asr #7 | ||
315 | usat r7, #8, r7 | ||
316 | strb r7, [r2, #3] | ||
317 | ldrb r4, [r2, #4] | ||
318 | ldrb r5, [r2, #5] | ||
319 | ldrb r6, [r2, #6] | ||
320 | ldrb r7, [r2, #7] | ||
321 | add r4, r4, r1, asr #7 | ||
322 | usat r4, #8, r4 | ||
323 | strb r4, [r2, #4] | ||
324 | add r5, r5, r1, asr #7 | ||
325 | usat r5, #8, r5 | ||
326 | strb r5, [r2, #5] | ||
327 | add r6, r6, r1, asr #7 | ||
328 | usat r6, #8, r6 | ||
329 | strb r6, [r2, #6] | ||
330 | add r7, r7, r1, asr #7 | ||
331 | usat r7, #8, r7 | ||
332 | strb r7, [r2, #7] | ||
333 | add r2, r2, r3 | ||
334 | cmp r2, r0 | ||
335 | blo 4b | ||
336 | ldmfd sp!, {r4-r7} | ||
337 | bx lr | ||