summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2009-06-20 14:05:15 +0000
committerJens Arnold <amiconn@rockbox.org>2009-06-20 14:05:15 +0000
commitf289b9f591746e5ff85835194bf7eae8ff088e6b (patch)
treefb6bb36f7167a71b6bf486fd45c0d30ec6ee6a9c
parente7c4cd90768cadcdc2f5202378b77c55513f4eef (diff)
downloadrockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.tar.gz
rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.zip
Faster idct for ARMv6. Overall mpegplayer speedup is quite minimal though.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21392 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/plugins/mpegplayer/SOURCES4
-rw-r--r--apps/plugins/mpegplayer/decode.c2
-rw-r--r--apps/plugins/mpegplayer/idct_armv6.S337
3 files changed, 342 insertions, 1 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES
index 5b3360cc5a..5ca0fcd86e 100644
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@@ -9,7 +9,11 @@ idct_coldfire.S
9motion_comp_coldfire_c.c 9motion_comp_coldfire_c.c
10motion_comp_coldfire_s.S 10motion_comp_coldfire_s.S
11#elif defined CPU_ARM 11#elif defined CPU_ARM
12#if ARM_ARCH >= 6
13idct_armv6.S
14#else
12idct_arm.S 15idct_arm.S
16#endif
13motion_comp_arm_c.c 17motion_comp_arm_c.c
14motion_comp_arm_s.S 18motion_comp_arm_s.S
15#else /* other CPU or SIM */ 19#else /* other CPU or SIM */
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c
index a19b929bef..91251206e9 100644
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@@ -35,7 +35,7 @@
35 35
36#define BUFFER_SIZE (1194 * 1024) 36#define BUFFER_SIZE (1194 * 1024)
37 37
38#ifdef CPU_COLDFIRE 38#if defined(CPU_COLDFIRE) || (defined(CPU_ARM) && ARM_ARCH >= 6)
39/* twice as large as on other targets because coldfire uses 39/* twice as large as on other targets because coldfire uses
40 * a secondary, transposed buffer for optimisation */ 40 * a secondary, transposed buffer for optimisation */
41static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16); 41static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S
new file mode 100644
index 0000000000..73feed4785
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_armv6.S
@@ -0,0 +1,337 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2009 by Jens Arnold
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22
23 .global mpeg2_idct_copy
24 .type mpeg2_idct_copy, %function
25 .global mpeg2_idct_add
26 .type mpeg2_idct_add, %function
27
28/* Custom calling convention:
29 * r0 contains block pointer and is non-volatile
30 * all non-volatile c context saved and restored on its behalf
31 */
32.idct:
33 str lr, [sp, #-4]! @ lr is used
34 add r1, r0, #128 @ secondary, transposed temp buffer
35 mov r14, #8 @ loop counter
36
37.row_loop:
38 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
39 ldrd r4, L_W1357 @ load W1, W3, W5, W7
40
41 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
42 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
43
44 smultt r7, r5, r10 @ b1 = -W7 * f3
45 smlabb r7, r4, r11, r7 @ + -W1 * f5
46 smlabt r7, r5, r11, r7 @ + -W5 * f7
47 rsb r7, r7, #0
48 smlatb r7, r4, r10, r7 @ + W3 * f1
49
50 smulbt r8, r4, r10 @ b2 = -W1 * f3
51 rsb r8, r8, #0
52 smlabb r8, r5, r10, r8 @ + W5 * f1
53 smlatb r8, r5, r11, r8 @ + W7 * f5
54 smlatt r8, r4, r11, r8 @ + W3 * f7
55
56 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
57 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
58
59 ldrd r4, L_W0246 @ load W0, W2, W4, W6
60 add r2, r2, #1 @ f0 += 1
61
62 smulbb r10, r4, r2 @ a0' = W0 * f0
63 smlabb r10, r5, r3, r10 @ + W4 * f4
64 smultt r12, r4, r2 @ a3' = W2 * f2
65 smlatt r12, r5, r3, r12 @ + W6 * f6
66 add r10, r10, r12 @ a0 = a0' + a3'
67 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
68
69 smulbb r11, r5, r3 @ a1' = -W4 * f4
70 rsb r11, r11, #0
71 smlabb r11, r4, r2, r11 @ + W0 * f0
72 smultt r3, r4, r3 @ a2' = -W2 * f6
73 rsb r3, r3, #0
74 smlatt r3, r5, r2, r3 @ + W6 * f2
75 add r11, r11, r3 @ a1 = a1' + a2'
76 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
77
78 sub r2, r10, r6 @ block[7] = (a0 - b0)
79 mov r2, r2, asr #12 @ >> 12
80 strh r2, [r1, #7*16]
81 sub r2, r11, r7 @ block[6] = (a1 - b1)
82 mov r2, r2, asr #12 @ >> 12
83 strh r2, [r1, #6*16]
84 sub r2, r3, r8 @ block[5] = (a2 - b2)
85 mov r2, r2, asr #12 @ >> 12
86 strh r2, [r1, #5*16]
87 sub r2, r12, r9 @ block[4] = (a3 - b3)
88 mov r2, r2, asr #12 @ >> 12
89 strh r2, [r1, #4*16]
90 add r2, r12, r9 @ block[3] = (a3 + b3)
91 mov r2, r2, asr #12 @ >> 12
92 strh r2, [r1, #3*16]
93 add r2, r3, r8 @ block[2] = (a2 + b2)
94 mov r2, r2, asr #12 @ >> 12
95 strh r2, [r1, #2*16]
96 add r2, r11, r7 @ block[1] = (a1 + b1)
97 mov r2, r2, asr #12 @ >> 12
98 strh r2, [r1, #1*16]
99 add r2, r10, r6 @ block[0] = (a0 + b0)
100 mov r2, r2, asr #12 @ >> 12
101 strh r2, [r1], #2 @ advance to next temp column
102
103 subs r14, r14, #1
104 bne .row_loop
105 b .col_start
106
107 @placed here because of ldrd's offset limit
108L_W1357:
109 .short 2841
110 .short 2408
111 .short 1609
112 .short 565
113
114L_W0246:
115 .short 2048
116 .short 2676
117 .short 2048
118 .short 1108
119
120.col_start:
121 @ r0 now points to the temp buffer, where we need it.
122 sub r1, r1, #128+16 @ point r1 back to the input block
123 mov r14, #8 @ loop counter
124
125.col_loop:
126 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
127 ldrd r4, L_W1357 @ load W1, W3, W5, W7
128
129 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
130 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
131
132 smultt r7, r5, r10 @ b1 = -W7 * f3
133 smlabb r7, r4, r11, r7 @ + -W1 * f5
134 smlabt r7, r5, r11, r7 @ + -W5 * f7
135 rsb r7, r7, #0
136 smlatb r7, r4, r10, r7 @ + W3 * f1
137
138 smulbt r8, r4, r10 @ b2 = -W1 * f3
139 rsb r8, r8, #0
140 smlabb r8, r5, r10, r8 @ + W5 * f1
141 smlatb r8, r5, r11, r8 @ + W7 * f5
142 smlatt r8, r4, r11, r8 @ + W3 * f7
143
144 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
145 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
146
147 ldrd r4, L_W0246 @ load W0, W2, W4, W6
148 add r2, r2, #32 @ DC offset: 0.5
149
150 smulbb r10, r4, r2 @ a0' = W0 * f0
151 smlabb r10, r5, r3, r10 @ + W4 * f4
152 smultt r12, r4, r2 @ a3' = W2 * f2
153 smlatt r12, r5, r3, r12 @ + W6 * f6
154 add r10, r10, r12 @ a0 = a0' + a3'
155 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
156
157 smulbb r11, r5, r3 @ a1' = -W4 * f4
158 rsb r11, r11, #0
159 smlabb r11, r4, r2, r11 @ + W0 * f0
160 smultt r3, r4, r3 @ a2' = -W2 * f6
161 rsb r3, r3, #0
162 smlatt r3, r5, r2, r3 @ + W6 * f2
163 add r11, r11, r3 @ a1 = a1' + a2'
164 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
165
166 sub r2, r10, r6 @ block[7] = (a0 - b0)
167 mov r2, r2, asr #17 @ >> 17
168 strh r2, [r1, #7*16]
169 sub r2, r11, r7 @ block[6] = (a1 - b1)
170 mov r2, r2, asr #17 @ >> 17
171 strh r2, [r1, #6*16]
172 sub r2, r3, r8 @ block[5] = (a2 - b2)
173 mov r2, r2, asr #17 @ >> 17
174 strh r2, [r1, #5*16]
175 sub r2, r12, r9 @ block[4] = (a3 - b3)
176 mov r2, r2, asr #17 @ >> 17
177 strh r2, [r1, #4*16]
178 add r2, r12, r9 @ block[3] = (a3 + b3)
179 mov r2, r2, asr #17 @ >> 17
180 strh r2, [r1, #3*16]
181 add r2, r3, r8 @ block[2] = (a2 + b2)
182 mov r2, r2, asr #17 @ >> 17
183 strh r2, [r1, #2*16]
184 add r2, r11, r7 @ block[1] = (a1 + b1)
185 mov r2, r2, asr #17 @ >> 17
186 strh r2, [r1, #1*16]
187 add r2, r10, r6 @ block[0] = (a0 + b0)
188 mov r2, r2, asr #17 @ >> 17
189 strh r2, [r1], #2 @ advance to next column
190
191 subs r14, r14, #1
192 bne .col_loop
193
194 sub r0, r0, #256 @ point r0 back to the input block
195 ldr pc, [sp], #4
196
197
198mpeg2_idct_copy:
199 stmfd sp!, {r1-r2, r4-r12, lr}
200 bl .idct
201 ldmfd sp!, {r1-r2}
202
203 add r12, r0, #128
204 ldrd r4, [r0]
205 mov r8, #0
206 mov r9, #0
207 mov r10, #0
208 mov r11, #0
2091:
210 ldrd r6, [r0, #8]
211 usat16 r4, #8, r4
212 strb r4, [r1, #0]
213 mov r4, r4, lsr #16
214 strb r4, [r1, #1]
215 usat16 r5, #8, r5
216 strb r5, [r1, #2]
217 mov r5, r5, lsr #16
218 strb r5, [r1, #3]
219 ldrd r4, [r0, #16]
220 usat16 r6, #8, r6
221 strb r6, [r1, #4]
222 mov r6, r6, lsr #16
223 strb r6, [r1, #5]
224 usat16 r7, #8, r7
225 strb r7, [r1, #6]
226 mov r7, r7, lsr #16
227 strb r7, [r1, #7]
228 stmia r0!, {r8-r11}
229 add r1, r1, r2
230 cmp r0, r12
231 blo 1b
232
233 ldmfd sp!, {r4-r12, pc}
234
235mpeg2_idct_add:
236 cmp r0, #129
237 mov r0, r1
238 ldreqsh r1, [r0, #0]
239 bne 1f
240 and r1, r1, #0x70
241 cmp r1, #0x40
242 bne 3f
2431:
244 stmfd sp!, {r2-r12, lr}
245 bl .idct
246 ldmfd sp!, {r1-r2}
247 mov r11, #0
248 add r12, r0, #128
2492:
250 ldmia r0, {r3-r6}
251 ldrb r7, [r1, #0]
252 ldrb r8, [r1, #1]
253 ldrb r9, [r1, #2]
254 ldrb r10, [r1, #3]
255 str r11, [r0], #4
256 orr r7, r7, r8, lsl #16
257 sadd16 r3, r3, r7
258 usat16 r3, #8, r3
259 strb r3, [r1, #0]
260 mov r3, r3, lsr #16
261 strb r3, [r1, #1]
262 str r11, [r0], #4
263 orr r9, r9, r10, lsl #16
264 sadd16 r4, r4, r9
265 usat16 r4, #8, r4
266 strb r4, [r1, #2]
267 mov r4, r4, lsr #16
268 strb r4, [r1, #3]
269 ldrb r7, [r1, #4]
270 ldrb r8, [r1, #5]
271 ldrb r9, [r1, #6]
272 ldrb r10, [r1, #7]
273 str r11, [r0], #4
274 orr r7, r7, r8, lsl #16
275 sadd16 r5, r5, r7
276 usat16 r5, #8, r5
277 strb r5, [r1, #4]
278 mov r5, r5, lsr #16
279 strb r5, [r1, #5]
280 str r11, [r0], #4
281 orr r9, r9, r10, lsl #16
282 sadd16 r6, r6, r9
283 usat16 r6, #8, r6
284 strb r6, [r1, #6]
285 mov r6, r6, lsr #16
286 strb r6, [r1, #7]
287 add r1, r1, r2
288 cmp r0, r12
289 blo 2b
290 ldmfd sp!, {r4-r12, pc}
291
2923:
293 stmfd sp!, {r4-r7}
294 ldrsh r1, [r0, #0] /* r1 = block[0] */
295 mov r11, #0
296 strh r11, [r0, #0] /* block[0] = 0 */
297 strh r11, [r0, #126] /* block[63] = 0 */
298 add r1, r1, #64 /* r1 = DC << 7 */
299 add r0, r2, r3, asl #3
3004:
301 ldrb r4, [r2, #0]
302 ldrb r5, [r2, #1]
303 ldrb r6, [r2, #2]
304 ldrb r7, [r2, #3]
305 add r4, r4, r1, asr #7
306 usat r4, #8, r4
307 strb r4, [r2, #0]
308 add r5, r5, r1, asr #7
309 usat r5, #8, r5
310 strb r5, [r2, #1]
311 add r6, r6, r1, asr #7
312 usat r6, #8, r6
313 strb r6, [r2, #2]
314 add r7, r7, r1, asr #7
315 usat r7, #8, r7
316 strb r7, [r2, #3]
317 ldrb r4, [r2, #4]
318 ldrb r5, [r2, #5]
319 ldrb r6, [r2, #6]
320 ldrb r7, [r2, #7]
321 add r4, r4, r1, asr #7
322 usat r4, #8, r4
323 strb r4, [r2, #4]
324 add r5, r5, r1, asr #7
325 usat r5, #8, r5
326 strb r5, [r2, #5]
327 add r6, r6, r1, asr #7
328 usat r6, #8, r6
329 strb r6, [r2, #6]
330 add r7, r7, r1, asr #7
331 usat r7, #8, r7
332 strb r7, [r2, #7]
333 add r2, r2, r3
334 cmp r2, r0
335 blo 4b
336 ldmfd sp!, {r4-r7}
337 bx lr