diff options
Diffstat (limited to 'apps/plugins/mpegplayer/libmpeg2/idct_armv6.S')
-rw-r--r-- | apps/plugins/mpegplayer/libmpeg2/idct_armv6.S | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/apps/plugins/mpegplayer/libmpeg2/idct_armv6.S b/apps/plugins/mpegplayer/libmpeg2/idct_armv6.S new file mode 100644 index 0000000000..dc53cbd7bd --- /dev/null +++ b/apps/plugins/mpegplayer/libmpeg2/idct_armv6.S | |||
@@ -0,0 +1,297 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2009 by Jens Arnold | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version 2 | ||
15 | * of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
18 | * KIND, either express or implied. | ||
19 | * | ||
20 | ****************************************************************************/ | ||
21 | |||
22 | |||
23 | .global mpeg2_idct_copy | ||
24 | .type mpeg2_idct_copy, %function | ||
25 | .global mpeg2_idct_add | ||
26 | .type mpeg2_idct_add, %function | ||
27 | |||
28 | /* Custom calling convention: | ||
29 | * r0 contains block pointer and is non-volatile | ||
30 | * all non-volatile c context saved and restored on its behalf | ||
31 | */ | ||
32 | .idct: | ||
33 | str lr, [sp, #-4]! @ lr is used | ||
34 | add r1, r0, #128 @ secondary, transposed temp buffer | ||
35 | mov r14, #8 @ loop counter | ||
36 | |||
37 | .row_loop: | ||
38 | ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 | ||
39 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | ||
40 | |||
41 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | ||
42 | smultt r7, r5, r10 @ -b1 = W7 * f3 | ||
43 | smulbt r8, r4, r10 @ -b2 = W1 * f3 | ||
44 | |||
45 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | ||
46 | smlabb r7, r4, r11, r7 @ -b1 += W1 * f5 | ||
47 | rsb r8, r8, #0 @ b2 = -b2 | ||
48 | smlabb r8, r5, r10, r8 @ b2 += W5 * f1 | ||
49 | |||
50 | smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7 | ||
51 | smlabt r7, r5, r11, r7 @ -b1 += W5 * f7 | ||
52 | smlatb r8, r5, r11, r8 @ b2 += W7 * f5 | ||
53 | |||
54 | smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1 | ||
55 | rsb r7, r7, #0 @ b1 = -b1 | ||
56 | smlatb r7, r4, r10, r7 @ b1 += W3 * f1 | ||
57 | smlatt r8, r4, r11, r8 @ b2 += W3 * f7 | ||
58 | |||
59 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | ||
60 | add r2, r2, #1 @ f0 += 1 | ||
61 | |||
62 | smulbb r10, r5, r3 @ a0' = W4 * f4 | ||
63 | smultt r12, r5, r3 @ a3' = W6 * f6 | ||
64 | smultt r3, r4, r3 @ -a2' = W2 * f6 | ||
65 | |||
66 | rsb r11, r10, #0 @ a1' = -W4 * f4 | ||
67 | smlabb r10, r4, r2, r10 @ a0' += W0 * f0 | ||
68 | smlabb r11, r4, r2, r11 @ a1' += W0 * f0 | ||
69 | smlatt r12, r4, r2, r12 @ a3' += W2 * f2 | ||
70 | rsb r3, r3, #0 @ a2' = -a2' | ||
71 | smlatt r3, r5, r2, r3 @ a2' += W6 * f2 | ||
72 | |||
73 | add r10, r10, r12 @ a0 = a0' + a3' | ||
74 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | ||
75 | add r11, r11, r3 @ a1 = a1' + a2' | ||
76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | ||
77 | |||
78 | subs r14, r14, #1 @ decrease loop count | ||
79 | |||
80 | @ Special store order for making the column pass calculate columns in | ||
81 | @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages. | ||
82 | sub r2, r10, r6 @ block[7] = (a0 - b0) | ||
83 | mov r2, r2, asr #12 @ >> 12 | ||
84 | strh r2, [r1, #7*16] | ||
85 | sub r2, r11, r7 @ block[6] = (a1 - b1) | ||
86 | mov r2, r2, asr #12 @ >> 12 | ||
87 | strh r2, [r1, #5*16] | ||
88 | sub r2, r3, r8 @ block[5] = (a2 - b2) | ||
89 | mov r2, r2, asr #12 @ >> 12 | ||
90 | strh r2, [r1, #6*16] | ||
91 | sub r2, r12, r9 @ block[4] = (a3 - b3) | ||
92 | mov r2, r2, asr #12 @ >> 12 | ||
93 | strh r2, [r1, #4*16] | ||
94 | add r2, r12, r9 @ block[3] = (a3 + b3) | ||
95 | mov r2, r2, asr #12 @ >> 12 | ||
96 | strh r2, [r1, #3*16] | ||
97 | add r2, r3, r8 @ block[2] = (a2 + b2) | ||
98 | mov r2, r2, asr #12 @ >> 12 | ||
99 | strh r2, [r1, #1*16] | ||
100 | add r2, r11, r7 @ block[1] = (a1 + b1) | ||
101 | mov r2, r2, asr #12 @ >> 12 | ||
102 | strh r2, [r1, #2*16] | ||
103 | add r2, r10, r6 @ block[0] = (a0 + b0) | ||
104 | mov r2, r2, asr #12 @ >> 12 | ||
105 | strh r2, [r1], #2 @ advance to next temp column | ||
106 | |||
107 | bne .row_loop | ||
108 | b .col_start | ||
109 | |||
110 | @placed here because of ldrd's offset limit | ||
111 | L_W1357: | ||
112 | .short 2841 | ||
113 | .short 2408 | ||
114 | .short 1609 | ||
115 | .short 565 | ||
116 | |||
117 | L_W0246: | ||
118 | .short 2048 | ||
119 | .short 2676 | ||
120 | .short 2048 | ||
121 | .short 1108 | ||
122 | |||
123 | .col_start: | ||
124 | @ r0 now points to the temp buffer, where we need it. | ||
125 | sub r1, r1, #128+16 @ point r1 back to the input block | ||
126 | mov r14, #8 @ loop counter | ||
127 | |||
128 | .col_loop: | ||
129 | ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 | ||
130 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | ||
131 | |||
132 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | ||
133 | smultt r7, r5, r10 @ -b1 = W7 * f3 | ||
134 | smulbt r8, r4, r10 @ -b2 = W1 * f3 | ||
135 | |||
136 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | ||
137 | smlabb r7, r4, r11, r7 @ -b1 += W1 * f5 | ||
138 | rsb r8, r8, #0 @ b2 = -b2 | ||
139 | smlabb r8, r5, r10, r8 @ b2 += W5 * f1 | ||
140 | |||
141 | smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7 | ||
142 | smlabt r7, r5, r11, r7 @ -b1 += W5 * f7 | ||
143 | smlatb r8, r5, r11, r8 @ b2 += W7 * f5 | ||
144 | |||
145 | smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1 | ||
146 | rsb r7, r7, #0 @ b1 = -b1 | ||
147 | smlatb r7, r4, r10, r7 @ b1 += W3 * f1 | ||
148 | smlatt r8, r4, r11, r8 @ b2 += W3 * f7 | ||
149 | |||
150 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | ||
151 | add r2, r2, #32 @ DC offset: 0.5 | ||
152 | |||
153 | smulbb r10, r5, r3 @ a0' = W4 * f4 | ||
154 | smultt r12, r5, r3 @ a3' = W6 * f6 | ||
155 | smultt r3, r4, r3 @ -a2' = W2 * f6 | ||
156 | |||
157 | rsb r11, r10, #0 @ a1' = -W4 * f4 | ||
158 | smlabb r10, r4, r2, r10 @ a0' += W0 * f0 | ||
159 | smlabb r11, r4, r2, r11 @ a1' += W0 * f0 | ||
160 | smlatt r12, r4, r2, r12 @ a3' += W2 * f2 | ||
161 | rsb r3, r3, #0 @ a2' = -a2' | ||
162 | smlatt r3, r5, r2, r3 @ a2' += W6 * f2 | ||
163 | |||
164 | add r10, r10, r12 @ a0 = a0' + a3' | ||
165 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | ||
166 | add r11, r11, r3 @ a1 = a1' + a2' | ||
167 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | ||
168 | |||
169 | subs r14, r14, #1 @ decrease loop count | ||
170 | |||
171 | sub r2, r10, r6 @ block[7] = (a0 - b0) | ||
172 | mov r2, r2, asr #17 @ >> 17 | ||
173 | strh r2, [r1, #7*16] | ||
174 | sub r2, r11, r7 @ block[6] = (a1 - b1) | ||
175 | mov r2, r2, asr #17 @ >> 17 | ||
176 | strh r2, [r1, #6*16] | ||
177 | sub r2, r3, r8 @ block[5] = (a2 - b2) | ||
178 | mov r2, r2, asr #17 @ >> 17 | ||
179 | strh r2, [r1, #5*16] | ||
180 | sub r2, r12, r9 @ block[4] = (a3 - b3) | ||
181 | mov r2, r2, asr #17 @ >> 17 | ||
182 | strh r2, [r1, #4*16] | ||
183 | add r2, r12, r9 @ block[3] = (a3 + b3) | ||
184 | mov r2, r2, asr #17 @ >> 17 | ||
185 | strh r2, [r1, #3*16] | ||
186 | add r2, r3, r8 @ block[2] = (a2 + b2) | ||
187 | mov r2, r2, asr #17 @ >> 17 | ||
188 | strh r2, [r1, #2*16] | ||
189 | add r2, r11, r7 @ block[1] = (a1 + b1) | ||
190 | mov r2, r2, asr #17 @ >> 17 | ||
191 | strh r2, [r1, #1*16] | ||
192 | add r2, r10, r6 @ block[0] = (a0 + b0) | ||
193 | mov r2, r2, asr #17 @ >> 17 | ||
194 | strh r2, [r1], #2 @ advance to next column | ||
195 | |||
196 | bne .col_loop | ||
197 | |||
198 | sub r0, r0, #256 @ point r0 back to the input block | ||
199 | ldr pc, [sp], #4 | ||
200 | |||
201 | |||
202 | mpeg2_idct_copy: | ||
203 | stmfd sp!, {r1-r2, r4-r11, lr} | ||
204 | bl .idct | ||
205 | ldmfd sp!, {r1-r2} | ||
206 | |||
207 | add r3, r0, #128 | ||
208 | mov r8, #0 | ||
209 | mov r9, #0 | ||
210 | mov r10, #0 | ||
211 | mov r11, #0 | ||
212 | 1: @ idct data is in order 0-2-1-3-4-6-5-7, | ||
213 | ldmia r0, {r4-r7} @ see above | ||
214 | stmia r0!, {r8-r11} | ||
215 | usat16 r4, #8, r4 | ||
216 | usat16 r5, #8, r5 | ||
217 | orr r4, r4, r5, lsl #8 | ||
218 | usat16 r6, #8, r6 | ||
219 | usat16 r7, #8, r7 | ||
220 | orr r5, r6, r7, lsl #8 | ||
221 | strd r4, [r1] @ r4, r5 | ||
222 | add r1, r1, r2 | ||
223 | cmp r0, r3 | ||
224 | blo 1b | ||
225 | |||
226 | ldmfd sp!, {r4-r11, pc} | ||
227 | |||
228 | mpeg2_idct_add: | ||
229 | cmp r0, #129 | ||
230 | mov r0, r1 | ||
231 | ldreqsh r1, [r0, #0] | ||
232 | bne 1f | ||
233 | and r1, r1, #0x70 | ||
234 | cmp r1, #0x40 | ||
235 | bne 3f | ||
236 | 1: | ||
237 | stmfd sp!, {r2-r11, lr} | ||
238 | bl .idct | ||
239 | ldmfd sp!, {r1-r2} | ||
240 | |||
241 | add r3, r0, #128 | ||
242 | mov r10, #0 | ||
243 | mov r11, #0 | ||
244 | mov r12, #0 | ||
245 | mov lr, #0 | ||
246 | ldrd r8, [r1] @ r8, r9 | ||
247 | 2: @ idct data is in order 0-2-1-3-4-6-5-7, | ||
248 | ldmia r0, {r4-r7} @ see above | ||
249 | stmia r0!, {r10-r12, lr} | ||
250 | uxtab16 r4, r4, r8 | ||
251 | uxtab16 r5, r5, r8, ror #8 | ||
252 | usat16 r4, #8, r4 | ||
253 | usat16 r5, #8, r5 | ||
254 | orr r4, r4, r5, lsl #8 | ||
255 | uxtab16 r6, r6, r9 | ||
256 | uxtab16 r7, r7, r9, ror #8 | ||
257 | usat16 r6, #8, r6 | ||
258 | usat16 r7, #8, r7 | ||
259 | orr r5, r6, r7, lsl #8 | ||
260 | strd r4, [r1] @ r4, r5 | ||
261 | add r1, r1, r2 | ||
262 | cmp r0, r3 | ||
263 | ldrlod r8, [r1] @ r8, r9 | ||
264 | blo 2b | ||
265 | |||
266 | ldmfd sp!, {r4-r11, pc} | ||
267 | |||
268 | 3: | ||
269 | stmfd sp!, {r4, lr} | ||
270 | ldrsh r4, [r0, #0] @ r4 = block[0] | ||
271 | mov r12, #0 | ||
272 | strh r12, [r0, #0] @ block[0] = 0 | ||
273 | strh r12, [r0, #126] @ block[63] = 0 | ||
274 | add r4, r4, #64 | ||
275 | mov r4, r4, asr #7 @ r4 = DC | ||
276 | mov r4, r4, lsl #16 @ spread to 2 halfwords | ||
277 | orr r4, r4, r4, lsr #16 | ||
278 | ldrd r0, [r2] @ r0, r1 | ||
279 | add r12, r2, r3, asl #3 | ||
280 | 4: | ||
281 | uxtab16 lr, r4, r0, ror #8 | ||
282 | uxtab16 r0, r4, r0 | ||
283 | usat16 lr, #8, lr | ||
284 | usat16 r0, #8, r0 | ||
285 | orr r0, r0, lr, lsl #8 | ||
286 | uxtab16 lr, r4, r1, ror #8 | ||
287 | uxtab16 r1, r4, r1 | ||
288 | usat16 lr, #8, lr | ||
289 | usat16 r1, #8, r1 | ||
290 | orr r1, r1, lr, lsl #8 | ||
291 | strd r0, [r2] @ r0, r1 | ||
292 | add r2, r2, r3 | ||
293 | cmp r2, r12 | ||
294 | ldrlod r0, [r2] @ r0, r1 | ||
295 | blo 4b | ||
296 | |||
297 | ldmfd sp!, {r4, pc} | ||