diff options
Diffstat (limited to 'apps/plugins/sdl/src/hermes/mmxp2_32.asm')
-rw-r--r-- | apps/plugins/sdl/src/hermes/mmxp2_32.asm | 405 |
1 files changed, 0 insertions, 405 deletions
diff --git a/apps/plugins/sdl/src/hermes/mmxp2_32.asm b/apps/plugins/sdl/src/hermes/mmxp2_32.asm deleted file mode 100644 index 20c3277a26..0000000000 --- a/apps/plugins/sdl/src/hermes/mmxp2_32.asm +++ /dev/null | |||
@@ -1,405 +0,0 @@ | |||
1 | ; | ||
2 | ; pII-optimised MMX format converters for HERMES | ||
3 | ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) | ||
4 | ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) | ||
5 | ; This source code is licensed under the GNU LGPL | ||
6 | ; | ||
7 | ; Please refer to the file COPYING.LIB contained in the distribution for | ||
8 | ; licensing conditions | ||
9 | ; | ||
10 | ; COPYRIGHT NOTICE | ||
11 | ; | ||
12 | ; This file partly contains code that is (c) Intel Corporation, specifically | ||
13 | ; the mode detection routine, and the converter to 15 bit (8 pixel | ||
14 | ; conversion routine from the mmx programming tutorial pages). | ||
15 | ; | ||
16 | ; | ||
17 | ; These routines aren't exactly pII optimised - it's just that as they | ||
18 | ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to | ||
19 | ; optimise them for p5 MMXs.. | ||
20 | |||
21 | BITS 32 | ||
22 | |||
23 | %include "common.inc" | ||
24 | |||
25 | SDL_FUNC _ConvertMMXpII32_24RGB888 | ||
26 | SDL_FUNC _ConvertMMXpII32_16RGB565 | ||
27 | SDL_FUNC _ConvertMMXpII32_16BGR565 | ||
28 | SDL_FUNC _ConvertMMXpII32_16RGB555 | ||
29 | SDL_FUNC _ConvertMMXpII32_16BGR555 | ||
30 | |||
31 | ;; Macros for conversion routines | ||
32 | |||
33 | %macro _push_immq_mask 1 | ||
34 | push dword %1 | ||
35 | push dword %1 | ||
36 | %endmacro | ||
37 | |||
38 | %macro load_immq 2 | ||
39 | _push_immq_mask %2 | ||
40 | movq %1, [esp] | ||
41 | %endmacro | ||
42 | |||
43 | %macro pand_immq 2 | ||
44 | _push_immq_mask %2 | ||
45 | pand %1, [esp] | ||
46 | %endmacro | ||
47 | |||
48 | %define CLEANUP_IMMQ_LOADS(num) \ | ||
49 | add esp, byte 8 * num | ||
50 | |||
51 | %define mmx32_rgb888_mask 00ffffffh | ||
52 | %define mmx32_rgb565_b 000000f8h | ||
53 | %define mmx32_rgb565_g 0000fc00h | ||
54 | %define mmx32_rgb565_r 00f80000h | ||
55 | |||
56 | %define mmx32_rgb555_rb 00f800f8h | ||
57 | %define mmx32_rgb555_g 0000f800h | ||
58 | %define mmx32_rgb555_mul 20000008h | ||
59 | %define mmx32_bgr555_mul 00082000h | ||
60 | |||
61 | SECTION .text | ||
62 | |||
63 | _ConvertMMXpII32_24RGB888: | ||
64 | |||
65 | ; set up mm6 as the mask, mm7 as zero | ||
66 | load_immq mm6, mmx32_rgb888_mask | ||
67 | CLEANUP_IMMQ_LOADS(1) | ||
68 | pxor mm7, mm7 | ||
69 | |||
70 | mov edx, ecx ; save ecx | ||
71 | and ecx, 0fffffffch ; clear lower two bits | ||
72 | jnz .L1 | ||
73 | jmp .L2 | ||
74 | |||
75 | .L1: | ||
76 | |||
77 | movq mm0, [esi] ; A R G B a r g b | ||
78 | pand mm0, mm6 ; 0 R G B 0 r g b | ||
79 | movq mm1, [esi+8] ; A R G B a r g b | ||
80 | pand mm1, mm6 ; 0 R G B 0 r g b | ||
81 | |||
82 | movq mm2, mm0 ; 0 R G B 0 r g b | ||
83 | punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B | ||
84 | punpckldq mm0, mm7 ; 0 0 0 0 0 r g b | ||
85 | psllq mm2, 24 ; 0 0 R G B 0 0 0 | ||
86 | por mm0, mm2 ; 0 0 R G B r g b | ||
87 | |||
88 | movq mm3, mm1 ; 0 R G B 0 r g b | ||
89 | psllq mm3, 48 ; g b 0 0 0 0 0 0 | ||
90 | por mm0, mm3 ; g b R G B r g b | ||
91 | |||
92 | movq mm4, mm1 ; 0 R G B 0 r g b | ||
93 | punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B | ||
94 | punpckldq mm1, mm7 ; 0 0 0 0 0 r g b | ||
95 | psrlq mm1, 16 ; 0 0 0 R G B 0 r | ||
96 | psllq mm4, 8 ; 0 0 0 0 R G B 0 | ||
97 | por mm1, mm4 ; 0 0 0 0 R G B r | ||
98 | |||
99 | movq [edi], mm0 | ||
100 | add esi, BYTE 16 | ||
101 | movd [edi+8], mm1 | ||
102 | add edi, BYTE 12 | ||
103 | sub ecx, BYTE 4 | ||
104 | jnz .L1 | ||
105 | |||
106 | .L2: | ||
107 | mov ecx, edx | ||
108 | and ecx, BYTE 3 | ||
109 | jz .L4 | ||
110 | .L3: | ||
111 | mov al, [esi] | ||
112 | mov bl, [esi+1] | ||
113 | mov dl, [esi+2] | ||
114 | mov [edi], al | ||
115 | mov [edi+1], bl | ||
116 | mov [edi+2], dl | ||
117 | add esi, BYTE 4 | ||
118 | add edi, BYTE 3 | ||
119 | dec ecx | ||
120 | jnz .L3 | ||
121 | .L4: | ||
122 | retn | ||
123 | |||
124 | |||
125 | |||
126 | _ConvertMMXpII32_16RGB565: | ||
127 | |||
128 | ; set up masks | ||
129 | load_immq mm5, mmx32_rgb565_b | ||
130 | load_immq mm6, mmx32_rgb565_g | ||
131 | load_immq mm7, mmx32_rgb565_r | ||
132 | CLEANUP_IMMQ_LOADS(3) | ||
133 | |||
134 | mov edx, ecx | ||
135 | shr ecx, 2 | ||
136 | jnz .L1 | ||
137 | jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | ||
138 | |||
139 | .L1: | ||
140 | movq mm0, [esi] ; argb | ||
141 | movq mm1, mm0 ; argb | ||
142 | pand mm0, mm6 ; 00g0 | ||
143 | movq mm3, mm1 ; argb | ||
144 | pand mm1, mm5 ; 000b | ||
145 | pand mm3, mm7 ; 0r00 | ||
146 | pslld mm1, 2 ; 0 0 000000bb bbb00000 | ||
147 | por mm0, mm1 ; 0 0 ggggggbb bbb00000 | ||
148 | psrld mm0, 5 ; 0 0 00000ggg gggbbbbb | ||
149 | |||
150 | movq mm4, [esi+8] ; argb | ||
151 | movq mm2, mm4 ; argb | ||
152 | pand mm4, mm6 ; 00g0 | ||
153 | movq mm1, mm2 ; argb | ||
154 | pand mm2, mm5 ; 000b | ||
155 | pand mm1, mm7 ; 0r00 | ||
156 | pslld mm2, 2 ; 0 0 000000bb bbb00000 | ||
157 | por mm4, mm2 ; 0 0 ggggggbb bbb00000 | ||
158 | psrld mm4, 5 ; 0 0 00000ggg gggbbbbb | ||
159 | |||
160 | packuswb mm3, mm1 ; R 0 r 0 | ||
161 | packssdw mm0, mm4 ; as above.. ish | ||
162 | por mm0, mm3 ; done. | ||
163 | movq [edi], mm0 | ||
164 | |||
165 | add esi, 16 | ||
166 | add edi, 8 | ||
167 | dec ecx | ||
168 | jnz .L1 | ||
169 | |||
170 | .L2: | ||
171 | mov ecx, edx | ||
172 | and ecx, BYTE 3 | ||
173 | jz .L4 | ||
174 | .L3: | ||
175 | mov al, [esi] | ||
176 | mov bh, [esi+1] | ||
177 | mov ah, [esi+2] | ||
178 | shr al, 3 | ||
179 | and eax, 0F81Fh ; BYTE? | ||
180 | shr ebx, 5 | ||
181 | and ebx, 07E0h ; BYTE? | ||
182 | add eax, ebx | ||
183 | mov [edi], al | ||
184 | mov [edi+1], ah | ||
185 | add esi, BYTE 4 | ||
186 | add edi, BYTE 2 | ||
187 | dec ecx | ||
188 | jnz .L3 | ||
189 | |||
190 | .L4: | ||
191 | retn | ||
192 | |||
193 | |||
194 | _ConvertMMXpII32_16BGR565: | ||
195 | |||
196 | load_immq mm5, mmx32_rgb565_r | ||
197 | load_immq mm6, mmx32_rgb565_g | ||
198 | load_immq mm7, mmx32_rgb565_b | ||
199 | CLEANUP_IMMQ_LOADS(3) | ||
200 | |||
201 | mov edx, ecx | ||
202 | shr ecx, 2 | ||
203 | jnz .L1 | ||
204 | jmp .L2 | ||
205 | |||
206 | .L1: | ||
207 | movq mm0, [esi] ; a r g b | ||
208 | movq mm1, mm0 ; a r g b | ||
209 | pand mm0, mm6 ; 0 0 g 0 | ||
210 | movq mm3, mm1 ; a r g b | ||
211 | pand mm1, mm5 ; 0 r 0 0 | ||
212 | pand mm3, mm7 ; 0 0 0 b | ||
213 | |||
214 | psllq mm3, 16 ; 0 b 0 0 | ||
215 | psrld mm1, 14 ; 0 0 000000rr rrr00000 | ||
216 | por mm0, mm1 ; 0 0 ggggggrr rrr00000 | ||
217 | psrld mm0, 5 ; 0 0 00000ggg gggrrrrr | ||
218 | |||
219 | movq mm4, [esi+8] ; a r g b | ||
220 | movq mm2, mm4 ; a r g b | ||
221 | pand mm4, mm6 ; 0 0 g 0 | ||
222 | movq mm1, mm2 ; a r g b | ||
223 | pand mm2, mm5 ; 0 r 0 0 | ||
224 | pand mm1, mm7 ; 0 0 0 b | ||
225 | |||
226 | psllq mm1, 16 ; 0 b 0 0 | ||
227 | psrld mm2, 14 ; 0 0 000000rr rrr00000 | ||
228 | por mm4, mm2 ; 0 0 ggggggrr rrr00000 | ||
229 | psrld mm4, 5 ; 0 0 00000ggg gggrrrrr | ||
230 | |||
231 | packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 | ||
232 | packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR | ||
233 | por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr | ||
234 | movq [edi], mm0 | ||
235 | |||
236 | add esi, BYTE 16 | ||
237 | add edi, BYTE 8 | ||
238 | dec ecx | ||
239 | jnz .L1 | ||
240 | |||
241 | .L2: | ||
242 | and edx, BYTE 3 | ||
243 | jz .L4 | ||
244 | .L3: | ||
245 | mov al, [esi+2] | ||
246 | mov bh, [esi+1] | ||
247 | mov ah, [esi] | ||
248 | shr al, 3 | ||
249 | and eax, 0F81Fh ; BYTE ? | ||
250 | shr ebx, 5 | ||
251 | and ebx, 07E0h ; BYTE ? | ||
252 | add eax, ebx | ||
253 | mov [edi], al | ||
254 | mov [edi+1], ah | ||
255 | add esi, BYTE 4 | ||
256 | add edi, BYTE 2 | ||
257 | dec edx | ||
258 | jnz .L3 | ||
259 | |||
260 | .L4: | ||
261 | retn | ||
262 | |||
263 | _ConvertMMXpII32_16BGR555: | ||
264 | |||
265 | ; the 16BGR555 converter is identical to the RGB555 one, | ||
266 | ; except it uses a different multiplier for the pmaddwd | ||
267 | ; instruction. cool huh. | ||
268 | |||
269 | load_immq mm7, mmx32_bgr555_mul | ||
270 | jmp _convert_bgr555_cheat | ||
271 | |||
272 | ; This is the same as the Intel version.. they obviously went to | ||
273 | ; much more trouble to expand/coil the loop than I did, so theirs | ||
274 | ; would almost certainly be faster, even if only a little. | ||
275 | ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | ||
276 | ; (I think) a more accurate name.. | ||
277 | _ConvertMMXpII32_16RGB555: | ||
278 | |||
279 | load_immq mm7, mmx32_rgb555_mul | ||
280 | _convert_bgr555_cheat: | ||
281 | load_immq mm6, mmx32_rgb555_g | ||
282 | CLEANUP_IMMQ_LOADS(2) | ||
283 | |||
284 | mov edx,ecx ; Save ecx | ||
285 | |||
286 | and ecx,DWORD 0fffffff8h ; clear lower three bits | ||
287 | jnz .L_OK | ||
288 | jmp near .L2 | ||
289 | |||
290 | .L_OK: | ||
291 | |||
292 | movq mm2,[esi+8] | ||
293 | |||
294 | movq mm0,[esi] | ||
295 | movq mm3,mm2 | ||
296 | |||
297 | pand_immq mm3, mmx32_rgb555_rb | ||
298 | movq mm1,mm0 | ||
299 | |||
300 | pand_immq mm1, mmx32_rgb555_rb | ||
301 | pmaddwd mm3,mm7 | ||
302 | |||
303 | CLEANUP_IMMQ_LOADS(2) | ||
304 | |||
305 | pmaddwd mm1,mm7 | ||
306 | pand mm2,mm6 | ||
307 | |||
308 | .L1: | ||
309 | movq mm4,[esi+24] | ||
310 | pand mm0,mm6 | ||
311 | |||
312 | movq mm5,[esi+16] | ||
313 | por mm3,mm2 | ||
314 | |||
315 | psrld mm3,6 | ||
316 | por mm1,mm0 | ||
317 | |||
318 | movq mm0,mm4 | ||
319 | psrld mm1,6 | ||
320 | |||
321 | pand_immq mm0, mmx32_rgb555_rb | ||
322 | packssdw mm1,mm3 | ||
323 | |||
324 | movq mm3,mm5 | ||
325 | pmaddwd mm0,mm7 | ||
326 | |||
327 | pand_immq mm3, mmx32_rgb555_rb | ||
328 | pand mm4,mm6 | ||
329 | |||
330 | movq [edi],mm1 | ||
331 | pmaddwd mm3,mm7 | ||
332 | |||
333 | add esi,BYTE 32 | ||
334 | por mm4,mm0 | ||
335 | |||
336 | pand mm5,mm6 | ||
337 | psrld mm4,6 | ||
338 | |||
339 | movq mm2,[esi+8] | ||
340 | por mm5,mm3 | ||
341 | |||
342 | movq mm0,[esi] | ||
343 | psrld mm5,6 | ||
344 | |||
345 | movq mm3,mm2 | ||
346 | movq mm1,mm0 | ||
347 | |||
348 | pand_immq mm3, mmx32_rgb555_rb | ||
349 | packssdw mm5,mm4 | ||
350 | |||
351 | pand_immq mm1, mmx32_rgb555_rb | ||
352 | pand mm2,mm6 | ||
353 | |||
354 | CLEANUP_IMMQ_LOADS(4) | ||
355 | |||
356 | movq [edi+8],mm5 | ||
357 | pmaddwd mm3,mm7 | ||
358 | |||
359 | pmaddwd mm1,mm7 | ||
360 | add edi,BYTE 16 | ||
361 | |||
362 | sub ecx,BYTE 8 | ||
363 | jz .L2 | ||
364 | jmp .L1 | ||
365 | |||
366 | |||
367 | .L2: | ||
368 | mov ecx,edx | ||
369 | |||
370 | and ecx,BYTE 7 | ||
371 | jz .L4 | ||
372 | |||
373 | .L3: | ||
374 | mov ebx,[esi] | ||
375 | add esi,BYTE 4 | ||
376 | |||
377 | mov eax,ebx | ||
378 | mov edx,ebx | ||
379 | |||
380 | shr eax,3 | ||
381 | shr edx,6 | ||
382 | |||
383 | and eax,BYTE 0000000000011111b | ||
384 | and edx, 0000001111100000b | ||
385 | |||
386 | shr ebx,9 | ||
387 | |||
388 | or eax,edx | ||
389 | |||
390 | and ebx, 0111110000000000b | ||
391 | |||
392 | or eax,ebx | ||
393 | |||
394 | mov [edi],ax | ||
395 | add edi,BYTE 2 | ||
396 | |||
397 | dec ecx | ||
398 | jnz .L3 | ||
399 | |||
400 | .L4: | ||
401 | retn | ||
402 | |||
403 | %ifidn __OUTPUT_FORMAT__,elf32 | ||
404 | section .note.GNU-stack noalloc noexec nowrite progbits | ||
405 | %endif | ||