diff options
author | Franklin Wei <git@fwei.tk> | 2017-01-21 15:18:31 -0500 |
---|---|---|
committer | Franklin Wei <git@fwei.tk> | 2017-12-23 21:01:26 -0500 |
commit | a855d6202536ff28e5aae4f22a0f31d8f5b325d0 (patch) | |
tree | 8c75f224dd64ed360505afa8843d016b0d75000b /apps/plugins/sdl/src/video/SDL_blit_A.c | |
parent | 01c6dcf6c7b9bb1ad2fa0450f99bacc5f3d3e04b (diff) | |
download | rockbox-a855d6202536ff28e5aae4f22a0f31d8f5b325d0.tar.gz rockbox-a855d6202536ff28e5aae4f22a0f31d8f5b325d0.zip |
Port of Duke Nukem 3D
This ports Fabien Sanglard's Chocolate Duke to run on a version of SDL
for Rockbox.
Change-Id: I8f2c4c78af19de10c1633ed7bb7a997b43256dd9
Diffstat (limited to 'apps/plugins/sdl/src/video/SDL_blit_A.c')
-rw-r--r-- | apps/plugins/sdl/src/video/SDL_blit_A.c | 2873 |
1 files changed, 2873 insertions, 0 deletions
diff --git a/apps/plugins/sdl/src/video/SDL_blit_A.c b/apps/plugins/sdl/src/video/SDL_blit_A.c new file mode 100644 index 0000000000..219cdccf5b --- /dev/null +++ b/apps/plugins/sdl/src/video/SDL_blit_A.c | |||
@@ -0,0 +1,2873 @@ | |||
1 | /* | ||
2 | SDL - Simple DirectMedia Layer | ||
3 | Copyright (C) 1997-2012 Sam Lantinga | ||
4 | |||
5 | This library is free software; you can redistribute it and/or | ||
6 | modify it under the terms of the GNU Lesser General Public | ||
7 | License as published by the Free Software Foundation; either | ||
8 | version 2.1 of the License, or (at your option) any later version. | ||
9 | |||
10 | This library is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | Lesser General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU Lesser General Public | ||
16 | License along with this library; if not, write to the Free Software | ||
17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
18 | |||
19 | Sam Lantinga | ||
20 | slouken@libsdl.org | ||
21 | */ | ||
22 | #include "SDL_config.h" | ||
23 | |||
24 | #include "SDL_video.h" | ||
25 | #include "SDL_blit.h" | ||
26 | |||
27 | /* | ||
28 | In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on. | ||
29 | Checking if _mm_free is #defined in malloc.h is is the only way to | ||
30 | determine if the Processor Pack is installed, as far as I can tell. | ||
31 | */ | ||
32 | |||
33 | #if SDL_ASSEMBLY_ROUTINES | ||
34 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
35 | /* forced MMX to 0...it breaks on most compilers now. --ryan. */ | ||
36 | # define MMX_ASMBLIT 0 | ||
37 | # define GCC_ASMBLIT 0 | ||
38 | # elif defined(_MSC_VER) && defined(_M_IX86) | ||
39 | # if (_MSC_VER <= 1200) | ||
40 | # include <malloc.h> | ||
41 | # if defined(_mm_free) | ||
42 | # define HAVE_MMINTRIN_H 1 | ||
43 | # endif | ||
44 | # else /* Visual Studio > VC6 always has mmintrin.h */ | ||
45 | # define HAVE_MMINTRIN_H 1 | ||
46 | # endif | ||
47 | # if HAVE_MMINTRIN_H | ||
48 | # define MMX_ASMBLIT 1 | ||
49 | # define MSVC_ASMBLIT 1 | ||
50 | # endif | ||
51 | # endif | ||
52 | #endif /* SDL_ASSEMBLY_ROUTINES */ | ||
53 | |||
54 | /* Function to check the CPU flags */ | ||
55 | #include "SDL_cpuinfo.h" | ||
56 | #if GCC_ASMBLIT | ||
57 | #include "mmx.h" | ||
58 | #elif MSVC_ASMBLIT | ||
59 | #include <mmintrin.h> | ||
60 | #include <mm3dnow.h> | ||
61 | #endif | ||
62 | |||
63 | /* Functions to perform alpha blended blitting */ | ||
64 | |||
65 | /* N->1 blending with per-surface alpha */ | ||
66 | static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) | ||
67 | { | ||
68 | int width = info->d_width; | ||
69 | int height = info->d_height; | ||
70 | Uint8 *src = info->s_pixels; | ||
71 | int srcskip = info->s_skip; | ||
72 | Uint8 *dst = info->d_pixels; | ||
73 | int dstskip = info->d_skip; | ||
74 | Uint8 *palmap = info->table; | ||
75 | SDL_PixelFormat *srcfmt = info->src; | ||
76 | SDL_PixelFormat *dstfmt = info->dst; | ||
77 | int srcbpp = srcfmt->BytesPerPixel; | ||
78 | |||
79 | const unsigned A = srcfmt->alpha; | ||
80 | |||
81 | while ( height-- ) { | ||
82 | DUFFS_LOOP4( | ||
83 | { | ||
84 | Uint32 Pixel; | ||
85 | unsigned sR; | ||
86 | unsigned sG; | ||
87 | unsigned sB; | ||
88 | unsigned dR; | ||
89 | unsigned dG; | ||
90 | unsigned dB; | ||
91 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); | ||
92 | dR = dstfmt->palette->colors[*dst].r; | ||
93 | dG = dstfmt->palette->colors[*dst].g; | ||
94 | dB = dstfmt->palette->colors[*dst].b; | ||
95 | ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); | ||
96 | dR &= 0xff; | ||
97 | dG &= 0xff; | ||
98 | dB &= 0xff; | ||
99 | /* Pack RGB into 8bit pixel */ | ||
100 | if ( palmap == NULL ) { | ||
101 | *dst =((dR>>5)<<(3+2))| | ||
102 | ((dG>>5)<<(2))| | ||
103 | ((dB>>6)<<(0)); | ||
104 | } else { | ||
105 | *dst = palmap[((dR>>5)<<(3+2))| | ||
106 | ((dG>>5)<<(2)) | | ||
107 | ((dB>>6)<<(0))]; | ||
108 | } | ||
109 | dst++; | ||
110 | src += srcbpp; | ||
111 | }, | ||
112 | width); | ||
113 | src += srcskip; | ||
114 | dst += dstskip; | ||
115 | } | ||
116 | } | ||
117 | |||
118 | /* N->1 blending with pixel alpha */ | ||
119 | static void BlitNto1PixelAlpha(SDL_BlitInfo *info) | ||
120 | { | ||
121 | int width = info->d_width; | ||
122 | int height = info->d_height; | ||
123 | Uint8 *src = info->s_pixels; | ||
124 | int srcskip = info->s_skip; | ||
125 | Uint8 *dst = info->d_pixels; | ||
126 | int dstskip = info->d_skip; | ||
127 | Uint8 *palmap = info->table; | ||
128 | SDL_PixelFormat *srcfmt = info->src; | ||
129 | SDL_PixelFormat *dstfmt = info->dst; | ||
130 | int srcbpp = srcfmt->BytesPerPixel; | ||
131 | |||
132 | /* FIXME: fix alpha bit field expansion here too? */ | ||
133 | while ( height-- ) { | ||
134 | DUFFS_LOOP4( | ||
135 | { | ||
136 | Uint32 Pixel; | ||
137 | unsigned sR; | ||
138 | unsigned sG; | ||
139 | unsigned sB; | ||
140 | unsigned sA; | ||
141 | unsigned dR; | ||
142 | unsigned dG; | ||
143 | unsigned dB; | ||
144 | DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); | ||
145 | dR = dstfmt->palette->colors[*dst].r; | ||
146 | dG = dstfmt->palette->colors[*dst].g; | ||
147 | dB = dstfmt->palette->colors[*dst].b; | ||
148 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | ||
149 | dR &= 0xff; | ||
150 | dG &= 0xff; | ||
151 | dB &= 0xff; | ||
152 | /* Pack RGB into 8bit pixel */ | ||
153 | if ( palmap == NULL ) { | ||
154 | *dst =((dR>>5)<<(3+2))| | ||
155 | ((dG>>5)<<(2))| | ||
156 | ((dB>>6)<<(0)); | ||
157 | } else { | ||
158 | *dst = palmap[((dR>>5)<<(3+2))| | ||
159 | ((dG>>5)<<(2)) | | ||
160 | ((dB>>6)<<(0)) ]; | ||
161 | } | ||
162 | dst++; | ||
163 | src += srcbpp; | ||
164 | }, | ||
165 | width); | ||
166 | src += srcskip; | ||
167 | dst += dstskip; | ||
168 | } | ||
169 | } | ||
170 | |||
171 | /* colorkeyed N->1 blending with per-surface alpha */ | ||
172 | static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) | ||
173 | { | ||
174 | int width = info->d_width; | ||
175 | int height = info->d_height; | ||
176 | Uint8 *src = info->s_pixels; | ||
177 | int srcskip = info->s_skip; | ||
178 | Uint8 *dst = info->d_pixels; | ||
179 | int dstskip = info->d_skip; | ||
180 | Uint8 *palmap = info->table; | ||
181 | SDL_PixelFormat *srcfmt = info->src; | ||
182 | SDL_PixelFormat *dstfmt = info->dst; | ||
183 | int srcbpp = srcfmt->BytesPerPixel; | ||
184 | Uint32 ckey = srcfmt->colorkey; | ||
185 | |||
186 | const int A = srcfmt->alpha; | ||
187 | |||
188 | while ( height-- ) { | ||
189 | DUFFS_LOOP( | ||
190 | { | ||
191 | Uint32 Pixel; | ||
192 | unsigned sR; | ||
193 | unsigned sG; | ||
194 | unsigned sB; | ||
195 | unsigned dR; | ||
196 | unsigned dG; | ||
197 | unsigned dB; | ||
198 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); | ||
199 | if ( Pixel != ckey ) { | ||
200 | dR = dstfmt->palette->colors[*dst].r; | ||
201 | dG = dstfmt->palette->colors[*dst].g; | ||
202 | dB = dstfmt->palette->colors[*dst].b; | ||
203 | ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); | ||
204 | dR &= 0xff; | ||
205 | dG &= 0xff; | ||
206 | dB &= 0xff; | ||
207 | /* Pack RGB into 8bit pixel */ | ||
208 | if ( palmap == NULL ) { | ||
209 | *dst =((dR>>5)<<(3+2))| | ||
210 | ((dG>>5)<<(2)) | | ||
211 | ((dB>>6)<<(0)); | ||
212 | } else { | ||
213 | *dst = palmap[((dR>>5)<<(3+2))| | ||
214 | ((dG>>5)<<(2)) | | ||
215 | ((dB>>6)<<(0)) ]; | ||
216 | } | ||
217 | } | ||
218 | dst++; | ||
219 | src += srcbpp; | ||
220 | }, | ||
221 | width); | ||
222 | src += srcskip; | ||
223 | dst += dstskip; | ||
224 | } | ||
225 | } | ||
226 | |||
227 | #if GCC_ASMBLIT | ||
228 | /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | ||
229 | static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) | ||
230 | { | ||
231 | int width = info->d_width; | ||
232 | int height = info->d_height; | ||
233 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
234 | int srcskip = info->s_skip >> 2; | ||
235 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
236 | int dstskip = info->d_skip >> 2; | ||
237 | Uint32 dalpha = info->dst->Amask; | ||
238 | Uint64 load; | ||
239 | |||
240 | load = 0x00fefefe00fefefeULL;/* alpha128 mask */ | ||
241 | movq_m2r(load, mm4); /* alpha128 mask -> mm4 */ | ||
242 | load = 0x0001010100010101ULL;/* !alpha128 mask */ | ||
243 | movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */ | ||
244 | movd_m2r(dalpha, mm7); /* dst alpha mask */ | ||
245 | punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ | ||
246 | while(height--) { | ||
247 | DUFFS_LOOP_DOUBLE2( | ||
248 | { | ||
249 | Uint32 s = *srcp++; | ||
250 | Uint32 d = *dstp; | ||
251 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | ||
252 | + (s & d & 0x00010101)) | dalpha; | ||
253 | },{ | ||
254 | movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | ||
255 | movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | ||
256 | |||
257 | movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ | ||
258 | movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ | ||
259 | |||
260 | pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ | ||
261 | pand_r2r(mm4, mm5); /* src & mask -> mm5 */ | ||
262 | paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ | ||
263 | pand_r2r(mm1, mm2); /* src & dst -> mm2 */ | ||
264 | psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ | ||
265 | pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ | ||
266 | paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ | ||
267 | |||
268 | por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | ||
269 | movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ | ||
270 | dstp += 2; | ||
271 | srcp += 2; | ||
272 | }, width); | ||
273 | srcp += srcskip; | ||
274 | dstp += dstskip; | ||
275 | } | ||
276 | emms(); | ||
277 | } | ||
278 | |||
279 | /* fast RGB888->(A)RGB888 blending with surface alpha */ | ||
280 | static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) | ||
281 | { | ||
282 | SDL_PixelFormat* df = info->dst; | ||
283 | unsigned alpha = info->src->alpha; | ||
284 | |||
285 | if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { | ||
286 | /* only call a128 version when R,G,B occupy lower bits */ | ||
287 | BlitRGBtoRGBSurfaceAlpha128MMX(info); | ||
288 | } else { | ||
289 | int width = info->d_width; | ||
290 | int height = info->d_height; | ||
291 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
292 | int srcskip = info->s_skip >> 2; | ||
293 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
294 | int dstskip = info->d_skip >> 2; | ||
295 | |||
296 | pxor_r2r(mm5, mm5); /* 0 -> mm5 */ | ||
297 | /* form the alpha mult */ | ||
298 | movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ | ||
299 | punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ | ||
300 | punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ | ||
301 | alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); | ||
302 | movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ | ||
303 | punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ | ||
304 | pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ | ||
305 | /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ | ||
306 | movd_m2r(df->Amask, mm7); /* dst alpha mask */ | ||
307 | punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ | ||
308 | |||
309 | while(height--) { | ||
310 | DUFFS_LOOP_DOUBLE2({ | ||
311 | /* One Pixel Blend */ | ||
312 | movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | ||
313 | movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | ||
314 | punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ | ||
315 | punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ | ||
316 | |||
317 | psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | ||
318 | pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | ||
319 | psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | ||
320 | paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | ||
321 | |||
322 | packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ | ||
323 | por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | ||
324 | movd_r2m(mm2, *dstp);/* mm2 -> pixel */ | ||
325 | ++srcp; | ||
326 | ++dstp; | ||
327 | },{ | ||
328 | /* Two Pixels Blend */ | ||
329 | movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ | ||
330 | movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | ||
331 | movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ | ||
332 | movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | ||
333 | |||
334 | punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ | ||
335 | punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ | ||
336 | punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ | ||
337 | punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ | ||
338 | |||
339 | psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ | ||
340 | pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ | ||
341 | psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ | ||
342 | paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ | ||
343 | |||
344 | psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ | ||
345 | pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | ||
346 | psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | ||
347 | paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ | ||
348 | |||
349 | packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ | ||
350 | por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ | ||
351 | |||
352 | movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ | ||
353 | |||
354 | srcp += 2; | ||
355 | dstp += 2; | ||
356 | }, width); | ||
357 | srcp += srcskip; | ||
358 | dstp += dstskip; | ||
359 | } | ||
360 | emms(); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | ||
365 | static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) | ||
366 | { | ||
367 | int width = info->d_width; | ||
368 | int height = info->d_height; | ||
369 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
370 | int srcskip = info->s_skip >> 2; | ||
371 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
372 | int dstskip = info->d_skip >> 2; | ||
373 | SDL_PixelFormat* sf = info->src; | ||
374 | Uint32 amask = sf->Amask; | ||
375 | |||
376 | pxor_r2r(mm6, mm6); /* 0 -> mm6 */ | ||
377 | /* form multiplication mask */ | ||
378 | movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ | ||
379 | punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ | ||
380 | pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ | ||
381 | movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ | ||
382 | pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ | ||
383 | /* form channel masks */ | ||
384 | movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ | ||
385 | packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ | ||
386 | packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ | ||
387 | pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ | ||
388 | /* get alpha channel shift */ | ||
389 | __asm__ __volatile__ ( | ||
390 | "movd %0, %%mm5" | ||
391 | : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */ | ||
392 | |||
393 | while(height--) { | ||
394 | DUFFS_LOOP4({ | ||
395 | Uint32 alpha = *srcp & amask; | ||
396 | /* FIXME: Here we special-case opaque alpha since the | ||
397 | compositioning used (>>8 instead of /255) doesn't handle | ||
398 | it correctly. Also special-case alpha=0 for speed? | ||
399 | Benchmark this! */ | ||
400 | if(alpha == 0) { | ||
401 | /* do nothing */ | ||
402 | } else if(alpha == amask) { | ||
403 | /* opaque alpha -- copy RGB, keep dst alpha */ | ||
404 | /* using MMX here to free up regular registers for other things */ | ||
405 | movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | ||
406 | movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | ||
407 | pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ | ||
408 | pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ | ||
409 | por_r2r(mm1, mm2); /* src | dst -> mm2 */ | ||
410 | movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ | ||
411 | } else { | ||
412 | movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | ||
413 | punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ | ||
414 | |||
415 | movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | ||
416 | punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ | ||
417 | |||
418 | __asm__ __volatile__ ( | ||
419 | "movd %0, %%mm4" | ||
420 | : : "r" (alpha) ); /* 0000A000 -> mm4 */ | ||
421 | psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ | ||
422 | punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ | ||
423 | punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ | ||
424 | pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ | ||
425 | |||
426 | /* blend */ | ||
427 | psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | ||
428 | pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | ||
429 | psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ | ||
430 | paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | ||
431 | |||
432 | packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ | ||
433 | movd_r2m(mm2, *dstp);/* mm2 -> dst */ | ||
434 | } | ||
435 | ++srcp; | ||
436 | ++dstp; | ||
437 | }, width); | ||
438 | srcp += srcskip; | ||
439 | dstp += dstskip; | ||
440 | } | ||
441 | emms(); | ||
442 | } | ||
443 | /* End GCC_ASMBLIT */ | ||
444 | |||
445 | #elif MSVC_ASMBLIT | ||
446 | /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | ||
447 | static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) | ||
448 | { | ||
449 | int width = info->d_width; | ||
450 | int height = info->d_height; | ||
451 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
452 | int srcskip = info->s_skip >> 2; | ||
453 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
454 | int dstskip = info->d_skip >> 2; | ||
455 | Uint32 dalpha = info->dst->Amask; | ||
456 | |||
457 | __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; | ||
458 | |||
459 | hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ | ||
460 | lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ | ||
461 | dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ | ||
462 | |||
463 | while (height--) { | ||
464 | int n = width; | ||
465 | if ( n & 1 ) { | ||
466 | Uint32 s = *srcp++; | ||
467 | Uint32 d = *dstp; | ||
468 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | ||
469 | + (s & d & 0x00010101)) | dalpha; | ||
470 | n--; | ||
471 | } | ||
472 | |||
473 | for (n >>= 1; n > 0; --n) { | ||
474 | dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */ | ||
475 | dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ | ||
476 | |||
477 | src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */ | ||
478 | src2 = src1; /* 2 x src -> src2(ARGBARGB) */ | ||
479 | |||
480 | dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ | ||
481 | src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ | ||
482 | src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ | ||
483 | src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ | ||
484 | |||
485 | dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ | ||
486 | dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ | ||
487 | dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ | ||
488 | dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ | ||
489 | |||
490 | *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */ | ||
491 | dstp += 2; | ||
492 | srcp += 2; | ||
493 | } | ||
494 | |||
495 | srcp += srcskip; | ||
496 | dstp += dstskip; | ||
497 | } | ||
498 | _mm_empty(); | ||
499 | } | ||
500 | |||
501 | /* fast RGB888->(A)RGB888 blending with surface alpha */ | ||
502 | static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) | ||
503 | { | ||
504 | SDL_PixelFormat* df = info->dst; | ||
505 | Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask; | ||
506 | unsigned alpha = info->src->alpha; | ||
507 | |||
508 | if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { | ||
509 | /* only call a128 version when R,G,B occupy lower bits */ | ||
510 | BlitRGBtoRGBSurfaceAlpha128MMX(info); | ||
511 | } else { | ||
512 | int width = info->d_width; | ||
513 | int height = info->d_height; | ||
514 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
515 | int srcskip = info->s_skip >> 2; | ||
516 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
517 | int dstskip = info->d_skip >> 2; | ||
518 | Uint32 dalpha = df->Amask; | ||
519 | Uint32 amult; | ||
520 | |||
521 | __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; | ||
522 | |||
523 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | ||
524 | /* form the alpha mult */ | ||
525 | amult = alpha | (alpha << 8); | ||
526 | amult = amult | (amult << 16); | ||
527 | chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); | ||
528 | mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ | ||
529 | mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ | ||
530 | /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ | ||
531 | dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ | ||
532 | |||
533 | while (height--) { | ||
534 | int n = width; | ||
535 | if (n & 1) { | ||
536 | /* One Pixel Blend */ | ||
537 | src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/ | ||
538 | src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ | ||
539 | |||
540 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | ||
541 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | ||
542 | |||
543 | src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ | ||
544 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
545 | src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ | ||
546 | dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ | ||
547 | |||
548 | dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | ||
549 | dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ | ||
550 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | ||
551 | |||
552 | ++srcp; | ||
553 | ++dstp; | ||
554 | |||
555 | n--; | ||
556 | } | ||
557 | |||
558 | for (n >>= 1; n > 0; --n) { | ||
559 | /* Two Pixels Blend */ | ||
560 | src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/ | ||
561 | src2 = src1; /* 2 x src -> src2(ARGBARGB) */ | ||
562 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ | ||
563 | src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ | ||
564 | |||
565 | dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */ | ||
566 | dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ | ||
567 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ | ||
568 | dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ | ||
569 | |||
570 | src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ | ||
571 | src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ | ||
572 | src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ | ||
573 | dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ | ||
574 | |||
575 | src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */ | ||
576 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
577 | src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ | ||
578 | dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ | ||
579 | |||
580 | dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ | ||
581 | dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ | ||
582 | |||
583 | *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */ | ||
584 | |||
585 | srcp += 2; | ||
586 | dstp += 2; | ||
587 | } | ||
588 | srcp += srcskip; | ||
589 | dstp += dstskip; | ||
590 | } | ||
591 | _mm_empty(); | ||
592 | } | ||
593 | } | ||
594 | |||
595 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | ||
596 | static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) | ||
597 | { | ||
598 | int width = info->d_width; | ||
599 | int height = info->d_height; | ||
600 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
601 | int srcskip = info->s_skip >> 2; | ||
602 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
603 | int dstskip = info->d_skip >> 2; | ||
604 | SDL_PixelFormat* sf = info->src; | ||
605 | Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; | ||
606 | Uint32 amask = sf->Amask; | ||
607 | Uint32 ashift = sf->Ashift; | ||
608 | Uint64 multmask; | ||
609 | |||
610 | __m64 src1, dst1, mm_alpha, mm_zero, dmask; | ||
611 | |||
612 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | ||
613 | multmask = ~(0xFFFFi64 << (ashift * 2)); | ||
614 | dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ | ||
615 | |||
616 | while(height--) { | ||
617 | DUFFS_LOOP4({ | ||
618 | Uint32 alpha = *srcp & amask; | ||
619 | if (alpha == 0) { | ||
620 | /* do nothing */ | ||
621 | } else if (alpha == amask) { | ||
622 | /* opaque alpha -- copy RGB, keep dst alpha */ | ||
623 | *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); | ||
624 | } else { | ||
625 | src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ | ||
626 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ | ||
627 | |||
628 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | ||
629 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | ||
630 | |||
631 | mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ | ||
632 | mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ | ||
633 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | ||
634 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | ||
635 | mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ | ||
636 | |||
637 | /* blend */ | ||
638 | src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ | ||
639 | src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ | ||
640 | src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ | ||
641 | dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ | ||
642 | dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | ||
643 | |||
644 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | ||
645 | } | ||
646 | ++srcp; | ||
647 | ++dstp; | ||
648 | }, width); | ||
649 | srcp += srcskip; | ||
650 | dstp += dstskip; | ||
651 | } | ||
652 | _mm_empty(); | ||
653 | } | ||
654 | /* End MSVC_ASMBLIT */ | ||
655 | |||
656 | #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | ||
657 | |||
658 | #if SDL_ALTIVEC_BLITTERS | ||
659 | #if __MWERKS__ | ||
660 | #pragma altivec_model on | ||
661 | #endif | ||
662 | #if HAVE_ALTIVEC_H | ||
663 | #include <altivec.h> | ||
664 | #endif | ||
665 | #include <assert.h> | ||
666 | |||
667 | #if (defined(__MACOSX__) && (__GNUC__ < 4)) | ||
668 | #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ | ||
669 | (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p ) | ||
670 | #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ | ||
671 | (vector unsigned short) ( a,b,c,d,e,f,g,h ) | ||
672 | #else | ||
673 | #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ | ||
674 | (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p } | ||
675 | #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ | ||
676 | (vector unsigned short) { a,b,c,d,e,f,g,h } | ||
677 | #endif | ||
678 | |||
679 | #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) | ||
680 | #define VECPRINT(msg, v) do { \ | ||
681 | vector unsigned int tmpvec = (vector unsigned int)(v); \ | ||
682 | unsigned int *vp = (unsigned int *)&tmpvec; \ | ||
683 | printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ | ||
684 | } while (0) | ||
685 | |||
686 | /* the permuation vector that takes the high bytes out of all the appropriate shorts | ||
687 | (vector unsigned char)( | ||
688 | 0x00, 0x10, 0x02, 0x12, | ||
689 | 0x04, 0x14, 0x06, 0x16, | ||
690 | 0x08, 0x18, 0x0A, 0x1A, | ||
691 | 0x0C, 0x1C, 0x0E, 0x1E ); | ||
692 | */ | ||
693 | #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) | ||
694 | #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) | ||
695 | #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) | ||
696 | #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ | ||
697 | ? vec_lvsl(0, src) \ | ||
698 | : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) | ||
699 | |||
700 | |||
701 | #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ | ||
702 | /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ | ||
703 | vector unsigned short vtemp1 = vec_mule(vs, valpha); \ | ||
704 | /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ | ||
705 | vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ | ||
706 | /* valpha2 is 255-alpha */ \ | ||
707 | vector unsigned char valpha2 = vec_nor(valpha, valpha); \ | ||
708 | /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ | ||
709 | vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ | ||
710 | /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ | ||
711 | vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ | ||
712 | /* add source and dest */ \ | ||
713 | vtemp1 = vec_add(vtemp1, vtemp3); \ | ||
714 | vtemp2 = vec_add(vtemp2, vtemp4); \ | ||
715 | /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ | ||
716 | vtemp1 = vec_add(vtemp1, v1_16); \ | ||
717 | vtemp3 = vec_sr(vtemp1, v8_16); \ | ||
718 | vtemp1 = vec_add(vtemp1, vtemp3); \ | ||
719 | /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ | ||
720 | vtemp2 = vec_add(vtemp2, v1_16); \ | ||
721 | vtemp4 = vec_sr(vtemp2, v8_16); \ | ||
722 | vtemp2 = vec_add(vtemp2, vtemp4); \ | ||
723 | /* (>>8) and get ARGBARGBARGBARGB */ \ | ||
724 | vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ | ||
725 | } while (0) | ||
726 | |||
727 | /* Calculate the permute vector used for 32->32 swizzling */ | ||
728 | static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, | ||
729 | const SDL_PixelFormat *dstfmt) | ||
730 | { | ||
731 | /* | ||
732 | * We have to assume that the bits that aren't used by other | ||
733 | * colors is alpha, and it's one complete byte, since some formats | ||
734 | * leave alpha with a zero mask, but we should still swizzle the bits. | ||
735 | */ | ||
736 | /* ARGB */ | ||
737 | const static struct SDL_PixelFormat default_pixel_format = { | ||
738 | NULL, 0, 0, | ||
739 | 0, 0, 0, 0, | ||
740 | 16, 8, 0, 24, | ||
741 | 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, | ||
742 | 0, 0}; | ||
743 | if (!srcfmt) { | ||
744 | srcfmt = &default_pixel_format; | ||
745 | } | ||
746 | if (!dstfmt) { | ||
747 | dstfmt = &default_pixel_format; | ||
748 | } | ||
749 | const vector unsigned char plus = VECUINT8_LITERAL | ||
750 | ( 0x00, 0x00, 0x00, 0x00, | ||
751 | 0x04, 0x04, 0x04, 0x04, | ||
752 | 0x08, 0x08, 0x08, 0x08, | ||
753 | 0x0C, 0x0C, 0x0C, 0x0C ); | ||
754 | vector unsigned char vswiz; | ||
755 | vector unsigned int srcvec; | ||
756 | #define RESHIFT(X) (3 - ((X) >> 3)) | ||
757 | Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); | ||
758 | Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); | ||
759 | Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); | ||
760 | Uint32 amask; | ||
761 | /* Use zero for alpha if either surface doesn't have alpha */ | ||
762 | if (dstfmt->Amask) { | ||
763 | amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); | ||
764 | } else { | ||
765 | amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); | ||
766 | } | ||
767 | #undef RESHIFT | ||
768 | ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask); | ||
769 | vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); | ||
770 | return(vswiz); | ||
771 | } | ||
772 | |||
773 | static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info) | ||
774 | { | ||
775 | int height = info->d_height; | ||
776 | Uint8 *src = (Uint8 *)info->s_pixels; | ||
777 | int srcskip = info->s_skip; | ||
778 | Uint8 *dst = (Uint8 *)info->d_pixels; | ||
779 | int dstskip = info->d_skip; | ||
780 | SDL_PixelFormat *srcfmt = info->src; | ||
781 | |||
782 | vector unsigned char v0 = vec_splat_u8(0); | ||
783 | vector unsigned short v8_16 = vec_splat_u16(8); | ||
784 | vector unsigned short v1_16 = vec_splat_u16(1); | ||
785 | vector unsigned short v2_16 = vec_splat_u16(2); | ||
786 | vector unsigned short v3_16 = vec_splat_u16(3); | ||
787 | vector unsigned int v8_32 = vec_splat_u32(8); | ||
788 | vector unsigned int v16_32 = vec_add(v8_32, v8_32); | ||
789 | vector unsigned short v3f = VECUINT16_LITERAL( | ||
790 | 0x003f, 0x003f, 0x003f, 0x003f, | ||
791 | 0x003f, 0x003f, 0x003f, 0x003f); | ||
792 | vector unsigned short vfc = VECUINT16_LITERAL( | ||
793 | 0x00fc, 0x00fc, 0x00fc, 0x00fc, | ||
794 | 0x00fc, 0x00fc, 0x00fc, 0x00fc); | ||
795 | |||
796 | /* | ||
797 | 0x10 - 0x1f is the alpha | ||
798 | 0x00 - 0x0e evens are the red | ||
799 | 0x01 - 0x0f odds are zero | ||
800 | */ | ||
801 | vector unsigned char vredalpha1 = VECUINT8_LITERAL( | ||
802 | 0x10, 0x00, 0x01, 0x01, | ||
803 | 0x10, 0x02, 0x01, 0x01, | ||
804 | 0x10, 0x04, 0x01, 0x01, | ||
805 | 0x10, 0x06, 0x01, 0x01 | ||
806 | ); | ||
807 | vector unsigned char vredalpha2 = (vector unsigned char)( | ||
808 | vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32)) | ||
809 | ); | ||
810 | /* | ||
811 | 0x00 - 0x0f is ARxx ARxx ARxx ARxx | ||
812 | 0x11 - 0x0f odds are blue | ||
813 | */ | ||
814 | vector unsigned char vblue1 = VECUINT8_LITERAL( | ||
815 | 0x00, 0x01, 0x02, 0x11, | ||
816 | 0x04, 0x05, 0x06, 0x13, | ||
817 | 0x08, 0x09, 0x0a, 0x15, | ||
818 | 0x0c, 0x0d, 0x0e, 0x17 | ||
819 | ); | ||
820 | vector unsigned char vblue2 = (vector unsigned char)( | ||
821 | vec_add((vector unsigned int)vblue1, v8_32) | ||
822 | ); | ||
823 | /* | ||
824 | 0x00 - 0x0f is ARxB ARxB ARxB ARxB | ||
825 | 0x10 - 0x0e evens are green | ||
826 | */ | ||
827 | vector unsigned char vgreen1 = VECUINT8_LITERAL( | ||
828 | 0x00, 0x01, 0x10, 0x03, | ||
829 | 0x04, 0x05, 0x12, 0x07, | ||
830 | 0x08, 0x09, 0x14, 0x0b, | ||
831 | 0x0c, 0x0d, 0x16, 0x0f | ||
832 | ); | ||
833 | vector unsigned char vgreen2 = (vector unsigned char)( | ||
834 | vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32)) | ||
835 | ); | ||
836 | vector unsigned char vgmerge = VECUINT8_LITERAL( | ||
837 | 0x00, 0x02, 0x00, 0x06, | ||
838 | 0x00, 0x0a, 0x00, 0x0e, | ||
839 | 0x00, 0x12, 0x00, 0x16, | ||
840 | 0x00, 0x1a, 0x00, 0x1e); | ||
841 | vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); | ||
842 | vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); | ||
843 | vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | ||
844 | |||
845 | vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); | ||
846 | vf800 = vec_sl(vf800, vec_splat_u16(8)); | ||
847 | |||
848 | while(height--) { | ||
849 | int extrawidth; | ||
850 | vector unsigned char valigner; | ||
851 | vector unsigned char vsrc; | ||
852 | vector unsigned char voverflow; | ||
853 | int width = info->d_width; | ||
854 | |||
855 | #define ONE_PIXEL_BLEND(condition, widthvar) \ | ||
856 | while (condition) { \ | ||
857 | Uint32 Pixel; \ | ||
858 | unsigned sR, sG, sB, dR, dG, dB, sA; \ | ||
859 | DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \ | ||
860 | if(sA) { \ | ||
861 | unsigned short dstpixel = *((unsigned short *)dst); \ | ||
862 | dR = (dstpixel >> 8) & 0xf8; \ | ||
863 | dG = (dstpixel >> 3) & 0xfc; \ | ||
864 | dB = (dstpixel << 3) & 0xf8; \ | ||
865 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | ||
866 | *((unsigned short *)dst) = ( \ | ||
867 | ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ | ||
868 | ); \ | ||
869 | } \ | ||
870 | src += 4; \ | ||
871 | dst += 2; \ | ||
872 | widthvar--; \ | ||
873 | } | ||
874 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); | ||
875 | extrawidth = (width % 8); | ||
876 | valigner = VEC_ALIGNER(src); | ||
877 | vsrc = (vector unsigned char)vec_ld(0, src); | ||
878 | width -= extrawidth; | ||
879 | while (width) { | ||
880 | vector unsigned char valpha; | ||
881 | vector unsigned char vsrc1, vsrc2; | ||
882 | vector unsigned char vdst1, vdst2; | ||
883 | vector unsigned short vR, vG, vB; | ||
884 | vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; | ||
885 | |||
886 | /* Load 8 pixels from src as ARGB */ | ||
887 | voverflow = (vector unsigned char)vec_ld(15, src); | ||
888 | vsrc = vec_perm(vsrc, voverflow, valigner); | ||
889 | vsrc1 = vec_perm(vsrc, vsrc, vpermute); | ||
890 | src += 16; | ||
891 | vsrc = (vector unsigned char)vec_ld(15, src); | ||
892 | voverflow = vec_perm(voverflow, vsrc, valigner); | ||
893 | vsrc2 = vec_perm(voverflow, voverflow, vpermute); | ||
894 | src += 16; | ||
895 | |||
896 | /* Load 8 pixels from dst as XRGB */ | ||
897 | voverflow = vec_ld(0, dst); | ||
898 | vR = vec_and((vector unsigned short)voverflow, vf800); | ||
899 | vB = vec_sl((vector unsigned short)voverflow, v3_16); | ||
900 | vG = vec_sl(vB, v2_16); | ||
901 | vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1); | ||
902 | vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); | ||
903 | vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); | ||
904 | vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2); | ||
905 | vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); | ||
906 | vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); | ||
907 | |||
908 | /* Alpha blend 8 pixels as ARGB */ | ||
909 | valpha = vec_perm(vsrc1, v0, valphaPermute); | ||
910 | VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16); | ||
911 | valpha = vec_perm(vsrc2, v0, valphaPermute); | ||
912 | VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16); | ||
913 | |||
914 | /* Convert 8 pixels to 565 */ | ||
915 | vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2); | ||
916 | vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge); | ||
917 | vgpixel = vec_and(vgpixel, vfc); | ||
918 | vgpixel = vec_sl(vgpixel, v3_16); | ||
919 | vrpixel = vec_sl(vpixel, v1_16); | ||
920 | vrpixel = vec_and(vrpixel, vf800); | ||
921 | vbpixel = vec_and(vpixel, v3f); | ||
922 | vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); | ||
923 | vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel); | ||
924 | |||
925 | /* Store 8 pixels */ | ||
926 | vec_st(vdst1, 0, dst); | ||
927 | |||
928 | width -= 8; | ||
929 | dst += 16; | ||
930 | } | ||
931 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
932 | #undef ONE_PIXEL_BLEND | ||
933 | src += srcskip; | ||
934 | dst += dstskip; | ||
935 | } | ||
936 | } | ||
937 | |||
938 | static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info) | ||
939 | { | ||
940 | unsigned alpha = info->src->alpha; | ||
941 | int height = info->d_height; | ||
942 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
943 | int srcskip = info->s_skip >> 2; | ||
944 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
945 | int dstskip = info->d_skip >> 2; | ||
946 | SDL_PixelFormat *srcfmt = info->src; | ||
947 | SDL_PixelFormat *dstfmt = info->dst; | ||
948 | unsigned sA = srcfmt->alpha; | ||
949 | unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | ||
950 | Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; | ||
951 | Uint32 ckey = info->src->colorkey; | ||
952 | vector unsigned char mergePermute; | ||
953 | vector unsigned char vsrcPermute; | ||
954 | vector unsigned char vdstPermute; | ||
955 | vector unsigned char vsdstPermute; | ||
956 | vector unsigned char valpha; | ||
957 | vector unsigned char valphamask; | ||
958 | vector unsigned char vbits; | ||
959 | vector unsigned char v0; | ||
960 | vector unsigned short v1; | ||
961 | vector unsigned short v8; | ||
962 | vector unsigned int vckey; | ||
963 | vector unsigned int vrgbmask; | ||
964 | |||
965 | mergePermute = VEC_MERGE_PERMUTE(); | ||
966 | v0 = vec_splat_u8(0); | ||
967 | v1 = vec_splat_u16(1); | ||
968 | v8 = vec_splat_u16(8); | ||
969 | |||
970 | /* set the alpha to 255 on the destination surf */ | ||
971 | valphamask = VEC_ALPHA_MASK(); | ||
972 | |||
973 | vsrcPermute = calc_swizzle32(srcfmt, NULL); | ||
974 | vdstPermute = calc_swizzle32(NULL, dstfmt); | ||
975 | vsdstPermute = calc_swizzle32(dstfmt, NULL); | ||
976 | |||
977 | /* set a vector full of alpha and 255-alpha */ | ||
978 | ((unsigned char *)&valpha)[0] = alpha; | ||
979 | valpha = vec_splat(valpha, 0); | ||
980 | vbits = (vector unsigned char)vec_splat_s8(-1); | ||
981 | |||
982 | ckey &= rgbmask; | ||
983 | ((unsigned int *)(char*)&vckey)[0] = ckey; | ||
984 | vckey = vec_splat(vckey, 0); | ||
985 | ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask; | ||
986 | vrgbmask = vec_splat(vrgbmask, 0); | ||
987 | |||
988 | while(height--) { | ||
989 | int width = info->d_width; | ||
990 | #define ONE_PIXEL_BLEND(condition, widthvar) \ | ||
991 | while (condition) { \ | ||
992 | Uint32 Pixel; \ | ||
993 | unsigned sR, sG, sB, dR, dG, dB; \ | ||
994 | RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \ | ||
995 | if(sA && Pixel != ckey) { \ | ||
996 | RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \ | ||
997 | DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ | ||
998 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | ||
999 | ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ | ||
1000 | } \ | ||
1001 | dstp++; \ | ||
1002 | srcp++; \ | ||
1003 | widthvar--; \ | ||
1004 | } | ||
1005 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | ||
1006 | if (width > 0) { | ||
1007 | int extrawidth = (width % 4); | ||
1008 | vector unsigned char valigner = VEC_ALIGNER(srcp); | ||
1009 | vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | ||
1010 | width -= extrawidth; | ||
1011 | while (width) { | ||
1012 | vector unsigned char vsel; | ||
1013 | vector unsigned char voverflow; | ||
1014 | vector unsigned char vd; | ||
1015 | vector unsigned char vd_orig; | ||
1016 | |||
1017 | /* s = *srcp */ | ||
1018 | voverflow = (vector unsigned char)vec_ld(15, srcp); | ||
1019 | vs = vec_perm(vs, voverflow, valigner); | ||
1020 | |||
1021 | /* vsel is set for items that match the key */ | ||
1022 | vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask); | ||
1023 | vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey); | ||
1024 | |||
1025 | /* permute to source format */ | ||
1026 | vs = vec_perm(vs, valpha, vsrcPermute); | ||
1027 | |||
1028 | /* d = *dstp */ | ||
1029 | vd = (vector unsigned char)vec_ld(0, dstp); | ||
1030 | vd_orig = vd = vec_perm(vd, v0, vsdstPermute); | ||
1031 | |||
1032 | VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | ||
1033 | |||
1034 | /* set the alpha channel to full on */ | ||
1035 | vd = vec_or(vd, valphamask); | ||
1036 | |||
1037 | /* mask out color key */ | ||
1038 | vd = vec_sel(vd, vd_orig, vsel); | ||
1039 | |||
1040 | /* permute to dest format */ | ||
1041 | vd = vec_perm(vd, vbits, vdstPermute); | ||
1042 | |||
1043 | /* *dstp = res */ | ||
1044 | vec_st((vector unsigned int)vd, 0, dstp); | ||
1045 | |||
1046 | srcp += 4; | ||
1047 | dstp += 4; | ||
1048 | width -= 4; | ||
1049 | vs = voverflow; | ||
1050 | } | ||
1051 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
1052 | } | ||
1053 | #undef ONE_PIXEL_BLEND | ||
1054 | |||
1055 | srcp += srcskip; | ||
1056 | dstp += dstskip; | ||
1057 | } | ||
1058 | } | ||
1059 | |||
1060 | |||
1061 | static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info) | ||
1062 | { | ||
1063 | int width = info->d_width; | ||
1064 | int height = info->d_height; | ||
1065 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1066 | int srcskip = info->s_skip >> 2; | ||
1067 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1068 | int dstskip = info->d_skip >> 2; | ||
1069 | SDL_PixelFormat *srcfmt = info->src; | ||
1070 | SDL_PixelFormat *dstfmt = info->dst; | ||
1071 | vector unsigned char mergePermute; | ||
1072 | vector unsigned char valphaPermute; | ||
1073 | vector unsigned char vsrcPermute; | ||
1074 | vector unsigned char vdstPermute; | ||
1075 | vector unsigned char vsdstPermute; | ||
1076 | vector unsigned char valphamask; | ||
1077 | vector unsigned char vpixelmask; | ||
1078 | vector unsigned char v0; | ||
1079 | vector unsigned short v1; | ||
1080 | vector unsigned short v8; | ||
1081 | |||
1082 | v0 = vec_splat_u8(0); | ||
1083 | v1 = vec_splat_u16(1); | ||
1084 | v8 = vec_splat_u16(8); | ||
1085 | mergePermute = VEC_MERGE_PERMUTE(); | ||
1086 | valphamask = VEC_ALPHA_MASK(); | ||
1087 | valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | ||
1088 | vpixelmask = vec_nor(valphamask, v0); | ||
1089 | vsrcPermute = calc_swizzle32(srcfmt, NULL); | ||
1090 | vdstPermute = calc_swizzle32(NULL, dstfmt); | ||
1091 | vsdstPermute = calc_swizzle32(dstfmt, NULL); | ||
1092 | |||
1093 | while ( height-- ) { | ||
1094 | width = info->d_width; | ||
1095 | #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | ||
1096 | Uint32 Pixel; \ | ||
1097 | unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ | ||
1098 | DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \ | ||
1099 | if(sA) { \ | ||
1100 | DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \ | ||
1101 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | ||
1102 | ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ | ||
1103 | } \ | ||
1104 | ++srcp; \ | ||
1105 | ++dstp; \ | ||
1106 | widthvar--; \ | ||
1107 | } | ||
1108 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | ||
1109 | if (width > 0) { | ||
1110 | /* vsrcPermute */ | ||
1111 | /* vdstPermute */ | ||
1112 | int extrawidth = (width % 4); | ||
1113 | vector unsigned char valigner = VEC_ALIGNER(srcp); | ||
1114 | vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | ||
1115 | width -= extrawidth; | ||
1116 | while (width) { | ||
1117 | vector unsigned char voverflow; | ||
1118 | vector unsigned char vd; | ||
1119 | vector unsigned char valpha; | ||
1120 | vector unsigned char vdstalpha; | ||
1121 | /* s = *srcp */ | ||
1122 | voverflow = (vector unsigned char)vec_ld(15, srcp); | ||
1123 | vs = vec_perm(vs, voverflow, valigner); | ||
1124 | vs = vec_perm(vs, v0, vsrcPermute); | ||
1125 | |||
1126 | valpha = vec_perm(vs, v0, valphaPermute); | ||
1127 | |||
1128 | /* d = *dstp */ | ||
1129 | vd = (vector unsigned char)vec_ld(0, dstp); | ||
1130 | vd = vec_perm(vd, v0, vsdstPermute); | ||
1131 | vdstalpha = vec_and(vd, valphamask); | ||
1132 | |||
1133 | VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | ||
1134 | |||
1135 | /* set the alpha to the dest alpha */ | ||
1136 | vd = vec_and(vd, vpixelmask); | ||
1137 | vd = vec_or(vd, vdstalpha); | ||
1138 | vd = vec_perm(vd, v0, vdstPermute); | ||
1139 | |||
1140 | /* *dstp = res */ | ||
1141 | vec_st((vector unsigned int)vd, 0, dstp); | ||
1142 | |||
1143 | srcp += 4; | ||
1144 | dstp += 4; | ||
1145 | width -= 4; | ||
1146 | vs = voverflow; | ||
1147 | |||
1148 | } | ||
1149 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
1150 | } | ||
1151 | srcp += srcskip; | ||
1152 | dstp += dstskip; | ||
1153 | #undef ONE_PIXEL_BLEND | ||
1154 | } | ||
1155 | } | ||
1156 | |||
1157 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | ||
1158 | static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info) | ||
1159 | { | ||
1160 | int width = info->d_width; | ||
1161 | int height = info->d_height; | ||
1162 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1163 | int srcskip = info->s_skip >> 2; | ||
1164 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1165 | int dstskip = info->d_skip >> 2; | ||
1166 | vector unsigned char mergePermute; | ||
1167 | vector unsigned char valphaPermute; | ||
1168 | vector unsigned char valphamask; | ||
1169 | vector unsigned char vpixelmask; | ||
1170 | vector unsigned char v0; | ||
1171 | vector unsigned short v1; | ||
1172 | vector unsigned short v8; | ||
1173 | v0 = vec_splat_u8(0); | ||
1174 | v1 = vec_splat_u16(1); | ||
1175 | v8 = vec_splat_u16(8); | ||
1176 | mergePermute = VEC_MERGE_PERMUTE(); | ||
1177 | valphamask = VEC_ALPHA_MASK(); | ||
1178 | valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | ||
1179 | |||
1180 | |||
1181 | vpixelmask = vec_nor(valphamask, v0); | ||
1182 | while(height--) { | ||
1183 | width = info->d_width; | ||
1184 | #define ONE_PIXEL_BLEND(condition, widthvar) \ | ||
1185 | while ((condition)) { \ | ||
1186 | Uint32 dalpha; \ | ||
1187 | Uint32 d; \ | ||
1188 | Uint32 s1; \ | ||
1189 | Uint32 d1; \ | ||
1190 | Uint32 s = *srcp; \ | ||
1191 | Uint32 alpha = s >> 24; \ | ||
1192 | if(alpha) { \ | ||
1193 | if(alpha == SDL_ALPHA_OPAQUE) { \ | ||
1194 | *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ | ||
1195 | } else { \ | ||
1196 | d = *dstp; \ | ||
1197 | dalpha = d & 0xff000000; \ | ||
1198 | s1 = s & 0xff00ff; \ | ||
1199 | d1 = d & 0xff00ff; \ | ||
1200 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ | ||
1201 | s &= 0xff00; \ | ||
1202 | d &= 0xff00; \ | ||
1203 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ | ||
1204 | *dstp = d1 | d | dalpha; \ | ||
1205 | } \ | ||
1206 | } \ | ||
1207 | ++srcp; \ | ||
1208 | ++dstp; \ | ||
1209 | widthvar--; \ | ||
1210 | } | ||
1211 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | ||
1212 | if (width > 0) { | ||
1213 | int extrawidth = (width % 4); | ||
1214 | vector unsigned char valigner = VEC_ALIGNER(srcp); | ||
1215 | vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | ||
1216 | width -= extrawidth; | ||
1217 | while (width) { | ||
1218 | vector unsigned char voverflow; | ||
1219 | vector unsigned char vd; | ||
1220 | vector unsigned char valpha; | ||
1221 | vector unsigned char vdstalpha; | ||
1222 | /* s = *srcp */ | ||
1223 | voverflow = (vector unsigned char)vec_ld(15, srcp); | ||
1224 | vs = vec_perm(vs, voverflow, valigner); | ||
1225 | |||
1226 | valpha = vec_perm(vs, v0, valphaPermute); | ||
1227 | |||
1228 | /* d = *dstp */ | ||
1229 | vd = (vector unsigned char)vec_ld(0, dstp); | ||
1230 | vdstalpha = vec_and(vd, valphamask); | ||
1231 | |||
1232 | VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | ||
1233 | |||
1234 | /* set the alpha to the dest alpha */ | ||
1235 | vd = vec_and(vd, vpixelmask); | ||
1236 | vd = vec_or(vd, vdstalpha); | ||
1237 | |||
1238 | /* *dstp = res */ | ||
1239 | vec_st((vector unsigned int)vd, 0, dstp); | ||
1240 | |||
1241 | srcp += 4; | ||
1242 | dstp += 4; | ||
1243 | width -= 4; | ||
1244 | vs = voverflow; | ||
1245 | } | ||
1246 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
1247 | } | ||
1248 | srcp += srcskip; | ||
1249 | dstp += dstskip; | ||
1250 | } | ||
1251 | #undef ONE_PIXEL_BLEND | ||
1252 | } | ||
1253 | |||
1254 | static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info) | ||
1255 | { | ||
1256 | /* XXX : 6 */ | ||
1257 | unsigned alpha = info->src->alpha; | ||
1258 | int height = info->d_height; | ||
1259 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1260 | int srcskip = info->s_skip >> 2; | ||
1261 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1262 | int dstskip = info->d_skip >> 2; | ||
1263 | SDL_PixelFormat *srcfmt = info->src; | ||
1264 | SDL_PixelFormat *dstfmt = info->dst; | ||
1265 | unsigned sA = srcfmt->alpha; | ||
1266 | unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | ||
1267 | vector unsigned char mergePermute; | ||
1268 | vector unsigned char vsrcPermute; | ||
1269 | vector unsigned char vdstPermute; | ||
1270 | vector unsigned char vsdstPermute; | ||
1271 | vector unsigned char valpha; | ||
1272 | vector unsigned char valphamask; | ||
1273 | vector unsigned char vbits; | ||
1274 | vector unsigned short v1; | ||
1275 | vector unsigned short v8; | ||
1276 | |||
1277 | mergePermute = VEC_MERGE_PERMUTE(); | ||
1278 | v1 = vec_splat_u16(1); | ||
1279 | v8 = vec_splat_u16(8); | ||
1280 | |||
1281 | /* set the alpha to 255 on the destination surf */ | ||
1282 | valphamask = VEC_ALPHA_MASK(); | ||
1283 | |||
1284 | vsrcPermute = calc_swizzle32(srcfmt, NULL); | ||
1285 | vdstPermute = calc_swizzle32(NULL, dstfmt); | ||
1286 | vsdstPermute = calc_swizzle32(dstfmt, NULL); | ||
1287 | |||
1288 | /* set a vector full of alpha and 255-alpha */ | ||
1289 | ((unsigned char *)&valpha)[0] = alpha; | ||
1290 | valpha = vec_splat(valpha, 0); | ||
1291 | vbits = (vector unsigned char)vec_splat_s8(-1); | ||
1292 | |||
1293 | while(height--) { | ||
1294 | int width = info->d_width; | ||
1295 | #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | ||
1296 | Uint32 Pixel; \ | ||
1297 | unsigned sR, sG, sB, dR, dG, dB; \ | ||
1298 | DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \ | ||
1299 | DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ | ||
1300 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | ||
1301 | ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ | ||
1302 | ++srcp; \ | ||
1303 | ++dstp; \ | ||
1304 | widthvar--; \ | ||
1305 | } | ||
1306 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | ||
1307 | if (width > 0) { | ||
1308 | int extrawidth = (width % 4); | ||
1309 | vector unsigned char valigner = VEC_ALIGNER(srcp); | ||
1310 | vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | ||
1311 | width -= extrawidth; | ||
1312 | while (width) { | ||
1313 | vector unsigned char voverflow; | ||
1314 | vector unsigned char vd; | ||
1315 | |||
1316 | /* s = *srcp */ | ||
1317 | voverflow = (vector unsigned char)vec_ld(15, srcp); | ||
1318 | vs = vec_perm(vs, voverflow, valigner); | ||
1319 | vs = vec_perm(vs, valpha, vsrcPermute); | ||
1320 | |||
1321 | /* d = *dstp */ | ||
1322 | vd = (vector unsigned char)vec_ld(0, dstp); | ||
1323 | vd = vec_perm(vd, vd, vsdstPermute); | ||
1324 | |||
1325 | VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | ||
1326 | |||
1327 | /* set the alpha channel to full on */ | ||
1328 | vd = vec_or(vd, valphamask); | ||
1329 | vd = vec_perm(vd, vbits, vdstPermute); | ||
1330 | |||
1331 | /* *dstp = res */ | ||
1332 | vec_st((vector unsigned int)vd, 0, dstp); | ||
1333 | |||
1334 | srcp += 4; | ||
1335 | dstp += 4; | ||
1336 | width -= 4; | ||
1337 | vs = voverflow; | ||
1338 | } | ||
1339 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
1340 | } | ||
1341 | #undef ONE_PIXEL_BLEND | ||
1342 | |||
1343 | srcp += srcskip; | ||
1344 | dstp += dstskip; | ||
1345 | } | ||
1346 | |||
1347 | } | ||
1348 | |||
1349 | |||
1350 | /* fast RGB888->(A)RGB888 blending */ | ||
1351 | static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) | ||
1352 | { | ||
1353 | unsigned alpha = info->src->alpha; | ||
1354 | int height = info->d_height; | ||
1355 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1356 | int srcskip = info->s_skip >> 2; | ||
1357 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1358 | int dstskip = info->d_skip >> 2; | ||
1359 | vector unsigned char mergePermute; | ||
1360 | vector unsigned char valpha; | ||
1361 | vector unsigned char valphamask; | ||
1362 | vector unsigned short v1; | ||
1363 | vector unsigned short v8; | ||
1364 | |||
1365 | mergePermute = VEC_MERGE_PERMUTE(); | ||
1366 | v1 = vec_splat_u16(1); | ||
1367 | v8 = vec_splat_u16(8); | ||
1368 | |||
1369 | /* set the alpha to 255 on the destination surf */ | ||
1370 | valphamask = VEC_ALPHA_MASK(); | ||
1371 | |||
1372 | /* set a vector full of alpha and 255-alpha */ | ||
1373 | ((unsigned char *)&valpha)[0] = alpha; | ||
1374 | valpha = vec_splat(valpha, 0); | ||
1375 | |||
1376 | while(height--) { | ||
1377 | int width = info->d_width; | ||
1378 | #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | ||
1379 | Uint32 s = *srcp; \ | ||
1380 | Uint32 d = *dstp; \ | ||
1381 | Uint32 s1 = s & 0xff00ff; \ | ||
1382 | Uint32 d1 = d & 0xff00ff; \ | ||
1383 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ | ||
1384 | & 0xff00ff; \ | ||
1385 | s &= 0xff00; \ | ||
1386 | d &= 0xff00; \ | ||
1387 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ | ||
1388 | *dstp = d1 | d | 0xff000000; \ | ||
1389 | ++srcp; \ | ||
1390 | ++dstp; \ | ||
1391 | widthvar--; \ | ||
1392 | } | ||
1393 | ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | ||
1394 | if (width > 0) { | ||
1395 | int extrawidth = (width % 4); | ||
1396 | vector unsigned char valigner = VEC_ALIGNER(srcp); | ||
1397 | vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | ||
1398 | width -= extrawidth; | ||
1399 | while (width) { | ||
1400 | vector unsigned char voverflow; | ||
1401 | vector unsigned char vd; | ||
1402 | |||
1403 | /* s = *srcp */ | ||
1404 | voverflow = (vector unsigned char)vec_ld(15, srcp); | ||
1405 | vs = vec_perm(vs, voverflow, valigner); | ||
1406 | |||
1407 | /* d = *dstp */ | ||
1408 | vd = (vector unsigned char)vec_ld(0, dstp); | ||
1409 | |||
1410 | VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | ||
1411 | |||
1412 | /* set the alpha channel to full on */ | ||
1413 | vd = vec_or(vd, valphamask); | ||
1414 | |||
1415 | /* *dstp = res */ | ||
1416 | vec_st((vector unsigned int)vd, 0, dstp); | ||
1417 | |||
1418 | srcp += 4; | ||
1419 | dstp += 4; | ||
1420 | width -= 4; | ||
1421 | vs = voverflow; | ||
1422 | } | ||
1423 | ONE_PIXEL_BLEND((extrawidth), extrawidth); | ||
1424 | } | ||
1425 | #undef ONE_PIXEL_BLEND | ||
1426 | |||
1427 | srcp += srcskip; | ||
1428 | dstp += dstskip; | ||
1429 | } | ||
1430 | } | ||
1431 | #if __MWERKS__ | ||
1432 | #pragma altivec_model off | ||
1433 | #endif | ||
1434 | #endif /* SDL_ALTIVEC_BLITTERS */ | ||
1435 | |||
1436 | /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | ||
1437 | static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) | ||
1438 | { | ||
1439 | int width = info->d_width; | ||
1440 | int height = info->d_height; | ||
1441 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1442 | int srcskip = info->s_skip >> 2; | ||
1443 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1444 | int dstskip = info->d_skip >> 2; | ||
1445 | |||
1446 | while(height--) { | ||
1447 | DUFFS_LOOP4({ | ||
1448 | Uint32 s = *srcp++; | ||
1449 | Uint32 d = *dstp; | ||
1450 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | ||
1451 | + (s & d & 0x00010101)) | 0xff000000; | ||
1452 | }, width); | ||
1453 | srcp += srcskip; | ||
1454 | dstp += dstskip; | ||
1455 | } | ||
1456 | } | ||
1457 | |||
1458 | /* fast RGB888->(A)RGB888 blending with surface alpha */ | ||
1459 | static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) | ||
1460 | { | ||
1461 | unsigned alpha = info->src->alpha; | ||
1462 | if(alpha == 128) { | ||
1463 | BlitRGBtoRGBSurfaceAlpha128(info); | ||
1464 | } else { | ||
1465 | int width = info->d_width; | ||
1466 | int height = info->d_height; | ||
1467 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1468 | int srcskip = info->s_skip >> 2; | ||
1469 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1470 | int dstskip = info->d_skip >> 2; | ||
1471 | Uint32 s; | ||
1472 | Uint32 d; | ||
1473 | Uint32 s1; | ||
1474 | Uint32 d1; | ||
1475 | |||
1476 | while(height--) { | ||
1477 | DUFFS_LOOP_DOUBLE2({ | ||
1478 | /* One Pixel Blend */ | ||
1479 | s = *srcp; | ||
1480 | d = *dstp; | ||
1481 | s1 = s & 0xff00ff; | ||
1482 | d1 = d & 0xff00ff; | ||
1483 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) | ||
1484 | & 0xff00ff; | ||
1485 | s &= 0xff00; | ||
1486 | d &= 0xff00; | ||
1487 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; | ||
1488 | *dstp = d1 | d | 0xff000000; | ||
1489 | ++srcp; | ||
1490 | ++dstp; | ||
1491 | },{ | ||
1492 | /* Two Pixels Blend */ | ||
1493 | s = *srcp; | ||
1494 | d = *dstp; | ||
1495 | s1 = s & 0xff00ff; | ||
1496 | d1 = d & 0xff00ff; | ||
1497 | d1 += (s1 - d1) * alpha >> 8; | ||
1498 | d1 &= 0xff00ff; | ||
1499 | |||
1500 | s = ((s & 0xff00) >> 8) | | ||
1501 | ((srcp[1] & 0xff00) << 8); | ||
1502 | d = ((d & 0xff00) >> 8) | | ||
1503 | ((dstp[1] & 0xff00) << 8); | ||
1504 | d += (s - d) * alpha >> 8; | ||
1505 | d &= 0x00ff00ff; | ||
1506 | |||
1507 | *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000; | ||
1508 | ++srcp; | ||
1509 | |||
1510 | s1 = *srcp; | ||
1511 | d1 = *dstp; | ||
1512 | s1 &= 0xff00ff; | ||
1513 | d1 &= 0xff00ff; | ||
1514 | d1 += (s1 - d1) * alpha >> 8; | ||
1515 | d1 &= 0xff00ff; | ||
1516 | |||
1517 | *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000; | ||
1518 | ++srcp; | ||
1519 | ++dstp; | ||
1520 | }, width); | ||
1521 | srcp += srcskip; | ||
1522 | dstp += dstskip; | ||
1523 | } | ||
1524 | } | ||
1525 | } | ||
1526 | |||
1527 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | ||
1528 | static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info) | ||
1529 | { | ||
1530 | int width = info->d_width; | ||
1531 | int height = info->d_height; | ||
1532 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1533 | int srcskip = info->s_skip >> 2; | ||
1534 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1535 | int dstskip = info->d_skip >> 2; | ||
1536 | |||
1537 | while(height--) { | ||
1538 | DUFFS_LOOP4({ | ||
1539 | Uint32 dalpha; | ||
1540 | Uint32 d; | ||
1541 | Uint32 s1; | ||
1542 | Uint32 d1; | ||
1543 | Uint32 s = *srcp; | ||
1544 | Uint32 alpha = s >> 24; | ||
1545 | /* FIXME: Here we special-case opaque alpha since the | ||
1546 | compositioning used (>>8 instead of /255) doesn't handle | ||
1547 | it correctly. Also special-case alpha=0 for speed? | ||
1548 | Benchmark this! */ | ||
1549 | if(alpha) { | ||
1550 | if(alpha == SDL_ALPHA_OPAQUE) { | ||
1551 | *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); | ||
1552 | } else { | ||
1553 | /* | ||
1554 | * take out the middle component (green), and process | ||
1555 | * the other two in parallel. One multiply less. | ||
1556 | */ | ||
1557 | d = *dstp; | ||
1558 | dalpha = d & 0xff000000; | ||
1559 | s1 = s & 0xff00ff; | ||
1560 | d1 = d & 0xff00ff; | ||
1561 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; | ||
1562 | s &= 0xff00; | ||
1563 | d &= 0xff00; | ||
1564 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; | ||
1565 | *dstp = d1 | d | dalpha; | ||
1566 | } | ||
1567 | } | ||
1568 | ++srcp; | ||
1569 | ++dstp; | ||
1570 | }, width); | ||
1571 | srcp += srcskip; | ||
1572 | dstp += dstskip; | ||
1573 | } | ||
1574 | } | ||
1575 | |||
1576 | #if GCC_ASMBLIT | ||
1577 | /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ | ||
1578 | static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) | ||
1579 | { | ||
1580 | int width = info->d_width; | ||
1581 | int height = info->d_height; | ||
1582 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1583 | int srcskip = info->s_skip >> 2; | ||
1584 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1585 | int dstskip = info->d_skip >> 2; | ||
1586 | SDL_PixelFormat* sf = info->src; | ||
1587 | Uint32 amask = sf->Amask; | ||
1588 | |||
1589 | __asm__ ( | ||
1590 | /* make mm6 all zeros. */ | ||
1591 | "pxor %%mm6, %%mm6\n" | ||
1592 | |||
1593 | /* Make a mask to preserve the alpha. */ | ||
1594 | "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ | ||
1595 | "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ | ||
1596 | "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ | ||
1597 | "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ | ||
1598 | "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ | ||
1599 | |||
1600 | /* form channel masks */ | ||
1601 | "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ | ||
1602 | "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ | ||
1603 | "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ | ||
1604 | "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ | ||
1605 | |||
1606 | /* get alpha channel shift */ | ||
1607 | "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ | ||
1608 | |||
1609 | : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) ); | ||
1610 | |||
1611 | while(height--) { | ||
1612 | |||
1613 | DUFFS_LOOP4({ | ||
1614 | Uint32 alpha; | ||
1615 | |||
1616 | __asm__ ( | ||
1617 | "prefetch 64(%0)\n" | ||
1618 | "prefetch 64(%1)\n" | ||
1619 | : : "r" (srcp), "r" (dstp) ); | ||
1620 | |||
1621 | alpha = *srcp & amask; | ||
1622 | /* FIXME: Here we special-case opaque alpha since the | ||
1623 | compositioning used (>>8 instead of /255) doesn't handle | ||
1624 | it correctly. Also special-case alpha=0 for speed? | ||
1625 | Benchmark this! */ | ||
1626 | if(alpha == 0) { | ||
1627 | /* do nothing */ | ||
1628 | } | ||
1629 | else if(alpha == amask) { | ||
1630 | /* opaque alpha -- copy RGB, keep dst alpha */ | ||
1631 | /* using MMX here to free up regular registers for other things */ | ||
1632 | __asm__ ( | ||
1633 | "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ | ||
1634 | "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ | ||
1635 | "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ | ||
1636 | "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ | ||
1637 | "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ | ||
1638 | "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ | ||
1639 | |||
1640 | : : "r" (srcp), "r" (dstp) ); | ||
1641 | } | ||
1642 | |||
1643 | else { | ||
1644 | __asm__ ( | ||
1645 | /* load in the source, and dst. */ | ||
1646 | "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */ | ||
1647 | "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */ | ||
1648 | |||
1649 | /* Move the src alpha into mm2 */ | ||
1650 | |||
1651 | /* if supporting pshufw */ | ||
1652 | /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ | ||
1653 | /*"psrlw $8, %%mm2\n" */ | ||
1654 | |||
1655 | /* else: */ | ||
1656 | "movd %2, %%mm2\n" | ||
1657 | "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ | ||
1658 | "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ | ||
1659 | "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ | ||
1660 | "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ | ||
1661 | |||
1662 | /* move the colors into words. */ | ||
1663 | "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ | ||
1664 | "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ | ||
1665 | |||
1666 | /* src - dst */ | ||
1667 | "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ | ||
1668 | |||
1669 | /* A * (src-dst) */ | ||
1670 | "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ | ||
1671 | "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ | ||
1672 | "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ | ||
1673 | |||
1674 | "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ | ||
1675 | |||
1676 | "movd %%mm0, (%1)\n" /* result in mm0 */ | ||
1677 | |||
1678 | : : "r" (srcp), "r" (dstp), "r" (alpha) ); | ||
1679 | |||
1680 | } | ||
1681 | ++srcp; | ||
1682 | ++dstp; | ||
1683 | }, width); | ||
1684 | srcp += srcskip; | ||
1685 | dstp += dstskip; | ||
1686 | } | ||
1687 | |||
1688 | __asm__ ( | ||
1689 | "emms\n" | ||
1690 | : ); | ||
1691 | } | ||
1692 | /* End GCC_ASMBLIT*/ | ||
1693 | |||
1694 | #elif MSVC_ASMBLIT | ||
1695 | /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ | ||
1696 | static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) | ||
1697 | { | ||
1698 | int width = info->d_width; | ||
1699 | int height = info->d_height; | ||
1700 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
1701 | int srcskip = info->s_skip >> 2; | ||
1702 | Uint32 *dstp = (Uint32 *)info->d_pixels; | ||
1703 | int dstskip = info->d_skip >> 2; | ||
1704 | SDL_PixelFormat* sf = info->src; | ||
1705 | Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; | ||
1706 | Uint32 amask = sf->Amask; | ||
1707 | Uint32 ashift = sf->Ashift; | ||
1708 | Uint64 multmask; | ||
1709 | |||
1710 | __m64 src1, dst1, mm_alpha, mm_zero, dmask; | ||
1711 | |||
1712 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | ||
1713 | multmask = ~(0xFFFFi64 << (ashift * 2)); | ||
1714 | dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ | ||
1715 | |||
1716 | while(height--) { | ||
1717 | DUFFS_LOOP4({ | ||
1718 | Uint32 alpha; | ||
1719 | |||
1720 | _m_prefetch(srcp + 16); | ||
1721 | _m_prefetch(dstp + 16); | ||
1722 | |||
1723 | alpha = *srcp & amask; | ||
1724 | if (alpha == 0) { | ||
1725 | /* do nothing */ | ||
1726 | } else if (alpha == amask) { | ||
1727 | /* copy RGB, keep dst alpha */ | ||
1728 | *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); | ||
1729 | } else { | ||
1730 | src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ | ||
1731 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ | ||
1732 | |||
1733 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | ||
1734 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | ||
1735 | |||
1736 | mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ | ||
1737 | mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ | ||
1738 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | ||
1739 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | ||
1740 | mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ | ||
1741 | |||
1742 | /* blend */ | ||
1743 | src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ | ||
1744 | src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ | ||
1745 | src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ | ||
1746 | dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ | ||
1747 | dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | ||
1748 | |||
1749 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | ||
1750 | } | ||
1751 | ++srcp; | ||
1752 | ++dstp; | ||
1753 | }, width); | ||
1754 | srcp += srcskip; | ||
1755 | dstp += dstskip; | ||
1756 | } | ||
1757 | _mm_empty(); | ||
1758 | } | ||
1759 | /* End MSVC_ASMBLIT */ | ||
1760 | |||
1761 | #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | ||
1762 | |||
1763 | /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ | ||
1764 | |||
1765 | /* blend a single 16 bit pixel at 50% */ | ||
1766 | #define BLEND16_50(d, s, mask) \ | ||
1767 | ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) | ||
1768 | |||
1769 | /* blend two 16 bit pixels at 50% */ | ||
1770 | #define BLEND2x16_50(d, s, mask) \ | ||
1771 | (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \ | ||
1772 | + (s & d & (~(mask | mask << 16)))) | ||
1773 | |||
1774 | static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) | ||
1775 | { | ||
1776 | int width = info->d_width; | ||
1777 | int height = info->d_height; | ||
1778 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
1779 | int srcskip = info->s_skip >> 1; | ||
1780 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
1781 | int dstskip = info->d_skip >> 1; | ||
1782 | |||
1783 | while(height--) { | ||
1784 | if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) { | ||
1785 | /* | ||
1786 | * Source and destination not aligned, pipeline it. | ||
1787 | * This is mostly a win for big blits but no loss for | ||
1788 | * small ones | ||
1789 | */ | ||
1790 | Uint32 prev_sw; | ||
1791 | int w = width; | ||
1792 | |||
1793 | /* handle odd destination */ | ||
1794 | if((uintptr_t)dstp & 2) { | ||
1795 | Uint16 d = *dstp, s = *srcp; | ||
1796 | *dstp = BLEND16_50(d, s, mask); | ||
1797 | dstp++; | ||
1798 | srcp++; | ||
1799 | w--; | ||
1800 | } | ||
1801 | srcp++; /* srcp is now 32-bit aligned */ | ||
1802 | |||
1803 | /* bootstrap pipeline with first halfword */ | ||
1804 | prev_sw = ((Uint32 *)srcp)[-1]; | ||
1805 | |||
1806 | while(w > 1) { | ||
1807 | Uint32 sw, dw, s; | ||
1808 | sw = *(Uint32 *)srcp; | ||
1809 | dw = *(Uint32 *)dstp; | ||
1810 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN | ||
1811 | s = (prev_sw << 16) + (sw >> 16); | ||
1812 | #else | ||
1813 | s = (prev_sw >> 16) + (sw << 16); | ||
1814 | #endif | ||
1815 | prev_sw = sw; | ||
1816 | *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask); | ||
1817 | dstp += 2; | ||
1818 | srcp += 2; | ||
1819 | w -= 2; | ||
1820 | } | ||
1821 | |||
1822 | /* final pixel if any */ | ||
1823 | if(w) { | ||
1824 | Uint16 d = *dstp, s; | ||
1825 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN | ||
1826 | s = (Uint16)prev_sw; | ||
1827 | #else | ||
1828 | s = (Uint16)(prev_sw >> 16); | ||
1829 | #endif | ||
1830 | *dstp = BLEND16_50(d, s, mask); | ||
1831 | srcp++; | ||
1832 | dstp++; | ||
1833 | } | ||
1834 | srcp += srcskip - 1; | ||
1835 | dstp += dstskip; | ||
1836 | } else { | ||
1837 | /* source and destination are aligned */ | ||
1838 | int w = width; | ||
1839 | |||
1840 | /* first odd pixel? */ | ||
1841 | if((uintptr_t)srcp & 2) { | ||
1842 | Uint16 d = *dstp, s = *srcp; | ||
1843 | *dstp = BLEND16_50(d, s, mask); | ||
1844 | srcp++; | ||
1845 | dstp++; | ||
1846 | w--; | ||
1847 | } | ||
1848 | /* srcp and dstp are now 32-bit aligned */ | ||
1849 | |||
1850 | while(w > 1) { | ||
1851 | Uint32 sw = *(Uint32 *)srcp; | ||
1852 | Uint32 dw = *(Uint32 *)dstp; | ||
1853 | *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask); | ||
1854 | srcp += 2; | ||
1855 | dstp += 2; | ||
1856 | w -= 2; | ||
1857 | } | ||
1858 | |||
1859 | /* last odd pixel? */ | ||
1860 | if(w) { | ||
1861 | Uint16 d = *dstp, s = *srcp; | ||
1862 | *dstp = BLEND16_50(d, s, mask); | ||
1863 | srcp++; | ||
1864 | dstp++; | ||
1865 | } | ||
1866 | srcp += srcskip; | ||
1867 | dstp += dstskip; | ||
1868 | } | ||
1869 | } | ||
1870 | } | ||
1871 | |||
1872 | #if GCC_ASMBLIT | ||
1873 | /* fast RGB565->RGB565 blending with surface alpha */ | ||
1874 | static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) | ||
1875 | { | ||
1876 | unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | ||
1877 | if(alpha == 128) { | ||
1878 | Blit16to16SurfaceAlpha128(info, 0xf7de); | ||
1879 | } else { | ||
1880 | int width = info->d_width; | ||
1881 | int height = info->d_height; | ||
1882 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
1883 | int srcskip = info->s_skip >> 1; | ||
1884 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
1885 | int dstskip = info->d_skip >> 1; | ||
1886 | Uint32 s, d; | ||
1887 | Uint64 load; | ||
1888 | |||
1889 | alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | ||
1890 | load = alpha; | ||
1891 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
1892 | |||
1893 | movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */ | ||
1894 | punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | ||
1895 | punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | ||
1896 | /* position alpha to allow for mullo and mulhi on diff channels | ||
1897 | to reduce the number of operations */ | ||
1898 | psllq_i2r(3, mm0); | ||
1899 | |||
1900 | /* Setup the 565 color channel masks */ | ||
1901 | load = 0x07E007E007E007E0ULL; | ||
1902 | movq_m2r(load, mm4); /* MASKGREEN -> mm4 */ | ||
1903 | load = 0x001F001F001F001FULL; | ||
1904 | movq_m2r(load, mm7); /* MASKBLUE -> mm7 */ | ||
1905 | while(height--) { | ||
1906 | DUFFS_LOOP_QUATRO2( | ||
1907 | { | ||
1908 | s = *srcp++; | ||
1909 | d = *dstp; | ||
1910 | /* | ||
1911 | * shift out the middle component (green) to | ||
1912 | * the high 16 bits, and process all three RGB | ||
1913 | * components at the same time. | ||
1914 | */ | ||
1915 | s = (s | s << 16) & 0x07e0f81f; | ||
1916 | d = (d | d << 16) & 0x07e0f81f; | ||
1917 | d += (s - d) * alpha >> 5; | ||
1918 | d &= 0x07e0f81f; | ||
1919 | *dstp++ = d | d >> 16; | ||
1920 | },{ | ||
1921 | s = *srcp++; | ||
1922 | d = *dstp; | ||
1923 | /* | ||
1924 | * shift out the middle component (green) to | ||
1925 | * the high 16 bits, and process all three RGB | ||
1926 | * components at the same time. | ||
1927 | */ | ||
1928 | s = (s | s << 16) & 0x07e0f81f; | ||
1929 | d = (d | d << 16) & 0x07e0f81f; | ||
1930 | d += (s - d) * alpha >> 5; | ||
1931 | d &= 0x07e0f81f; | ||
1932 | *dstp++ = d | d >> 16; | ||
1933 | s = *srcp++; | ||
1934 | d = *dstp; | ||
1935 | /* | ||
1936 | * shift out the middle component (green) to | ||
1937 | * the high 16 bits, and process all three RGB | ||
1938 | * components at the same time. | ||
1939 | */ | ||
1940 | s = (s | s << 16) & 0x07e0f81f; | ||
1941 | d = (d | d << 16) & 0x07e0f81f; | ||
1942 | d += (s - d) * alpha >> 5; | ||
1943 | d &= 0x07e0f81f; | ||
1944 | *dstp++ = d | d >> 16; | ||
1945 | },{ | ||
1946 | movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | ||
1947 | movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | ||
1948 | |||
1949 | /* red -- does not need a mask since the right shift clears | ||
1950 | the uninteresting bits */ | ||
1951 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
1952 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
1953 | psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ | ||
1954 | psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ | ||
1955 | |||
1956 | /* blend */ | ||
1957 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
1958 | pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
1959 | /* alpha used is actually 11 bits | ||
1960 | 11 + 5 = 16 bits, so the sign bits are lost */ | ||
1961 | psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ | ||
1962 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
1963 | psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ | ||
1964 | |||
1965 | movq_r2r(mm6, mm1); /* save new reds in dsts */ | ||
1966 | |||
1967 | /* green -- process the bits in place */ | ||
1968 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
1969 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
1970 | pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ | ||
1971 | pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ | ||
1972 | |||
1973 | /* blend */ | ||
1974 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
1975 | pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
1976 | /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting | ||
1977 | bits are gone and the sign bits present */ | ||
1978 | psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ | ||
1979 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
1980 | |||
1981 | por_r2r(mm6, mm1); /* save new greens in dsts */ | ||
1982 | |||
1983 | /* blue */ | ||
1984 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
1985 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
1986 | pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ | ||
1987 | pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | ||
1988 | |||
1989 | /* blend */ | ||
1990 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
1991 | pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
1992 | /* 11 + 5 = 16 bits, so the sign bits are lost and | ||
1993 | the interesting bits will need to be MASKed */ | ||
1994 | psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ | ||
1995 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
1996 | pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ | ||
1997 | |||
1998 | por_r2r(mm6, mm1); /* save new blues in dsts */ | ||
1999 | |||
2000 | movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ | ||
2001 | |||
2002 | srcp += 4; | ||
2003 | dstp += 4; | ||
2004 | }, width); | ||
2005 | srcp += srcskip; | ||
2006 | dstp += dstskip; | ||
2007 | } | ||
2008 | emms(); | ||
2009 | } | ||
2010 | } | ||
2011 | |||
2012 | /* fast RGB555->RGB555 blending with surface alpha */ | ||
2013 | static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) | ||
2014 | { | ||
2015 | unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | ||
2016 | if(alpha == 128) { | ||
2017 | Blit16to16SurfaceAlpha128(info, 0xfbde); | ||
2018 | } else { | ||
2019 | int width = info->d_width; | ||
2020 | int height = info->d_height; | ||
2021 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
2022 | int srcskip = info->s_skip >> 1; | ||
2023 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2024 | int dstskip = info->d_skip >> 1; | ||
2025 | Uint32 s, d; | ||
2026 | Uint64 load; | ||
2027 | |||
2028 | alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | ||
2029 | load = alpha; | ||
2030 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
2031 | |||
2032 | movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */ | ||
2033 | punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | ||
2034 | punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | ||
2035 | /* position alpha to allow for mullo and mulhi on diff channels | ||
2036 | to reduce the number of operations */ | ||
2037 | psllq_i2r(3, mm0); | ||
2038 | |||
2039 | /* Setup the 555 color channel masks */ | ||
2040 | load = 0x03E003E003E003E0ULL; | ||
2041 | movq_m2r(load, mm4); /* MASKGREEN -> mm4 */ | ||
2042 | load = 0x001F001F001F001FULL; | ||
2043 | movq_m2r(load, mm7); /* MASKBLUE -> mm7 */ | ||
2044 | while(height--) { | ||
2045 | DUFFS_LOOP_QUATRO2( | ||
2046 | { | ||
2047 | s = *srcp++; | ||
2048 | d = *dstp; | ||
2049 | /* | ||
2050 | * shift out the middle component (green) to | ||
2051 | * the high 16 bits, and process all three RGB | ||
2052 | * components at the same time. | ||
2053 | */ | ||
2054 | s = (s | s << 16) & 0x03e07c1f; | ||
2055 | d = (d | d << 16) & 0x03e07c1f; | ||
2056 | d += (s - d) * alpha >> 5; | ||
2057 | d &= 0x03e07c1f; | ||
2058 | *dstp++ = d | d >> 16; | ||
2059 | },{ | ||
2060 | s = *srcp++; | ||
2061 | d = *dstp; | ||
2062 | /* | ||
2063 | * shift out the middle component (green) to | ||
2064 | * the high 16 bits, and process all three RGB | ||
2065 | * components at the same time. | ||
2066 | */ | ||
2067 | s = (s | s << 16) & 0x03e07c1f; | ||
2068 | d = (d | d << 16) & 0x03e07c1f; | ||
2069 | d += (s - d) * alpha >> 5; | ||
2070 | d &= 0x03e07c1f; | ||
2071 | *dstp++ = d | d >> 16; | ||
2072 | s = *srcp++; | ||
2073 | d = *dstp; | ||
2074 | /* | ||
2075 | * shift out the middle component (green) to | ||
2076 | * the high 16 bits, and process all three RGB | ||
2077 | * components at the same time. | ||
2078 | */ | ||
2079 | s = (s | s << 16) & 0x03e07c1f; | ||
2080 | d = (d | d << 16) & 0x03e07c1f; | ||
2081 | d += (s - d) * alpha >> 5; | ||
2082 | d &= 0x03e07c1f; | ||
2083 | *dstp++ = d | d >> 16; | ||
2084 | },{ | ||
2085 | movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | ||
2086 | movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | ||
2087 | |||
2088 | /* red -- process the bits in place */ | ||
2089 | psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ | ||
2090 | /* by reusing the GREEN mask we free up another mmx | ||
2091 | register to accumulate the result */ | ||
2092 | |||
2093 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
2094 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
2095 | pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ | ||
2096 | pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ | ||
2097 | |||
2098 | /* blend */ | ||
2099 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
2100 | pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
2101 | /* 11 + 15 - 16 = 10 bits, uninteresting bits will be | ||
2102 | cleared by a MASK below */ | ||
2103 | psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ | ||
2104 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
2105 | pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ | ||
2106 | |||
2107 | psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ | ||
2108 | |||
2109 | movq_r2r(mm6, mm1); /* save new reds in dsts */ | ||
2110 | |||
2111 | /* green -- process the bits in place */ | ||
2112 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
2113 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
2114 | pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ | ||
2115 | pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ | ||
2116 | |||
2117 | /* blend */ | ||
2118 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
2119 | pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
2120 | /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting | ||
2121 | bits are gone and the sign bits present */ | ||
2122 | psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ | ||
2123 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
2124 | |||
2125 | por_r2r(mm6, mm1); /* save new greens in dsts */ | ||
2126 | |||
2127 | /* blue */ | ||
2128 | movq_r2r(mm2, mm5); /* src -> mm5 */ | ||
2129 | movq_r2r(mm3, mm6); /* dst -> mm6 */ | ||
2130 | pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ | ||
2131 | pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | ||
2132 | |||
2133 | /* blend */ | ||
2134 | psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | ||
2135 | pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | ||
2136 | /* 11 + 5 = 16 bits, so the sign bits are lost and | ||
2137 | the interesting bits will need to be MASKed */ | ||
2138 | psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ | ||
2139 | paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | ||
2140 | pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ | ||
2141 | |||
2142 | por_r2r(mm6, mm1); /* save new blues in dsts */ | ||
2143 | |||
2144 | movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ | ||
2145 | |||
2146 | srcp += 4; | ||
2147 | dstp += 4; | ||
2148 | }, width); | ||
2149 | srcp += srcskip; | ||
2150 | dstp += dstskip; | ||
2151 | } | ||
2152 | emms(); | ||
2153 | } | ||
2154 | } | ||
2155 | /* End GCC_ASMBLIT */ | ||
2156 | |||
2157 | #elif MSVC_ASMBLIT | ||
2158 | /* fast RGB565->RGB565 blending with surface alpha */ | ||
2159 | static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) | ||
2160 | { | ||
2161 | unsigned alpha = info->src->alpha; | ||
2162 | if(alpha == 128) { | ||
2163 | Blit16to16SurfaceAlpha128(info, 0xf7de); | ||
2164 | } else { | ||
2165 | int width = info->d_width; | ||
2166 | int height = info->d_height; | ||
2167 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
2168 | int srcskip = info->s_skip >> 1; | ||
2169 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2170 | int dstskip = info->d_skip >> 1; | ||
2171 | Uint32 s, d; | ||
2172 | |||
2173 | __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; | ||
2174 | |||
2175 | alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | ||
2176 | mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ | ||
2177 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
2178 | |||
2179 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | ||
2180 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | ||
2181 | /* position alpha to allow for mullo and mulhi on diff channels | ||
2182 | to reduce the number of operations */ | ||
2183 | mm_alpha = _mm_slli_si64(mm_alpha, 3); | ||
2184 | |||
2185 | /* Setup the 565 color channel masks */ | ||
2186 | gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ | ||
2187 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ | ||
2188 | |||
2189 | while(height--) { | ||
2190 | DUFFS_LOOP_QUATRO2( | ||
2191 | { | ||
2192 | s = *srcp++; | ||
2193 | d = *dstp; | ||
2194 | /* | ||
2195 | * shift out the middle component (green) to | ||
2196 | * the high 16 bits, and process all three RGB | ||
2197 | * components at the same time. | ||
2198 | */ | ||
2199 | s = (s | s << 16) & 0x07e0f81f; | ||
2200 | d = (d | d << 16) & 0x07e0f81f; | ||
2201 | d += (s - d) * alpha >> 5; | ||
2202 | d &= 0x07e0f81f; | ||
2203 | *dstp++ = (Uint16)(d | d >> 16); | ||
2204 | },{ | ||
2205 | s = *srcp++; | ||
2206 | d = *dstp; | ||
2207 | /* | ||
2208 | * shift out the middle component (green) to | ||
2209 | * the high 16 bits, and process all three RGB | ||
2210 | * components at the same time. | ||
2211 | */ | ||
2212 | s = (s | s << 16) & 0x07e0f81f; | ||
2213 | d = (d | d << 16) & 0x07e0f81f; | ||
2214 | d += (s - d) * alpha >> 5; | ||
2215 | d &= 0x07e0f81f; | ||
2216 | *dstp++ = (Uint16)(d | d >> 16); | ||
2217 | s = *srcp++; | ||
2218 | d = *dstp; | ||
2219 | /* | ||
2220 | * shift out the middle component (green) to | ||
2221 | * the high 16 bits, and process all three RGB | ||
2222 | * components at the same time. | ||
2223 | */ | ||
2224 | s = (s | s << 16) & 0x07e0f81f; | ||
2225 | d = (d | d << 16) & 0x07e0f81f; | ||
2226 | d += (s - d) * alpha >> 5; | ||
2227 | d &= 0x07e0f81f; | ||
2228 | *dstp++ = (Uint16)(d | d >> 16); | ||
2229 | },{ | ||
2230 | src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ | ||
2231 | dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ | ||
2232 | |||
2233 | /* red */ | ||
2234 | src2 = src1; | ||
2235 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ | ||
2236 | |||
2237 | dst2 = dst1; | ||
2238 | dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ | ||
2239 | |||
2240 | /* blend */ | ||
2241 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2242 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2243 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | ||
2244 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2245 | dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ | ||
2246 | |||
2247 | mm_res = dst2; /* RED -> mm_res */ | ||
2248 | |||
2249 | /* green -- process the bits in place */ | ||
2250 | src2 = src1; | ||
2251 | src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ | ||
2252 | |||
2253 | dst2 = dst1; | ||
2254 | dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ | ||
2255 | |||
2256 | /* blend */ | ||
2257 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2258 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2259 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | ||
2260 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2261 | |||
2262 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ | ||
2263 | |||
2264 | /* blue */ | ||
2265 | src2 = src1; | ||
2266 | src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ | ||
2267 | |||
2268 | dst2 = dst1; | ||
2269 | dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ | ||
2270 | |||
2271 | /* blend */ | ||
2272 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2273 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2274 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | ||
2275 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2276 | dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ | ||
2277 | |||
2278 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ | ||
2279 | |||
2280 | *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ | ||
2281 | |||
2282 | srcp += 4; | ||
2283 | dstp += 4; | ||
2284 | }, width); | ||
2285 | srcp += srcskip; | ||
2286 | dstp += dstskip; | ||
2287 | } | ||
2288 | _mm_empty(); | ||
2289 | } | ||
2290 | } | ||
2291 | |||
2292 | /* fast RGB555->RGB555 blending with surface alpha */ | ||
2293 | static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) | ||
2294 | { | ||
2295 | unsigned alpha = info->src->alpha; | ||
2296 | if(alpha == 128) { | ||
2297 | Blit16to16SurfaceAlpha128(info, 0xfbde); | ||
2298 | } else { | ||
2299 | int width = info->d_width; | ||
2300 | int height = info->d_height; | ||
2301 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
2302 | int srcskip = info->s_skip >> 1; | ||
2303 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2304 | int dstskip = info->d_skip >> 1; | ||
2305 | Uint32 s, d; | ||
2306 | |||
2307 | __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; | ||
2308 | |||
2309 | alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | ||
2310 | mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ | ||
2311 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
2312 | |||
2313 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | ||
2314 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | ||
2315 | /* position alpha to allow for mullo and mulhi on diff channels | ||
2316 | to reduce the number of operations */ | ||
2317 | mm_alpha = _mm_slli_si64(mm_alpha, 3); | ||
2318 | |||
2319 | /* Setup the 555 color channel masks */ | ||
2320 | rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ | ||
2321 | gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ | ||
2322 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ | ||
2323 | |||
2324 | while(height--) { | ||
2325 | DUFFS_LOOP_QUATRO2( | ||
2326 | { | ||
2327 | s = *srcp++; | ||
2328 | d = *dstp; | ||
2329 | /* | ||
2330 | * shift out the middle component (green) to | ||
2331 | * the high 16 bits, and process all three RGB | ||
2332 | * components at the same time. | ||
2333 | */ | ||
2334 | s = (s | s << 16) & 0x03e07c1f; | ||
2335 | d = (d | d << 16) & 0x03e07c1f; | ||
2336 | d += (s - d) * alpha >> 5; | ||
2337 | d &= 0x03e07c1f; | ||
2338 | *dstp++ = (Uint16)(d | d >> 16); | ||
2339 | },{ | ||
2340 | s = *srcp++; | ||
2341 | d = *dstp; | ||
2342 | /* | ||
2343 | * shift out the middle component (green) to | ||
2344 | * the high 16 bits, and process all three RGB | ||
2345 | * components at the same time. | ||
2346 | */ | ||
2347 | s = (s | s << 16) & 0x03e07c1f; | ||
2348 | d = (d | d << 16) & 0x03e07c1f; | ||
2349 | d += (s - d) * alpha >> 5; | ||
2350 | d &= 0x03e07c1f; | ||
2351 | *dstp++ = (Uint16)(d | d >> 16); | ||
2352 | s = *srcp++; | ||
2353 | d = *dstp; | ||
2354 | /* | ||
2355 | * shift out the middle component (green) to | ||
2356 | * the high 16 bits, and process all three RGB | ||
2357 | * components at the same time. | ||
2358 | */ | ||
2359 | s = (s | s << 16) & 0x03e07c1f; | ||
2360 | d = (d | d << 16) & 0x03e07c1f; | ||
2361 | d += (s - d) * alpha >> 5; | ||
2362 | d &= 0x03e07c1f; | ||
2363 | *dstp++ = (Uint16)(d | d >> 16); | ||
2364 | },{ | ||
2365 | src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ | ||
2366 | dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ | ||
2367 | |||
2368 | /* red -- process the bits in place */ | ||
2369 | src2 = src1; | ||
2370 | src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ | ||
2371 | |||
2372 | dst2 = dst1; | ||
2373 | dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ | ||
2374 | |||
2375 | /* blend */ | ||
2376 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2377 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2378 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | ||
2379 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2380 | dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ | ||
2381 | |||
2382 | mm_res = dst2; /* RED -> mm_res */ | ||
2383 | |||
2384 | /* green -- process the bits in place */ | ||
2385 | src2 = src1; | ||
2386 | src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ | ||
2387 | |||
2388 | dst2 = dst1; | ||
2389 | dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ | ||
2390 | |||
2391 | /* blend */ | ||
2392 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2393 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2394 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | ||
2395 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2396 | |||
2397 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ | ||
2398 | |||
2399 | /* blue */ | ||
2400 | src2 = src1; /* src -> src2 */ | ||
2401 | src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ | ||
2402 | |||
2403 | dst2 = dst1; /* dst -> dst2 */ | ||
2404 | dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ | ||
2405 | |||
2406 | /* blend */ | ||
2407 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | ||
2408 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | ||
2409 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | ||
2410 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | ||
2411 | dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ | ||
2412 | |||
2413 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ | ||
2414 | |||
2415 | *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ | ||
2416 | |||
2417 | srcp += 4; | ||
2418 | dstp += 4; | ||
2419 | }, width); | ||
2420 | srcp += srcskip; | ||
2421 | dstp += dstskip; | ||
2422 | } | ||
2423 | _mm_empty(); | ||
2424 | } | ||
2425 | } | ||
2426 | #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | ||
2427 | |||
2428 | /* fast RGB565->RGB565 blending with surface alpha */ | ||
2429 | static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) | ||
2430 | { | ||
2431 | unsigned alpha = info->src->alpha; | ||
2432 | if(alpha == 128) { | ||
2433 | Blit16to16SurfaceAlpha128(info, 0xf7de); | ||
2434 | } else { | ||
2435 | int width = info->d_width; | ||
2436 | int height = info->d_height; | ||
2437 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
2438 | int srcskip = info->s_skip >> 1; | ||
2439 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2440 | int dstskip = info->d_skip >> 1; | ||
2441 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
2442 | |||
2443 | while(height--) { | ||
2444 | DUFFS_LOOP4({ | ||
2445 | Uint32 s = *srcp++; | ||
2446 | Uint32 d = *dstp; | ||
2447 | /* | ||
2448 | * shift out the middle component (green) to | ||
2449 | * the high 16 bits, and process all three RGB | ||
2450 | * components at the same time. | ||
2451 | */ | ||
2452 | s = (s | s << 16) & 0x07e0f81f; | ||
2453 | d = (d | d << 16) & 0x07e0f81f; | ||
2454 | d += (s - d) * alpha >> 5; | ||
2455 | d &= 0x07e0f81f; | ||
2456 | *dstp++ = (Uint16)(d | d >> 16); | ||
2457 | }, width); | ||
2458 | srcp += srcskip; | ||
2459 | dstp += dstskip; | ||
2460 | } | ||
2461 | } | ||
2462 | } | ||
2463 | |||
2464 | /* fast RGB555->RGB555 blending with surface alpha */ | ||
2465 | static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info) | ||
2466 | { | ||
2467 | unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | ||
2468 | if(alpha == 128) { | ||
2469 | Blit16to16SurfaceAlpha128(info, 0xfbde); | ||
2470 | } else { | ||
2471 | int width = info->d_width; | ||
2472 | int height = info->d_height; | ||
2473 | Uint16 *srcp = (Uint16 *)info->s_pixels; | ||
2474 | int srcskip = info->s_skip >> 1; | ||
2475 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2476 | int dstskip = info->d_skip >> 1; | ||
2477 | alpha >>= 3; /* downscale alpha to 5 bits */ | ||
2478 | |||
2479 | while(height--) { | ||
2480 | DUFFS_LOOP4({ | ||
2481 | Uint32 s = *srcp++; | ||
2482 | Uint32 d = *dstp; | ||
2483 | /* | ||
2484 | * shift out the middle component (green) to | ||
2485 | * the high 16 bits, and process all three RGB | ||
2486 | * components at the same time. | ||
2487 | */ | ||
2488 | s = (s | s << 16) & 0x03e07c1f; | ||
2489 | d = (d | d << 16) & 0x03e07c1f; | ||
2490 | d += (s - d) * alpha >> 5; | ||
2491 | d &= 0x03e07c1f; | ||
2492 | *dstp++ = (Uint16)(d | d >> 16); | ||
2493 | }, width); | ||
2494 | srcp += srcskip; | ||
2495 | dstp += dstskip; | ||
2496 | } | ||
2497 | } | ||
2498 | } | ||
2499 | |||
2500 | /* fast ARGB8888->RGB565 blending with pixel alpha */ | ||
2501 | static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info) | ||
2502 | { | ||
2503 | int width = info->d_width; | ||
2504 | int height = info->d_height; | ||
2505 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
2506 | int srcskip = info->s_skip >> 2; | ||
2507 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2508 | int dstskip = info->d_skip >> 1; | ||
2509 | |||
2510 | while(height--) { | ||
2511 | DUFFS_LOOP4({ | ||
2512 | Uint32 s = *srcp; | ||
2513 | unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ | ||
2514 | /* FIXME: Here we special-case opaque alpha since the | ||
2515 | compositioning used (>>8 instead of /255) doesn't handle | ||
2516 | it correctly. Also special-case alpha=0 for speed? | ||
2517 | Benchmark this! */ | ||
2518 | if(alpha) { | ||
2519 | if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | ||
2520 | *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); | ||
2521 | } else { | ||
2522 | Uint32 d = *dstp; | ||
2523 | /* | ||
2524 | * convert source and destination to G0RAB65565 | ||
2525 | * and blend all components at the same time | ||
2526 | */ | ||
2527 | s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) | ||
2528 | + (s >> 3 & 0x1f); | ||
2529 | d = (d | d << 16) & 0x07e0f81f; | ||
2530 | d += (s - d) * alpha >> 5; | ||
2531 | d &= 0x07e0f81f; | ||
2532 | *dstp = (Uint16)(d | d >> 16); | ||
2533 | } | ||
2534 | } | ||
2535 | srcp++; | ||
2536 | dstp++; | ||
2537 | }, width); | ||
2538 | srcp += srcskip; | ||
2539 | dstp += dstskip; | ||
2540 | } | ||
2541 | } | ||
2542 | |||
2543 | /* fast ARGB8888->RGB555 blending with pixel alpha */ | ||
2544 | static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info) | ||
2545 | { | ||
2546 | int width = info->d_width; | ||
2547 | int height = info->d_height; | ||
2548 | Uint32 *srcp = (Uint32 *)info->s_pixels; | ||
2549 | int srcskip = info->s_skip >> 2; | ||
2550 | Uint16 *dstp = (Uint16 *)info->d_pixels; | ||
2551 | int dstskip = info->d_skip >> 1; | ||
2552 | |||
2553 | while(height--) { | ||
2554 | DUFFS_LOOP4({ | ||
2555 | unsigned alpha; | ||
2556 | Uint32 s = *srcp; | ||
2557 | alpha = s >> 27; /* downscale alpha to 5 bits */ | ||
2558 | /* FIXME: Here we special-case opaque alpha since the | ||
2559 | compositioning used (>>8 instead of /255) doesn't handle | ||
2560 | it correctly. Also special-case alpha=0 for speed? | ||
2561 | Benchmark this! */ | ||
2562 | if(alpha) { | ||
2563 | if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | ||
2564 | *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); | ||
2565 | } else { | ||
2566 | Uint32 d = *dstp; | ||
2567 | /* | ||
2568 | * convert source and destination to G0RAB65565 | ||
2569 | * and blend all components at the same time | ||
2570 | */ | ||
2571 | s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) | ||
2572 | + (s >> 3 & 0x1f); | ||
2573 | d = (d | d << 16) & 0x03e07c1f; | ||
2574 | d += (s - d) * alpha >> 5; | ||
2575 | d &= 0x03e07c1f; | ||
2576 | *dstp = (Uint16)(d | d >> 16); | ||
2577 | } | ||
2578 | } | ||
2579 | srcp++; | ||
2580 | dstp++; | ||
2581 | }, width); | ||
2582 | srcp += srcskip; | ||
2583 | dstp += dstskip; | ||
2584 | } | ||
2585 | } | ||
2586 | |||
2587 | /* General (slow) N->N blending with per-surface alpha */ | ||
2588 | static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info) | ||
2589 | { | ||
2590 | int width = info->d_width; | ||
2591 | int height = info->d_height; | ||
2592 | Uint8 *src = info->s_pixels; | ||
2593 | int srcskip = info->s_skip; | ||
2594 | Uint8 *dst = info->d_pixels; | ||
2595 | int dstskip = info->d_skip; | ||
2596 | SDL_PixelFormat *srcfmt = info->src; | ||
2597 | SDL_PixelFormat *dstfmt = info->dst; | ||
2598 | int srcbpp = srcfmt->BytesPerPixel; | ||
2599 | int dstbpp = dstfmt->BytesPerPixel; | ||
2600 | unsigned sA = srcfmt->alpha; | ||
2601 | unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | ||
2602 | |||
2603 | if(sA) { | ||
2604 | while ( height-- ) { | ||
2605 | DUFFS_LOOP4( | ||
2606 | { | ||
2607 | Uint32 Pixel; | ||
2608 | unsigned sR; | ||
2609 | unsigned sG; | ||
2610 | unsigned sB; | ||
2611 | unsigned dR; | ||
2612 | unsigned dG; | ||
2613 | unsigned dB; | ||
2614 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); | ||
2615 | DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); | ||
2616 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | ||
2617 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | ||
2618 | src += srcbpp; | ||
2619 | dst += dstbpp; | ||
2620 | }, | ||
2621 | width); | ||
2622 | src += srcskip; | ||
2623 | dst += dstskip; | ||
2624 | } | ||
2625 | } | ||
2626 | } | ||
2627 | |||
2628 | /* General (slow) colorkeyed N->N blending with per-surface alpha */ | ||
2629 | static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) | ||
2630 | { | ||
2631 | int width = info->d_width; | ||
2632 | int height = info->d_height; | ||
2633 | Uint8 *src = info->s_pixels; | ||
2634 | int srcskip = info->s_skip; | ||
2635 | Uint8 *dst = info->d_pixels; | ||
2636 | int dstskip = info->d_skip; | ||
2637 | SDL_PixelFormat *srcfmt = info->src; | ||
2638 | SDL_PixelFormat *dstfmt = info->dst; | ||
2639 | Uint32 ckey = srcfmt->colorkey; | ||
2640 | int srcbpp = srcfmt->BytesPerPixel; | ||
2641 | int dstbpp = dstfmt->BytesPerPixel; | ||
2642 | unsigned sA = srcfmt->alpha; | ||
2643 | unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | ||
2644 | |||
2645 | while ( height-- ) { | ||
2646 | DUFFS_LOOP4( | ||
2647 | { | ||
2648 | Uint32 Pixel; | ||
2649 | unsigned sR; | ||
2650 | unsigned sG; | ||
2651 | unsigned sB; | ||
2652 | unsigned dR; | ||
2653 | unsigned dG; | ||
2654 | unsigned dB; | ||
2655 | RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); | ||
2656 | if(sA && Pixel != ckey) { | ||
2657 | RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); | ||
2658 | DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); | ||
2659 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | ||
2660 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | ||
2661 | } | ||
2662 | src += srcbpp; | ||
2663 | dst += dstbpp; | ||
2664 | }, | ||
2665 | width); | ||
2666 | src += srcskip; | ||
2667 | dst += dstskip; | ||
2668 | } | ||
2669 | } | ||
2670 | |||
2671 | /* General (slow) N->N blending with pixel alpha */ | ||
2672 | static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) | ||
2673 | { | ||
2674 | int width = info->d_width; | ||
2675 | int height = info->d_height; | ||
2676 | Uint8 *src = info->s_pixels; | ||
2677 | int srcskip = info->s_skip; | ||
2678 | Uint8 *dst = info->d_pixels; | ||
2679 | int dstskip = info->d_skip; | ||
2680 | SDL_PixelFormat *srcfmt = info->src; | ||
2681 | SDL_PixelFormat *dstfmt = info->dst; | ||
2682 | |||
2683 | int srcbpp; | ||
2684 | int dstbpp; | ||
2685 | |||
2686 | /* Set up some basic variables */ | ||
2687 | srcbpp = srcfmt->BytesPerPixel; | ||
2688 | dstbpp = dstfmt->BytesPerPixel; | ||
2689 | |||
2690 | /* FIXME: for 8bpp source alpha, this doesn't get opaque values | ||
2691 | quite right. for <8bpp source alpha, it gets them very wrong | ||
2692 | (check all macros!) | ||
2693 | It is unclear whether there is a good general solution that doesn't | ||
2694 | need a branch (or a divide). */ | ||
2695 | while ( height-- ) { | ||
2696 | DUFFS_LOOP4( | ||
2697 | { | ||
2698 | Uint32 Pixel; | ||
2699 | unsigned sR; | ||
2700 | unsigned sG; | ||
2701 | unsigned sB; | ||
2702 | unsigned dR; | ||
2703 | unsigned dG; | ||
2704 | unsigned dB; | ||
2705 | unsigned sA; | ||
2706 | unsigned dA; | ||
2707 | DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); | ||
2708 | if(sA) { | ||
2709 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); | ||
2710 | ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | ||
2711 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | ||
2712 | } | ||
2713 | src += srcbpp; | ||
2714 | dst += dstbpp; | ||
2715 | }, | ||
2716 | width); | ||
2717 | src += srcskip; | ||
2718 | dst += dstskip; | ||
2719 | } | ||
2720 | } | ||
2721 | |||
2722 | |||
2723 | SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) | ||
2724 | { | ||
2725 | SDL_PixelFormat *sf = surface->format; | ||
2726 | SDL_PixelFormat *df = surface->map->dst->format; | ||
2727 | |||
2728 | if(sf->Amask == 0) { | ||
2729 | if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { | ||
2730 | if(df->BytesPerPixel == 1) | ||
2731 | return BlitNto1SurfaceAlphaKey; | ||
2732 | else | ||
2733 | #if SDL_ALTIVEC_BLITTERS | ||
2734 | if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && | ||
2735 | !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | ||
2736 | return Blit32to32SurfaceAlphaKeyAltivec; | ||
2737 | else | ||
2738 | #endif | ||
2739 | return BlitNtoNSurfaceAlphaKey; | ||
2740 | } else { | ||
2741 | /* Per-surface alpha blits */ | ||
2742 | switch(df->BytesPerPixel) { | ||
2743 | case 1: | ||
2744 | return BlitNto1SurfaceAlpha; | ||
2745 | |||
2746 | case 2: | ||
2747 | if(surface->map->identity) { | ||
2748 | if(df->Gmask == 0x7e0) | ||
2749 | { | ||
2750 | #if MMX_ASMBLIT | ||
2751 | if(SDL_HasMMX()) | ||
2752 | return Blit565to565SurfaceAlphaMMX; | ||
2753 | else | ||
2754 | #endif | ||
2755 | return Blit565to565SurfaceAlpha; | ||
2756 | } | ||
2757 | else if(df->Gmask == 0x3e0) | ||
2758 | { | ||
2759 | #if MMX_ASMBLIT | ||
2760 | if(SDL_HasMMX()) | ||
2761 | return Blit555to555SurfaceAlphaMMX; | ||
2762 | else | ||
2763 | #endif | ||
2764 | return Blit555to555SurfaceAlpha; | ||
2765 | } | ||
2766 | } | ||
2767 | return BlitNtoNSurfaceAlpha; | ||
2768 | |||
2769 | case 4: | ||
2770 | if(sf->Rmask == df->Rmask | ||
2771 | && sf->Gmask == df->Gmask | ||
2772 | && sf->Bmask == df->Bmask | ||
2773 | && sf->BytesPerPixel == 4) | ||
2774 | { | ||
2775 | #if MMX_ASMBLIT | ||
2776 | if(sf->Rshift % 8 == 0 | ||
2777 | && sf->Gshift % 8 == 0 | ||
2778 | && sf->Bshift % 8 == 0 | ||
2779 | && SDL_HasMMX()) | ||
2780 | return BlitRGBtoRGBSurfaceAlphaMMX; | ||
2781 | #endif | ||
2782 | if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) | ||
2783 | { | ||
2784 | #if SDL_ALTIVEC_BLITTERS | ||
2785 | if(!(surface->map->dst->flags & SDL_HWSURFACE) | ||
2786 | && SDL_HasAltiVec()) | ||
2787 | return BlitRGBtoRGBSurfaceAlphaAltivec; | ||
2788 | #endif | ||
2789 | return BlitRGBtoRGBSurfaceAlpha; | ||
2790 | } | ||
2791 | } | ||
2792 | #if SDL_ALTIVEC_BLITTERS | ||
2793 | if((sf->BytesPerPixel == 4) && | ||
2794 | !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | ||
2795 | return Blit32to32SurfaceAlphaAltivec; | ||
2796 | else | ||
2797 | #endif | ||
2798 | return BlitNtoNSurfaceAlpha; | ||
2799 | |||
2800 | case 3: | ||
2801 | default: | ||
2802 | return BlitNtoNSurfaceAlpha; | ||
2803 | } | ||
2804 | } | ||
2805 | } else { | ||
2806 | /* Per-pixel alpha blits */ | ||
2807 | switch(df->BytesPerPixel) { | ||
2808 | case 1: | ||
2809 | return BlitNto1PixelAlpha; | ||
2810 | |||
2811 | case 2: | ||
2812 | #if SDL_ALTIVEC_BLITTERS | ||
2813 | if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) && | ||
2814 | df->Gmask == 0x7e0 && | ||
2815 | df->Bmask == 0x1f && SDL_HasAltiVec()) | ||
2816 | return Blit32to565PixelAlphaAltivec; | ||
2817 | else | ||
2818 | #endif | ||
2819 | if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 | ||
2820 | && sf->Gmask == 0xff00 | ||
2821 | && ((sf->Rmask == 0xff && df->Rmask == 0x1f) | ||
2822 | || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { | ||
2823 | if(df->Gmask == 0x7e0) | ||
2824 | return BlitARGBto565PixelAlpha; | ||
2825 | else if(df->Gmask == 0x3e0) | ||
2826 | return BlitARGBto555PixelAlpha; | ||
2827 | } | ||
2828 | return BlitNtoNPixelAlpha; | ||
2829 | |||
2830 | case 4: | ||
2831 | if(sf->Rmask == df->Rmask | ||
2832 | && sf->Gmask == df->Gmask | ||
2833 | && sf->Bmask == df->Bmask | ||
2834 | && sf->BytesPerPixel == 4) | ||
2835 | { | ||
2836 | #if MMX_ASMBLIT | ||
2837 | if(sf->Rshift % 8 == 0 | ||
2838 | && sf->Gshift % 8 == 0 | ||
2839 | && sf->Bshift % 8 == 0 | ||
2840 | && sf->Ashift % 8 == 0 | ||
2841 | && sf->Aloss == 0) | ||
2842 | { | ||
2843 | if(SDL_Has3DNow()) | ||
2844 | return BlitRGBtoRGBPixelAlphaMMX3DNOW; | ||
2845 | if(SDL_HasMMX()) | ||
2846 | return BlitRGBtoRGBPixelAlphaMMX; | ||
2847 | } | ||
2848 | #endif | ||
2849 | if(sf->Amask == 0xff000000) | ||
2850 | { | ||
2851 | #if SDL_ALTIVEC_BLITTERS | ||
2852 | if(!(surface->map->dst->flags & SDL_HWSURFACE) | ||
2853 | && SDL_HasAltiVec()) | ||
2854 | return BlitRGBtoRGBPixelAlphaAltivec; | ||
2855 | #endif | ||
2856 | return BlitRGBtoRGBPixelAlpha; | ||
2857 | } | ||
2858 | } | ||
2859 | #if SDL_ALTIVEC_BLITTERS | ||
2860 | if (sf->Amask && sf->BytesPerPixel == 4 && | ||
2861 | !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | ||
2862 | return Blit32to32PixelAlphaAltivec; | ||
2863 | else | ||
2864 | #endif | ||
2865 | return BlitNtoNPixelAlpha; | ||
2866 | |||
2867 | case 3: | ||
2868 | default: | ||
2869 | return BlitNtoNPixelAlpha; | ||
2870 | } | ||
2871 | } | ||
2872 | } | ||
2873 | |||