summaryrefslogtreecommitdiff
path: root/apps/plugins/sdl/src/video/SDL_blit_A.c
diff options
context:
space:
mode:
Diffstat (limited to 'apps/plugins/sdl/src/video/SDL_blit_A.c')
-rw-r--r--apps/plugins/sdl/src/video/SDL_blit_A.c2873
1 files changed, 2873 insertions, 0 deletions
diff --git a/apps/plugins/sdl/src/video/SDL_blit_A.c b/apps/plugins/sdl/src/video/SDL_blit_A.c
new file mode 100644
index 0000000000..219cdccf5b
--- /dev/null
+++ b/apps/plugins/sdl/src/video/SDL_blit_A.c
@@ -0,0 +1,2873 @@
1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2012 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 /* forced MMX to 0...it breaks on most compilers now. --ryan. */
36# define MMX_ASMBLIT 0
37# define GCC_ASMBLIT 0
38# elif defined(_MSC_VER) && defined(_M_IX86)
39# if (_MSC_VER <= 1200)
40# include <malloc.h>
41# if defined(_mm_free)
42# define HAVE_MMINTRIN_H 1
43# endif
44# else /* Visual Studio > VC6 always has mmintrin.h */
45# define HAVE_MMINTRIN_H 1
46# endif
47# if HAVE_MMINTRIN_H
48# define MMX_ASMBLIT 1
49# define MSVC_ASMBLIT 1
50# endif
51# endif
52#endif /* SDL_ASSEMBLY_ROUTINES */
53
54/* Function to check the CPU flags */
55#include "SDL_cpuinfo.h"
56#if GCC_ASMBLIT
57#include "mmx.h"
58#elif MSVC_ASMBLIT
59#include <mmintrin.h>
60#include <mm3dnow.h>
61#endif
62
63/* Functions to perform alpha blended blitting */
64
65/* N->1 blending with per-surface alpha */
66static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
67{
68 int width = info->d_width;
69 int height = info->d_height;
70 Uint8 *src = info->s_pixels;
71 int srcskip = info->s_skip;
72 Uint8 *dst = info->d_pixels;
73 int dstskip = info->d_skip;
74 Uint8 *palmap = info->table;
75 SDL_PixelFormat *srcfmt = info->src;
76 SDL_PixelFormat *dstfmt = info->dst;
77 int srcbpp = srcfmt->BytesPerPixel;
78
79 const unsigned A = srcfmt->alpha;
80
81 while ( height-- ) {
82 DUFFS_LOOP4(
83 {
84 Uint32 Pixel;
85 unsigned sR;
86 unsigned sG;
87 unsigned sB;
88 unsigned dR;
89 unsigned dG;
90 unsigned dB;
91 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
92 dR = dstfmt->palette->colors[*dst].r;
93 dG = dstfmt->palette->colors[*dst].g;
94 dB = dstfmt->palette->colors[*dst].b;
95 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
96 dR &= 0xff;
97 dG &= 0xff;
98 dB &= 0xff;
99 /* Pack RGB into 8bit pixel */
100 if ( palmap == NULL ) {
101 *dst =((dR>>5)<<(3+2))|
102 ((dG>>5)<<(2))|
103 ((dB>>6)<<(0));
104 } else {
105 *dst = palmap[((dR>>5)<<(3+2))|
106 ((dG>>5)<<(2)) |
107 ((dB>>6)<<(0))];
108 }
109 dst++;
110 src += srcbpp;
111 },
112 width);
113 src += srcskip;
114 dst += dstskip;
115 }
116}
117
118/* N->1 blending with pixel alpha */
119static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
120{
121 int width = info->d_width;
122 int height = info->d_height;
123 Uint8 *src = info->s_pixels;
124 int srcskip = info->s_skip;
125 Uint8 *dst = info->d_pixels;
126 int dstskip = info->d_skip;
127 Uint8 *palmap = info->table;
128 SDL_PixelFormat *srcfmt = info->src;
129 SDL_PixelFormat *dstfmt = info->dst;
130 int srcbpp = srcfmt->BytesPerPixel;
131
132 /* FIXME: fix alpha bit field expansion here too? */
133 while ( height-- ) {
134 DUFFS_LOOP4(
135 {
136 Uint32 Pixel;
137 unsigned sR;
138 unsigned sG;
139 unsigned sB;
140 unsigned sA;
141 unsigned dR;
142 unsigned dG;
143 unsigned dB;
144 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
145 dR = dstfmt->palette->colors[*dst].r;
146 dG = dstfmt->palette->colors[*dst].g;
147 dB = dstfmt->palette->colors[*dst].b;
148 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
149 dR &= 0xff;
150 dG &= 0xff;
151 dB &= 0xff;
152 /* Pack RGB into 8bit pixel */
153 if ( palmap == NULL ) {
154 *dst =((dR>>5)<<(3+2))|
155 ((dG>>5)<<(2))|
156 ((dB>>6)<<(0));
157 } else {
158 *dst = palmap[((dR>>5)<<(3+2))|
159 ((dG>>5)<<(2)) |
160 ((dB>>6)<<(0)) ];
161 }
162 dst++;
163 src += srcbpp;
164 },
165 width);
166 src += srcskip;
167 dst += dstskip;
168 }
169}
170
171/* colorkeyed N->1 blending with per-surface alpha */
172static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
173{
174 int width = info->d_width;
175 int height = info->d_height;
176 Uint8 *src = info->s_pixels;
177 int srcskip = info->s_skip;
178 Uint8 *dst = info->d_pixels;
179 int dstskip = info->d_skip;
180 Uint8 *palmap = info->table;
181 SDL_PixelFormat *srcfmt = info->src;
182 SDL_PixelFormat *dstfmt = info->dst;
183 int srcbpp = srcfmt->BytesPerPixel;
184 Uint32 ckey = srcfmt->colorkey;
185
186 const int A = srcfmt->alpha;
187
188 while ( height-- ) {
189 DUFFS_LOOP(
190 {
191 Uint32 Pixel;
192 unsigned sR;
193 unsigned sG;
194 unsigned sB;
195 unsigned dR;
196 unsigned dG;
197 unsigned dB;
198 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
199 if ( Pixel != ckey ) {
200 dR = dstfmt->palette->colors[*dst].r;
201 dG = dstfmt->palette->colors[*dst].g;
202 dB = dstfmt->palette->colors[*dst].b;
203 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
204 dR &= 0xff;
205 dG &= 0xff;
206 dB &= 0xff;
207 /* Pack RGB into 8bit pixel */
208 if ( palmap == NULL ) {
209 *dst =((dR>>5)<<(3+2))|
210 ((dG>>5)<<(2)) |
211 ((dB>>6)<<(0));
212 } else {
213 *dst = palmap[((dR>>5)<<(3+2))|
214 ((dG>>5)<<(2)) |
215 ((dB>>6)<<(0)) ];
216 }
217 }
218 dst++;
219 src += srcbpp;
220 },
221 width);
222 src += srcskip;
223 dst += dstskip;
224 }
225}
226
227#if GCC_ASMBLIT
228/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
229static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
230{
231 int width = info->d_width;
232 int height = info->d_height;
233 Uint32 *srcp = (Uint32 *)info->s_pixels;
234 int srcskip = info->s_skip >> 2;
235 Uint32 *dstp = (Uint32 *)info->d_pixels;
236 int dstskip = info->d_skip >> 2;
237 Uint32 dalpha = info->dst->Amask;
238 Uint64 load;
239
240 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
241 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
242 load = 0x0001010100010101ULL;/* !alpha128 mask */
243 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
244 movd_m2r(dalpha, mm7); /* dst alpha mask */
245 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
246 while(height--) {
247 DUFFS_LOOP_DOUBLE2(
248 {
249 Uint32 s = *srcp++;
250 Uint32 d = *dstp;
251 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
252 + (s & d & 0x00010101)) | dalpha;
253 },{
254 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
255 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
256
257 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
258 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
259
260 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
261 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
262 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
263 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
264 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
265 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
266 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
267
268 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
269 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
270 dstp += 2;
271 srcp += 2;
272 }, width);
273 srcp += srcskip;
274 dstp += dstskip;
275 }
276 emms();
277}
278
279/* fast RGB888->(A)RGB888 blending with surface alpha */
280static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
281{
282 SDL_PixelFormat* df = info->dst;
283 unsigned alpha = info->src->alpha;
284
285 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
286 /* only call a128 version when R,G,B occupy lower bits */
287 BlitRGBtoRGBSurfaceAlpha128MMX(info);
288 } else {
289 int width = info->d_width;
290 int height = info->d_height;
291 Uint32 *srcp = (Uint32 *)info->s_pixels;
292 int srcskip = info->s_skip >> 2;
293 Uint32 *dstp = (Uint32 *)info->d_pixels;
294 int dstskip = info->d_skip >> 2;
295
296 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
297 /* form the alpha mult */
298 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
299 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
300 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
301 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
302 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
303 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
304 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
305 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
306 movd_m2r(df->Amask, mm7); /* dst alpha mask */
307 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
308
309 while(height--) {
310 DUFFS_LOOP_DOUBLE2({
311 /* One Pixel Blend */
312 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
313 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
314 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
315 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
316
317 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
318 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
319 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
320 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
321
322 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
323 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
324 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
325 ++srcp;
326 ++dstp;
327 },{
328 /* Two Pixels Blend */
329 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
330 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
331 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
332 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
333
334 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
335 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
336 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
337 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
338
339 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
340 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
341 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
342 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
343
344 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
345 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
346 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
347 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
348
349 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
350 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
351
352 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
353
354 srcp += 2;
355 dstp += 2;
356 }, width);
357 srcp += srcskip;
358 dstp += dstskip;
359 }
360 emms();
361 }
362}
363
364/* fast ARGB888->(A)RGB888 blending with pixel alpha */
365static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
366{
367 int width = info->d_width;
368 int height = info->d_height;
369 Uint32 *srcp = (Uint32 *)info->s_pixels;
370 int srcskip = info->s_skip >> 2;
371 Uint32 *dstp = (Uint32 *)info->d_pixels;
372 int dstskip = info->d_skip >> 2;
373 SDL_PixelFormat* sf = info->src;
374 Uint32 amask = sf->Amask;
375
376 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
377 /* form multiplication mask */
378 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
379 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
380 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
381 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
382 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
383 /* form channel masks */
384 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
385 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
386 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
387 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
388 /* get alpha channel shift */
389 __asm__ __volatile__ (
390 "movd %0, %%mm5"
391 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
392
393 while(height--) {
394 DUFFS_LOOP4({
395 Uint32 alpha = *srcp & amask;
396 /* FIXME: Here we special-case opaque alpha since the
397 compositioning used (>>8 instead of /255) doesn't handle
398 it correctly. Also special-case alpha=0 for speed?
399 Benchmark this! */
400 if(alpha == 0) {
401 /* do nothing */
402 } else if(alpha == amask) {
403 /* opaque alpha -- copy RGB, keep dst alpha */
404 /* using MMX here to free up regular registers for other things */
405 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
406 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
407 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
408 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
409 por_r2r(mm1, mm2); /* src | dst -> mm2 */
410 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
411 } else {
412 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
413 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
414
415 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
416 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
417
418 __asm__ __volatile__ (
419 "movd %0, %%mm4"
420 : : "r" (alpha) ); /* 0000A000 -> mm4 */
421 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
422 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
423 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
424 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
425
426 /* blend */
427 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
428 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
429 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
430 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
431
432 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
433 movd_r2m(mm2, *dstp);/* mm2 -> dst */
434 }
435 ++srcp;
436 ++dstp;
437 }, width);
438 srcp += srcskip;
439 dstp += dstskip;
440 }
441 emms();
442}
443/* End GCC_ASMBLIT */
444
445#elif MSVC_ASMBLIT
446/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
447static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
448{
449 int width = info->d_width;
450 int height = info->d_height;
451 Uint32 *srcp = (Uint32 *)info->s_pixels;
452 int srcskip = info->s_skip >> 2;
453 Uint32 *dstp = (Uint32 *)info->d_pixels;
454 int dstskip = info->d_skip >> 2;
455 Uint32 dalpha = info->dst->Amask;
456
457 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
458
459 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
460 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
461 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
462
463 while (height--) {
464 int n = width;
465 if ( n & 1 ) {
466 Uint32 s = *srcp++;
467 Uint32 d = *dstp;
468 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
469 + (s & d & 0x00010101)) | dalpha;
470 n--;
471 }
472
473 for (n >>= 1; n > 0; --n) {
474 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
475 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
476
477 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
478 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
479
480 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
481 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
482 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
483 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
484
485 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
486 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
487 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
488 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
489
490 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
491 dstp += 2;
492 srcp += 2;
493 }
494
495 srcp += srcskip;
496 dstp += dstskip;
497 }
498 _mm_empty();
499}
500
501/* fast RGB888->(A)RGB888 blending with surface alpha */
502static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
503{
504 SDL_PixelFormat* df = info->dst;
505 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
506 unsigned alpha = info->src->alpha;
507
508 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
509 /* only call a128 version when R,G,B occupy lower bits */
510 BlitRGBtoRGBSurfaceAlpha128MMX(info);
511 } else {
512 int width = info->d_width;
513 int height = info->d_height;
514 Uint32 *srcp = (Uint32 *)info->s_pixels;
515 int srcskip = info->s_skip >> 2;
516 Uint32 *dstp = (Uint32 *)info->d_pixels;
517 int dstskip = info->d_skip >> 2;
518 Uint32 dalpha = df->Amask;
519 Uint32 amult;
520
521 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
522
523 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
524 /* form the alpha mult */
525 amult = alpha | (alpha << 8);
526 amult = amult | (amult << 16);
527 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
528 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
529 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
530 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
531 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
532
533 while (height--) {
534 int n = width;
535 if (n & 1) {
536 /* One Pixel Blend */
537 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
538 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
539
540 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
541 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
542
543 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
544 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
545 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
546 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
547
548 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
549 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
550 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
551
552 ++srcp;
553 ++dstp;
554
555 n--;
556 }
557
558 for (n >>= 1; n > 0; --n) {
559 /* Two Pixels Blend */
560 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
561 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
562 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
563 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
564
565 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
566 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
567 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
568 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
569
570 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
571 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
572 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
573 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
574
575 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
576 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
577 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
578 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
579
580 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
581 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
582
583 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
584
585 srcp += 2;
586 dstp += 2;
587 }
588 srcp += srcskip;
589 dstp += dstskip;
590 }
591 _mm_empty();
592 }
593}
594
595/* fast ARGB888->(A)RGB888 blending with pixel alpha */
596static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
597{
598 int width = info->d_width;
599 int height = info->d_height;
600 Uint32 *srcp = (Uint32 *)info->s_pixels;
601 int srcskip = info->s_skip >> 2;
602 Uint32 *dstp = (Uint32 *)info->d_pixels;
603 int dstskip = info->d_skip >> 2;
604 SDL_PixelFormat* sf = info->src;
605 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
606 Uint32 amask = sf->Amask;
607 Uint32 ashift = sf->Ashift;
608 Uint64 multmask;
609
610 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
611
612 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
613 multmask = ~(0xFFFFi64 << (ashift * 2));
614 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
615
616 while(height--) {
617 DUFFS_LOOP4({
618 Uint32 alpha = *srcp & amask;
619 if (alpha == 0) {
620 /* do nothing */
621 } else if (alpha == amask) {
622 /* opaque alpha -- copy RGB, keep dst alpha */
623 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
624 } else {
625 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
626 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
627
628 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
629 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
630
631 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
632 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
633 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
634 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
635 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
636
637 /* blend */
638 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
639 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
640 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
641 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
642 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
643
644 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
645 }
646 ++srcp;
647 ++dstp;
648 }, width);
649 srcp += srcskip;
650 dstp += dstskip;
651 }
652 _mm_empty();
653}
654/* End MSVC_ASMBLIT */
655
656#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
657
658#if SDL_ALTIVEC_BLITTERS
659#if __MWERKS__
660#pragma altivec_model on
661#endif
662#if HAVE_ALTIVEC_H
663#include <altivec.h>
664#endif
665#include <assert.h>
666
667#if (defined(__MACOSX__) && (__GNUC__ < 4))
668 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
669 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
670 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
671 (vector unsigned short) ( a,b,c,d,e,f,g,h )
672#else
673 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
674 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
675 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
676 (vector unsigned short) { a,b,c,d,e,f,g,h }
677#endif
678
679#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
680#define VECPRINT(msg, v) do { \
681 vector unsigned int tmpvec = (vector unsigned int)(v); \
682 unsigned int *vp = (unsigned int *)&tmpvec; \
683 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
684} while (0)
685
686/* the permuation vector that takes the high bytes out of all the appropriate shorts
687 (vector unsigned char)(
688 0x00, 0x10, 0x02, 0x12,
689 0x04, 0x14, 0x06, 0x16,
690 0x08, 0x18, 0x0A, 0x1A,
691 0x0C, 0x1C, 0x0E, 0x1E );
692*/
693#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
694#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
695#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
696#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
697 ? vec_lvsl(0, src) \
698 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
699
700
701#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
702 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
703 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
704 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
705 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
706 /* valpha2 is 255-alpha */ \
707 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
708 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
709 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
710 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
711 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
712 /* add source and dest */ \
713 vtemp1 = vec_add(vtemp1, vtemp3); \
714 vtemp2 = vec_add(vtemp2, vtemp4); \
715 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
716 vtemp1 = vec_add(vtemp1, v1_16); \
717 vtemp3 = vec_sr(vtemp1, v8_16); \
718 vtemp1 = vec_add(vtemp1, vtemp3); \
719 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
720 vtemp2 = vec_add(vtemp2, v1_16); \
721 vtemp4 = vec_sr(vtemp2, v8_16); \
722 vtemp2 = vec_add(vtemp2, vtemp4); \
723 /* (>>8) and get ARGBARGBARGBARGB */ \
724 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
725} while (0)
726
727/* Calculate the permute vector used for 32->32 swizzling */
728static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
729 const SDL_PixelFormat *dstfmt)
730{
731 /*
732 * We have to assume that the bits that aren't used by other
733 * colors is alpha, and it's one complete byte, since some formats
734 * leave alpha with a zero mask, but we should still swizzle the bits.
735 */
736 /* ARGB */
737 const static struct SDL_PixelFormat default_pixel_format = {
738 NULL, 0, 0,
739 0, 0, 0, 0,
740 16, 8, 0, 24,
741 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
742 0, 0};
743 if (!srcfmt) {
744 srcfmt = &default_pixel_format;
745 }
746 if (!dstfmt) {
747 dstfmt = &default_pixel_format;
748 }
749 const vector unsigned char plus = VECUINT8_LITERAL
750 ( 0x00, 0x00, 0x00, 0x00,
751 0x04, 0x04, 0x04, 0x04,
752 0x08, 0x08, 0x08, 0x08,
753 0x0C, 0x0C, 0x0C, 0x0C );
754 vector unsigned char vswiz;
755 vector unsigned int srcvec;
756#define RESHIFT(X) (3 - ((X) >> 3))
757 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
758 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
759 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
760 Uint32 amask;
761 /* Use zero for alpha if either surface doesn't have alpha */
762 if (dstfmt->Amask) {
763 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
764 } else {
765 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
766 }
767#undef RESHIFT
768 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
769 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
770 return(vswiz);
771}
772
773static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
774{
775 int height = info->d_height;
776 Uint8 *src = (Uint8 *)info->s_pixels;
777 int srcskip = info->s_skip;
778 Uint8 *dst = (Uint8 *)info->d_pixels;
779 int dstskip = info->d_skip;
780 SDL_PixelFormat *srcfmt = info->src;
781
782 vector unsigned char v0 = vec_splat_u8(0);
783 vector unsigned short v8_16 = vec_splat_u16(8);
784 vector unsigned short v1_16 = vec_splat_u16(1);
785 vector unsigned short v2_16 = vec_splat_u16(2);
786 vector unsigned short v3_16 = vec_splat_u16(3);
787 vector unsigned int v8_32 = vec_splat_u32(8);
788 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
789 vector unsigned short v3f = VECUINT16_LITERAL(
790 0x003f, 0x003f, 0x003f, 0x003f,
791 0x003f, 0x003f, 0x003f, 0x003f);
792 vector unsigned short vfc = VECUINT16_LITERAL(
793 0x00fc, 0x00fc, 0x00fc, 0x00fc,
794 0x00fc, 0x00fc, 0x00fc, 0x00fc);
795
796 /*
797 0x10 - 0x1f is the alpha
798 0x00 - 0x0e evens are the red
799 0x01 - 0x0f odds are zero
800 */
801 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
802 0x10, 0x00, 0x01, 0x01,
803 0x10, 0x02, 0x01, 0x01,
804 0x10, 0x04, 0x01, 0x01,
805 0x10, 0x06, 0x01, 0x01
806 );
807 vector unsigned char vredalpha2 = (vector unsigned char)(
808 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
809 );
810 /*
811 0x00 - 0x0f is ARxx ARxx ARxx ARxx
812 0x11 - 0x0f odds are blue
813 */
814 vector unsigned char vblue1 = VECUINT8_LITERAL(
815 0x00, 0x01, 0x02, 0x11,
816 0x04, 0x05, 0x06, 0x13,
817 0x08, 0x09, 0x0a, 0x15,
818 0x0c, 0x0d, 0x0e, 0x17
819 );
820 vector unsigned char vblue2 = (vector unsigned char)(
821 vec_add((vector unsigned int)vblue1, v8_32)
822 );
823 /*
824 0x00 - 0x0f is ARxB ARxB ARxB ARxB
825 0x10 - 0x0e evens are green
826 */
827 vector unsigned char vgreen1 = VECUINT8_LITERAL(
828 0x00, 0x01, 0x10, 0x03,
829 0x04, 0x05, 0x12, 0x07,
830 0x08, 0x09, 0x14, 0x0b,
831 0x0c, 0x0d, 0x16, 0x0f
832 );
833 vector unsigned char vgreen2 = (vector unsigned char)(
834 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
835 );
836 vector unsigned char vgmerge = VECUINT8_LITERAL(
837 0x00, 0x02, 0x00, 0x06,
838 0x00, 0x0a, 0x00, 0x0e,
839 0x00, 0x12, 0x00, 0x16,
840 0x00, 0x1a, 0x00, 0x1e);
841 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
842 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
843 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
844
845 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
846 vf800 = vec_sl(vf800, vec_splat_u16(8));
847
848 while(height--) {
849 int extrawidth;
850 vector unsigned char valigner;
851 vector unsigned char vsrc;
852 vector unsigned char voverflow;
853 int width = info->d_width;
854
855#define ONE_PIXEL_BLEND(condition, widthvar) \
856 while (condition) { \
857 Uint32 Pixel; \
858 unsigned sR, sG, sB, dR, dG, dB, sA; \
859 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
860 if(sA) { \
861 unsigned short dstpixel = *((unsigned short *)dst); \
862 dR = (dstpixel >> 8) & 0xf8; \
863 dG = (dstpixel >> 3) & 0xfc; \
864 dB = (dstpixel << 3) & 0xf8; \
865 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
866 *((unsigned short *)dst) = ( \
867 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
868 ); \
869 } \
870 src += 4; \
871 dst += 2; \
872 widthvar--; \
873 }
874 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
875 extrawidth = (width % 8);
876 valigner = VEC_ALIGNER(src);
877 vsrc = (vector unsigned char)vec_ld(0, src);
878 width -= extrawidth;
879 while (width) {
880 vector unsigned char valpha;
881 vector unsigned char vsrc1, vsrc2;
882 vector unsigned char vdst1, vdst2;
883 vector unsigned short vR, vG, vB;
884 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
885
886 /* Load 8 pixels from src as ARGB */
887 voverflow = (vector unsigned char)vec_ld(15, src);
888 vsrc = vec_perm(vsrc, voverflow, valigner);
889 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
890 src += 16;
891 vsrc = (vector unsigned char)vec_ld(15, src);
892 voverflow = vec_perm(voverflow, vsrc, valigner);
893 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
894 src += 16;
895
896 /* Load 8 pixels from dst as XRGB */
897 voverflow = vec_ld(0, dst);
898 vR = vec_and((vector unsigned short)voverflow, vf800);
899 vB = vec_sl((vector unsigned short)voverflow, v3_16);
900 vG = vec_sl(vB, v2_16);
901 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
902 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
903 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
904 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
905 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
906 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
907
908 /* Alpha blend 8 pixels as ARGB */
909 valpha = vec_perm(vsrc1, v0, valphaPermute);
910 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
911 valpha = vec_perm(vsrc2, v0, valphaPermute);
912 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
913
914 /* Convert 8 pixels to 565 */
915 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
916 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
917 vgpixel = vec_and(vgpixel, vfc);
918 vgpixel = vec_sl(vgpixel, v3_16);
919 vrpixel = vec_sl(vpixel, v1_16);
920 vrpixel = vec_and(vrpixel, vf800);
921 vbpixel = vec_and(vpixel, v3f);
922 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
923 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
924
925 /* Store 8 pixels */
926 vec_st(vdst1, 0, dst);
927
928 width -= 8;
929 dst += 16;
930 }
931 ONE_PIXEL_BLEND((extrawidth), extrawidth);
932#undef ONE_PIXEL_BLEND
933 src += srcskip;
934 dst += dstskip;
935 }
936}
937
938static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
939{
940 unsigned alpha = info->src->alpha;
941 int height = info->d_height;
942 Uint32 *srcp = (Uint32 *)info->s_pixels;
943 int srcskip = info->s_skip >> 2;
944 Uint32 *dstp = (Uint32 *)info->d_pixels;
945 int dstskip = info->d_skip >> 2;
946 SDL_PixelFormat *srcfmt = info->src;
947 SDL_PixelFormat *dstfmt = info->dst;
948 unsigned sA = srcfmt->alpha;
949 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
950 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
951 Uint32 ckey = info->src->colorkey;
952 vector unsigned char mergePermute;
953 vector unsigned char vsrcPermute;
954 vector unsigned char vdstPermute;
955 vector unsigned char vsdstPermute;
956 vector unsigned char valpha;
957 vector unsigned char valphamask;
958 vector unsigned char vbits;
959 vector unsigned char v0;
960 vector unsigned short v1;
961 vector unsigned short v8;
962 vector unsigned int vckey;
963 vector unsigned int vrgbmask;
964
965 mergePermute = VEC_MERGE_PERMUTE();
966 v0 = vec_splat_u8(0);
967 v1 = vec_splat_u16(1);
968 v8 = vec_splat_u16(8);
969
970 /* set the alpha to 255 on the destination surf */
971 valphamask = VEC_ALPHA_MASK();
972
973 vsrcPermute = calc_swizzle32(srcfmt, NULL);
974 vdstPermute = calc_swizzle32(NULL, dstfmt);
975 vsdstPermute = calc_swizzle32(dstfmt, NULL);
976
977 /* set a vector full of alpha and 255-alpha */
978 ((unsigned char *)&valpha)[0] = alpha;
979 valpha = vec_splat(valpha, 0);
980 vbits = (vector unsigned char)vec_splat_s8(-1);
981
982 ckey &= rgbmask;
983 ((unsigned int *)(char*)&vckey)[0] = ckey;
984 vckey = vec_splat(vckey, 0);
985 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
986 vrgbmask = vec_splat(vrgbmask, 0);
987
988 while(height--) {
989 int width = info->d_width;
990#define ONE_PIXEL_BLEND(condition, widthvar) \
991 while (condition) { \
992 Uint32 Pixel; \
993 unsigned sR, sG, sB, dR, dG, dB; \
994 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
995 if(sA && Pixel != ckey) { \
996 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
997 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
998 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
999 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1000 } \
1001 dstp++; \
1002 srcp++; \
1003 widthvar--; \
1004 }
1005 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1006 if (width > 0) {
1007 int extrawidth = (width % 4);
1008 vector unsigned char valigner = VEC_ALIGNER(srcp);
1009 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1010 width -= extrawidth;
1011 while (width) {
1012 vector unsigned char vsel;
1013 vector unsigned char voverflow;
1014 vector unsigned char vd;
1015 vector unsigned char vd_orig;
1016
1017 /* s = *srcp */
1018 voverflow = (vector unsigned char)vec_ld(15, srcp);
1019 vs = vec_perm(vs, voverflow, valigner);
1020
1021 /* vsel is set for items that match the key */
1022 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1023 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1024
1025 /* permute to source format */
1026 vs = vec_perm(vs, valpha, vsrcPermute);
1027
1028 /* d = *dstp */
1029 vd = (vector unsigned char)vec_ld(0, dstp);
1030 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1031
1032 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1033
1034 /* set the alpha channel to full on */
1035 vd = vec_or(vd, valphamask);
1036
1037 /* mask out color key */
1038 vd = vec_sel(vd, vd_orig, vsel);
1039
1040 /* permute to dest format */
1041 vd = vec_perm(vd, vbits, vdstPermute);
1042
1043 /* *dstp = res */
1044 vec_st((vector unsigned int)vd, 0, dstp);
1045
1046 srcp += 4;
1047 dstp += 4;
1048 width -= 4;
1049 vs = voverflow;
1050 }
1051 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1052 }
1053#undef ONE_PIXEL_BLEND
1054
1055 srcp += srcskip;
1056 dstp += dstskip;
1057 }
1058}
1059
1060
1061static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1062{
1063 int width = info->d_width;
1064 int height = info->d_height;
1065 Uint32 *srcp = (Uint32 *)info->s_pixels;
1066 int srcskip = info->s_skip >> 2;
1067 Uint32 *dstp = (Uint32 *)info->d_pixels;
1068 int dstskip = info->d_skip >> 2;
1069 SDL_PixelFormat *srcfmt = info->src;
1070 SDL_PixelFormat *dstfmt = info->dst;
1071 vector unsigned char mergePermute;
1072 vector unsigned char valphaPermute;
1073 vector unsigned char vsrcPermute;
1074 vector unsigned char vdstPermute;
1075 vector unsigned char vsdstPermute;
1076 vector unsigned char valphamask;
1077 vector unsigned char vpixelmask;
1078 vector unsigned char v0;
1079 vector unsigned short v1;
1080 vector unsigned short v8;
1081
1082 v0 = vec_splat_u8(0);
1083 v1 = vec_splat_u16(1);
1084 v8 = vec_splat_u16(8);
1085 mergePermute = VEC_MERGE_PERMUTE();
1086 valphamask = VEC_ALPHA_MASK();
1087 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1088 vpixelmask = vec_nor(valphamask, v0);
1089 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1090 vdstPermute = calc_swizzle32(NULL, dstfmt);
1091 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1092
1093 while ( height-- ) {
1094 width = info->d_width;
1095#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1096 Uint32 Pixel; \
1097 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1098 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1099 if(sA) { \
1100 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1101 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1102 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1103 } \
1104 ++srcp; \
1105 ++dstp; \
1106 widthvar--; \
1107 }
1108 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1109 if (width > 0) {
1110 /* vsrcPermute */
1111 /* vdstPermute */
1112 int extrawidth = (width % 4);
1113 vector unsigned char valigner = VEC_ALIGNER(srcp);
1114 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1115 width -= extrawidth;
1116 while (width) {
1117 vector unsigned char voverflow;
1118 vector unsigned char vd;
1119 vector unsigned char valpha;
1120 vector unsigned char vdstalpha;
1121 /* s = *srcp */
1122 voverflow = (vector unsigned char)vec_ld(15, srcp);
1123 vs = vec_perm(vs, voverflow, valigner);
1124 vs = vec_perm(vs, v0, vsrcPermute);
1125
1126 valpha = vec_perm(vs, v0, valphaPermute);
1127
1128 /* d = *dstp */
1129 vd = (vector unsigned char)vec_ld(0, dstp);
1130 vd = vec_perm(vd, v0, vsdstPermute);
1131 vdstalpha = vec_and(vd, valphamask);
1132
1133 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1134
1135 /* set the alpha to the dest alpha */
1136 vd = vec_and(vd, vpixelmask);
1137 vd = vec_or(vd, vdstalpha);
1138 vd = vec_perm(vd, v0, vdstPermute);
1139
1140 /* *dstp = res */
1141 vec_st((vector unsigned int)vd, 0, dstp);
1142
1143 srcp += 4;
1144 dstp += 4;
1145 width -= 4;
1146 vs = voverflow;
1147
1148 }
1149 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1150 }
1151 srcp += srcskip;
1152 dstp += dstskip;
1153#undef ONE_PIXEL_BLEND
1154 }
1155}
1156
1157/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1158static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1159{
1160 int width = info->d_width;
1161 int height = info->d_height;
1162 Uint32 *srcp = (Uint32 *)info->s_pixels;
1163 int srcskip = info->s_skip >> 2;
1164 Uint32 *dstp = (Uint32 *)info->d_pixels;
1165 int dstskip = info->d_skip >> 2;
1166 vector unsigned char mergePermute;
1167 vector unsigned char valphaPermute;
1168 vector unsigned char valphamask;
1169 vector unsigned char vpixelmask;
1170 vector unsigned char v0;
1171 vector unsigned short v1;
1172 vector unsigned short v8;
1173 v0 = vec_splat_u8(0);
1174 v1 = vec_splat_u16(1);
1175 v8 = vec_splat_u16(8);
1176 mergePermute = VEC_MERGE_PERMUTE();
1177 valphamask = VEC_ALPHA_MASK();
1178 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1179
1180
1181 vpixelmask = vec_nor(valphamask, v0);
1182 while(height--) {
1183 width = info->d_width;
1184#define ONE_PIXEL_BLEND(condition, widthvar) \
1185 while ((condition)) { \
1186 Uint32 dalpha; \
1187 Uint32 d; \
1188 Uint32 s1; \
1189 Uint32 d1; \
1190 Uint32 s = *srcp; \
1191 Uint32 alpha = s >> 24; \
1192 if(alpha) { \
1193 if(alpha == SDL_ALPHA_OPAQUE) { \
1194 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1195 } else { \
1196 d = *dstp; \
1197 dalpha = d & 0xff000000; \
1198 s1 = s & 0xff00ff; \
1199 d1 = d & 0xff00ff; \
1200 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1201 s &= 0xff00; \
1202 d &= 0xff00; \
1203 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1204 *dstp = d1 | d | dalpha; \
1205 } \
1206 } \
1207 ++srcp; \
1208 ++dstp; \
1209 widthvar--; \
1210 }
1211 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1212 if (width > 0) {
1213 int extrawidth = (width % 4);
1214 vector unsigned char valigner = VEC_ALIGNER(srcp);
1215 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1216 width -= extrawidth;
1217 while (width) {
1218 vector unsigned char voverflow;
1219 vector unsigned char vd;
1220 vector unsigned char valpha;
1221 vector unsigned char vdstalpha;
1222 /* s = *srcp */
1223 voverflow = (vector unsigned char)vec_ld(15, srcp);
1224 vs = vec_perm(vs, voverflow, valigner);
1225
1226 valpha = vec_perm(vs, v0, valphaPermute);
1227
1228 /* d = *dstp */
1229 vd = (vector unsigned char)vec_ld(0, dstp);
1230 vdstalpha = vec_and(vd, valphamask);
1231
1232 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1233
1234 /* set the alpha to the dest alpha */
1235 vd = vec_and(vd, vpixelmask);
1236 vd = vec_or(vd, vdstalpha);
1237
1238 /* *dstp = res */
1239 vec_st((vector unsigned int)vd, 0, dstp);
1240
1241 srcp += 4;
1242 dstp += 4;
1243 width -= 4;
1244 vs = voverflow;
1245 }
1246 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1247 }
1248 srcp += srcskip;
1249 dstp += dstskip;
1250 }
1251#undef ONE_PIXEL_BLEND
1252}
1253
1254static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1255{
1256 /* XXX : 6 */
1257 unsigned alpha = info->src->alpha;
1258 int height = info->d_height;
1259 Uint32 *srcp = (Uint32 *)info->s_pixels;
1260 int srcskip = info->s_skip >> 2;
1261 Uint32 *dstp = (Uint32 *)info->d_pixels;
1262 int dstskip = info->d_skip >> 2;
1263 SDL_PixelFormat *srcfmt = info->src;
1264 SDL_PixelFormat *dstfmt = info->dst;
1265 unsigned sA = srcfmt->alpha;
1266 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1267 vector unsigned char mergePermute;
1268 vector unsigned char vsrcPermute;
1269 vector unsigned char vdstPermute;
1270 vector unsigned char vsdstPermute;
1271 vector unsigned char valpha;
1272 vector unsigned char valphamask;
1273 vector unsigned char vbits;
1274 vector unsigned short v1;
1275 vector unsigned short v8;
1276
1277 mergePermute = VEC_MERGE_PERMUTE();
1278 v1 = vec_splat_u16(1);
1279 v8 = vec_splat_u16(8);
1280
1281 /* set the alpha to 255 on the destination surf */
1282 valphamask = VEC_ALPHA_MASK();
1283
1284 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1285 vdstPermute = calc_swizzle32(NULL, dstfmt);
1286 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1287
1288 /* set a vector full of alpha and 255-alpha */
1289 ((unsigned char *)&valpha)[0] = alpha;
1290 valpha = vec_splat(valpha, 0);
1291 vbits = (vector unsigned char)vec_splat_s8(-1);
1292
1293 while(height--) {
1294 int width = info->d_width;
1295#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1296 Uint32 Pixel; \
1297 unsigned sR, sG, sB, dR, dG, dB; \
1298 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1299 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1300 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1301 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1302 ++srcp; \
1303 ++dstp; \
1304 widthvar--; \
1305 }
1306 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1307 if (width > 0) {
1308 int extrawidth = (width % 4);
1309 vector unsigned char valigner = VEC_ALIGNER(srcp);
1310 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1311 width -= extrawidth;
1312 while (width) {
1313 vector unsigned char voverflow;
1314 vector unsigned char vd;
1315
1316 /* s = *srcp */
1317 voverflow = (vector unsigned char)vec_ld(15, srcp);
1318 vs = vec_perm(vs, voverflow, valigner);
1319 vs = vec_perm(vs, valpha, vsrcPermute);
1320
1321 /* d = *dstp */
1322 vd = (vector unsigned char)vec_ld(0, dstp);
1323 vd = vec_perm(vd, vd, vsdstPermute);
1324
1325 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1326
1327 /* set the alpha channel to full on */
1328 vd = vec_or(vd, valphamask);
1329 vd = vec_perm(vd, vbits, vdstPermute);
1330
1331 /* *dstp = res */
1332 vec_st((vector unsigned int)vd, 0, dstp);
1333
1334 srcp += 4;
1335 dstp += 4;
1336 width -= 4;
1337 vs = voverflow;
1338 }
1339 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1340 }
1341#undef ONE_PIXEL_BLEND
1342
1343 srcp += srcskip;
1344 dstp += dstskip;
1345 }
1346
1347}
1348
1349
1350/* fast RGB888->(A)RGB888 blending */
1351static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1352{
1353 unsigned alpha = info->src->alpha;
1354 int height = info->d_height;
1355 Uint32 *srcp = (Uint32 *)info->s_pixels;
1356 int srcskip = info->s_skip >> 2;
1357 Uint32 *dstp = (Uint32 *)info->d_pixels;
1358 int dstskip = info->d_skip >> 2;
1359 vector unsigned char mergePermute;
1360 vector unsigned char valpha;
1361 vector unsigned char valphamask;
1362 vector unsigned short v1;
1363 vector unsigned short v8;
1364
1365 mergePermute = VEC_MERGE_PERMUTE();
1366 v1 = vec_splat_u16(1);
1367 v8 = vec_splat_u16(8);
1368
1369 /* set the alpha to 255 on the destination surf */
1370 valphamask = VEC_ALPHA_MASK();
1371
1372 /* set a vector full of alpha and 255-alpha */
1373 ((unsigned char *)&valpha)[0] = alpha;
1374 valpha = vec_splat(valpha, 0);
1375
1376 while(height--) {
1377 int width = info->d_width;
1378#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1379 Uint32 s = *srcp; \
1380 Uint32 d = *dstp; \
1381 Uint32 s1 = s & 0xff00ff; \
1382 Uint32 d1 = d & 0xff00ff; \
1383 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1384 & 0xff00ff; \
1385 s &= 0xff00; \
1386 d &= 0xff00; \
1387 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1388 *dstp = d1 | d | 0xff000000; \
1389 ++srcp; \
1390 ++dstp; \
1391 widthvar--; \
1392 }
1393 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1394 if (width > 0) {
1395 int extrawidth = (width % 4);
1396 vector unsigned char valigner = VEC_ALIGNER(srcp);
1397 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1398 width -= extrawidth;
1399 while (width) {
1400 vector unsigned char voverflow;
1401 vector unsigned char vd;
1402
1403 /* s = *srcp */
1404 voverflow = (vector unsigned char)vec_ld(15, srcp);
1405 vs = vec_perm(vs, voverflow, valigner);
1406
1407 /* d = *dstp */
1408 vd = (vector unsigned char)vec_ld(0, dstp);
1409
1410 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1411
1412 /* set the alpha channel to full on */
1413 vd = vec_or(vd, valphamask);
1414
1415 /* *dstp = res */
1416 vec_st((vector unsigned int)vd, 0, dstp);
1417
1418 srcp += 4;
1419 dstp += 4;
1420 width -= 4;
1421 vs = voverflow;
1422 }
1423 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1424 }
1425#undef ONE_PIXEL_BLEND
1426
1427 srcp += srcskip;
1428 dstp += dstskip;
1429 }
1430}
1431#if __MWERKS__
1432#pragma altivec_model off
1433#endif
1434#endif /* SDL_ALTIVEC_BLITTERS */
1435
1436/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1437static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1438{
1439 int width = info->d_width;
1440 int height = info->d_height;
1441 Uint32 *srcp = (Uint32 *)info->s_pixels;
1442 int srcskip = info->s_skip >> 2;
1443 Uint32 *dstp = (Uint32 *)info->d_pixels;
1444 int dstskip = info->d_skip >> 2;
1445
1446 while(height--) {
1447 DUFFS_LOOP4({
1448 Uint32 s = *srcp++;
1449 Uint32 d = *dstp;
1450 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1451 + (s & d & 0x00010101)) | 0xff000000;
1452 }, width);
1453 srcp += srcskip;
1454 dstp += dstskip;
1455 }
1456}
1457
1458/* fast RGB888->(A)RGB888 blending with surface alpha */
1459static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1460{
1461 unsigned alpha = info->src->alpha;
1462 if(alpha == 128) {
1463 BlitRGBtoRGBSurfaceAlpha128(info);
1464 } else {
1465 int width = info->d_width;
1466 int height = info->d_height;
1467 Uint32 *srcp = (Uint32 *)info->s_pixels;
1468 int srcskip = info->s_skip >> 2;
1469 Uint32 *dstp = (Uint32 *)info->d_pixels;
1470 int dstskip = info->d_skip >> 2;
1471 Uint32 s;
1472 Uint32 d;
1473 Uint32 s1;
1474 Uint32 d1;
1475
1476 while(height--) {
1477 DUFFS_LOOP_DOUBLE2({
1478 /* One Pixel Blend */
1479 s = *srcp;
1480 d = *dstp;
1481 s1 = s & 0xff00ff;
1482 d1 = d & 0xff00ff;
1483 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1484 & 0xff00ff;
1485 s &= 0xff00;
1486 d &= 0xff00;
1487 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1488 *dstp = d1 | d | 0xff000000;
1489 ++srcp;
1490 ++dstp;
1491 },{
1492 /* Two Pixels Blend */
1493 s = *srcp;
1494 d = *dstp;
1495 s1 = s & 0xff00ff;
1496 d1 = d & 0xff00ff;
1497 d1 += (s1 - d1) * alpha >> 8;
1498 d1 &= 0xff00ff;
1499
1500 s = ((s & 0xff00) >> 8) |
1501 ((srcp[1] & 0xff00) << 8);
1502 d = ((d & 0xff00) >> 8) |
1503 ((dstp[1] & 0xff00) << 8);
1504 d += (s - d) * alpha >> 8;
1505 d &= 0x00ff00ff;
1506
1507 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1508 ++srcp;
1509
1510 s1 = *srcp;
1511 d1 = *dstp;
1512 s1 &= 0xff00ff;
1513 d1 &= 0xff00ff;
1514 d1 += (s1 - d1) * alpha >> 8;
1515 d1 &= 0xff00ff;
1516
1517 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1518 ++srcp;
1519 ++dstp;
1520 }, width);
1521 srcp += srcskip;
1522 dstp += dstskip;
1523 }
1524 }
1525}
1526
1527/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1528static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1529{
1530 int width = info->d_width;
1531 int height = info->d_height;
1532 Uint32 *srcp = (Uint32 *)info->s_pixels;
1533 int srcskip = info->s_skip >> 2;
1534 Uint32 *dstp = (Uint32 *)info->d_pixels;
1535 int dstskip = info->d_skip >> 2;
1536
1537 while(height--) {
1538 DUFFS_LOOP4({
1539 Uint32 dalpha;
1540 Uint32 d;
1541 Uint32 s1;
1542 Uint32 d1;
1543 Uint32 s = *srcp;
1544 Uint32 alpha = s >> 24;
1545 /* FIXME: Here we special-case opaque alpha since the
1546 compositioning used (>>8 instead of /255) doesn't handle
1547 it correctly. Also special-case alpha=0 for speed?
1548 Benchmark this! */
1549 if(alpha) {
1550 if(alpha == SDL_ALPHA_OPAQUE) {
1551 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1552 } else {
1553 /*
1554 * take out the middle component (green), and process
1555 * the other two in parallel. One multiply less.
1556 */
1557 d = *dstp;
1558 dalpha = d & 0xff000000;
1559 s1 = s & 0xff00ff;
1560 d1 = d & 0xff00ff;
1561 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1562 s &= 0xff00;
1563 d &= 0xff00;
1564 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1565 *dstp = d1 | d | dalpha;
1566 }
1567 }
1568 ++srcp;
1569 ++dstp;
1570 }, width);
1571 srcp += srcskip;
1572 dstp += dstskip;
1573 }
1574}
1575
1576#if GCC_ASMBLIT
1577/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1578static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1579{
1580 int width = info->d_width;
1581 int height = info->d_height;
1582 Uint32 *srcp = (Uint32 *)info->s_pixels;
1583 int srcskip = info->s_skip >> 2;
1584 Uint32 *dstp = (Uint32 *)info->d_pixels;
1585 int dstskip = info->d_skip >> 2;
1586 SDL_PixelFormat* sf = info->src;
1587 Uint32 amask = sf->Amask;
1588
1589 __asm__ (
1590 /* make mm6 all zeros. */
1591 "pxor %%mm6, %%mm6\n"
1592
1593 /* Make a mask to preserve the alpha. */
1594 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1595 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1596 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1597 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1598 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1599
1600 /* form channel masks */
1601 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1602 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1603 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1604 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1605
1606 /* get alpha channel shift */
1607 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1608
1609 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1610
1611 while(height--) {
1612
1613 DUFFS_LOOP4({
1614 Uint32 alpha;
1615
1616 __asm__ (
1617 "prefetch 64(%0)\n"
1618 "prefetch 64(%1)\n"
1619 : : "r" (srcp), "r" (dstp) );
1620
1621 alpha = *srcp & amask;
1622 /* FIXME: Here we special-case opaque alpha since the
1623 compositioning used (>>8 instead of /255) doesn't handle
1624 it correctly. Also special-case alpha=0 for speed?
1625 Benchmark this! */
1626 if(alpha == 0) {
1627 /* do nothing */
1628 }
1629 else if(alpha == amask) {
1630 /* opaque alpha -- copy RGB, keep dst alpha */
1631 /* using MMX here to free up regular registers for other things */
1632 __asm__ (
1633 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1634 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1635 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1636 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1637 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1638 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1639
1640 : : "r" (srcp), "r" (dstp) );
1641 }
1642
1643 else {
1644 __asm__ (
1645 /* load in the source, and dst. */
1646 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1647 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1648
1649 /* Move the src alpha into mm2 */
1650
1651 /* if supporting pshufw */
1652 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1653 /*"psrlw $8, %%mm2\n" */
1654
1655 /* else: */
1656 "movd %2, %%mm2\n"
1657 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1658 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1659 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1660 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1661
1662 /* move the colors into words. */
1663 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1664 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1665
1666 /* src - dst */
1667 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1668
1669 /* A * (src-dst) */
1670 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1671 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1672 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1673
1674 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1675
1676 "movd %%mm0, (%1)\n" /* result in mm0 */
1677
1678 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1679
1680 }
1681 ++srcp;
1682 ++dstp;
1683 }, width);
1684 srcp += srcskip;
1685 dstp += dstskip;
1686 }
1687
1688 __asm__ (
1689 "emms\n"
1690 : );
1691}
1692/* End GCC_ASMBLIT*/
1693
1694#elif MSVC_ASMBLIT
1695/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1696static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1697{
1698 int width = info->d_width;
1699 int height = info->d_height;
1700 Uint32 *srcp = (Uint32 *)info->s_pixels;
1701 int srcskip = info->s_skip >> 2;
1702 Uint32 *dstp = (Uint32 *)info->d_pixels;
1703 int dstskip = info->d_skip >> 2;
1704 SDL_PixelFormat* sf = info->src;
1705 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1706 Uint32 amask = sf->Amask;
1707 Uint32 ashift = sf->Ashift;
1708 Uint64 multmask;
1709
1710 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1711
1712 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1713 multmask = ~(0xFFFFi64 << (ashift * 2));
1714 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1715
1716 while(height--) {
1717 DUFFS_LOOP4({
1718 Uint32 alpha;
1719
1720 _m_prefetch(srcp + 16);
1721 _m_prefetch(dstp + 16);
1722
1723 alpha = *srcp & amask;
1724 if (alpha == 0) {
1725 /* do nothing */
1726 } else if (alpha == amask) {
1727 /* copy RGB, keep dst alpha */
1728 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1729 } else {
1730 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1731 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1732
1733 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1734 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1735
1736 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1737 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1738 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1739 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1740 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1741
1742 /* blend */
1743 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1744 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1745 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1746 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1747 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1748
1749 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1750 }
1751 ++srcp;
1752 ++dstp;
1753 }, width);
1754 srcp += srcskip;
1755 dstp += dstskip;
1756 }
1757 _mm_empty();
1758}
1759/* End MSVC_ASMBLIT */
1760
1761#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1762
1763/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1764
1765/* blend a single 16 bit pixel at 50% */
1766#define BLEND16_50(d, s, mask) \
1767 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1768
1769/* blend two 16 bit pixels at 50% */
1770#define BLEND2x16_50(d, s, mask) \
1771 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1772 + (s & d & (~(mask | mask << 16))))
1773
1774static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1775{
1776 int width = info->d_width;
1777 int height = info->d_height;
1778 Uint16 *srcp = (Uint16 *)info->s_pixels;
1779 int srcskip = info->s_skip >> 1;
1780 Uint16 *dstp = (Uint16 *)info->d_pixels;
1781 int dstskip = info->d_skip >> 1;
1782
1783 while(height--) {
1784 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1785 /*
1786 * Source and destination not aligned, pipeline it.
1787 * This is mostly a win for big blits but no loss for
1788 * small ones
1789 */
1790 Uint32 prev_sw;
1791 int w = width;
1792
1793 /* handle odd destination */
1794 if((uintptr_t)dstp & 2) {
1795 Uint16 d = *dstp, s = *srcp;
1796 *dstp = BLEND16_50(d, s, mask);
1797 dstp++;
1798 srcp++;
1799 w--;
1800 }
1801 srcp++; /* srcp is now 32-bit aligned */
1802
1803 /* bootstrap pipeline with first halfword */
1804 prev_sw = ((Uint32 *)srcp)[-1];
1805
1806 while(w > 1) {
1807 Uint32 sw, dw, s;
1808 sw = *(Uint32 *)srcp;
1809 dw = *(Uint32 *)dstp;
1810#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1811 s = (prev_sw << 16) + (sw >> 16);
1812#else
1813 s = (prev_sw >> 16) + (sw << 16);
1814#endif
1815 prev_sw = sw;
1816 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1817 dstp += 2;
1818 srcp += 2;
1819 w -= 2;
1820 }
1821
1822 /* final pixel if any */
1823 if(w) {
1824 Uint16 d = *dstp, s;
1825#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1826 s = (Uint16)prev_sw;
1827#else
1828 s = (Uint16)(prev_sw >> 16);
1829#endif
1830 *dstp = BLEND16_50(d, s, mask);
1831 srcp++;
1832 dstp++;
1833 }
1834 srcp += srcskip - 1;
1835 dstp += dstskip;
1836 } else {
1837 /* source and destination are aligned */
1838 int w = width;
1839
1840 /* first odd pixel? */
1841 if((uintptr_t)srcp & 2) {
1842 Uint16 d = *dstp, s = *srcp;
1843 *dstp = BLEND16_50(d, s, mask);
1844 srcp++;
1845 dstp++;
1846 w--;
1847 }
1848 /* srcp and dstp are now 32-bit aligned */
1849
1850 while(w > 1) {
1851 Uint32 sw = *(Uint32 *)srcp;
1852 Uint32 dw = *(Uint32 *)dstp;
1853 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1854 srcp += 2;
1855 dstp += 2;
1856 w -= 2;
1857 }
1858
1859 /* last odd pixel? */
1860 if(w) {
1861 Uint16 d = *dstp, s = *srcp;
1862 *dstp = BLEND16_50(d, s, mask);
1863 srcp++;
1864 dstp++;
1865 }
1866 srcp += srcskip;
1867 dstp += dstskip;
1868 }
1869 }
1870}
1871
1872#if GCC_ASMBLIT
1873/* fast RGB565->RGB565 blending with surface alpha */
1874static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1875{
1876 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1877 if(alpha == 128) {
1878 Blit16to16SurfaceAlpha128(info, 0xf7de);
1879 } else {
1880 int width = info->d_width;
1881 int height = info->d_height;
1882 Uint16 *srcp = (Uint16 *)info->s_pixels;
1883 int srcskip = info->s_skip >> 1;
1884 Uint16 *dstp = (Uint16 *)info->d_pixels;
1885 int dstskip = info->d_skip >> 1;
1886 Uint32 s, d;
1887 Uint64 load;
1888
1889 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1890 load = alpha;
1891 alpha >>= 3; /* downscale alpha to 5 bits */
1892
1893 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1894 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1895 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1896 /* position alpha to allow for mullo and mulhi on diff channels
1897 to reduce the number of operations */
1898 psllq_i2r(3, mm0);
1899
1900 /* Setup the 565 color channel masks */
1901 load = 0x07E007E007E007E0ULL;
1902 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1903 load = 0x001F001F001F001FULL;
1904 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1905 while(height--) {
1906 DUFFS_LOOP_QUATRO2(
1907 {
1908 s = *srcp++;
1909 d = *dstp;
1910 /*
1911 * shift out the middle component (green) to
1912 * the high 16 bits, and process all three RGB
1913 * components at the same time.
1914 */
1915 s = (s | s << 16) & 0x07e0f81f;
1916 d = (d | d << 16) & 0x07e0f81f;
1917 d += (s - d) * alpha >> 5;
1918 d &= 0x07e0f81f;
1919 *dstp++ = d | d >> 16;
1920 },{
1921 s = *srcp++;
1922 d = *dstp;
1923 /*
1924 * shift out the middle component (green) to
1925 * the high 16 bits, and process all three RGB
1926 * components at the same time.
1927 */
1928 s = (s | s << 16) & 0x07e0f81f;
1929 d = (d | d << 16) & 0x07e0f81f;
1930 d += (s - d) * alpha >> 5;
1931 d &= 0x07e0f81f;
1932 *dstp++ = d | d >> 16;
1933 s = *srcp++;
1934 d = *dstp;
1935 /*
1936 * shift out the middle component (green) to
1937 * the high 16 bits, and process all three RGB
1938 * components at the same time.
1939 */
1940 s = (s | s << 16) & 0x07e0f81f;
1941 d = (d | d << 16) & 0x07e0f81f;
1942 d += (s - d) * alpha >> 5;
1943 d &= 0x07e0f81f;
1944 *dstp++ = d | d >> 16;
1945 },{
1946 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1947 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1948
1949 /* red -- does not need a mask since the right shift clears
1950 the uninteresting bits */
1951 movq_r2r(mm2, mm5); /* src -> mm5 */
1952 movq_r2r(mm3, mm6); /* dst -> mm6 */
1953 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1954 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1955
1956 /* blend */
1957 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1958 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1959 /* alpha used is actually 11 bits
1960 11 + 5 = 16 bits, so the sign bits are lost */
1961 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1962 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1963 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1964
1965 movq_r2r(mm6, mm1); /* save new reds in dsts */
1966
1967 /* green -- process the bits in place */
1968 movq_r2r(mm2, mm5); /* src -> mm5 */
1969 movq_r2r(mm3, mm6); /* dst -> mm6 */
1970 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1971 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1972
1973 /* blend */
1974 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1975 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1976 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1977 bits are gone and the sign bits present */
1978 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1979 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1980
1981 por_r2r(mm6, mm1); /* save new greens in dsts */
1982
1983 /* blue */
1984 movq_r2r(mm2, mm5); /* src -> mm5 */
1985 movq_r2r(mm3, mm6); /* dst -> mm6 */
1986 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1987 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1988
1989 /* blend */
1990 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1991 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1992 /* 11 + 5 = 16 bits, so the sign bits are lost and
1993 the interesting bits will need to be MASKed */
1994 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1995 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1996 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1997
1998 por_r2r(mm6, mm1); /* save new blues in dsts */
1999
2000 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2001
2002 srcp += 4;
2003 dstp += 4;
2004 }, width);
2005 srcp += srcskip;
2006 dstp += dstskip;
2007 }
2008 emms();
2009 }
2010}
2011
2012/* fast RGB555->RGB555 blending with surface alpha */
2013static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2014{
2015 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2016 if(alpha == 128) {
2017 Blit16to16SurfaceAlpha128(info, 0xfbde);
2018 } else {
2019 int width = info->d_width;
2020 int height = info->d_height;
2021 Uint16 *srcp = (Uint16 *)info->s_pixels;
2022 int srcskip = info->s_skip >> 1;
2023 Uint16 *dstp = (Uint16 *)info->d_pixels;
2024 int dstskip = info->d_skip >> 1;
2025 Uint32 s, d;
2026 Uint64 load;
2027
2028 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2029 load = alpha;
2030 alpha >>= 3; /* downscale alpha to 5 bits */
2031
2032 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2033 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2034 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2035 /* position alpha to allow for mullo and mulhi on diff channels
2036 to reduce the number of operations */
2037 psllq_i2r(3, mm0);
2038
2039 /* Setup the 555 color channel masks */
2040 load = 0x03E003E003E003E0ULL;
2041 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2042 load = 0x001F001F001F001FULL;
2043 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2044 while(height--) {
2045 DUFFS_LOOP_QUATRO2(
2046 {
2047 s = *srcp++;
2048 d = *dstp;
2049 /*
2050 * shift out the middle component (green) to
2051 * the high 16 bits, and process all three RGB
2052 * components at the same time.
2053 */
2054 s = (s | s << 16) & 0x03e07c1f;
2055 d = (d | d << 16) & 0x03e07c1f;
2056 d += (s - d) * alpha >> 5;
2057 d &= 0x03e07c1f;
2058 *dstp++ = d | d >> 16;
2059 },{
2060 s = *srcp++;
2061 d = *dstp;
2062 /*
2063 * shift out the middle component (green) to
2064 * the high 16 bits, and process all three RGB
2065 * components at the same time.
2066 */
2067 s = (s | s << 16) & 0x03e07c1f;
2068 d = (d | d << 16) & 0x03e07c1f;
2069 d += (s - d) * alpha >> 5;
2070 d &= 0x03e07c1f;
2071 *dstp++ = d | d >> 16;
2072 s = *srcp++;
2073 d = *dstp;
2074 /*
2075 * shift out the middle component (green) to
2076 * the high 16 bits, and process all three RGB
2077 * components at the same time.
2078 */
2079 s = (s | s << 16) & 0x03e07c1f;
2080 d = (d | d << 16) & 0x03e07c1f;
2081 d += (s - d) * alpha >> 5;
2082 d &= 0x03e07c1f;
2083 *dstp++ = d | d >> 16;
2084 },{
2085 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2086 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2087
2088 /* red -- process the bits in place */
2089 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2090 /* by reusing the GREEN mask we free up another mmx
2091 register to accumulate the result */
2092
2093 movq_r2r(mm2, mm5); /* src -> mm5 */
2094 movq_r2r(mm3, mm6); /* dst -> mm6 */
2095 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2096 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2097
2098 /* blend */
2099 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2100 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2101 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2102 cleared by a MASK below */
2103 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2104 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2105 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2106
2107 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2108
2109 movq_r2r(mm6, mm1); /* save new reds in dsts */
2110
2111 /* green -- process the bits in place */
2112 movq_r2r(mm2, mm5); /* src -> mm5 */
2113 movq_r2r(mm3, mm6); /* dst -> mm6 */
2114 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2115 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2116
2117 /* blend */
2118 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2119 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2120 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2121 bits are gone and the sign bits present */
2122 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2123 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2124
2125 por_r2r(mm6, mm1); /* save new greens in dsts */
2126
2127 /* blue */
2128 movq_r2r(mm2, mm5); /* src -> mm5 */
2129 movq_r2r(mm3, mm6); /* dst -> mm6 */
2130 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2131 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2132
2133 /* blend */
2134 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2135 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2136 /* 11 + 5 = 16 bits, so the sign bits are lost and
2137 the interesting bits will need to be MASKed */
2138 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2139 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2140 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2141
2142 por_r2r(mm6, mm1); /* save new blues in dsts */
2143
2144 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2145
2146 srcp += 4;
2147 dstp += 4;
2148 }, width);
2149 srcp += srcskip;
2150 dstp += dstskip;
2151 }
2152 emms();
2153 }
2154}
2155/* End GCC_ASMBLIT */
2156
2157#elif MSVC_ASMBLIT
2158/* fast RGB565->RGB565 blending with surface alpha */
2159static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2160{
2161 unsigned alpha = info->src->alpha;
2162 if(alpha == 128) {
2163 Blit16to16SurfaceAlpha128(info, 0xf7de);
2164 } else {
2165 int width = info->d_width;
2166 int height = info->d_height;
2167 Uint16 *srcp = (Uint16 *)info->s_pixels;
2168 int srcskip = info->s_skip >> 1;
2169 Uint16 *dstp = (Uint16 *)info->d_pixels;
2170 int dstskip = info->d_skip >> 1;
2171 Uint32 s, d;
2172
2173 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2174
2175 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2176 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2177 alpha >>= 3; /* downscale alpha to 5 bits */
2178
2179 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2180 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2181 /* position alpha to allow for mullo and mulhi on diff channels
2182 to reduce the number of operations */
2183 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2184
2185 /* Setup the 565 color channel masks */
2186 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2187 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2188
2189 while(height--) {
2190 DUFFS_LOOP_QUATRO2(
2191 {
2192 s = *srcp++;
2193 d = *dstp;
2194 /*
2195 * shift out the middle component (green) to
2196 * the high 16 bits, and process all three RGB
2197 * components at the same time.
2198 */
2199 s = (s | s << 16) & 0x07e0f81f;
2200 d = (d | d << 16) & 0x07e0f81f;
2201 d += (s - d) * alpha >> 5;
2202 d &= 0x07e0f81f;
2203 *dstp++ = (Uint16)(d | d >> 16);
2204 },{
2205 s = *srcp++;
2206 d = *dstp;
2207 /*
2208 * shift out the middle component (green) to
2209 * the high 16 bits, and process all three RGB
2210 * components at the same time.
2211 */
2212 s = (s | s << 16) & 0x07e0f81f;
2213 d = (d | d << 16) & 0x07e0f81f;
2214 d += (s - d) * alpha >> 5;
2215 d &= 0x07e0f81f;
2216 *dstp++ = (Uint16)(d | d >> 16);
2217 s = *srcp++;
2218 d = *dstp;
2219 /*
2220 * shift out the middle component (green) to
2221 * the high 16 bits, and process all three RGB
2222 * components at the same time.
2223 */
2224 s = (s | s << 16) & 0x07e0f81f;
2225 d = (d | d << 16) & 0x07e0f81f;
2226 d += (s - d) * alpha >> 5;
2227 d &= 0x07e0f81f;
2228 *dstp++ = (Uint16)(d | d >> 16);
2229 },{
2230 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2231 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2232
2233 /* red */
2234 src2 = src1;
2235 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2236
2237 dst2 = dst1;
2238 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2239
2240 /* blend */
2241 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2242 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2243 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2244 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2245 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2246
2247 mm_res = dst2; /* RED -> mm_res */
2248
2249 /* green -- process the bits in place */
2250 src2 = src1;
2251 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2252
2253 dst2 = dst1;
2254 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2255
2256 /* blend */
2257 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2258 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2259 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2260 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2261
2262 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2263
2264 /* blue */
2265 src2 = src1;
2266 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2267
2268 dst2 = dst1;
2269 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2270
2271 /* blend */
2272 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2273 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2274 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2275 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2276 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2277
2278 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2279
2280 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2281
2282 srcp += 4;
2283 dstp += 4;
2284 }, width);
2285 srcp += srcskip;
2286 dstp += dstskip;
2287 }
2288 _mm_empty();
2289 }
2290}
2291
2292/* fast RGB555->RGB555 blending with surface alpha */
2293static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2294{
2295 unsigned alpha = info->src->alpha;
2296 if(alpha == 128) {
2297 Blit16to16SurfaceAlpha128(info, 0xfbde);
2298 } else {
2299 int width = info->d_width;
2300 int height = info->d_height;
2301 Uint16 *srcp = (Uint16 *)info->s_pixels;
2302 int srcskip = info->s_skip >> 1;
2303 Uint16 *dstp = (Uint16 *)info->d_pixels;
2304 int dstskip = info->d_skip >> 1;
2305 Uint32 s, d;
2306
2307 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2308
2309 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2310 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2311 alpha >>= 3; /* downscale alpha to 5 bits */
2312
2313 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2314 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2315 /* position alpha to allow for mullo and mulhi on diff channels
2316 to reduce the number of operations */
2317 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2318
2319 /* Setup the 555 color channel masks */
2320 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2321 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2322 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2323
2324 while(height--) {
2325 DUFFS_LOOP_QUATRO2(
2326 {
2327 s = *srcp++;
2328 d = *dstp;
2329 /*
2330 * shift out the middle component (green) to
2331 * the high 16 bits, and process all three RGB
2332 * components at the same time.
2333 */
2334 s = (s | s << 16) & 0x03e07c1f;
2335 d = (d | d << 16) & 0x03e07c1f;
2336 d += (s - d) * alpha >> 5;
2337 d &= 0x03e07c1f;
2338 *dstp++ = (Uint16)(d | d >> 16);
2339 },{
2340 s = *srcp++;
2341 d = *dstp;
2342 /*
2343 * shift out the middle component (green) to
2344 * the high 16 bits, and process all three RGB
2345 * components at the same time.
2346 */
2347 s = (s | s << 16) & 0x03e07c1f;
2348 d = (d | d << 16) & 0x03e07c1f;
2349 d += (s - d) * alpha >> 5;
2350 d &= 0x03e07c1f;
2351 *dstp++ = (Uint16)(d | d >> 16);
2352 s = *srcp++;
2353 d = *dstp;
2354 /*
2355 * shift out the middle component (green) to
2356 * the high 16 bits, and process all three RGB
2357 * components at the same time.
2358 */
2359 s = (s | s << 16) & 0x03e07c1f;
2360 d = (d | d << 16) & 0x03e07c1f;
2361 d += (s - d) * alpha >> 5;
2362 d &= 0x03e07c1f;
2363 *dstp++ = (Uint16)(d | d >> 16);
2364 },{
2365 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2366 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2367
2368 /* red -- process the bits in place */
2369 src2 = src1;
2370 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2371
2372 dst2 = dst1;
2373 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2374
2375 /* blend */
2376 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2377 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2378 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2379 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2380 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2381
2382 mm_res = dst2; /* RED -> mm_res */
2383
2384 /* green -- process the bits in place */
2385 src2 = src1;
2386 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2387
2388 dst2 = dst1;
2389 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2390
2391 /* blend */
2392 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2393 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2394 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2395 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2396
2397 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2398
2399 /* blue */
2400 src2 = src1; /* src -> src2 */
2401 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2402
2403 dst2 = dst1; /* dst -> dst2 */
2404 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2405
2406 /* blend */
2407 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2408 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2409 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2410 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2411 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2412
2413 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2414
2415 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2416
2417 srcp += 4;
2418 dstp += 4;
2419 }, width);
2420 srcp += srcskip;
2421 dstp += dstskip;
2422 }
2423 _mm_empty();
2424 }
2425}
2426#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2427
2428/* fast RGB565->RGB565 blending with surface alpha */
2429static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2430{
2431 unsigned alpha = info->src->alpha;
2432 if(alpha == 128) {
2433 Blit16to16SurfaceAlpha128(info, 0xf7de);
2434 } else {
2435 int width = info->d_width;
2436 int height = info->d_height;
2437 Uint16 *srcp = (Uint16 *)info->s_pixels;
2438 int srcskip = info->s_skip >> 1;
2439 Uint16 *dstp = (Uint16 *)info->d_pixels;
2440 int dstskip = info->d_skip >> 1;
2441 alpha >>= 3; /* downscale alpha to 5 bits */
2442
2443 while(height--) {
2444 DUFFS_LOOP4({
2445 Uint32 s = *srcp++;
2446 Uint32 d = *dstp;
2447 /*
2448 * shift out the middle component (green) to
2449 * the high 16 bits, and process all three RGB
2450 * components at the same time.
2451 */
2452 s = (s | s << 16) & 0x07e0f81f;
2453 d = (d | d << 16) & 0x07e0f81f;
2454 d += (s - d) * alpha >> 5;
2455 d &= 0x07e0f81f;
2456 *dstp++ = (Uint16)(d | d >> 16);
2457 }, width);
2458 srcp += srcskip;
2459 dstp += dstskip;
2460 }
2461 }
2462}
2463
2464/* fast RGB555->RGB555 blending with surface alpha */
2465static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2466{
2467 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2468 if(alpha == 128) {
2469 Blit16to16SurfaceAlpha128(info, 0xfbde);
2470 } else {
2471 int width = info->d_width;
2472 int height = info->d_height;
2473 Uint16 *srcp = (Uint16 *)info->s_pixels;
2474 int srcskip = info->s_skip >> 1;
2475 Uint16 *dstp = (Uint16 *)info->d_pixels;
2476 int dstskip = info->d_skip >> 1;
2477 alpha >>= 3; /* downscale alpha to 5 bits */
2478
2479 while(height--) {
2480 DUFFS_LOOP4({
2481 Uint32 s = *srcp++;
2482 Uint32 d = *dstp;
2483 /*
2484 * shift out the middle component (green) to
2485 * the high 16 bits, and process all three RGB
2486 * components at the same time.
2487 */
2488 s = (s | s << 16) & 0x03e07c1f;
2489 d = (d | d << 16) & 0x03e07c1f;
2490 d += (s - d) * alpha >> 5;
2491 d &= 0x03e07c1f;
2492 *dstp++ = (Uint16)(d | d >> 16);
2493 }, width);
2494 srcp += srcskip;
2495 dstp += dstskip;
2496 }
2497 }
2498}
2499
2500/* fast ARGB8888->RGB565 blending with pixel alpha */
2501static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2502{
2503 int width = info->d_width;
2504 int height = info->d_height;
2505 Uint32 *srcp = (Uint32 *)info->s_pixels;
2506 int srcskip = info->s_skip >> 2;
2507 Uint16 *dstp = (Uint16 *)info->d_pixels;
2508 int dstskip = info->d_skip >> 1;
2509
2510 while(height--) {
2511 DUFFS_LOOP4({
2512 Uint32 s = *srcp;
2513 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2514 /* FIXME: Here we special-case opaque alpha since the
2515 compositioning used (>>8 instead of /255) doesn't handle
2516 it correctly. Also special-case alpha=0 for speed?
2517 Benchmark this! */
2518 if(alpha) {
2519 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2520 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2521 } else {
2522 Uint32 d = *dstp;
2523 /*
2524 * convert source and destination to G0RAB65565
2525 * and blend all components at the same time
2526 */
2527 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2528 + (s >> 3 & 0x1f);
2529 d = (d | d << 16) & 0x07e0f81f;
2530 d += (s - d) * alpha >> 5;
2531 d &= 0x07e0f81f;
2532 *dstp = (Uint16)(d | d >> 16);
2533 }
2534 }
2535 srcp++;
2536 dstp++;
2537 }, width);
2538 srcp += srcskip;
2539 dstp += dstskip;
2540 }
2541}
2542
2543/* fast ARGB8888->RGB555 blending with pixel alpha */
2544static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2545{
2546 int width = info->d_width;
2547 int height = info->d_height;
2548 Uint32 *srcp = (Uint32 *)info->s_pixels;
2549 int srcskip = info->s_skip >> 2;
2550 Uint16 *dstp = (Uint16 *)info->d_pixels;
2551 int dstskip = info->d_skip >> 1;
2552
2553 while(height--) {
2554 DUFFS_LOOP4({
2555 unsigned alpha;
2556 Uint32 s = *srcp;
2557 alpha = s >> 27; /* downscale alpha to 5 bits */
2558 /* FIXME: Here we special-case opaque alpha since the
2559 compositioning used (>>8 instead of /255) doesn't handle
2560 it correctly. Also special-case alpha=0 for speed?
2561 Benchmark this! */
2562 if(alpha) {
2563 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2564 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2565 } else {
2566 Uint32 d = *dstp;
2567 /*
2568 * convert source and destination to G0RAB65565
2569 * and blend all components at the same time
2570 */
2571 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2572 + (s >> 3 & 0x1f);
2573 d = (d | d << 16) & 0x03e07c1f;
2574 d += (s - d) * alpha >> 5;
2575 d &= 0x03e07c1f;
2576 *dstp = (Uint16)(d | d >> 16);
2577 }
2578 }
2579 srcp++;
2580 dstp++;
2581 }, width);
2582 srcp += srcskip;
2583 dstp += dstskip;
2584 }
2585}
2586
2587/* General (slow) N->N blending with per-surface alpha */
2588static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2589{
2590 int width = info->d_width;
2591 int height = info->d_height;
2592 Uint8 *src = info->s_pixels;
2593 int srcskip = info->s_skip;
2594 Uint8 *dst = info->d_pixels;
2595 int dstskip = info->d_skip;
2596 SDL_PixelFormat *srcfmt = info->src;
2597 SDL_PixelFormat *dstfmt = info->dst;
2598 int srcbpp = srcfmt->BytesPerPixel;
2599 int dstbpp = dstfmt->BytesPerPixel;
2600 unsigned sA = srcfmt->alpha;
2601 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2602
2603 if(sA) {
2604 while ( height-- ) {
2605 DUFFS_LOOP4(
2606 {
2607 Uint32 Pixel;
2608 unsigned sR;
2609 unsigned sG;
2610 unsigned sB;
2611 unsigned dR;
2612 unsigned dG;
2613 unsigned dB;
2614 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2615 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2616 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2617 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2618 src += srcbpp;
2619 dst += dstbpp;
2620 },
2621 width);
2622 src += srcskip;
2623 dst += dstskip;
2624 }
2625 }
2626}
2627
2628/* General (slow) colorkeyed N->N blending with per-surface alpha */
2629static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2630{
2631 int width = info->d_width;
2632 int height = info->d_height;
2633 Uint8 *src = info->s_pixels;
2634 int srcskip = info->s_skip;
2635 Uint8 *dst = info->d_pixels;
2636 int dstskip = info->d_skip;
2637 SDL_PixelFormat *srcfmt = info->src;
2638 SDL_PixelFormat *dstfmt = info->dst;
2639 Uint32 ckey = srcfmt->colorkey;
2640 int srcbpp = srcfmt->BytesPerPixel;
2641 int dstbpp = dstfmt->BytesPerPixel;
2642 unsigned sA = srcfmt->alpha;
2643 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2644
2645 while ( height-- ) {
2646 DUFFS_LOOP4(
2647 {
2648 Uint32 Pixel;
2649 unsigned sR;
2650 unsigned sG;
2651 unsigned sB;
2652 unsigned dR;
2653 unsigned dG;
2654 unsigned dB;
2655 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2656 if(sA && Pixel != ckey) {
2657 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2658 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2659 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2660 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2661 }
2662 src += srcbpp;
2663 dst += dstbpp;
2664 },
2665 width);
2666 src += srcskip;
2667 dst += dstskip;
2668 }
2669}
2670
2671/* General (slow) N->N blending with pixel alpha */
2672static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2673{
2674 int width = info->d_width;
2675 int height = info->d_height;
2676 Uint8 *src = info->s_pixels;
2677 int srcskip = info->s_skip;
2678 Uint8 *dst = info->d_pixels;
2679 int dstskip = info->d_skip;
2680 SDL_PixelFormat *srcfmt = info->src;
2681 SDL_PixelFormat *dstfmt = info->dst;
2682
2683 int srcbpp;
2684 int dstbpp;
2685
2686 /* Set up some basic variables */
2687 srcbpp = srcfmt->BytesPerPixel;
2688 dstbpp = dstfmt->BytesPerPixel;
2689
2690 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2691 quite right. for <8bpp source alpha, it gets them very wrong
2692 (check all macros!)
2693 It is unclear whether there is a good general solution that doesn't
2694 need a branch (or a divide). */
2695 while ( height-- ) {
2696 DUFFS_LOOP4(
2697 {
2698 Uint32 Pixel;
2699 unsigned sR;
2700 unsigned sG;
2701 unsigned sB;
2702 unsigned dR;
2703 unsigned dG;
2704 unsigned dB;
2705 unsigned sA;
2706 unsigned dA;
2707 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2708 if(sA) {
2709 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2710 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2711 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2712 }
2713 src += srcbpp;
2714 dst += dstbpp;
2715 },
2716 width);
2717 src += srcskip;
2718 dst += dstskip;
2719 }
2720}
2721
2722
2723SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2724{
2725 SDL_PixelFormat *sf = surface->format;
2726 SDL_PixelFormat *df = surface->map->dst->format;
2727
2728 if(sf->Amask == 0) {
2729 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2730 if(df->BytesPerPixel == 1)
2731 return BlitNto1SurfaceAlphaKey;
2732 else
2733#if SDL_ALTIVEC_BLITTERS
2734 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2735 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2736 return Blit32to32SurfaceAlphaKeyAltivec;
2737 else
2738#endif
2739 return BlitNtoNSurfaceAlphaKey;
2740 } else {
2741 /* Per-surface alpha blits */
2742 switch(df->BytesPerPixel) {
2743 case 1:
2744 return BlitNto1SurfaceAlpha;
2745
2746 case 2:
2747 if(surface->map->identity) {
2748 if(df->Gmask == 0x7e0)
2749 {
2750#if MMX_ASMBLIT
2751 if(SDL_HasMMX())
2752 return Blit565to565SurfaceAlphaMMX;
2753 else
2754#endif
2755 return Blit565to565SurfaceAlpha;
2756 }
2757 else if(df->Gmask == 0x3e0)
2758 {
2759#if MMX_ASMBLIT
2760 if(SDL_HasMMX())
2761 return Blit555to555SurfaceAlphaMMX;
2762 else
2763#endif
2764 return Blit555to555SurfaceAlpha;
2765 }
2766 }
2767 return BlitNtoNSurfaceAlpha;
2768
2769 case 4:
2770 if(sf->Rmask == df->Rmask
2771 && sf->Gmask == df->Gmask
2772 && sf->Bmask == df->Bmask
2773 && sf->BytesPerPixel == 4)
2774 {
2775#if MMX_ASMBLIT
2776 if(sf->Rshift % 8 == 0
2777 && sf->Gshift % 8 == 0
2778 && sf->Bshift % 8 == 0
2779 && SDL_HasMMX())
2780 return BlitRGBtoRGBSurfaceAlphaMMX;
2781#endif
2782 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2783 {
2784#if SDL_ALTIVEC_BLITTERS
2785 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2786 && SDL_HasAltiVec())
2787 return BlitRGBtoRGBSurfaceAlphaAltivec;
2788#endif
2789 return BlitRGBtoRGBSurfaceAlpha;
2790 }
2791 }
2792#if SDL_ALTIVEC_BLITTERS
2793 if((sf->BytesPerPixel == 4) &&
2794 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2795 return Blit32to32SurfaceAlphaAltivec;
2796 else
2797#endif
2798 return BlitNtoNSurfaceAlpha;
2799
2800 case 3:
2801 default:
2802 return BlitNtoNSurfaceAlpha;
2803 }
2804 }
2805 } else {
2806 /* Per-pixel alpha blits */
2807 switch(df->BytesPerPixel) {
2808 case 1:
2809 return BlitNto1PixelAlpha;
2810
2811 case 2:
2812#if SDL_ALTIVEC_BLITTERS
2813 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2814 df->Gmask == 0x7e0 &&
2815 df->Bmask == 0x1f && SDL_HasAltiVec())
2816 return Blit32to565PixelAlphaAltivec;
2817 else
2818#endif
2819 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2820 && sf->Gmask == 0xff00
2821 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2822 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2823 if(df->Gmask == 0x7e0)
2824 return BlitARGBto565PixelAlpha;
2825 else if(df->Gmask == 0x3e0)
2826 return BlitARGBto555PixelAlpha;
2827 }
2828 return BlitNtoNPixelAlpha;
2829
2830 case 4:
2831 if(sf->Rmask == df->Rmask
2832 && sf->Gmask == df->Gmask
2833 && sf->Bmask == df->Bmask
2834 && sf->BytesPerPixel == 4)
2835 {
2836#if MMX_ASMBLIT
2837 if(sf->Rshift % 8 == 0
2838 && sf->Gshift % 8 == 0
2839 && sf->Bshift % 8 == 0
2840 && sf->Ashift % 8 == 0
2841 && sf->Aloss == 0)
2842 {
2843 if(SDL_Has3DNow())
2844 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2845 if(SDL_HasMMX())
2846 return BlitRGBtoRGBPixelAlphaMMX;
2847 }
2848#endif
2849 if(sf->Amask == 0xff000000)
2850 {
2851#if SDL_ALTIVEC_BLITTERS
2852 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2853 && SDL_HasAltiVec())
2854 return BlitRGBtoRGBPixelAlphaAltivec;
2855#endif
2856 return BlitRGBtoRGBPixelAlpha;
2857 }
2858 }
2859#if SDL_ALTIVEC_BLITTERS
2860 if (sf->Amask && sf->BytesPerPixel == 4 &&
2861 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2862 return Blit32to32PixelAlphaAltivec;
2863 else
2864#endif
2865 return BlitNtoNPixelAlpha;
2866
2867 case 3:
2868 default:
2869 return BlitNtoNPixelAlpha;
2870 }
2871 }
2872}
2873