diff options
Diffstat (limited to 'apps/plugins/sdl/src/video/mmx.h')
-rw-r--r-- | apps/plugins/sdl/src/video/mmx.h | 704 |
1 files changed, 704 insertions, 0 deletions
diff --git a/apps/plugins/sdl/src/video/mmx.h b/apps/plugins/sdl/src/video/mmx.h new file mode 100644 index 0000000000..dcee7b0931 --- /dev/null +++ b/apps/plugins/sdl/src/video/mmx.h | |||
@@ -0,0 +1,704 @@ | |||
1 | /* mmx.h | ||
2 | |||
3 | MultiMedia eXtensions GCC interface library for IA32. | ||
4 | |||
5 | To use this library, simply include this header file | ||
6 | and compile with GCC. You MUST have inlining enabled | ||
7 | in order for mmx_ok() to work; this can be done by | ||
8 | simply using -O on the GCC command line. | ||
9 | |||
10 | Compiling with -DMMX_TRACE will cause detailed trace | ||
11 | output to be sent to stderr for each mmx operation. | ||
12 | This adds lots of code, and obviously slows execution to | ||
13 | a crawl, but can be very useful for debugging. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY | ||
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT | ||
17 | LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY | ||
18 | AND FITNESS FOR ANY PARTICULAR PURPOSE. | ||
19 | |||
20 | 1997-99 by H. Dietz and R. Fisher | ||
21 | |||
22 | Notes: | ||
23 | It appears that the latest gas has the pand problem fixed, therefore | ||
24 | I'll undefine BROKEN_PAND by default. | ||
25 | */ | ||
26 | |||
27 | #ifndef _MMX_H | ||
28 | #define _MMX_H | ||
29 | |||
30 | |||
31 | /* Warning: at this writing, the version of GAS packaged | ||
32 | with most Linux distributions does not handle the | ||
33 | parallel AND operation mnemonic correctly. If the | ||
34 | symbol BROKEN_PAND is defined, a slower alternative | ||
35 | coding will be used. If execution of mmxtest results | ||
36 | in an illegal instruction fault, define this symbol. | ||
37 | */ | ||
38 | #undef BROKEN_PAND | ||
39 | |||
40 | |||
41 | /* The type of an value that fits in an MMX register | ||
42 | (note that long long constant values MUST be suffixed | ||
43 | by LL and unsigned long long values by ULL, lest | ||
44 | they be truncated by the compiler) | ||
45 | */ | ||
46 | typedef union { | ||
47 | long long q; /* Quadword (64-bit) value */ | ||
48 | unsigned long long uq; /* Unsigned Quadword */ | ||
49 | int d[2]; /* 2 Doubleword (32-bit) values */ | ||
50 | unsigned int ud[2]; /* 2 Unsigned Doubleword */ | ||
51 | short w[4]; /* 4 Word (16-bit) values */ | ||
52 | unsigned short uw[4]; /* 4 Unsigned Word */ | ||
53 | char b[8]; /* 8 Byte (8-bit) values */ | ||
54 | unsigned char ub[8]; /* 8 Unsigned Byte */ | ||
55 | float s[2]; /* Single-precision (32-bit) value */ | ||
56 | } __attribute__ ((aligned (8))) mmx_t; /* On an 8-byte (64-bit) boundary */ | ||
57 | |||
58 | |||
59 | #if 0 | ||
60 | /* Function to test if multimedia instructions are supported... | ||
61 | */ | ||
62 | inline extern int | ||
63 | mm_support(void) | ||
64 | { | ||
65 | /* Returns 1 if MMX instructions are supported, | ||
66 | 3 if Cyrix MMX and Extended MMX instructions are supported | ||
67 | 5 if AMD MMX and 3DNow! instructions are supported | ||
68 | 0 if hardware does not support any of these | ||
69 | */ | ||
70 | register int rval = 0; | ||
71 | |||
72 | __asm__ __volatile__ ( | ||
73 | /* See if CPUID instruction is supported ... */ | ||
74 | /* ... Get copies of EFLAGS into eax and ecx */ | ||
75 | "pushf\n\t" | ||
76 | "popl %%eax\n\t" | ||
77 | "movl %%eax, %%ecx\n\t" | ||
78 | |||
79 | /* ... Toggle the ID bit in one copy and store */ | ||
80 | /* to the EFLAGS reg */ | ||
81 | "xorl $0x200000, %%eax\n\t" | ||
82 | "push %%eax\n\t" | ||
83 | "popf\n\t" | ||
84 | |||
85 | /* ... Get the (hopefully modified) EFLAGS */ | ||
86 | "pushf\n\t" | ||
87 | "popl %%eax\n\t" | ||
88 | |||
89 | /* ... Compare and test result */ | ||
90 | "xorl %%eax, %%ecx\n\t" | ||
91 | "testl $0x200000, %%ecx\n\t" | ||
92 | "jz NotSupported1\n\t" /* CPUID not supported */ | ||
93 | |||
94 | |||
95 | /* Get standard CPUID information, and | ||
96 | go to a specific vendor section */ | ||
97 | "movl $0, %%eax\n\t" | ||
98 | "cpuid\n\t" | ||
99 | |||
100 | /* Check for Intel */ | ||
101 | "cmpl $0x756e6547, %%ebx\n\t" | ||
102 | "jne TryAMD\n\t" | ||
103 | "cmpl $0x49656e69, %%edx\n\t" | ||
104 | "jne TryAMD\n\t" | ||
105 | "cmpl $0x6c65746e, %%ecx\n" | ||
106 | "jne TryAMD\n\t" | ||
107 | "jmp Intel\n\t" | ||
108 | |||
109 | /* Check for AMD */ | ||
110 | "\nTryAMD:\n\t" | ||
111 | "cmpl $0x68747541, %%ebx\n\t" | ||
112 | "jne TryCyrix\n\t" | ||
113 | "cmpl $0x69746e65, %%edx\n\t" | ||
114 | "jne TryCyrix\n\t" | ||
115 | "cmpl $0x444d4163, %%ecx\n" | ||
116 | "jne TryCyrix\n\t" | ||
117 | "jmp AMD\n\t" | ||
118 | |||
119 | /* Check for Cyrix */ | ||
120 | "\nTryCyrix:\n\t" | ||
121 | "cmpl $0x69727943, %%ebx\n\t" | ||
122 | "jne NotSupported2\n\t" | ||
123 | "cmpl $0x736e4978, %%edx\n\t" | ||
124 | "jne NotSupported3\n\t" | ||
125 | "cmpl $0x64616574, %%ecx\n\t" | ||
126 | "jne NotSupported4\n\t" | ||
127 | /* Drop through to Cyrix... */ | ||
128 | |||
129 | |||
130 | /* Cyrix Section */ | ||
131 | /* See if extended CPUID level 80000001 is supported */ | ||
132 | /* The value of CPUID/80000001 for the 6x86MX is undefined | ||
133 | according to the Cyrix CPU Detection Guide (Preliminary | ||
134 | Rev. 1.01 table 1), so we'll check the value of eax for | ||
135 | CPUID/0 to see if standard CPUID level 2 is supported. | ||
136 | According to the table, the only CPU which supports level | ||
137 | 2 is also the only one which supports extended CPUID levels. | ||
138 | */ | ||
139 | "cmpl $0x2, %%eax\n\t" | ||
140 | "jne MMXtest\n\t" /* Use standard CPUID instead */ | ||
141 | |||
142 | /* Extended CPUID supported (in theory), so get extended | ||
143 | features */ | ||
144 | "movl $0x80000001, %%eax\n\t" | ||
145 | "cpuid\n\t" | ||
146 | "testl $0x00800000, %%eax\n\t" /* Test for MMX */ | ||
147 | "jz NotSupported5\n\t" /* MMX not supported */ | ||
148 | "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ | ||
149 | "jnz EMMXSupported\n\t" | ||
150 | "movl $1, %0:\n\n\t" /* MMX Supported */ | ||
151 | "jmp Return\n\n" | ||
152 | "EMMXSupported:\n\t" | ||
153 | "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ | ||
154 | "jmp Return\n\t" | ||
155 | |||
156 | |||
157 | /* AMD Section */ | ||
158 | "AMD:\n\t" | ||
159 | |||
160 | /* See if extended CPUID is supported */ | ||
161 | "movl $0x80000000, %%eax\n\t" | ||
162 | "cpuid\n\t" | ||
163 | "cmpl $0x80000000, %%eax\n\t" | ||
164 | "jl MMXtest\n\t" /* Use standard CPUID instead */ | ||
165 | |||
166 | /* Extended CPUID supported, so get extended features */ | ||
167 | "movl $0x80000001, %%eax\n\t" | ||
168 | "cpuid\n\t" | ||
169 | "testl $0x00800000, %%edx\n\t" /* Test for MMX */ | ||
170 | "jz NotSupported6\n\t" /* MMX not supported */ | ||
171 | "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ | ||
172 | "jnz ThreeDNowSupported\n\t" | ||
173 | "movl $1, %0:\n\n\t" /* MMX Supported */ | ||
174 | "jmp Return\n\n" | ||
175 | "ThreeDNowSupported:\n\t" | ||
176 | "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ | ||
177 | "jmp Return\n\t" | ||
178 | |||
179 | |||
180 | /* Intel Section */ | ||
181 | "Intel:\n\t" | ||
182 | |||
183 | /* Check for MMX */ | ||
184 | "MMXtest:\n\t" | ||
185 | "movl $1, %%eax\n\t" | ||
186 | "cpuid\n\t" | ||
187 | "testl $0x00800000, %%edx\n\t" /* Test for MMX */ | ||
188 | "jz NotSupported7\n\t" /* MMX Not supported */ | ||
189 | "movl $1, %0:\n\n\t" /* MMX Supported */ | ||
190 | "jmp Return\n\t" | ||
191 | |||
192 | /* Nothing supported */ | ||
193 | "\nNotSupported1:\n\t" | ||
194 | "#movl $101, %0:\n\n\t" | ||
195 | "\nNotSupported2:\n\t" | ||
196 | "#movl $102, %0:\n\n\t" | ||
197 | "\nNotSupported3:\n\t" | ||
198 | "#movl $103, %0:\n\n\t" | ||
199 | "\nNotSupported4:\n\t" | ||
200 | "#movl $104, %0:\n\n\t" | ||
201 | "\nNotSupported5:\n\t" | ||
202 | "#movl $105, %0:\n\n\t" | ||
203 | "\nNotSupported6:\n\t" | ||
204 | "#movl $106, %0:\n\n\t" | ||
205 | "\nNotSupported7:\n\t" | ||
206 | "#movl $107, %0:\n\n\t" | ||
207 | "movl $0, %0:\n\n\t" | ||
208 | |||
209 | "Return:\n\t" | ||
210 | : "=a" (rval) | ||
211 | : /* no input */ | ||
212 | : "eax", "ebx", "ecx", "edx" | ||
213 | ); | ||
214 | |||
215 | /* Return */ | ||
216 | return(rval); | ||
217 | } | ||
218 | |||
219 | /* Function to test if mmx instructions are supported... | ||
220 | */ | ||
221 | inline extern int | ||
222 | mmx_ok(void) | ||
223 | { | ||
224 | /* Returns 1 if MMX instructions are supported, 0 otherwise */ | ||
225 | return ( mm_support() & 0x1 ); | ||
226 | } | ||
227 | #endif | ||
228 | |||
229 | /* Helper functions for the instruction macros that follow... | ||
230 | (note that memory-to-register, m2r, instructions are nearly | ||
231 | as efficient as register-to-register, r2r, instructions; | ||
232 | however, memory-to-memory instructions are really simulated | ||
233 | as a convenience, and are only 1/3 as efficient) | ||
234 | */ | ||
235 | #ifdef MMX_TRACE | ||
236 | |||
237 | /* Include the stuff for printing a trace to stderr... | ||
238 | */ | ||
239 | |||
240 | #define mmx_i2r(op, imm, reg) \ | ||
241 | { \ | ||
242 | mmx_t mmx_trace; \ | ||
243 | mmx_trace.uq = (imm); \ | ||
244 | printf(#op "_i2r(" #imm "=0x%08x%08x, ", \ | ||
245 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
246 | __asm__ __volatile__ ("movq %%" #reg ", %0" \ | ||
247 | : "=y" (mmx_trace) \ | ||
248 | : /* nothing */ ); \ | ||
249 | printf(#reg "=0x%08x%08x) => ", \ | ||
250 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
251 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
252 | : /* nothing */ \ | ||
253 | : "y" (imm)); \ | ||
254 | __asm__ __volatile__ ("movq %%" #reg ", %0" \ | ||
255 | : "=y" (mmx_trace) \ | ||
256 | : /* nothing */ ); \ | ||
257 | printf(#reg "=0x%08x%08x\n", \ | ||
258 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
259 | } | ||
260 | |||
261 | #define mmx_m2r(op, mem, reg) \ | ||
262 | { \ | ||
263 | mmx_t mmx_trace; \ | ||
264 | mmx_trace = (mem); \ | ||
265 | printf(#op "_m2r(" #mem "=0x%08x%08x, ", \ | ||
266 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
267 | __asm__ __volatile__ ("movq %%" #reg ", %0" \ | ||
268 | : "=y" (mmx_trace) \ | ||
269 | : /* nothing */ ); \ | ||
270 | printf(#reg "=0x%08x%08x) => ", \ | ||
271 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
272 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
273 | : /* nothing */ \ | ||
274 | : "y" (mem)); \ | ||
275 | __asm__ __volatile__ ("movq %%" #reg ", %0" \ | ||
276 | : "=y" (mmx_trace) \ | ||
277 | : /* nothing */ ); \ | ||
278 | printf(#reg "=0x%08x%08x\n", \ | ||
279 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
280 | } | ||
281 | |||
282 | #define mmx_r2m(op, reg, mem) \ | ||
283 | { \ | ||
284 | mmx_t mmx_trace; \ | ||
285 | __asm__ __volatile__ ("movq %%" #reg ", %0" \ | ||
286 | : "=y" (mmx_trace) \ | ||
287 | : /* nothing */ ); \ | ||
288 | printf(#op "_r2m(" #reg "=0x%08x%08x, ", \ | ||
289 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
290 | mmx_trace = (mem); \ | ||
291 | printf(#mem "=0x%08x%08x) => ", \ | ||
292 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
293 | __asm__ __volatile__ (#op " %%" #reg ", %0" \ | ||
294 | : "=y" (mem) \ | ||
295 | : /* nothing */ ); \ | ||
296 | mmx_trace = (mem); \ | ||
297 | printf(#mem "=0x%08x%08x\n", \ | ||
298 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
299 | } | ||
300 | |||
301 | #define mmx_r2r(op, regs, regd) \ | ||
302 | { \ | ||
303 | mmx_t mmx_trace; \ | ||
304 | __asm__ __volatile__ ("movq %%" #regs ", %0" \ | ||
305 | : "=y" (mmx_trace) \ | ||
306 | : /* nothing */ ); \ | ||
307 | printf(#op "_r2r(" #regs "=0x%08x%08x, ", \ | ||
308 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
309 | __asm__ __volatile__ ("movq %%" #regd ", %0" \ | ||
310 | : "=y" (mmx_trace) \ | ||
311 | : /* nothing */ ); \ | ||
312 | printf(#regd "=0x%08x%08x) => ", \ | ||
313 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
314 | __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ | ||
315 | __asm__ __volatile__ ("movq %%" #regd ", %0" \ | ||
316 | : "=y" (mmx_trace) \ | ||
317 | : /* nothing */ ); \ | ||
318 | printf(#regd "=0x%08x%08x\n", \ | ||
319 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
320 | } | ||
321 | |||
322 | #define mmx_m2m(op, mems, memd) \ | ||
323 | { \ | ||
324 | mmx_t mmx_trace; \ | ||
325 | mmx_trace = (mems); \ | ||
326 | printf(#op "_m2m(" #mems "=0x%08x%08x, ", \ | ||
327 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
328 | mmx_trace = (memd); \ | ||
329 | printf(#memd "=0x%08x%08x) => ", \ | ||
330 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
331 | __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ | ||
332 | #op " %1, %%mm0\n\t" \ | ||
333 | "movq %%mm0, %0" \ | ||
334 | : "=y" (memd) \ | ||
335 | : "y" (mems)); \ | ||
336 | mmx_trace = (memd); \ | ||
337 | printf(#memd "=0x%08x%08x\n", \ | ||
338 | mmx_trace.d[1], mmx_trace.d[0]); \ | ||
339 | } | ||
340 | |||
341 | #else | ||
342 | |||
343 | /* These macros are a lot simpler without the tracing... | ||
344 | */ | ||
345 | |||
346 | #define mmx_i2r(op, imm, reg) \ | ||
347 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
348 | : /* nothing */ \ | ||
349 | : "y" (imm) ) | ||
350 | |||
351 | #define mmx_m2r(op, mem, reg) \ | ||
352 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
353 | : /* nothing */ \ | ||
354 | : "m" (mem)) | ||
355 | |||
356 | #define mmx_r2m(op, reg, mem) \ | ||
357 | __asm__ __volatile__ (#op " %%" #reg ", %0" \ | ||
358 | : "=m" (mem) \ | ||
359 | : /* nothing */ ) | ||
360 | |||
361 | #define mmx_r2r(op, regs, regd) \ | ||
362 | __asm__ __volatile__ (#op " %" #regs ", %" #regd) | ||
363 | |||
364 | #define mmx_m2m(op, mems, memd) \ | ||
365 | __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ | ||
366 | #op " %1, %%mm0\n\t" \ | ||
367 | "movq %%mm0, %0" \ | ||
368 | : "=y" (memd) \ | ||
369 | : "y" (mems)) | ||
370 | |||
371 | #endif | ||
372 | |||
373 | |||
374 | /* 1x64 MOVe Quadword | ||
375 | (this is both a load and a store... | ||
376 | in fact, it is the only way to store) | ||
377 | */ | ||
378 | #define movq_m2r(var, reg) mmx_m2r(movq, var, reg) | ||
379 | #define movq_r2m(reg, var) mmx_r2m(movq, reg, var) | ||
380 | #define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) | ||
381 | #define movq(vars, vard) \ | ||
382 | __asm__ __volatile__ ("movq %1, %%mm0\n\t" \ | ||
383 | "movq %%mm0, %0" \ | ||
384 | : "=y" (vard) \ | ||
385 | : "y" (vars)) | ||
386 | |||
387 | |||
388 | /* 1x32 MOVe Doubleword | ||
389 | (like movq, this is both load and store... | ||
390 | but is most useful for moving things between | ||
391 | mmx registers and ordinary registers) | ||
392 | */ | ||
393 | #define movd_m2r(var, reg) mmx_m2r(movd, var, reg) | ||
394 | #define movd_r2m(reg, var) mmx_r2m(movd, reg, var) | ||
395 | #define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) | ||
396 | #define movd(vars, vard) \ | ||
397 | __asm__ __volatile__ ("movd %1, %%mm0\n\t" \ | ||
398 | "movd %%mm0, %0" \ | ||
399 | : "=y" (vard) \ | ||
400 | : "y" (vars)) | ||
401 | |||
402 | |||
403 | /* 2x32, 4x16, and 8x8 Parallel ADDs | ||
404 | */ | ||
405 | #define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) | ||
406 | #define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) | ||
407 | #define paddd(vars, vard) mmx_m2m(paddd, vars, vard) | ||
408 | |||
409 | #define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) | ||
410 | #define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) | ||
411 | #define paddw(vars, vard) mmx_m2m(paddw, vars, vard) | ||
412 | |||
413 | #define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) | ||
414 | #define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) | ||
415 | #define paddb(vars, vard) mmx_m2m(paddb, vars, vard) | ||
416 | |||
417 | |||
418 | /* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic | ||
419 | */ | ||
420 | #define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) | ||
421 | #define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) | ||
422 | #define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard) | ||
423 | |||
424 | #define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) | ||
425 | #define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) | ||
426 | #define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard) | ||
427 | |||
428 | |||
429 | /* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic | ||
430 | */ | ||
431 | #define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) | ||
432 | #define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) | ||
433 | #define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard) | ||
434 | |||
435 | #define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) | ||
436 | #define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) | ||
437 | #define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard) | ||
438 | |||
439 | |||
440 | /* 2x32, 4x16, and 8x8 Parallel SUBs | ||
441 | */ | ||
442 | #define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) | ||
443 | #define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) | ||
444 | #define psubd(vars, vard) mmx_m2m(psubd, vars, vard) | ||
445 | |||
446 | #define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) | ||
447 | #define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) | ||
448 | #define psubw(vars, vard) mmx_m2m(psubw, vars, vard) | ||
449 | |||
450 | #define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) | ||
451 | #define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) | ||
452 | #define psubb(vars, vard) mmx_m2m(psubb, vars, vard) | ||
453 | |||
454 | |||
455 | /* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic | ||
456 | */ | ||
457 | #define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) | ||
458 | #define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) | ||
459 | #define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard) | ||
460 | |||
461 | #define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) | ||
462 | #define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) | ||
463 | #define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard) | ||
464 | |||
465 | |||
466 | /* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic | ||
467 | */ | ||
468 | #define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) | ||
469 | #define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) | ||
470 | #define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard) | ||
471 | |||
472 | #define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) | ||
473 | #define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) | ||
474 | #define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard) | ||
475 | |||
476 | |||
477 | /* 4x16 Parallel MULs giving Low 4x16 portions of results | ||
478 | */ | ||
479 | #define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) | ||
480 | #define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) | ||
481 | #define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard) | ||
482 | |||
483 | |||
484 | /* 4x16 Parallel MULs giving High 4x16 portions of results | ||
485 | */ | ||
486 | #define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) | ||
487 | #define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) | ||
488 | #define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard) | ||
489 | |||
490 | |||
491 | /* 4x16->2x32 Parallel Mul-ADD | ||
492 | (muls like pmullw, then adds adjacent 16-bit fields | ||
493 | in the multiply result to make the final 2x32 result) | ||
494 | */ | ||
495 | #define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) | ||
496 | #define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) | ||
497 | #define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard) | ||
498 | |||
499 | |||
500 | /* 1x64 bitwise AND | ||
501 | */ | ||
502 | #ifdef BROKEN_PAND | ||
503 | #define pand_m2r(var, reg) \ | ||
504 | { \ | ||
505 | mmx_m2r(pandn, (mmx_t) -1LL, reg); \ | ||
506 | mmx_m2r(pandn, var, reg); \ | ||
507 | } | ||
508 | #define pand_r2r(regs, regd) \ | ||
509 | { \ | ||
510 | mmx_m2r(pandn, (mmx_t) -1LL, regd); \ | ||
511 | mmx_r2r(pandn, regs, regd) \ | ||
512 | } | ||
513 | #define pand(vars, vard) \ | ||
514 | { \ | ||
515 | movq_m2r(vard, mm0); \ | ||
516 | mmx_m2r(pandn, (mmx_t) -1LL, mm0); \ | ||
517 | mmx_m2r(pandn, vars, mm0); \ | ||
518 | movq_r2m(mm0, vard); \ | ||
519 | } | ||
520 | #else | ||
521 | #define pand_m2r(var, reg) mmx_m2r(pand, var, reg) | ||
522 | #define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) | ||
523 | #define pand(vars, vard) mmx_m2m(pand, vars, vard) | ||
524 | #endif | ||
525 | |||
526 | |||
527 | /* 1x64 bitwise AND with Not the destination | ||
528 | */ | ||
529 | #define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) | ||
530 | #define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) | ||
531 | #define pandn(vars, vard) mmx_m2m(pandn, vars, vard) | ||
532 | |||
533 | |||
534 | /* 1x64 bitwise OR | ||
535 | */ | ||
536 | #define por_m2r(var, reg) mmx_m2r(por, var, reg) | ||
537 | #define por_r2r(regs, regd) mmx_r2r(por, regs, regd) | ||
538 | #define por(vars, vard) mmx_m2m(por, vars, vard) | ||
539 | |||
540 | |||
541 | /* 1x64 bitwise eXclusive OR | ||
542 | */ | ||
543 | #define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) | ||
544 | #define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) | ||
545 | #define pxor(vars, vard) mmx_m2m(pxor, vars, vard) | ||
546 | |||
547 | |||
548 | /* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality | ||
549 | (resulting fields are either 0 or -1) | ||
550 | */ | ||
551 | #define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) | ||
552 | #define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) | ||
553 | #define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard) | ||
554 | |||
555 | #define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) | ||
556 | #define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) | ||
557 | #define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard) | ||
558 | |||
559 | #define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) | ||
560 | #define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) | ||
561 | #define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard) | ||
562 | |||
563 | |||
564 | /* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than | ||
565 | (resulting fields are either 0 or -1) | ||
566 | */ | ||
567 | #define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) | ||
568 | #define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) | ||
569 | #define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard) | ||
570 | |||
571 | #define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) | ||
572 | #define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) | ||
573 | #define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard) | ||
574 | |||
575 | #define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) | ||
576 | #define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) | ||
577 | #define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard) | ||
578 | |||
579 | |||
580 | /* 1x64, 2x32, and 4x16 Parallel Shift Left Logical | ||
581 | */ | ||
582 | #define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg) | ||
583 | #define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) | ||
584 | #define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) | ||
585 | #define psllq(vars, vard) mmx_m2m(psllq, vars, vard) | ||
586 | |||
587 | #define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg) | ||
588 | #define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) | ||
589 | #define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) | ||
590 | #define pslld(vars, vard) mmx_m2m(pslld, vars, vard) | ||
591 | |||
592 | #define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg) | ||
593 | #define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) | ||
594 | #define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) | ||
595 | #define psllw(vars, vard) mmx_m2m(psllw, vars, vard) | ||
596 | |||
597 | |||
598 | /* 1x64, 2x32, and 4x16 Parallel Shift Right Logical | ||
599 | */ | ||
600 | #define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg) | ||
601 | #define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) | ||
602 | #define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) | ||
603 | #define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard) | ||
604 | |||
605 | #define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg) | ||
606 | #define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) | ||
607 | #define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) | ||
608 | #define psrld(vars, vard) mmx_m2m(psrld, vars, vard) | ||
609 | |||
610 | #define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg) | ||
611 | #define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) | ||
612 | #define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) | ||
613 | #define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard) | ||
614 | |||
615 | |||
616 | /* 2x32 and 4x16 Parallel Shift Right Arithmetic | ||
617 | */ | ||
618 | #define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg) | ||
619 | #define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) | ||
620 | #define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) | ||
621 | #define psrad(vars, vard) mmx_m2m(psrad, vars, vard) | ||
622 | |||
623 | #define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg) | ||
624 | #define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) | ||
625 | #define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) | ||
626 | #define psraw(vars, vard) mmx_m2m(psraw, vars, vard) | ||
627 | |||
628 | |||
629 | /* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate | ||
630 | (packs source and dest fields into dest in that order) | ||
631 | */ | ||
632 | #define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) | ||
633 | #define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) | ||
634 | #define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard) | ||
635 | |||
636 | #define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) | ||
637 | #define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) | ||
638 | #define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard) | ||
639 | |||
640 | |||
641 | /* 4x16->8x8 PACK and Unsigned Saturate | ||
642 | (packs source and dest fields into dest in that order) | ||
643 | */ | ||
644 | #define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) | ||
645 | #define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) | ||
646 | #define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard) | ||
647 | |||
648 | |||
649 | /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low | ||
650 | (interleaves low half of dest with low half of source | ||
651 | as padding in each result field) | ||
652 | */ | ||
653 | #define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) | ||
654 | #define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) | ||
655 | #define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard) | ||
656 | |||
657 | #define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) | ||
658 | #define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) | ||
659 | #define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard) | ||
660 | |||
661 | #define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) | ||
662 | #define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) | ||
663 | #define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard) | ||
664 | |||
665 | |||
666 | /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High | ||
667 | (interleaves high half of dest with high half of source | ||
668 | as padding in each result field) | ||
669 | */ | ||
670 | #define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) | ||
671 | #define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) | ||
672 | #define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard) | ||
673 | |||
674 | #define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) | ||
675 | #define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) | ||
676 | #define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard) | ||
677 | |||
678 | #define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) | ||
679 | #define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) | ||
680 | #define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard) | ||
681 | |||
682 | |||
683 | /* Empty MMx State | ||
684 | (used to clean-up when going from mmx to float use | ||
685 | of the registers that are shared by both; note that | ||
686 | there is no float-to-mmx operation needed, because | ||
687 | only the float tag word info is corruptible) | ||
688 | */ | ||
689 | #ifdef MMX_TRACE | ||
690 | |||
691 | #define emms() \ | ||
692 | { \ | ||
693 | printf("emms()\n"); \ | ||
694 | __asm__ __volatile__ ("emms"); \ | ||
695 | } | ||
696 | |||
697 | #else | ||
698 | |||
699 | #define emms() __asm__ __volatile__ ("emms") | ||
700 | |||
701 | #endif | ||
702 | |||
703 | #endif | ||
704 | |||