blob: 6f52c58291cea0dfd6f111a089fc32214e430a5f [file] [log] [blame]
Anders Carlsson566d8da2008-12-22 00:01:20 +00001/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "MMX instruction set not enabled"
29#else
30
Anders Carlsson4fcc3132008-12-22 00:48:30 +000031#include <mmintrin.h>
32
Anders Carlsson566d8da2008-12-22 00:01:20 +000033typedef float __m128 __attribute__((__vector_size__(16)));
34
35static inline __m128 __attribute__((__always_inline__)) _mm_add_ss(__m128 a, __m128 b)
36{
37 return __builtin_ia32_addss(a, b);
38}
39
40static inline __m128 __attribute__((__always_inline__)) _mm_add_ps(__m128 a, __m128 b)
41{
42 return a + b;
43}
44
45static inline __m128 __attribute__((__always_inline__)) _mm_sub_ss(__m128 a, __m128 b)
46{
47 return __builtin_ia32_subss(a, b);
48}
49
50static inline __m128 __attribute__((__always_inline__)) _mm_sub_ps(__m128 a, __m128 b)
51{
52 return a - b;
53}
54
55static inline __m128 __attribute__((__always_inline__)) _mm_mul_ss(__m128 a, __m128 b)
56{
57 return __builtin_ia32_mulss(a, b);
58}
59
60static inline __m128 __attribute__((__always_inline__)) _mm_mul_ps(__m128 a, __m128 b)
61{
62 return a * b;
63}
64
65static inline __m128 __attribute__((__always_inline__)) _mm_div_ss(__m128 a, __m128 b)
66{
67 return __builtin_ia32_divss(a, b);
68}
69
70static inline __m128 __attribute__((__always_inline__)) _mm_div_ps(__m128 a, __m128 b)
71{
72 return a / b;
73}
74
75static inline __m128 __attribute__((__always_inline__)) _mm_sqrt_ss(__m128 a)
76{
77 return __builtin_ia32_sqrtss(a);
78}
79
80static inline __m128 __attribute__((__always_inline__)) _mm_sqrt_ps(__m128 a)
81{
82 return __builtin_ia32_sqrtps(a);
83}
84
85static inline __m128 __attribute__((__always_inline__)) _mm_rcp_ss(__m128 a)
86{
87 return __builtin_ia32_rcpss(a);
88}
89
90static inline __m128 __attribute__((__always_inline__)) _mm_rcp_ps(__m128 a)
91{
92 return __builtin_ia32_rcpps(a);
93}
94
95static inline __m128 __attribute__((__always_inline__)) _mm_rsqrt_ss(__m128 a)
96{
97 return __builtin_ia32_rsqrtss(a);
98}
99
100static inline __m128 __attribute__((__always_inline__)) _mm_rsqrt_ps(__m128 a)
101{
102 return __builtin_ia32_rsqrtps(a);
103}
104
105static inline __m128 __attribute__((__always_inline__)) _mm_min_ss(__m128 a, __m128 b)
106{
107 return __builtin_ia32_minss(a, b);
108}
109
110static inline __m128 __attribute__((__always_inline__)) _mm_min_ps(__m128 a, __m128 b)
111{
112 return __builtin_ia32_minps(a, b);
113}
114
115static inline __m128 __attribute__((__always_inline__)) _mm_max_ss(__m128 a, __m128 b)
116{
117 return __builtin_ia32_maxss(a, b);
118}
119
120static inline __m128 __attribute__((__always_inline__)) _mm_max_ps(__m128 a, __m128 b)
121{
122 return __builtin_ia32_maxps(a, b);
123}
124
125static inline __m128 __attribute__((__always_inline__)) _mm_and_ps(__m128 a, __m128 b)
126{
127 return __builtin_ia32_andps(a, b);
128}
129
130static inline __m128 __attribute__((__always_inline__)) _mm_andnot_ps(__m128 a, __m128 b)
131{
132 return __builtin_ia32_andnps(a, b);
133}
134
135static inline __m128 __attribute__((__always_inline__)) _mm_or_ps(__m128 a, __m128 b)
136{
137 return __builtin_ia32_orps(a, b);
138}
139
140static inline __m128 __attribute__((__always_inline__)) _mm_xor_ps(__m128 a, __m128 b)
141{
142 return __builtin_ia32_xorps(a, b);
143}
144
Anders Carlssonf62c6812008-12-22 00:28:39 +0000145static inline __m128 __attribute__((__always_inline__)) _mm_cmpeq_ss(__m128 a, __m128 b)
146{
147 return (__m128)__builtin_ia32_cmpeqss(a, b);
148}
149
150static inline __m128 __attribute__((__always_inline__)) _mm_cmpeq_ps(__m128 a, __m128 b)
151{
152 return (__m128)__builtin_ia32_cmpeqps(a, b);
153}
154
155static inline __m128 __attribute__((__always_inline__)) _mm_cmplt_ss(__m128 a, __m128 b)
156{
157 return (__m128)__builtin_ia32_cmpltss(a, b);
158}
159
160static inline __m128 __attribute__((__always_inline__)) _mm_cmplt_ps(__m128 a, __m128 b)
161{
162 return (__m128)__builtin_ia32_cmpltps(a, b);
163}
164
165static inline __m128 __attribute__((__always_inline__)) _mm_cmple_ss(__m128 a, __m128 b)
166{
167 return (__m128)__builtin_ia32_cmpless(a, b);
168}
169
170static inline __m128 __attribute__((__always_inline__)) _mm_cmple_ps(__m128 a, __m128 b)
171{
172 return (__m128)__builtin_ia32_cmpleps(a, b);
173}
174
175static inline __m128 __attribute__((__always_inline__)) _mm_cmpgt_ss(__m128 a, __m128 b)
176{
177 return (__m128)__builtin_ia32_cmpltss(b, a);
178}
179
180static inline __m128 __attribute__((__always_inline__)) _mm_cmpgt_ps(__m128 a, __m128 b)
181{
182 return (__m128)__builtin_ia32_cmpltps(b, a);
183}
184
185static inline __m128 __attribute__((__always_inline__)) _mm_cmpge_ss(__m128 a, __m128 b)
186{
187 return (__m128)__builtin_ia32_cmpless(b, a);
188}
189
190static inline __m128 __attribute__((__always_inline__)) _mm_cmpge_ps(__m128 a, __m128 b)
191{
192 return (__m128)__builtin_ia32_cmpleps(b, a);
193}
194
195static inline __m128 __attribute__((__always_inline__)) _mm_cmpneq_ss(__m128 a, __m128 b)
196{
197 return (__m128)__builtin_ia32_cmpneqss(a, b);
198}
199
200static inline __m128 __attribute__((__always_inline__)) _mm_cmpneq_ps(__m128 a, __m128 b)
201{
202 return (__m128)__builtin_ia32_cmpneqps(a, b);
203}
204
205static inline __m128 __attribute__((__always_inline__)) _mm_cmpnlt_ss(__m128 a, __m128 b)
206{
207 return (__m128)__builtin_ia32_cmpnltss(a, b);
208}
209
210static inline __m128 __attribute__((__always_inline__)) _mm_cmpnlt_ps(__m128 a, __m128 b)
211{
212 return (__m128)__builtin_ia32_cmpnltps(a, b);
213}
214
215static inline __m128 __attribute__((__always_inline__)) _mm_cmpnle_ss(__m128 a, __m128 b)
216{
217 return (__m128)__builtin_ia32_cmpnless(a, b);
218}
219
220static inline __m128 __attribute__((__always_inline__)) _mm_cmpnle_ps(__m128 a, __m128 b)
221{
222 return (__m128)__builtin_ia32_cmpnleps(a, b);
223}
224
225static inline __m128 __attribute__((__always_inline__)) _mm_cmpngt_ss(__m128 a, __m128 b)
226{
227 return (__m128)__builtin_ia32_cmpnltss(b, a);
228}
229
230static inline __m128 __attribute__((__always_inline__)) _mm_cmpngt_ps(__m128 a, __m128 b)
231{
232 return (__m128)__builtin_ia32_cmpnltps(b, a);
233}
234
235static inline __m128 __attribute__((__always_inline__)) _mm_cmpnge_ss(__m128 a, __m128 b)
236{
237 return (__m128)__builtin_ia32_cmpnless(b, a);
238}
239
240static inline __m128 __attribute__((__always_inline__)) _mm_cmpnge_ps(__m128 a, __m128 b)
241{
242 return (__m128)__builtin_ia32_cmpnleps(b, a);
243}
244
245static inline __m128 __attribute__((__always_inline__)) _mm_cmpord_ss(__m128 a, __m128 b)
246{
247 return (__m128)__builtin_ia32_cmpordss(a, b);
248}
249
250static inline __m128 __attribute__((__always_inline__)) _mm_cmpord_ps(__m128 a, __m128 b)
251{
252 return (__m128)__builtin_ia32_cmpordps(a, b);
253}
254
255static inline __m128 __attribute__((__always_inline__)) _mm_cmpunord_ss(__m128 a, __m128 b)
256{
257 return (__m128)__builtin_ia32_cmpunordss(a, b);
258}
259
260static inline __m128 __attribute__((__always_inline__)) _mm_cmpunord_ps(__m128 a, __m128 b)
261{
262 return (__m128)__builtin_ia32_cmpunordps(a, b);
263}
264
265static inline int __attribute__((__always_inline__)) _mm_comieq_ss(__m128 a, __m128 b)
266{
267 return __builtin_ia32_comieq(a, b);
268}
269
270static inline int __attribute__((__always_inline__)) _mm_comilt_ss(__m128 a, __m128 b)
271{
272 return __builtin_ia32_comilt(a, b);
273}
274
275static inline int __attribute__((__always_inline__)) _mm_comile_ss(__m128 a, __m128 b)
276{
277 return __builtin_ia32_comile(a, b);
278}
279
280static inline int __attribute__((__always_inline__)) _mm_comigt_ss(__m128 a, __m128 b)
281{
282 return __builtin_ia32_comigt(a, b);
283}
284
285static inline int __attribute__((__always_inline__)) _mm_comige_ss(__m128 a, __m128 b)
286{
287 return __builtin_ia32_comige(a, b);
288}
289
290static inline int __attribute__((__always_inline__)) _mm_comineq_ss(__m128 a, __m128 b)
291{
292 return __builtin_ia32_comineq(a, b);
293}
294
295static inline int __attribute__((__always_inline__)) _mm_ucomieq_ss(__m128 a, __m128 b)
296{
297 return __builtin_ia32_ucomieq(a, b);
298}
299
300static inline int __attribute__((__always_inline__)) _mm_ucomilt_ss(__m128 a, __m128 b)
301{
302 return __builtin_ia32_ucomilt(a, b);
303}
304
305static inline int __attribute__((__always_inline__)) _mm_ucomile_ss(__m128 a, __m128 b)
306{
307 return __builtin_ia32_ucomile(a, b);
308}
309
310static inline int __attribute__((__always_inline__)) _mm_ucomigt_ss(__m128 a, __m128 b)
311{
312 return __builtin_ia32_ucomigt(a, b);
313}
314
315static inline int __attribute__((__always_inline__)) _mm_ucomige_ss(__m128 a, __m128 b)
316{
317 return __builtin_ia32_ucomige(a, b);
318}
319
320static inline int __attribute__((__always_inline__)) _mm_ucomineq_ss(__m128 a, __m128 b)
321{
322 return __builtin_ia32_ucomineq(a, b);
323}
324
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000325static inline int __attribute__((__always_inline__)) _mm_cvtss_si32(__m128 a)
326{
327 return __builtin_ia32_cvtss2si(a);
328}
329
330static inline long long __attribute__((__always_inline__)) _mm_cvtss_si64(__m128 a)
331{
332 return __builtin_ia32_cvtss2si64(a);
333}
334
335static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi32(__m128 a)
336{
337 return (__m64)__builtin_ia32_cvtps2pi(a);
338}
339
340static inline int __attribute__((__always_inline__)) _mm_cvttss_si32(__m128 a)
341{
342 return __builtin_ia32_cvttss2si(a);
343}
344
345static inline long long __attribute__((__always_inline__)) _mm_cvttss_si64(__m128 a)
346{
347 return __builtin_ia32_cvttss2si64(a);
348}
349
350static inline __m64 __attribute__((__always_inline__)) _mm_cvttps_pi32(__m128 a)
351{
352 return (__m64)__builtin_ia32_cvttps2pi(a);
353}
354
355static inline __m128 __attribute__((__always_inline__)) _mm_cvtsi32_ss(__m128 a, int b)
356{
357 return __builtin_ia32_cvtsi2ss(a, b);
358}
359
Anders Carlsson1b76b802008-12-22 01:26:50 +0000360#ifdef __x86_64__
361
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000362static inline __m128 __attribute__((__always_inline__)) _mm_cvtsi64_ss(__m128 a, long long b)
363{
364 return __builtin_ia32_cvtsi642ss(a, b);
365}
366
Anders Carlsson1b76b802008-12-22 01:26:50 +0000367#endif
368
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000369static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi32_ps(__m128 a, __m64 b)
370{
371 return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
372}
373
Anders Carlsson1b76b802008-12-22 01:26:50 +0000374static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi16_ps(__m64 a)
375{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000376 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000377 return (__m128){ 0, 0, 0, 0 };
378}
379
380static inline __m128 __attribute__((__always_inline__)) _mm_cvtpu16_ps(__m64 a)
381{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000382 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000383 return (__m128){ 0, 0, 0, 0 };
384}
385
386static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi8_ps(__m64 a)
387{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000388 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000389 return (__m128){ 0, 0, 0, 0 };
390}
391
392static inline __m128 __attribute__((__always_inline__)) _mm_cvtpu8_ps(__m64 a)
393{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000394 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000395 return (__m128){ 0, 0, 0, 0 };
396}
397
398static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi32x2_ps(__m64 a, __m64 b)
399{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000400 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000401 return (__m128){ 0, 0, 0, 0 };
402}
403
404static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi16(__m128 a)
405{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000406 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000407 return _mm_setzero_si64();
408}
409
410static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi8(__m128 a)
411{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000412 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000413 return _mm_setzero_si64();
414}
415
416static inline float __attribute__((__always_inline__)) _mm_cvtss_f32(__m128 a)
417{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000418 /* FIXME: Implement */
Anders Carlsson1b76b802008-12-22 01:26:50 +0000419 return 0;
420}
421
Anders Carlsson97700862008-12-22 02:43:30 +0000422static inline __m128 __attribute__((__always_inline__)) _mm_loadh_pi(__m128 a, __m64 const *p)
423{
424 return __builtin_ia32_loadhps(a, (__v2si *)p);
425}
426
427static inline __m128 __attribute__((__always_inline__)) _mm_loadl_pi(__m128 a, __m64 const *p)
428{
429 return __builtin_ia32_loadlps(a, (__v2si *)p);
430}
431
432static inline __m128 __attribute__((__always_inline__)) _mm_load_ss(float *p)
433{
434 return (__m128){ *p, 0, 0, 0 };
435}
436
437static inline __m128 __attribute__((__always_inline__)) _mm_load1_ps(float *p)
438{
439 return (__m128){ *p, *p, *p, *p };
440}
441
442static inline __m128 __attribute__((__always_inline__)) _mm_load_ps(float *p)
443{
444 return *(__m128*)p;
445}
446
447static inline __m128 __attribute__((__always_inline__)) _mm_loadu_ps(float *p)
448{
449 return __builtin_ia32_loadups(p);
450}
451
452static inline __m128 __attribute__((__always_inline__)) _mm_loadr_ps(float *p)
453{
454 __m128 a = _mm_load_ps(p);
455 return __builtin_shufflevector(a, a, 3, 2, 1, 0);
456}
457
Anders Carlssona6ba0012008-12-22 02:51:35 +0000458static inline __m128 __attribute__((__always_inline__)) _mm_set_ss(float w)
459{
460 return (__m128){ w, 0, 0, 0 };
461}
462
463static inline __m128 __attribute__((__always_inline__)) _mm_set1_ps(float w)
464{
465 return (__m128){ w, w, w, w };
466}
467
468static inline __m128 __attribute__((__always_inline__)) _mm_set_ps(float z, float y, float x, float w)
469{
470 return (__m128){ w, x, y, z };
471}
472
473static inline __m128 __attribute__((__always_inline__)) _mm_setr_ps(float z, float y, float x, float w)
474{
475 return (__m128){ z, y, x, w };
476}
477
478static inline __m128 __attribute__((__always__inline__)) _mm_setzero_ps(void)
479{
480 return (__m128){ 0, 0, 0, 0 };
481}
482
Anders Carlsson09b93052008-12-22 03:16:40 +0000483static inline void __attribute__((__always__inline__)) _mm_storeh_pi(__m64 *p, __m128 a)
484{
485 __builtin_ia32_storehps((__v2si *)p, a);
486}
487
488static inline void __attribute__((__always__inline__)) _mm_storel_pi(__m64 *p, __m128 a)
489{
490 __builtin_ia32_storelps((__v2si *)p, a);
491}
492
493static inline void __attribute__((__always__inline__)) _mm_store_ss(float *p, __m128 a)
494{
495 *p = a[0];
496}
497
498static inline void __attribute__((__always_inline__)) _mm_storeu_ps(float *p, __m128 a)
499{
500 __builtin_ia32_storeups(p, a);
501}
502
503static inline void __attribute__((__always_inline__)) _mm_store1_ps(float *p, __m128 a)
504{
505 a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
506 _mm_storeu_ps(p, a);
507}
508
509static inline void __attribute__((__always_inline__)) _mm_store_ps(float *p, __m128 a)
510{
511 *(__m128 *)p = a;
512}
513
514static inline void __attribute__((__always_inline__)) _mm_storer_ps(float *p, __m128 a)
515{
516 a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
517 _mm_store_ps(p, a);
518}
519
Anders Carlssondedad4e2008-12-22 03:50:21 +0000520#define _MM_HINT_T0 1
521#define _MM_HINT_T1 2
522#define _MM_HINT_T2 3
523#define _MM_HINT_NTA 0
524
Anders Carlsson62af71c2008-12-22 04:55:36 +0000525/* FIXME: We have to #define this because "sel" must be a constant integer, and
526 Sema doesn't do any form of constant propagation yet. */
Anders Carlssondedad4e2008-12-22 03:50:21 +0000527
528#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
529
530static inline void __attribute__((__always_inline__)) _mm_stream_pi(__m64 *p, __m64 a)
531{
532 __builtin_ia32_movntq(p, a);
533}
534
535static inline void __attribute__((__always_inline__)) _mm_stream_ps(float *p, __m128 a)
536{
537 __builtin_ia32_movntps(p, a);
538}
539
540static inline void __attribute__((__always_inline__)) _mm_sfence(void)
541{
542 __builtin_ia32_sfence();
543}
544
Anders Carlsson62af71c2008-12-22 04:55:36 +0000545static inline int __attribute__((__always_inline__)) _mm_extract_pi16(__m64 a, int n)
546{
547 /* FIXME:
548 * This should force n to be an immediate.
549 * This does not use the PEXTRW instruction. From looking at the LLVM source, the
550 instruction doesn't seem to be hooked up.
551 * The code could probably be made better :)
552 */
553 __v4hi b = (__v4hi)a;
554 return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
555}
556
557static inline __m64 __attribute__((__always_inline__)) _mm_insert_pi16(__m64 a, int d, int n)
558{
559 /* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
560 the already existing __builtin_shufflevector.
561 */
562 return (__m64){ 0LL };
563}
564
565static inline __m64 __attribute__((__always_inline__)) _mm_max_pi16(__m64 a, __m64 b)
566{
567 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
568}
569
570static inline __m64 __attribute__((__always_inline__)) _mm_max_pu8(__m64 a, __m64 b)
571{
572 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
573}
574
575static inline __m64 __attribute__((__always_inline__)) _mm_min_pi16(__m64 a, __m64 b)
576{
577 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
578}
579
580static inline __m64 __attribute__((__always_inline__)) _mm_min_pu8(__m64 a, __m64 b)
581{
582 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
583}
584
585static inline int __attribute__((__always_inline__)) _mm_movemask_pi8(__m64 a)
586{
587 return __builtin_ia32_pmovmskb((__v8qi)a);
588}
589
590static inline __m64 __attribute__((__always_inline__)) _mm_mulhi_pu16(__m64 a, __m64 b)
591{
592 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
593}
594
595#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
596
597static inline void __attribute__((__always_inline__)) _mm_maskmove_si64(__m64 d, __m64 n, char *p)
598{
599 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
600}
601
602static inline __m64 __attribute__((__always_inline__)) _mm_avg_pu8(__m64 a, __m64 b)
603{
604 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
605}
606
607static inline __m64 __attribute__((__always_inline__)) _mm_avg_pu16(__m64 a, __m64 b)
608{
609 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
610}
611
612static inline __m64 __attribute__((__always_inline___)) _mm_sad_pu8(__m64 a, __m64 b)
613{
614 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
615}
Anders Carlssonc1f9afd2008-12-22 05:00:07 +0000616
617static inline unsigned int __attribute__((__always_inline___)) _mm_getcsr(void)
618{
619 return __builtin_ia32_stmxcsr();
620}
621
622static inline void __attribute__((__always_inline__)) _mm_setcsr(unsigned int i)
623{
624 __builtin_ia32_ldmxcsr(i);
625}
626
Anders Carlsson50099cb2008-12-22 05:20:34 +0000627#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
628
629static inline __m128 __attribute__((__always_inline__)) _mm_unpackhi_ps(__m128 a, __m128 b)
630{
631 return __builtin_shufflevector(a, b, 2, 6, 3, 7);
632}
633
634static inline __m128 __attribute__((__always_inline__)) _mm_unpacklo_ps(__m128 a, __m128 b)
635{
636 return __builtin_shufflevector(a, b, 0, 4, 1, 5);
637}
638
639static inline __m128 __attribute__((__always_inline__)) _mm_move_ss(__m128 a, __m128 b)
640{
641 return __builtin_shufflevector(a, b, 4, 1, 2, 3);
642}
643
644static inline __m128 __attribute__((__always_inline__)) _mm_movehl_ps(__m128 a, __m128 b)
645{
646 return __builtin_shufflevector(a, b, 6, 7, 2, 3);
647}
648
649static inline __m128 __attribute__((__always_inline__)) _mm_movelh_ps(__m128 a, __m128 b)
650{
651 return __builtin_shufflevector(a, b, 0, 1, 4, 5);
652}
653
654static inline int __attribute__((__always_inline__)) _mm_movemask_ps(__m128 a)
655{
656 return __builtin_ia32_movmskps(a);
657}
658
Anders Carlssonb5955092008-12-22 05:42:03 +0000659#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
660
661#define _MM_MASK_MASK (0x1f80)
662#define _MM_EXCEPT_MASK (0x003f)
663#define _MM_FLUSH_MASK (0x8000)
664#define _MM_ROUND_MASK (0x6000)
665
666#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
667#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
668#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_MASK)
669#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
670
671#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_MASK) | (x)))
672#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
673#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_MASK) | (x)))
674#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
675
676#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
677do { \
678 __m128 tmp3, tmp2, tmp1, tmp0; \
679 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
680 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
681 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
682 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
683 (row0) = _mm_movelh_ps(tmp0, tmp2); \
684 (row1) = _mm_movehl_ps(tmp2, tmp0); \
685 (row2) = _mm_movelh_ps(tmp1, tmp3); \
686 (row3) = _mm_movelh_ps(tmp3, tmp1); \
687} while (0)
688
Anders Carlsson566d8da2008-12-22 00:01:20 +0000689#endif /* __SSE__ */
690
691#endif /* __XMMINTRIN_H */