blob: b3f8569524b9833bf02d3938e561d1fd71bfae0f [file] [log] [blame]
Stephen Hines51a0ffb2014-02-14 00:25:07 -08001/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines. */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d __a, __m128d __b)
44{
45 __a[0] += __b[0];
46 return __a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d __a, __m128d __b)
51{
52 return __a + __b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d __a, __m128d __b)
57{
58 __a[0] -= __b[0];
59 return __a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d __a, __m128d __b)
64{
65 return __a - __b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d __a, __m128d __b)
70{
71 __a[0] *= __b[0];
72 return __a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d __a, __m128d __b)
77{
78 return __a * __b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d __a, __m128d __b)
83{
84 __a[0] /= __b[0];
85 return __a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d __a, __m128d __b)
90{
91 return __a / __b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d __a, __m128d __b)
96{
97 __m128d __c = __builtin_ia32_sqrtsd(__b);
98 return (__m128d) { __c[0], __a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d __a)
103{
104 return __builtin_ia32_sqrtpd(__a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d __a, __m128d __b)
109{
110 return __builtin_ia32_minsd(__a, __b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d __a, __m128d __b)
115{
116 return __builtin_ia32_minpd(__a, __b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d __a, __m128d __b)
121{
122 return __builtin_ia32_maxsd(__a, __b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d __a, __m128d __b)
127{
128 return __builtin_ia32_maxpd(__a, __b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d __a, __m128d __b)
133{
134 return (__m128d)((__v4si)__a & (__v4si)__b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d __a, __m128d __b)
139{
140 return (__m128d)(~(__v4si)__a & (__v4si)__b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d __a, __m128d __b)
145{
146 return (__m128d)((__v4si)__a | (__v4si)__b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d __a, __m128d __b)
151{
152 return (__m128d)((__v4si)__a ^ (__v4si)__b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d __a, __m128d __b)
157{
158 return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d __a, __m128d __b)
163{
164 return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d __a, __m128d __b)
169{
170 return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d __a, __m128d __b)
175{
176 return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d __a, __m128d __b)
181{
182 return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d __a, __m128d __b)
187{
188 return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d __a, __m128d __b)
193{
194 return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d __a, __m128d __b)
199{
200 return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205{
206 return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d __a, __m128d __b)
211{
212 return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d __a, __m128d __b)
217{
218 return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d __a, __m128d __b)
223{
224 return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d __a, __m128d __b)
229{
230 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d __a, __m128d __b)
235{
236 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d __a, __m128d __b)
241{
242 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d __a, __m128d __b)
247{
248 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
249 return (__m128d) { __c[0], __a[1] };
250}
251
252static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
253_mm_cmpge_sd(__m128d __a, __m128d __b)
254{
255 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
256 return (__m128d) { __c[0], __a[1] };
257}
258
259static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260_mm_cmpord_sd(__m128d __a, __m128d __b)
261{
262 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
263}
264
265static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_cmpunord_sd(__m128d __a, __m128d __b)
267{
268 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
269}
270
271static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272_mm_cmpneq_sd(__m128d __a, __m128d __b)
273{
274 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
275}
276
277static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278_mm_cmpnlt_sd(__m128d __a, __m128d __b)
279{
280 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
281}
282
283static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284_mm_cmpnle_sd(__m128d __a, __m128d __b)
285{
286 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
287}
288
289static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
290_mm_cmpngt_sd(__m128d __a, __m128d __b)
291{
292 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
293 return (__m128d) { __c[0], __a[1] };
294}
295
296static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
297_mm_cmpnge_sd(__m128d __a, __m128d __b)
298{
299 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
300 return (__m128d) { __c[0], __a[1] };
301}
302
303static __inline__ int __attribute__((__always_inline__, __nodebug__))
304_mm_comieq_sd(__m128d __a, __m128d __b)
305{
306 return __builtin_ia32_comisdeq(__a, __b);
307}
308
309static __inline__ int __attribute__((__always_inline__, __nodebug__))
310_mm_comilt_sd(__m128d __a, __m128d __b)
311{
312 return __builtin_ia32_comisdlt(__a, __b);
313}
314
315static __inline__ int __attribute__((__always_inline__, __nodebug__))
316_mm_comile_sd(__m128d __a, __m128d __b)
317{
318 return __builtin_ia32_comisdle(__a, __b);
319}
320
321static __inline__ int __attribute__((__always_inline__, __nodebug__))
322_mm_comigt_sd(__m128d __a, __m128d __b)
323{
324 return __builtin_ia32_comisdgt(__a, __b);
325}
326
327static __inline__ int __attribute__((__always_inline__, __nodebug__))
328_mm_comige_sd(__m128d __a, __m128d __b)
329{
330 return __builtin_ia32_comisdge(__a, __b);
331}
332
333static __inline__ int __attribute__((__always_inline__, __nodebug__))
334_mm_comineq_sd(__m128d __a, __m128d __b)
335{
336 return __builtin_ia32_comisdneq(__a, __b);
337}
338
339static __inline__ int __attribute__((__always_inline__, __nodebug__))
340_mm_ucomieq_sd(__m128d __a, __m128d __b)
341{
342 return __builtin_ia32_ucomisdeq(__a, __b);
343}
344
345static __inline__ int __attribute__((__always_inline__, __nodebug__))
346_mm_ucomilt_sd(__m128d __a, __m128d __b)
347{
348 return __builtin_ia32_ucomisdlt(__a, __b);
349}
350
351static __inline__ int __attribute__((__always_inline__, __nodebug__))
352_mm_ucomile_sd(__m128d __a, __m128d __b)
353{
354 return __builtin_ia32_ucomisdle(__a, __b);
355}
356
357static __inline__ int __attribute__((__always_inline__, __nodebug__))
358_mm_ucomigt_sd(__m128d __a, __m128d __b)
359{
360 return __builtin_ia32_ucomisdgt(__a, __b);
361}
362
363static __inline__ int __attribute__((__always_inline__, __nodebug__))
364_mm_ucomige_sd(__m128d __a, __m128d __b)
365{
366 return __builtin_ia32_ucomisdge(__a, __b);
367}
368
369static __inline__ int __attribute__((__always_inline__, __nodebug__))
370_mm_ucomineq_sd(__m128d __a, __m128d __b)
371{
372 return __builtin_ia32_ucomisdneq(__a, __b);
373}
374
375static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
376_mm_cvtpd_ps(__m128d __a)
377{
378 return __builtin_ia32_cvtpd2ps(__a);
379}
380
381static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
382_mm_cvtps_pd(__m128 __a)
383{
384 return __builtin_ia32_cvtps2pd(__a);
385}
386
387static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
388_mm_cvtepi32_pd(__m128i __a)
389{
390 return __builtin_ia32_cvtdq2pd((__v4si)__a);
391}
392
393static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
394_mm_cvtpd_epi32(__m128d __a)
395{
396 return __builtin_ia32_cvtpd2dq(__a);
397}
398
399static __inline__ int __attribute__((__always_inline__, __nodebug__))
400_mm_cvtsd_si32(__m128d __a)
401{
402 return __builtin_ia32_cvtsd2si(__a);
403}
404
405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
406_mm_cvtsd_ss(__m128 __a, __m128d __b)
407{
408 __a[0] = __b[0];
409 return __a;
410}
411
412static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
413_mm_cvtsi32_sd(__m128d __a, int __b)
414{
415 __a[0] = __b;
416 return __a;
417}
418
419static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
420_mm_cvtss_sd(__m128d __a, __m128 __b)
421{
422 __a[0] = __b[0];
423 return __a;
424}
425
426static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
427_mm_cvttpd_epi32(__m128d __a)
428{
429 return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430}
431
432static __inline__ int __attribute__((__always_inline__, __nodebug__))
433_mm_cvttsd_si32(__m128d __a)
434{
435 return __a[0];
436}
437
438static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
439_mm_cvtpd_pi32(__m128d __a)
440{
441 return (__m64)__builtin_ia32_cvtpd2pi(__a);
442}
443
444static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
445_mm_cvttpd_pi32(__m128d __a)
446{
447 return (__m64)__builtin_ia32_cvttpd2pi(__a);
448}
449
450static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
451_mm_cvtpi32_pd(__m64 __a)
452{
453 return __builtin_ia32_cvtpi2pd((__v2si)__a);
454}
455
456static __inline__ double __attribute__((__always_inline__, __nodebug__))
457_mm_cvtsd_f64(__m128d __a)
458{
459 return __a[0];
460}
461
462static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
463_mm_load_pd(double const *__dp)
464{
465 return *(__m128d*)__dp;
466}
467
468static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
469_mm_load1_pd(double const *__dp)
470{
471 struct __mm_load1_pd_struct {
472 double __u;
473 } __attribute__((__packed__, __may_alias__));
474 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475 return (__m128d){ __u, __u };
476}
477
478#define _mm_load_pd1(dp) _mm_load1_pd(dp)
479
480static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481_mm_loadr_pd(double const *__dp)
482{
483 __m128d __u = *(__m128d*)__dp;
484 return __builtin_shufflevector(__u, __u, 1, 0);
485}
486
487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_loadu_pd(double const *__dp)
489{
490 struct __loadu_pd {
491 __m128d __v;
492 } __attribute__((packed, may_alias));
493 return ((struct __loadu_pd*)__dp)->__v;
494}
495
496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497_mm_load_sd(double const *__dp)
498{
499 struct __mm_load_sd_struct {
500 double __u;
501 } __attribute__((__packed__, __may_alias__));
502 double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503 return (__m128d){ __u, 0 };
504}
505
506static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
507_mm_loadh_pd(__m128d __a, double const *__dp)
508{
509 struct __mm_loadh_pd_struct {
510 double __u;
511 } __attribute__((__packed__, __may_alias__));
512 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513 return (__m128d){ __a[0], __u };
514}
515
516static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
517_mm_loadl_pd(__m128d __a, double const *__dp)
518{
519 struct __mm_loadl_pd_struct {
520 double __u;
521 } __attribute__((__packed__, __may_alias__));
522 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523 return (__m128d){ __u, __a[1] };
524}
525
526static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527_mm_set_sd(double __w)
528{
529 return (__m128d){ __w, 0 };
530}
531
532static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533_mm_set1_pd(double __w)
534{
535 return (__m128d){ __w, __w };
536}
537
538static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539_mm_set_pd(double __w, double __x)
540{
541 return (__m128d){ __x, __w };
542}
543
544static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
545_mm_setr_pd(double __w, double __x)
546{
547 return (__m128d){ __w, __x };
548}
549
550static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
551_mm_setzero_pd(void)
552{
553 return (__m128d){ 0, 0 };
554}
555
556static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
557_mm_move_sd(__m128d __a, __m128d __b)
558{
559 return (__m128d){ __b[0], __a[1] };
560}
561
562static __inline__ void __attribute__((__always_inline__, __nodebug__))
563_mm_store_sd(double *__dp, __m128d __a)
564{
565 struct __mm_store_sd_struct {
566 double __u;
567 } __attribute__((__packed__, __may_alias__));
568 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569}
570
571static __inline__ void __attribute__((__always_inline__, __nodebug__))
572_mm_store1_pd(double *__dp, __m128d __a)
573{
574 struct __mm_store1_pd_struct {
575 double __u[2];
576 } __attribute__((__packed__, __may_alias__));
577 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579}
580
581static __inline__ void __attribute__((__always_inline__, __nodebug__))
582_mm_store_pd(double *__dp, __m128d __a)
583{
584 *(__m128d *)__dp = __a;
585}
586
587static __inline__ void __attribute__((__always_inline__, __nodebug__))
588_mm_storeu_pd(double *__dp, __m128d __a)
589{
590 __builtin_ia32_storeupd(__dp, __a);
591}
592
593static __inline__ void __attribute__((__always_inline__, __nodebug__))
594_mm_storer_pd(double *__dp, __m128d __a)
595{
596 __a = __builtin_shufflevector(__a, __a, 1, 0);
597 *(__m128d *)__dp = __a;
598}
599
600static __inline__ void __attribute__((__always_inline__, __nodebug__))
601_mm_storeh_pd(double *__dp, __m128d __a)
602{
603 struct __mm_storeh_pd_struct {
604 double __u;
605 } __attribute__((__packed__, __may_alias__));
606 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607}
608
609static __inline__ void __attribute__((__always_inline__, __nodebug__))
610_mm_storel_pd(double *__dp, __m128d __a)
611{
612 struct __mm_storeh_pd_struct {
613 double __u;
614 } __attribute__((__packed__, __may_alias__));
615 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616}
617
618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619_mm_add_epi8(__m128i __a, __m128i __b)
620{
621 return (__m128i)((__v16qi)__a + (__v16qi)__b);
622}
623
624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625_mm_add_epi16(__m128i __a, __m128i __b)
626{
627 return (__m128i)((__v8hi)__a + (__v8hi)__b);
628}
629
630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631_mm_add_epi32(__m128i __a, __m128i __b)
632{
633 return (__m128i)((__v4si)__a + (__v4si)__b);
634}
635
636static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
637_mm_add_si64(__m64 __a, __m64 __b)
638{
639 return __a + __b;
640}
641
642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643_mm_add_epi64(__m128i __a, __m128i __b)
644{
645 return __a + __b;
646}
647
648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649_mm_adds_epi8(__m128i __a, __m128i __b)
650{
651 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652}
653
654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655_mm_adds_epi16(__m128i __a, __m128i __b)
656{
657 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658}
659
660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661_mm_adds_epu8(__m128i __a, __m128i __b)
662{
663 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664}
665
666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667_mm_adds_epu16(__m128i __a, __m128i __b)
668{
669 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670}
671
672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673_mm_avg_epu8(__m128i __a, __m128i __b)
674{
675 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676}
677
678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679_mm_avg_epu16(__m128i __a, __m128i __b)
680{
681 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682}
683
684static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685_mm_madd_epi16(__m128i __a, __m128i __b)
686{
687 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688}
689
690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691_mm_max_epi16(__m128i __a, __m128i __b)
692{
693 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694}
695
696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697_mm_max_epu8(__m128i __a, __m128i __b)
698{
699 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700}
701
702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703_mm_min_epi16(__m128i __a, __m128i __b)
704{
705 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706}
707
708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709_mm_min_epu8(__m128i __a, __m128i __b)
710{
711 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712}
713
714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715_mm_mulhi_epi16(__m128i __a, __m128i __b)
716{
717 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718}
719
720static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721_mm_mulhi_epu16(__m128i __a, __m128i __b)
722{
723 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724}
725
726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727_mm_mullo_epi16(__m128i __a, __m128i __b)
728{
729 return (__m128i)((__v8hi)__a * (__v8hi)__b);
730}
731
732static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733_mm_mul_su32(__m64 __a, __m64 __b)
734{
735 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736}
737
738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739_mm_mul_epu32(__m128i __a, __m128i __b)
740{
741 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742}
743
744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745_mm_sad_epu8(__m128i __a, __m128i __b)
746{
747 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748}
749
750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751_mm_sub_epi8(__m128i __a, __m128i __b)
752{
753 return (__m128i)((__v16qi)__a - (__v16qi)__b);
754}
755
756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757_mm_sub_epi16(__m128i __a, __m128i __b)
758{
759 return (__m128i)((__v8hi)__a - (__v8hi)__b);
760}
761
762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763_mm_sub_epi32(__m128i __a, __m128i __b)
764{
765 return (__m128i)((__v4si)__a - (__v4si)__b);
766}
767
768static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
769_mm_sub_si64(__m64 __a, __m64 __b)
770{
771 return __a - __b;
772}
773
774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775_mm_sub_epi64(__m128i __a, __m128i __b)
776{
777 return __a - __b;
778}
779
780static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781_mm_subs_epi8(__m128i __a, __m128i __b)
782{
783 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784}
785
786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787_mm_subs_epi16(__m128i __a, __m128i __b)
788{
789 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790}
791
792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793_mm_subs_epu8(__m128i __a, __m128i __b)
794{
795 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796}
797
798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799_mm_subs_epu16(__m128i __a, __m128i __b)
800{
801 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802}
803
804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805_mm_and_si128(__m128i __a, __m128i __b)
806{
807 return __a & __b;
808}
809
810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811_mm_andnot_si128(__m128i __a, __m128i __b)
812{
813 return ~__a & __b;
814}
815
816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817_mm_or_si128(__m128i __a, __m128i __b)
818{
819 return __a | __b;
820}
821
822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823_mm_xor_si128(__m128i __a, __m128i __b)
824{
825 return __a ^ __b;
826}
827
828#define _mm_slli_si128(a, count) __extension__ ({ \
829 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
830 __m128i __a = (a); \
831 _Pragma("clang diagnostic pop"); \
832 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_slli_epi16(__m128i __a, int __count)
836{
837 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_sll_epi16(__m128i __a, __m128i __count)
842{
843 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
844}
845
846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847_mm_slli_epi32(__m128i __a, int __count)
848{
849 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
850}
851
852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853_mm_sll_epi32(__m128i __a, __m128i __count)
854{
855 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
856}
857
858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859_mm_slli_epi64(__m128i __a, int __count)
860{
861 return __builtin_ia32_psllqi128(__a, __count);
862}
863
864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865_mm_sll_epi64(__m128i __a, __m128i __count)
866{
867 return __builtin_ia32_psllq128(__a, __count);
868}
869
870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871_mm_srai_epi16(__m128i __a, int __count)
872{
873 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
874}
875
876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877_mm_sra_epi16(__m128i __a, __m128i __count)
878{
879 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
880}
881
882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883_mm_srai_epi32(__m128i __a, int __count)
884{
885 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
886}
887
888static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
889_mm_sra_epi32(__m128i __a, __m128i __count)
890{
891 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
892}
893
894
895#define _mm_srli_si128(a, count) __extension__ ({ \
896 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
897 __m128i __a = (a); \
898 _Pragma("clang diagnostic pop"); \
899 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
900
901static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
902_mm_srli_epi16(__m128i __a, int __count)
903{
904 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
905}
906
907static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
908_mm_srl_epi16(__m128i __a, __m128i __count)
909{
910 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
911}
912
913static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914_mm_srli_epi32(__m128i __a, int __count)
915{
916 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
917}
918
919static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920_mm_srl_epi32(__m128i __a, __m128i __count)
921{
922 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
923}
924
925static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
926_mm_srli_epi64(__m128i __a, int __count)
927{
928 return __builtin_ia32_psrlqi128(__a, __count);
929}
930
931static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
932_mm_srl_epi64(__m128i __a, __m128i __count)
933{
934 return __builtin_ia32_psrlq128(__a, __count);
935}
936
937static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
938_mm_cmpeq_epi8(__m128i __a, __m128i __b)
939{
940 return (__m128i)((__v16qi)__a == (__v16qi)__b);
941}
942
943static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
944_mm_cmpeq_epi16(__m128i __a, __m128i __b)
945{
946 return (__m128i)((__v8hi)__a == (__v8hi)__b);
947}
948
949static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
950_mm_cmpeq_epi32(__m128i __a, __m128i __b)
951{
952 return (__m128i)((__v4si)__a == (__v4si)__b);
953}
954
955static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
956_mm_cmpgt_epi8(__m128i __a, __m128i __b)
957{
958 /* This function always performs a signed comparison, but __v16qi is a char
959 which may be signed or unsigned. */
960 typedef signed char __v16qs __attribute__((__vector_size__(16)));
961 return (__m128i)((__v16qs)__a > (__v16qs)__b);
962}
963
964static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965_mm_cmpgt_epi16(__m128i __a, __m128i __b)
966{
967 return (__m128i)((__v8hi)__a > (__v8hi)__b);
968}
969
970static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971_mm_cmpgt_epi32(__m128i __a, __m128i __b)
972{
973 return (__m128i)((__v4si)__a > (__v4si)__b);
974}
975
976static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977_mm_cmplt_epi8(__m128i __a, __m128i __b)
978{
979 return _mm_cmpgt_epi8(__b, __a);
980}
981
982static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
983_mm_cmplt_epi16(__m128i __a, __m128i __b)
984{
985 return _mm_cmpgt_epi16(__b, __a);
986}
987
988static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989_mm_cmplt_epi32(__m128i __a, __m128i __b)
990{
991 return _mm_cmpgt_epi32(__b, __a);
992}
993
994#ifdef __x86_64__
995static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
996_mm_cvtsi64_sd(__m128d __a, long long __b)
997{
998 __a[0] = __b;
999 return __a;
1000}
1001
1002static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1003_mm_cvtsd_si64(__m128d __a)
1004{
1005 return __builtin_ia32_cvtsd2si64(__a);
1006}
1007
1008static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1009_mm_cvttsd_si64(__m128d __a)
1010{
1011 return __a[0];
1012}
1013#endif
1014
1015static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1016_mm_cvtepi32_ps(__m128i __a)
1017{
1018 return __builtin_ia32_cvtdq2ps((__v4si)__a);
1019}
1020
1021static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1022_mm_cvtps_epi32(__m128 __a)
1023{
1024 return (__m128i)__builtin_ia32_cvtps2dq(__a);
1025}
1026
1027static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028_mm_cvttps_epi32(__m128 __a)
1029{
1030 return (__m128i)__builtin_ia32_cvttps2dq(__a);
1031}
1032
1033static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1034_mm_cvtsi32_si128(int __a)
1035{
1036 return (__m128i)(__v4si){ __a, 0, 0, 0 };
1037}
1038
1039#ifdef __x86_64__
1040static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041_mm_cvtsi64_si128(long long __a)
1042{
1043 return (__m128i){ __a, 0 };
1044}
1045#endif
1046
1047static __inline__ int __attribute__((__always_inline__, __nodebug__))
1048_mm_cvtsi128_si32(__m128i __a)
1049{
1050 __v4si __b = (__v4si)__a;
1051 return __b[0];
1052}
1053
1054#ifdef __x86_64__
1055static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1056_mm_cvtsi128_si64(__m128i __a)
1057{
1058 return __a[0];
1059}
1060#endif
1061
1062static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1063_mm_load_si128(__m128i const *__p)
1064{
1065 return *__p;
1066}
1067
1068static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1069_mm_loadu_si128(__m128i const *__p)
1070{
1071 struct __loadu_si128 {
1072 __m128i __v;
1073 } __attribute__((packed, may_alias));
1074 return ((struct __loadu_si128*)__p)->__v;
1075}
1076
1077static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078_mm_loadl_epi64(__m128i const *__p)
1079{
1080 struct __mm_loadl_epi64_struct {
1081 long long __u;
1082 } __attribute__((__packed__, __may_alias__));
1083 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1084}
1085
1086static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1087_mm_set_epi64x(long long q1, long long q0)
1088{
1089 return (__m128i){ q0, q1 };
1090}
1091
1092static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1093_mm_set_epi64(__m64 q1, __m64 q0)
1094{
1095 return (__m128i){ (long long)q0, (long long)q1 };
1096}
1097
1098static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1099_mm_set_epi32(int i3, int i2, int i1, int i0)
1100{
1101 return (__m128i)(__v4si){ i0, i1, i2, i3};
1102}
1103
1104static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1105_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1106{
1107 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1108}
1109
1110static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1111_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1112{
1113 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1114}
1115
1116static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1117_mm_set1_epi64x(long long __q)
1118{
1119 return (__m128i){ __q, __q };
1120}
1121
1122static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1123_mm_set1_epi64(__m64 __q)
1124{
1125 return (__m128i){ (long long)__q, (long long)__q };
1126}
1127
1128static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1129_mm_set1_epi32(int __i)
1130{
1131 return (__m128i)(__v4si){ __i, __i, __i, __i };
1132}
1133
1134static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1135_mm_set1_epi16(short __w)
1136{
1137 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1138}
1139
1140static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1141_mm_set1_epi8(char __b)
1142{
1143 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1144}
1145
1146static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1147_mm_setr_epi64(__m64 q0, __m64 q1)
1148{
1149 return (__m128i){ (long long)q0, (long long)q1 };
1150}
1151
1152static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1153_mm_setr_epi32(int i0, int i1, int i2, int i3)
1154{
1155 return (__m128i)(__v4si){ i0, i1, i2, i3};
1156}
1157
1158static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1159_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1160{
1161 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1162}
1163
1164static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1165_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1166{
1167 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1168}
1169
1170static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1171_mm_setzero_si128(void)
1172{
1173 return (__m128i){ 0LL, 0LL };
1174}
1175
1176static __inline__ void __attribute__((__always_inline__, __nodebug__))
1177_mm_store_si128(__m128i *__p, __m128i __b)
1178{
1179 *__p = __b;
1180}
1181
1182static __inline__ void __attribute__((__always_inline__, __nodebug__))
1183_mm_storeu_si128(__m128i *__p, __m128i __b)
1184{
1185 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1186}
1187
1188static __inline__ void __attribute__((__always_inline__, __nodebug__))
1189_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1190{
1191 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1192}
1193
1194static __inline__ void __attribute__((__always_inline__, __nodebug__))
1195_mm_storel_epi64(__m128i *__p, __m128i __a)
1196{
1197 struct __mm_storel_epi64_struct {
1198 long long __u;
1199 } __attribute__((__packed__, __may_alias__));
1200 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1201}
1202
1203static __inline__ void __attribute__((__always_inline__, __nodebug__))
1204_mm_stream_pd(double *__p, __m128d __a)
1205{
1206 __builtin_ia32_movntpd(__p, __a);
1207}
1208
1209static __inline__ void __attribute__((__always_inline__, __nodebug__))
1210_mm_stream_si128(__m128i *__p, __m128i __a)
1211{
1212 __builtin_ia32_movntdq(__p, __a);
1213}
1214
1215static __inline__ void __attribute__((__always_inline__, __nodebug__))
1216_mm_stream_si32(int *__p, int __a)
1217{
1218 __builtin_ia32_movnti(__p, __a);
1219}
1220
1221#ifdef __x86_64__
1222static __inline__ void __attribute__((__always_inline__, __nodebug__))
1223_mm_stream_si64(long long *__p, long long __a)
1224{
1225 __builtin_ia32_movnti64(__p, __a);
1226}
1227#endif
1228
1229static __inline__ void __attribute__((__always_inline__, __nodebug__))
1230_mm_clflush(void const *__p)
1231{
1232 __builtin_ia32_clflush(__p);
1233}
1234
1235static __inline__ void __attribute__((__always_inline__, __nodebug__))
1236_mm_lfence(void)
1237{
1238 __builtin_ia32_lfence();
1239}
1240
1241static __inline__ void __attribute__((__always_inline__, __nodebug__))
1242_mm_mfence(void)
1243{
1244 __builtin_ia32_mfence();
1245}
1246
1247static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1248_mm_packs_epi16(__m128i __a, __m128i __b)
1249{
1250 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1251}
1252
1253static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1254_mm_packs_epi32(__m128i __a, __m128i __b)
1255{
1256 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1257}
1258
1259static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1260_mm_packus_epi16(__m128i __a, __m128i __b)
1261{
1262 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1263}
1264
1265static __inline__ int __attribute__((__always_inline__, __nodebug__))
1266_mm_extract_epi16(__m128i __a, int __imm)
1267{
1268 __v8hi __b = (__v8hi)__a;
1269 return (unsigned short)__b[__imm & 7];
1270}
1271
1272static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273_mm_insert_epi16(__m128i __a, int __b, int __imm)
1274{
1275 __v8hi __c = (__v8hi)__a;
1276 __c[__imm & 7] = __b;
1277 return (__m128i)__c;
1278}
1279
1280static __inline__ int __attribute__((__always_inline__, __nodebug__))
1281_mm_movemask_epi8(__m128i __a)
1282{
1283 return __builtin_ia32_pmovmskb128((__v16qi)__a);
1284}
1285
1286#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1287 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1288 __m128i __a = (a); \
1289 _Pragma("clang diagnostic pop"); \
1290 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1291 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1292 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1293
1294#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1295 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1296 __m128i __a = (a); \
1297 _Pragma("clang diagnostic pop"); \
1298 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1299 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1300 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1301 4, 5, 6, 7); })
1302
1303#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1304 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1305 __m128i __a = (a); \
1306 _Pragma("clang diagnostic pop"); \
1307 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1308 0, 1, 2, 3, \
1309 4 + (((imm) & 0x03) >> 0), \
1310 4 + (((imm) & 0x0c) >> 2), \
1311 4 + (((imm) & 0x30) >> 4), \
1312 4 + (((imm) & 0xc0) >> 6)); })
1313
1314static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1315_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1316{
1317 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1318}
1319
1320static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1321_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1322{
1323 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1324}
1325
1326static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1327_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1328{
1329 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1330}
1331
1332static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1333_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1334{
1335 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1336}
1337
1338static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1339_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1340{
1341 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1342}
1343
1344static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1345_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1346{
1347 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1348}
1349
1350static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1351_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1352{
1353 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1354}
1355
1356static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1357_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1358{
1359 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1360}
1361
1362static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1363_mm_movepi64_pi64(__m128i __a)
1364{
1365 return (__m64)__a[0];
1366}
1367
1368static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1369_mm_movpi64_epi64(__m64 __a)
1370{
1371 return (__m128i){ (long long)__a, 0 };
1372}
1373
1374static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375_mm_move_epi64(__m128i __a)
1376{
1377 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1378}
1379
1380static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381_mm_unpackhi_pd(__m128d __a, __m128d __b)
1382{
1383 return __builtin_shufflevector(__a, __b, 1, 2+1);
1384}
1385
1386static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1387_mm_unpacklo_pd(__m128d __a, __m128d __b)
1388{
1389 return __builtin_shufflevector(__a, __b, 0, 2+0);
1390}
1391
1392static __inline__ int __attribute__((__always_inline__, __nodebug__))
1393_mm_movemask_pd(__m128d __a)
1394{
1395 return __builtin_ia32_movmskpd(__a);
1396}
1397
1398#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1399 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1400 __m128d __a = (a); \
1401 __m128d __b = (b); \
1402 _Pragma("clang diagnostic pop"); \
1403 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1404
1405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406_mm_castpd_ps(__m128d __a)
1407{
1408 return (__m128)__a;
1409}
1410
1411static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1412_mm_castpd_si128(__m128d __a)
1413{
1414 return (__m128i)__a;
1415}
1416
1417static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1418_mm_castps_pd(__m128 __a)
1419{
1420 return (__m128d)__a;
1421}
1422
1423static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1424_mm_castps_si128(__m128 __a)
1425{
1426 return (__m128i)__a;
1427}
1428
1429static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1430_mm_castsi128_ps(__m128i __a)
1431{
1432 return (__m128)__a;
1433}
1434
1435static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1436_mm_castsi128_pd(__m128i __a)
1437{
1438 return (__m128d)__a;
1439}
1440
1441static __inline__ void __attribute__((__always_inline__, __nodebug__))
1442_mm_pause(void)
1443{
1444 __asm__ volatile ("pause");
1445}
1446
1447#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1448
1449#endif /* __SSE2__ */
1450
1451#endif /* __EMMINTRIN_H */