blob: 1232cab89ff71b40a6a5fb4aedb13bf6dc5a654f [file] [log] [blame]
Anders Carlsson37f2f002008-12-24 01:45:22 +00001/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef int __v4si __attribute__((__vector_size__(16)));
Anders Carlsson07603aa2008-12-24 02:41:00 +000037typedef short __v8hi __attribute__((__vector_size__(16)));
Anders Carlsson445afa02008-12-24 02:11:54 +000038typedef char __v16qi __attribute__((__vector_size__(16)));
Anders Carlsson37f2f002008-12-24 01:45:22 +000039
40static inline __m128d __attribute__((__always_inline__)) _mm_add_sd(__m128d a, __m128d b)
41{
42 return __builtin_ia32_addsd(a, b);
43}
44
45static inline __m128d __attribute__((__always_inline__)) _mm_add_pd(__m128d a, __m128d b)
46{
47 return a + b;
48}
49
50static inline __m128d __attribute__((__always_inline__)) _mm_sub_sd(__m128d a, __m128d b)
51{
52 return __builtin_ia32_subsd(a, b);
53}
54
55static inline __m128d __attribute__((__always_inline__)) _mm_sub_pd(__m128d a, __m128d b)
56{
57 return a - b;
58}
59
60static inline __m128d __attribute__((__always_inline__)) _mm_mul_sd(__m128d a, __m128d b)
61{
62 return __builtin_ia32_mulsd(a, b);
63}
64
65static inline __m128d __attribute__((__always_inline__)) _mm_mul_pd(__m128d a, __m128d b)
66{
67 return a * b;
68}
69
70static inline __m128d __attribute__((__always_inline__)) _mm_div_sd(__m128d a, __m128d b)
71{
72 return __builtin_ia32_divsd(a, b);
73}
74
75static inline __m128d __attribute__((__always_inline__)) _mm_div_pd(__m128d a, __m128d b)
76{
77 return a / b;
78}
79
80static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_sd(__m128d a, __m128d b)
81{
82 __m128d c = __builtin_ia32_sqrtsd(b);
83 return (__m128d) { c[0], a[1] };
84}
85
86static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_pd(__m128d a)
87{
88 return __builtin_ia32_sqrtpd(a);
89}
90
91static inline __m128d __attribute__((__always_inline__)) _mm_min_sd(__m128d a, __m128d b)
92{
93 return __builtin_ia32_minsd(a, b);
94}
95
96static inline __m128d __attribute__((__always_inline__)) _mm_min_pd(__m128d a, __m128d b)
97{
98 return __builtin_ia32_minpd(a, b);
99}
100
101static inline __m128d __attribute__((__always_inline__)) _mm_max_sd(__m128d a, __m128d b)
102{
103 return __builtin_ia32_maxsd(a, b);
104}
105
106static inline __m128d __attribute__((__always_inline__)) _mm_max_pd(__m128d a, __m128d b)
107{
108 return __builtin_ia32_maxpd(a, b);
109}
110
111static inline __m128d __attribute__((__always_inline__)) _mm_and_pd(__m128d a, __m128d b)
112{
113 return __builtin_ia32_andpd(a, b);
114}
115
116static inline __m128d __attribute__((__always_inline__)) _mm_andnot_pd(__m128d a, __m128d b)
117{
118 return __builtin_ia32_andnpd(a, b);
119}
120
121static inline __m128d __attribute__((__always_inline__)) _mm_or_pd(__m128d a, __m128d b)
122{
123 return __builtin_ia32_orpd(a, b);
124}
125
126static inline __m128d __attribute__((__always_inline__)) _mm_xor_pd(__m128d a, __m128d b)
127{
128 return __builtin_ia32_xorpd(a, b);
129}
130
131static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_pd(__m128d a, __m128d b)
132{
133 return (__m128d)__builtin_ia32_cmpeqpd(a, b);
134}
135
136static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_pd(__m128d a, __m128d b)
137{
138 return (__m128d)__builtin_ia32_cmpltpd(a, b);
139}
140
141static inline __m128d __attribute__((__always_inline__)) _mm_cmple_pd(__m128d a, __m128d b)
142{
143 return (__m128d)__builtin_ia32_cmplepd(a, b);
144}
145
146static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_pd(__m128d a, __m128d b)
147{
148 return (__m128d)__builtin_ia32_cmpltpd(b, a);
149}
150
151static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_pd(__m128d a, __m128d b)
152{
153 return (__m128d)__builtin_ia32_cmplepd(b, a);
154}
155
156static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_pd(__m128d a, __m128d b)
157{
158 return (__m128d)__builtin_ia32_cmpordpd(a, b);
159}
160
161static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_pd(__m128d a, __m128d b)
162{
163 return (__m128d)__builtin_ia32_cmpunordpd(a, b);
164}
165
166static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_pd(__m128d a, __m128d b)
167{
168 return (__m128d)__builtin_ia32_cmpneqpd(a, b);
169}
170
171static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_pd(__m128d a, __m128d b)
172{
173 return (__m128d)__builtin_ia32_cmpnltpd(a, b);
174}
175
176static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_pd(__m128d a, __m128d b)
177{
178 return (__m128d)__builtin_ia32_cmpnlepd(a, b);
179}
180
181static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_pd(__m128d a, __m128d b)
182{
183 return (__m128d)__builtin_ia32_cmpnltpd(b, a);
184}
185
186static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_pd(__m128d a, __m128d b)
187{
188 return (__m128d)__builtin_ia32_cmpnlepd(b, a);
189}
190
191static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_sd(__m128d a, __m128d b)
192{
193 return (__m128d)__builtin_ia32_cmpeqsd(a, b);
194}
195
196static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_sd(__m128d a, __m128d b)
197{
198 return (__m128d)__builtin_ia32_cmpltsd(a, b);
199}
200
201static inline __m128d __attribute__((__always_inline__)) _mm_cmple_sd(__m128d a, __m128d b)
202{
203 return (__m128d)__builtin_ia32_cmplesd(a, b);
204}
205
206static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_sd(__m128d a, __m128d b)
207{
208 return (__m128d)__builtin_ia32_cmpltsd(b, a);
209}
210
211static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_sd(__m128d a, __m128d b)
212{
213 return (__m128d)__builtin_ia32_cmplesd(b, a);
214}
215
216static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_sd(__m128d a, __m128d b)
217{
218 return (__m128d)__builtin_ia32_cmpordsd(a, b);
219}
220
221static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_sd(__m128d a, __m128d b)
222{
223 return (__m128d)__builtin_ia32_cmpunordsd(a, b);
224}
225
226static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_sd(__m128d a, __m128d b)
227{
228 return (__m128d)__builtin_ia32_cmpneqsd(a, b);
229}
230
231static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_sd(__m128d a, __m128d b)
232{
233 return (__m128d)__builtin_ia32_cmpnltsd(a, b);
234}
235
236static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_sd(__m128d a, __m128d b)
237{
238 return (__m128d)__builtin_ia32_cmpnlesd(a, b);
239}
240
241static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_sd(__m128d a, __m128d b)
242{
243 return (__m128d)__builtin_ia32_cmpnltsd(b, a);
244}
245
246static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_sd(__m128d a, __m128d b)
247{
248 return (__m128d)__builtin_ia32_cmpnlesd(b, a);
249}
250
251static inline int __attribute__((__always_inline__)) _mm_comieq_sd(__m128d a, __m128d b)
252{
253 return __builtin_ia32_comisdeq(a, b);
254}
255
256static inline int __attribute__((__always_inline__)) _mm_comilt_sd(__m128d a, __m128d b)
257{
258 return __builtin_ia32_comisdlt(a, b);
259}
260
261static inline int __attribute__((__always_inline__)) _mm_comile_sd(__m128d a, __m128d b)
262{
263 return __builtin_ia32_comisdle(a, b);
264}
265
266static inline int __attribute__((__always_inline__)) _mm_comigt_sd(__m128d a, __m128d b)
267{
268 return __builtin_ia32_comisdgt(a, b);
269}
270
271static inline int __attribute__((__always_inline__)) _mm_comineq_sd(__m128d a, __m128d b)
272{
273 return __builtin_ia32_comisdneq(a, b);
274}
275
276static inline int __attribute__((__always_inline__)) _mm_ucomieq_sd(__m128d a, __m128d b)
277{
278 return __builtin_ia32_ucomisdeq(a, b);
279}
280
281static inline int __attribute__((__always_inline__)) _mm_ucomilt_sd(__m128d a, __m128d b)
282{
283 return __builtin_ia32_ucomisdlt(a, b);
284}
285
286static inline int __attribute__((__always_inline__)) _mm_ucomile_sd(__m128d a, __m128d b)
287{
288 return __builtin_ia32_ucomisdle(a, b);
289}
290
291static inline int __attribute__((__always_inline__)) _mm_ucomigt_sd(__m128d a, __m128d b)
292{
293 return __builtin_ia32_ucomisdgt(a, b);
294}
295
296static inline int __attribute__((__always_inline__)) _mm_ucomineq_sd(__m128d a, __m128d b)
297{
298 return __builtin_ia32_ucomisdneq(a, b);
299}
300
301static inline __m128 __attribute__((__always_inline__)) _mm_cvtpd_ps(__m128d a)
302{
303 return __builtin_ia32_cvtpd2ps(a);
304}
305
306static inline __m128d __attribute__((__always_inline__)) _mm_cvtps_pd(__m128 a)
307{
308 return __builtin_ia32_cvtps2pd(a);
309}
310
311static inline __m128d __attribute__((__always_inline__)) _mm_cvtepi32_pd(__m128i a)
312{
313 return __builtin_ia32_cvtdq2pd((__v4si)a);
314}
315
316static inline __m128i __attribute__((__always_inline__)) _mm_cvtpd_epi32(__m128d a)
317{
318 return __builtin_ia32_cvtpd2dq(a);
319}
320
321static inline int __attribute__((__always_inline__)) _mm_cvtsd_si32(__m128d a)
322{
323 return __builtin_ia32_cvtsd2si(a);
324}
325
326static inline __m128 __attribute__((__always_inline__)) _mm_cvtsd_ss(__m128 a, __m128d b)
327{
328 return __builtin_ia32_cvtsd2ss(a, b);
329}
330
331static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi32_sd(__m128d a, int b)
332{
333 return __builtin_ia32_cvtsi2sd(a, b);
334}
335
336static inline __m128d __attribute__((__always_inline__)) _mm_cvtss_sd(__m128d a, __m128 b)
337{
338 return __builtin_ia32_cvtss2sd(a, b);
339}
340
341static inline __m128i __attribute__((__always_inline__)) _mm_cvttpd_epi32(__m128d a)
342{
343 return (__m128i)__builtin_ia32_cvttpd2dq(a);
344}
345
346static inline int __attribute__((__always_inline__)) _mm_cvttsd_si32(__m128d a)
347{
348 return __builtin_ia32_cvttsd2si(a);
349}
350
351static inline __m64 __attribute__((__always_inline__)) _mm_cvtpd_pi32(__m128d a)
352{
353 return (__m64)__builtin_ia32_cvtpd2pi(a);
354}
355
356static inline __m64 __attribute__((__always_inline__)) _mm_cvttpd_pi32(__m128d a)
357{
358 return (__m64)__builtin_ia32_cvttpd2pi(a);
359}
360
361static inline __m128d __attribute__((__always_inline__)) _mm_cvtpi32_pd(__m64 a)
362{
363 return __builtin_ia32_cvtpi2pd((__v2si)a);
364}
365
366static inline double __attribute__((__always_inline__)) _mm_cvtsd_f64(__m128d a)
367{
368 return a[0];
369}
370
Anders Carlsson445afa02008-12-24 02:11:54 +0000371static inline __m128d __attribute__((__always_inline__)) _mm_load_pd(double const *dp)
372{
373 return *(__m128d*)dp;
374}
375
376static inline __m128d __attribute__((__always_inline__)) _mm_load1_pd(double const *dp)
377{
378 return (__m128d){ dp[0], dp[0] };
379}
380
381static inline __m128d __attribute__((__always_inline__)) _mm_loadr_pd(double const *dp)
382{
383 return (__m128d){ dp[1], dp[0] };
384}
385
386static inline __m128d __attribute__((__always_inline__)) _mm_loadu_pd(double const *dp)
387{
388 return __builtin_ia32_loadupd(dp);
389}
390
391static inline __m128d __attribute__((__always_inline__)) _mm_load_sd(double const *dp)
392{
393 return (__m128d){ *dp, 0.0 };
394}
395
396static inline __m128d __attribute__((__always_inline__)) _mm_loadh_pd(__m128d a, double const *dp)
397{
398 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
399}
400
401static inline __m128d __attribute__((__always_inline__)) _mm_loadl_pd(__m128d a, double const *dp)
402{
403 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
404}
405
406static inline __m128d __attribute__((__always_inline__)) _mm_set_sd(double w)
407{
408 return (__m128d){ w, 0 };
409}
410
411static inline __m128d __attribute__((__always_inline__)) _mm_set1_pd(double w)
412{
413 return (__m128d){ w, w };
414}
415
416static inline __m128d __attribute__((__always_inline__)) _mm_set_pd(double w, double x)
417{
418 return (__m128d){ w, x };
419}
420
421static inline __m128d __attribute__((__always_inline__)) _mm_setr_pd(double w, double x)
422{
423 return (__m128d){ x, w };
424}
425
426static inline __m128d __attribute__((__always_inline__)) _mm_setzero_pd(void)
427{
428 return (__m128d){ 0, 0 };
429}
430
431static inline __m128d __attribute__((__always_inline__)) _mm_move_sd(__m128d a, __m128d b)
432{
433 return (__m128d){ b[0], a[1] };
434}
435
436static inline void __attribute__((__always_inline__)) _mm_store_sd(double *dp, __m128d a)
437{
438 dp[0] = a[0];
439}
440
441static inline void __attribute__((__always_inline__)) _mm_store1_pd(double *dp, __m128d a)
442{
443 dp[0] = a[0];
444 dp[1] = a[0];
445}
446
447static inline void __attribute__((__always_inline__)) _mm_store_pd(double *dp, __m128d a)
448{
449 *(__m128d *)dp = a;
450}
451
452static inline void __attribute__((__always_inline__)) _mm_storeu_pd(double *dp, __m128d a)
453{
454 __builtin_ia32_storeupd(dp, a);
455}
456
457static inline void __attribute__((__always_inline__)) _mm_storer_pd(double *dp, __m128d a)
458{
459 dp[0] = a[1];
460 dp[1] = a[0];
461}
462
463static inline void __attribute__((__always_inline__)) _mm_storeh_pd(double *dp, __m128d a)
464{
465 dp[0] = a[1];
466}
467
468static inline void __attribute__((__always_inline__)) _mm_storel_pd(double *dp, __m128d a)
469{
470 dp[0] = a[0];
471}
472
Anders Carlsson07603aa2008-12-24 02:41:00 +0000473static inline __m128i __attribute__((__always_inline__)) _mm_add_epi8(__m128i a, __m128i b)
474{
475 return (__m128i)((__v16qi)a + (__v16qi)b);
476}
477
478static inline __m128i __attribute__((__always_inline__)) _mm_add_epi16(__m128i a, __m128i b)
479{
480 return (__m128i)((__v8hi)a + (__v8hi)b);
481}
482
483static inline __m128i __attribute__((__always_inline__)) _mm_add_epi32(__m128i a, __m128i b)
484{
485 return (__m128i)((__v4si)a + (__v4si)b);
486}
487
Anders Carlssondfb09112008-12-25 07:07:08 +0000488static inline __m64 __attribute__((__always_inline__)) _mm_add_si64(__m64 a, __m64 b)
Anders Carlsson07603aa2008-12-24 02:41:00 +0000489{
490 return a + b;
491}
492
493static inline __m128i __attribute__((__always_inline__)) _mm_add_epi64(__m128i a, __m128i b)
494{
495 return a + b;
496}
497
498static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi8(__m128i a, __m128i b)
499{
500 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
501}
502
503static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi16(__m128i a, __m128i b)
504{
505 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
506}
507
508static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu8(__m128i a, __m128i b)
509{
510 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
511}
512
513static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu16(__m128i a, __m128i b)
514{
515 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
516}
517
518static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu8(__m128i a, __m128i b)
519{
520 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
521}
522
523static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu16(__m128i a, __m128i b)
524{
525 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
526}
527
528static inline __m128i __attribute__((__always_inline__)) _mm_madd_epi16(__m128i a, __m128i b)
529{
530 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
531}
532
533static inline __m128i __attribute__((__always_inline__)) _mm_max_epi16(__m128i a, __m128i b)
534{
535 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
536}
537
538static inline __m128i __attribute__((__always_inline__)) _mm_max_epu8(__m128i a, __m128i b)
539{
540 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
541}
542
543static inline __m128i __attribute__((__always_inline__)) _mm_min_epi16(__m128i a, __m128i b)
544{
545 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
546}
547
548static inline __m128i __attribute__((__always_inline__)) _mm_min_epu8(__m128i a, __m128i b)
549{
550 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
551}
552
553static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epi16(__m128i a, __m128i b)
554{
555 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
556}
557
558static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epu16(__m128i a, __m128i b)
559{
560 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
561}
562
563static inline __m128i __attribute__((__always_inline__)) _mm_mullo_epi16(__m128i a, __m128i b)
564{
565 return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
566}
567
Anders Carlsson0727df02008-12-25 23:48:58 +0000568static inline __m64 __attribute__((__always_inline__)) _mm_mul_su32(__m64 a, __m64 b)
Anders Carlsson07603aa2008-12-24 02:41:00 +0000569{
570 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
571}
572
573static inline __m128i __attribute__((__always_inline__)) _mm_mul_epu32(__m128i a, __m128i b)
574{
575 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
576}
577
578static inline __m128i __attribute__((__always_inline__)) _mm_sad_epu(__m128i a, __m128i b)
579{
580 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
581}
582
583static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi8(__m128i a, __m128i b)
584{
585 return (__m128i)((__v16qi)a - (__v16qi)b);
586}
587
588static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi16(__m128i a, __m128i b)
589{
590 return (__m128i)((__v8hi)a - (__v8hi)b);
591}
592
593static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi32(__m128i a, __m128i b)
594{
595 return (__m128i)((__v4si)a - (__v4si)b);
596}
597
Anders Carlsson0727df02008-12-25 23:48:58 +0000598static inline __m64 __attribute__((__always_inline__)) _mm_sub_si64(__m64 a, __m64 b)
Anders Carlsson07603aa2008-12-24 02:41:00 +0000599{
600 return a - b;
601}
602
603static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi64(__m128i a, __m128i b)
604{
605 return a - b;
606}
607
608static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi8(__m128i a, __m128i b)
609{
610 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
611}
612
613static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi16(__m128i a, __m128i b)
614{
615 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
616}
617
618static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu8(__m128i a, __m128i b)
619{
620 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
621}
622
623static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu16(__m128i a, __m128i b)
624{
625 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
626}
627
Anders Carlsson0727df02008-12-25 23:48:58 +0000628static inline __m128i __attribute__((__always_inline__)) _mm_and_si128(__m128i a, __m128i b)
629{
630 return __builtin_ia32_pand128(a, b);
631}
632
633static inline __m128i __attribute__((__always_inline__)) _mm_andnot_si128(__m128i a, __m128i b)
634{
635 return __builtin_ia32_pandn128(a, b);
636}
637
638static inline __m128i __attribute__((__always_inline__)) _mm_or_si128(__m128i a, __m128i b)
639{
640 return __builtin_ia32_por128(a, b);
641}
642
643static inline __m128i __attribute__((__always_inline__)) _mm_xor_si128(__m128i a, __m128i b)
644{
645 return __builtin_ia32_pxor128(a, b);
646}
647
648static inline __m128i __attribute__((__always_inline__)) _mm_slli_si128(__m128i a, int imm)
649{
650 return __builtin_ia32_pslldqi128(a, imm * 8);
651}
652
653static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi16(__m128i a, int count)
654{
655 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
656}
657
658static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi16(__m128i a, __m128i count)
659{
660 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
661}
662
663static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32(__m128i a, int count)
664{
665 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
666}
667
668static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi32(__m128i a, __m128i count)
669{
670 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
671}
672
673static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi64(__m128i a, int count)
674{
675 return __builtin_ia32_psllqi128(a, count);
676}
677
678static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi64(__m128i a, __m128i count)
679{
680 return __builtin_ia32_psllq128(a, count);
681}
682
683static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi16(__m128i a, int count)
684{
685 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
686}
687
688static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi16(__m128i a, __m128i count)
689{
690 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
691}
692
693static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi32(__m128i a, int count)
694{
695 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
696}
697
698static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi32(__m128i a, __m128i count)
699{
700 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
701}
702
703static inline __m128i __attribute__((__always_inline__)) _mm_srli_si128(__m128i a, int imm)
704{
705 return __builtin_ia32_psrldqi128(a, imm * 8);
706}
707
708static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi16(__m128i a, int count)
709{
710 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
711}
712
713static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi16(__m128i a, __m128i count)
714{
715 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
716}
717
718static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi32(__m128i a, int count)
719{
720 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
721}
722
723static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi32(__m128i a, __m128i count)
724{
725 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
726}
727
728static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi64(__m128i a, int count)
729{
730 return __builtin_ia32_psrlqi128(a, count);
731}
732
733static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi64(__m128i a, __m128i count)
734{
735 return __builtin_ia32_psrlq128(a, count);
736}
737
738static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi8(__m128i a, __m128i b)
739{
740 return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
741}
742
743static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi16(__m128i a, __m128i b)
744{
745 return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
746}
747
748static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi32(__m128i a, __m128i b)
749{
750 return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
751}
752
753static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi8(__m128i a, __m128i b)
754{
755 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
756}
757
758static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi16(__m128i a, __m128i b)
759{
760 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
761}
762
763static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi32(__m128i a, __m128i b)
764{
765 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
766}
767
768static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi8(__m128i a, __m128i b)
769{
770 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
771}
772
773static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi16(__m128i a, __m128i b)
774{
775 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
776}
777
778static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi32(__m128i a, __m128i b)
779{
780 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
781}
782
783#ifdef __x86_64__
784static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi64_sd(__m128d a, long long b)
785{
786 return __builtin_ia32_cvtsi642sd(a, b);
787}
788
789static inline long long __attribute__((__always_inline__)) _mm_cvtsd_si64(__m128d a)
790{
791 return __builtin_ia32_cvtsd2si64(a);
792}
793
794static inline long long __attribute__((__always_inline__)) _mm_cvttsd_si64(__m128d a)
795{
796 return __builtin_ia32_cvttsd2si64(a);
797}
798#endif
799
800static inline __m128 __attribute__((__always_inline__)) _mm_cvtepi32_ps(__m128i a)
801{
802 return __builtin_ia32_cvtdq2ps((__v4si)a);
803}
804
805static inline __m128i __attribute__((__always_inline__)) _mm_cvtps_epi32(__m128 a)
806{
807 return (__m128i)__builtin_ia32_cvtps2dq(a);
808}
809
810static inline __m128i __attribute__((__always_inline__)) _mm_cvttps_epi32(__m128 a)
811{
812 return (__m128i)__builtin_ia32_cvttps2dq(a);
813}
814
815static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi32_si128(int a)
816{
817 return (__m128i)(__v4si){ a, 0, 0, 0 };
818}
819
820#ifdef __x86_64__
821static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi64_si128(long long a)
822{
823 return (__m128i){ a, 0 };
824}
825#endif
826
827static inline int __attribute__((__always_inline__)) _mm_cvtsi128_si32(__m128i a)
828{
829 __v4si b = (__v4si)a;
830 return b[0];
831}
832
833#ifdef __x86_64__
834static inline long long __attribute__((__always_inline__)) _mm_cvtsi128_si64(__m128i a)
835{
836 return a[0];
837}
838#endif
839
840static inline __m128i __attribute__((__always_inline__)) _mm_load_si128(__m128i const *p)
841{
842 return *p;
843}
844
845static inline __m128i __attribute__((__always_inline__)) _mm_loadu_si128(__m128i const *p)
846{
847 return (__m128i)__builtin_ia32_loaddqu((char const *)p);
848}
849
850static inline __m128i __attribute__((__always_inline__)) _mm_loadl_epi64(__m128i const *p)
851{
852 return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
853}
854
855static inline __m128i __attribute__((__always_inline__)) _mm_set_epi64(__m64 q1, __m64 q0)
856{
857 return (__m128i){ (long long)q0, (long long)q1 };
858}
859
860static inline __m128i __attribute__((__always_inline__)) _mm_set_epi32(int i3, int i2, int i1, int i0)
861{
862 return (__m128i)(__v4si){ i0, i1, i2, i3};
863}
864
865static inline __m128i __attribute__((__always_inline__)) _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
866{
867 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
868}
869
870static inline __m128i __attribute__((__always_inline__)) _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
871{
872 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
873}
874
875static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi64(__m64 q)
876{
877 return (__m128i){ (long long)q, (long long)q };
878}
879
880static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi32(int i)
881{
882 return (__m128i)(__v4si){ i, i, i, i };
883}
884
885static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi16(short w)
886{
887 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
888}
889
890static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi8(char b)
891{
892 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
893}
894
895static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi64(__m64 q0, __m64 q1)
896{
897 return (__m128i){ (long long)q0, (long long)q1 };
898}
899
900static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi32(int i0, int i1, int i2, int i3)
901{
902 return (__m128i)(__v4si){ i0, i1, i2, i3};
903}
904
905static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
906{
907 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
908}
909
910static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
911{
912 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
913}
914
915static inline __m128i __attribute__((__always_inline__)) _mm_setzero_si128(void)
916{
917 return (__m128i){ 0LL, 0LL };
918}
919
920static inline void __attribute__((__always_inline__)) _mm_store_si128(__m128i *p, __m128i b)
921{
922 *p = b;
923}
924
925static inline void __attribute__((__always_inline__)) _mm_storeu_si128(__m128i *p, __m128i b)
926{
927 __builtin_ia32_storedqu((char *)p, (__v16qi)b);
928}
929
930static inline void __attribute__((__always_inline__)) _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
931{
932 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
933}
934
935static inline void __attribute__((__always_inline__)) _mm_storel_epi64(__m128i *p, __m128i a)
936{
937 __builtin_ia32_storelv4si((__v2si *)p, a);
938}
939
940static inline void __attribute__((__always_inline__)) _mm_stream_pd(double *p, __m128d a)
941{
942 __builtin_ia32_movntpd(p, a);
943}
944
945static inline void __attribute__((__always_inline__)) _mm_stream_si128(__m128i *p, __m128i a)
946{
947 __builtin_ia32_movntdq(p, a);
948}
949
950static inline void __attribute__((__always_inline__)) _mm_stream_si32(int *p, int a)
951{
952 __builtin_ia32_movnti(p, a);
953}
954
955static inline void __attribute__((__always_inline__)) _mm_clflush(void const *p)
956{
957 __builtin_ia32_clflush(p);
958}
959
960static inline void __attribute__((__always_inline__)) _mm_lfence(void)
961{
962 __builtin_ia32_lfence();
963}
964
965static inline void __attribute__((__always_inline__)) _mm_mfence(void)
966{
967 __builtin_ia32_mfence();
968}
969
Anders Carlsson45470752008-12-26 00:45:50 +0000970static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi16(__m128i a, __m128i b)
971{
972 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
973}
974
975static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi32(__m128i a, __m128i b)
976{
977 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
978}
979
980static inline __m128i __attribute__((__always_inline__)) _mm_packus_epi16(__m128i a, __m128i b)
981{
982 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
983}
984
985static inline int __attribute__((__always_inline__)) _mm_extract_epi16(__m128i a, int imm)
986{
987 __v8hi b = (__v8hi)a;
988 return b[imm];
989}
990
991static inline __m128i __attribute__((__always_inline__)) _mm_insert_epi16(__m128i a, int b, int imm)
992{
993 return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
994}
995
996static inline int __attribute__((__always_inline__)) _mm_movemask_epi8(__m128i a)
997{
998 return __builtin_ia32_pmovmskb128((__v16qi)a);
999}
1000
Anders Carlsson92d66862008-12-26 00:50:47 +00001001#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm)))
1002#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
1003#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
Anders Carlsson45470752008-12-26 00:45:50 +00001004
1005static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi8(__m128i a, __m128i b)
1006{
Anders Carlsson92d66862008-12-26 00:50:47 +00001007 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
Anders Carlsson45470752008-12-26 00:45:50 +00001008}
1009
1010static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi16(__m128i a, __m128i b)
1011{
Anders Carlsson92d66862008-12-26 00:50:47 +00001012 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
Anders Carlsson45470752008-12-26 00:45:50 +00001013}
1014
1015static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi32(__m128i a, __m128i b)
1016{
Anders Carlsson92d66862008-12-26 00:50:47 +00001017 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
Anders Carlsson45470752008-12-26 00:45:50 +00001018}
1019
1020static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi64(__m128i a, __m128i b)
1021{
Anders Carlsson92d66862008-12-26 00:50:47 +00001022 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
Anders Carlsson45470752008-12-26 00:45:50 +00001023}
1024
1025static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi8(__m128i a, __m128i b)
1026{
Anders Carlsson92d66862008-12-26 00:50:47 +00001027 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
Anders Carlsson45470752008-12-26 00:45:50 +00001028}
1029
1030static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi16(__m128i a, __m128i b)
1031{
Anders Carlsson92d66862008-12-26 00:50:47 +00001032 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
Anders Carlsson45470752008-12-26 00:45:50 +00001033}
1034
1035static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi32(__m128i a, __m128i b)
1036{
Anders Carlsson92d66862008-12-26 00:50:47 +00001037 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
Anders Carlsson45470752008-12-26 00:45:50 +00001038}
1039
1040static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi64(__m128i a, __m128i b)
1041{
Anders Carlsson92d66862008-12-26 00:50:47 +00001042 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
Anders Carlsson45470752008-12-26 00:45:50 +00001043}
1044
1045static inline __m64 __attribute__((__always_inline__)) _mm_movepi64_pi64(__m128i a)
1046{
1047 return (__m64)a[0];
1048}
1049
1050static inline __m128i __attribute__((__always_inline__)) _mm_movpi64_pi64(__m64 a)
1051{
1052 return (__m128i){ (long long)a, 0 };
1053}
1054
1055static inline __m128i __attribute__((__always_inline__)) _mm_move_epi64(__m128i a)
1056{
1057 return (__m128i){ a[0], 0 };
1058}
1059
1060static inline __m128d __attribute__((__always_inline__)) _mm_unpackhi_pd(__m128d a, __m128d b)
1061{
Anders Carlsson92d66862008-12-26 00:50:47 +00001062 return __builtin_shufflevector(a, b, 1, 2+1);
Anders Carlsson45470752008-12-26 00:45:50 +00001063}
1064
1065static inline __m128d __attribute__((__always_inline__)) _mm_unpacklo_pd(__m128d a, __m128d b)
1066{
Anders Carlsson92d66862008-12-26 00:50:47 +00001067 return __builtin_shufflevector(a, b, 0, 2+0);
Anders Carlsson45470752008-12-26 00:45:50 +00001068}
1069
1070static inline int __attribute__((__always_inline__)) _mm_movemask_pd(__m128d a)
1071{
1072 return __builtin_ia32_movmskpd(a);
1073}
1074
1075#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
1076
1077static inline __m128 __attribute__((__always_inline__)) _mm_castpd_ps(__m128d in)
1078{
1079 return (__m128)in;
1080}
1081
1082static inline __m128i __attribute__((__always_inline__)) _mm_castpd_si128(__m128d in)
1083{
1084 return (__m128i)in;
1085}
1086
1087static inline __m128d __attribute__((__always_inline__)) _mm_castps_pd(__m128 in)
1088{
1089 return (__m128d)in;
1090}
1091
1092static inline __m128i __attribute__((__always_inline__)) _mm_castps_si128(__m128 in)
1093{
1094 return (__m128i)in;
1095}
1096
1097static inline __m128 __attribute__((__always_inline__)) _mm_castsi128_ps(__m128i in)
1098{
1099 return (__m128)in;
1100}
1101
1102static inline __m128d __attribute__((__always_inline__)) _mm_castsi128_pd(__m128i in)
1103{
1104 return (__m128d)in;
1105}
1106
Anders Carlssonf1bc6602008-12-26 00:49:43 +00001107static inline void __attribute__((__always_inline__)) _mm_pause(void)
1108{
1109 asm("pause");
1110}
1111
1112#define _MM_SHUFFLE(x, y) (((x) << 1) | (y))
1113
Anders Carlsson37f2f002008-12-24 01:45:22 +00001114#endif /* __SSE2__ */
1115
1116#endif /* __EMMINTRIN_H */