blob: df57ad22216c2e368a50134be6140c994c6a2ce4 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __AVXINTRIN_H
25#define __AVXINTRIN_H
26
27#ifndef __AVX__
28#error "AVX instruction set not enabled"
29#else
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38typedef float __m256 __attribute__ ((__vector_size__ (32)));
39typedef double __m256d __attribute__((__vector_size__(32)));
40typedef long long __m256i __attribute__((__vector_size__(32)));
41
42/* Arithmetic */
43static __inline __m256d __attribute__((__always_inline__, __nodebug__))
44_mm256_add_pd(__m256d a, __m256d b)
45{
46 return a+b;
47}
48
49static __inline __m256 __attribute__((__always_inline__, __nodebug__))
50_mm256_add_ps(__m256 a, __m256 b)
51{
52 return a+b;
53}
54
55static __inline __m256d __attribute__((__always_inline__, __nodebug__))
56_mm256_sub_pd(__m256d a, __m256d b)
57{
58 return a-b;
59}
60
61static __inline __m256 __attribute__((__always_inline__, __nodebug__))
62_mm256_sub_ps(__m256 a, __m256 b)
63{
64 return a-b;
65}
66
67static __inline __m256d __attribute__((__always_inline__, __nodebug__))
68_mm256_addsub_pd(__m256d a, __m256d b)
69{
70 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
71}
72
73static __inline __m256 __attribute__((__always_inline__, __nodebug__))
74_mm256_addsub_ps(__m256 a, __m256 b)
75{
76 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
77}
78
79static __inline __m256d __attribute__((__always_inline__, __nodebug__))
80_mm256_div_pd(__m256d a, __m256d b)
81{
82 return a / b;
83}
84
85static __inline __m256 __attribute__((__always_inline__, __nodebug__))
86_mm256_div_ps(__m256 a, __m256 b)
87{
88 return a / b;
89}
90
91static __inline __m256d __attribute__((__always_inline__, __nodebug__))
92_mm256_max_pd(__m256d a, __m256d b)
93{
94 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
95}
96
97static __inline __m256 __attribute__((__always_inline__, __nodebug__))
98_mm256_max_ps(__m256 a, __m256 b)
99{
100 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
101}
102
103static __inline __m256d __attribute__((__always_inline__, __nodebug__))
104_mm256_min_pd(__m256d a, __m256d b)
105{
106 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
107}
108
109static __inline __m256 __attribute__((__always_inline__, __nodebug__))
110_mm256_min_ps(__m256 a, __m256 b)
111{
112 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
113}
114
115static __inline __m256d __attribute__((__always_inline__, __nodebug__))
116_mm256_mul_pd(__m256d a, __m256d b)
117{
118 return a * b;
119}
120
121static __inline __m256 __attribute__((__always_inline__, __nodebug__))
122_mm256_mul_ps(__m256 a, __m256 b)
123{
124 return a * b;
125}
126
127static __inline __m256d __attribute__((__always_inline__, __nodebug__))
128_mm256_sqrt_pd(__m256d a)
129{
130 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
131}
132
133static __inline __m256 __attribute__((__always_inline__, __nodebug__))
134_mm256_sqrt_ps(__m256 a)
135{
136 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
137}
138
139static __inline __m256 __attribute__((__always_inline__, __nodebug__))
140_mm256_rsqrt_ps(__m256 a)
141{
142 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
143}
144
145static __inline __m256 __attribute__((__always_inline__, __nodebug__))
146_mm256_rcp_ps(__m256 a)
147{
148 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
149}
150
151static __inline __m256d __attribute__((__always_inline__, __nodebug__))
152_mm256_round_pd(__m256d v, const int m)
153{
154 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
155}
156
157static __inline __m256 __attribute__((__always_inline__, __nodebug__))
158_mm256_round_ps(__m256 v, const int m)
159{
160 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
161}
162
163#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
164#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
165#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
166#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
167
168/* Logical */
169static __inline __m256d __attribute__((__always_inline__, __nodebug__))
170_mm256_and_pd(__m256d a, __m256d b)
171{
172 return (__m256d)__builtin_ia32_andpd256((__v4df)a, (__v4df)b);
173}
174
175static __inline __m256 __attribute__((__always_inline__, __nodebug__))
176_mm256_and_ps(__m256 a, __m256 b)
177{
178 return (__m256)__builtin_ia32_andps256((__v8sf)a, (__v8sf)b);
179}
180
181static __inline __m256d __attribute__((__always_inline__, __nodebug__))
182_mm256_andnot_pd(__m256d a, __m256d b)
183{
184 return (__m256d)__builtin_ia32_andnpd256((__v4df)a, (__v4df)b);
185}
186
187static __inline __m256 __attribute__((__always_inline__, __nodebug__))
188_mm256_andnot_ps(__m256 a, __m256 b)
189{
190 return (__m256)__builtin_ia32_andnps256((__v8sf)a, (__v8sf)b);
191}
192
193static __inline __m256d __attribute__((__always_inline__, __nodebug__))
194_mm256_or_pd(__m256d a, __m256d b)
195{
196 return (__m256d)__builtin_ia32_orpd256((__v4df)a, (__v4df)b);
197}
198
199static __inline __m256 __attribute__((__always_inline__, __nodebug__))
200_mm256_or_ps(__m256 a, __m256 b)
201{
202 return (__m256)__builtin_ia32_orps256((__v8sf)a, (__v8sf)b);
203}
204
205static __inline __m256d __attribute__((__always_inline__, __nodebug__))
206_mm256_xor_pd(__m256d a, __m256d b)
207{
208 return (__m256d)__builtin_ia32_xorpd256((__v4df)a, (__v4df)b);
209}
210
211static __inline __m256 __attribute__((__always_inline__, __nodebug__))
212_mm256_xor_ps(__m256 a, __m256 b)
213{
214 return (__m256)__builtin_ia32_xorps256((__v8sf)a, (__v8sf)b);
215}
216
217/* Horizontal arithmetic */
218static __inline __m256d __attribute__((__always_inline__, __nodebug__))
219_mm256_hadd_pd(__m256d a, __m256d b)
220{
221 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
222}
223
224static __inline __m256 __attribute__((__always_inline__, __nodebug__))
225_mm256_hadd_ps(__m256 a, __m256 b)
226{
227 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
228}
229
230static __inline __m256d __attribute__((__always_inline__, __nodebug__))
231_mm256_hsub_pd(__m256d a, __m256d b)
232{
233 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
234}
235
236static __inline __m256 __attribute__((__always_inline__, __nodebug__))
237_mm256_hsub_ps(__m256 a, __m256 b)
238{
239 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
240}
241
242/* Vector permutations */
243static __inline __m128d __attribute__((__always_inline__, __nodebug__))
244_mm_permutevar_pd(__m128d a, __m128i c)
245{
246 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
247}
248
249static __inline __m256d __attribute__((__always_inline__, __nodebug__))
250_mm256_permutevar_pd(__m256d a, __m256i c)
251{
252 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
253}
254
255static __inline __m128 __attribute__((__always_inline__, __nodebug__))
256_mm_permutevar_ps(__m128 a, __m128i c)
257{
258 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
259}
260
261static __inline __m256 __attribute__((__always_inline__, __nodebug__))
262_mm256_permutevar_ps(__m256 a, __m256i c)
263{
264 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
265 (__v8si)c);
266}
267
268static __inline __m128d __attribute__((__always_inline__, __nodebug__))
269_mm_permute_pd(__m128d a, const int c)
270{
271 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
272}
273
274static __inline __m256d __attribute__((__always_inline__, __nodebug__))
275_mm256_permute_pd(__m256d a, const int c)
276{
277 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
278}
279
280static __inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_permute_ps(__m128 a, const int c)
282{
283 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
284}
285
286static __inline __m256 __attribute__((__always_inline__, __nodebug__))
287_mm256_permute_ps(__m256 a, const int c)
288{
289 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
290}
291
292static __inline __m256d __attribute__((__always_inline__, __nodebug__))
293_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
294{
295 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
296}
297
298static __inline __m256 __attribute__((__always_inline__, __nodebug__))
299_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
300{
301 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
302}
303
304static __inline __m256i __attribute__((__always_inline__, __nodebug__))
305_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
306{
307 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
308}
309
310/* Vector Blend */
311static __inline __m256d __attribute__((__always_inline__, __nodebug__))
312_mm256_blend_pd(__m256d a, __m256d b, const int c)
313{
314 return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
315}
316
317static __inline __m256 __attribute__((__always_inline__, __nodebug__))
318_mm256_blend_ps(__m256 a, __m256 b, const int c)
319{
320 return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
321}
322
323static __inline __m256d __attribute__((__always_inline__, __nodebug__))
324_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
325{
326 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
327}
328
329static __inline __m256 __attribute__((__always_inline__, __nodebug__))
330_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
331{
332 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
333}
334
335/* Vector Dot Product */
336static __inline __m256 __attribute__((__always_inline__, __nodebug__))
337_mm256_dp_ps(__m256 a, __m256 b, const int c)
338{
339 return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
340}
341
342/* Vector shuffle */
343static __inline __m256d __attribute__((__always_inline__, __nodebug__))
344_mm256_shuffle_pd(__m256d a, __m256d b, const int s)
345{
346 return (__m256d)__builtin_ia32_shufpd256((__v4df)a, (__v4df)b, s);
347}
348
349static __inline __m256 __attribute__((__always_inline__, __nodebug__))
350_mm256_shuffle_ps(__m256 a, __m256 b, const int s)
351{
352 return (__m256)__builtin_ia32_shufps256((__v8sf)a, (__v8sf)b, s);
353}
354
355/* Compare */
356#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
357#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
358#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
359#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
360#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
361#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
362#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
363#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
364#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
365#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
366#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
367#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
368#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
369#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
370#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
371#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
372#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
373#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
374#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
375#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
376#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
377#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
378#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
379#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
380#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
381#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
382#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
383#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
384#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
385#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
386#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
387#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
388
389static __inline __m128d __attribute__((__always_inline__, __nodebug__))
390_mm_cmp_pd(__m128d a, __m128d b, const int c)
391{
392 return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c);
393}
394
395static __inline __m128 __attribute__((__always_inline__, __nodebug__))
396_mm_cmp_ps(__m128 a, __m128 b, const int c)
397{
398 return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c);
399}
400
401static __inline __m256d __attribute__((__always_inline__, __nodebug__))
402_mm256_cmp_pd(__m256d a, __m256d b, const int c)
403{
404 return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c);
405}
406
407static __inline __m256 __attribute__((__always_inline__, __nodebug__))
408_mm256_cmp_ps(__m256 a, __m256 b, const int c)
409{
410 return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c);
411}
412
413static __inline __m128d __attribute__((__always_inline__, __nodebug__))
414_mm_cmp_sd(__m128d a, __m128d b, const int c)
415{
416 return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c);
417}
418
419static __inline __m128 __attribute__((__always_inline__, __nodebug__))
420_mm_cmp_ss(__m128 a, __m128 b, const int c)
421{
422 return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c);
423}
424
425/* Vector extract */
426static __inline __m128d __attribute__((__always_inline__, __nodebug__))
427_mm256_extractf128_pd(__m256d a, const int o)
428{
429 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
430}
431
432static __inline __m128 __attribute__((__always_inline__, __nodebug__))
433_mm256_extractf128_ps(__m256 a, const int o)
434{
435 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
436}
437
438static __inline __m128i __attribute__((__always_inline__, __nodebug__))
439_mm256_extractf128_si256(__m256i a, const int o)
440{
441 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
442}
443
444static __inline int __attribute__((__always_inline__, __nodebug__))
445_mm256_extract_epi32(__m256i a, int const imm)
446{
447 __v8si b = (__v8si)a;
448 return b[imm];
449}
450
451static __inline int __attribute__((__always_inline__, __nodebug__))
452_mm256_extract_epi16(__m256i a, int const imm)
453{
454 __v16hi b = (__v16hi)a;
455 return b[imm];
456}
457
458static __inline int __attribute__((__always_inline__, __nodebug__))
459_mm256_extract_epi8(__m256i a, int const imm)
460{
461 __v32qi b = (__v32qi)a;
462 return b[imm];
463}
464
465#ifdef __x86_64__
466static __inline long long __attribute__((__always_inline__, __nodebug__))
467_mm256_extract_epi64(__m256i a, const int imm)
468{
469 __v4di b = (__v4di)a;
470 return b[imm];
471}
472#endif
473
474/* Vector insert */
475static __inline __m256d __attribute__((__always_inline__, __nodebug__))
476_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
477{
478 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
479}
480
481static __inline __m256 __attribute__((__always_inline__, __nodebug__))
482_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
483{
484 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
485}
486
487static __inline __m256i __attribute__((__always_inline__, __nodebug__))
488_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
489{
490 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
491}
492
493static __inline __m256i __attribute__((__always_inline__, __nodebug__))
494_mm256_insert_epi32(__m256i a, int b, int const imm)
495{
496 __v8si c = (__v8si)a;
497 c[imm & 7] = b;
498 return (__m256i)c;
499}
500
501static __inline __m256i __attribute__((__always_inline__, __nodebug__))
502_mm256_insert_epi16(__m256i a, int b, int const imm)
503{
504 __v16hi c = (__v16hi)a;
505 c[imm & 15] = b;
506 return (__m256i)c;
507}
508
509static __inline __m256i __attribute__((__always_inline__, __nodebug__))
510_mm256_insert_epi8(__m256i a, int b, int const imm)
511{
512 __v32qi c = (__v32qi)a;
513 c[imm & 31] = b;
514 return (__m256i)c;
515}
516
517#ifdef __x86_64__
518static __inline __m256i __attribute__((__always_inline__, __nodebug__))
519_mm256_insert_epi64(__m256i a, int b, int const imm)
520{
521 __v4di c = (__v4di)a;
522 c[imm & 3] = b;
523 return (__m256i)c;
524}
525#endif
526
527/* Conversion */
528static __inline __m256d __attribute__((__always_inline__, __nodebug__))
529_mm256_cvtepi32_pd(__m128i a)
530{
531 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
532}
533
534static __inline __m256 __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtepi32_ps(__m256i a)
536{
537 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
538}
539
540static __inline __m128 __attribute__((__always_inline__, __nodebug__))
541_mm256_cvtpd_ps(__m256d a)
542{
543 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
544}
545
546static __inline __m256i __attribute__((__always_inline__, __nodebug__))
547_mm256_cvtps_epi32(__m256 a)
548{
549 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
550}
551
552static __inline __m256d __attribute__((__always_inline__, __nodebug__))
553_mm256_cvtps_pd(__m128 a)
554{
555 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
556}
557
558static __inline __m128i __attribute__((__always_inline__, __nodebug__))
559_mm256_cvttpd_epi32(__m256d a)
560{
561 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
562}
563
564static __inline __m128i __attribute__((__always_inline__, __nodebug__))
565_mm256_cvtpd_epi32(__m256d a)
566{
567 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
568}
569
570static __inline __m256i __attribute__((__always_inline__, __nodebug__))
571_mm256_cvttps_epi32(__m256 a)
572{
573 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
574}
575
576/* Vector replicate */
577static __inline __m256 __attribute__((__always_inline__, __nodebug__))
578_mm256_movehdup_ps(__m256 a)
579{
580 return (__m256)__builtin_ia32_movshdup256((__v8sf)a);
581}
582
583static __inline __m256 __attribute__((__always_inline__, __nodebug__))
584_mm256_moveldup_ps(__m256 a)
585{
586 return (__m256)__builtin_ia32_movsldup256((__v8sf)a);
587}
588
589static __inline __m256d __attribute__((__always_inline__, __nodebug__))
590_mm256_movedup_pd(__m256d a)
591{
592 return (__m256d)__builtin_ia32_movddup256((__v4df)a);
593}
594
595/* Unpack and Interleave */
596static __inline __m256d __attribute__((__always_inline__, __nodebug__))
597_mm256_unpackhi_pd(__m256d a, __m256d b)
598{
599 return (__m256d)__builtin_ia32_unpckhpd256((__v4df)a, (__v4df)b);
600}
601
602static __inline __m256d __attribute__((__always_inline__, __nodebug__))
603_mm256_unpacklo_pd(__m256d a, __m256d b)
604{
605 return (__m256d)__builtin_ia32_unpcklpd256((__v4df)a, (__v4df)b);
606}
607
608static __inline __m256 __attribute__((__always_inline__, __nodebug__))
609_mm256_unpackhi_ps(__m256 a, __m256 b)
610{
611 return (__m256)__builtin_ia32_unpckhps256((__v8sf)a, (__v8sf)b);
612}
613
614static __inline __m256 __attribute__((__always_inline__, __nodebug__))
615_mm256_unpacklo_ps(__m256 a, __m256 b)
616{
617 return (__m256)__builtin_ia32_unpcklps256((__v8sf)a, (__v8sf)b);
618}
619
620/* Bit Test */
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testz_pd(__m128d a, __m128d b)
623{
624 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm_testc_pd(__m128d a, __m128d b)
629{
630 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm_testnzc_pd(__m128d a, __m128d b)
635{
636 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm_testz_ps(__m128 a, __m128 b)
641{
642 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm_testc_ps(__m128 a, __m128 b)
647{
648 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm_testnzc_ps(__m128 a, __m128 b)
653{
654 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testz_pd(__m256d a, __m256d b)
659{
660 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testc_pd(__m256d a, __m256d b)
665{
666 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testnzc_pd(__m256d a, __m256d b)
671{
672 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testz_ps(__m256 a, __m256 b)
677{
678 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
679}
680
681static __inline int __attribute__((__always_inline__, __nodebug__))
682_mm256_testc_ps(__m256 a, __m256 b)
683{
684 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
685}
686
687static __inline int __attribute__((__always_inline__, __nodebug__))
688_mm256_testnzc_ps(__m256 a, __m256 b)
689{
690 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
691}
692
693static __inline int __attribute__((__always_inline__, __nodebug__))
694_mm256_testz_si256(__m256i a, __m256i b)
695{
696 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
697}
698
699static __inline int __attribute__((__always_inline__, __nodebug__))
700_mm256_testc_si256(__m256i a, __m256i b)
701{
702 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
703}
704
705static __inline int __attribute__((__always_inline__, __nodebug__))
706_mm256_testnzc_si256(__m256i a, __m256i b)
707{
708 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
709}
710
711/* Vector extract sign mask */
712static __inline int __attribute__((__always_inline__, __nodebug__))
713_mm256_movemask_pd(__m256d a)
714{
715 return __builtin_ia32_movmskpd256((__v4df)a);
716}
717
718static __inline int __attribute__((__always_inline__, __nodebug__))
719_mm256_movemask_ps(__m256 a)
720{
721 return __builtin_ia32_movmskps256((__v8sf)a);
722}
723
724/* Vector zero */
725static __inline void __attribute__((__always_inline__, __nodebug__))
726_mm256_zeroall(void)
727{
728 __builtin_ia32_vzeroall();
729}
730
731static __inline void __attribute__((__always_inline__, __nodebug__))
732_mm256_zeroupper(void)
733{
734 __builtin_ia32_vzeroupper();
735}
736
737/* Vector load with broadcast */
738static __inline __m128 __attribute__((__always_inline__, __nodebug__))
739_mm_broadcast_ss(float const *a)
740{
741 return (__m128)__builtin_ia32_vbroadcastss(a);
742}
743
744static __inline __m256d __attribute__((__always_inline__, __nodebug__))
745_mm256_broadcast_sd(double const *a)
746{
747 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
748}
749
750static __inline __m256 __attribute__((__always_inline__, __nodebug__))
751_mm256_broadcast_ss(float const *a)
752{
753 return (__m256)__builtin_ia32_vbroadcastss256(a);
754}
755
756static __inline __m256d __attribute__((__always_inline__, __nodebug__))
757_mm256_broadcast_pd(__m128d const *a)
758{
759 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
760}
761
762static __inline __m256 __attribute__((__always_inline__, __nodebug__))
763_mm256_broadcast_ps(__m128 const *a)
764{
765 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
766}
767
768/* SIMD load ops */
769static __inline __m256d __attribute__((__always_inline__, __nodebug__))
770_mm256_load_pd(double const *p)
771{
772 return *(__m256d *)p;
773}
774
775static __inline __m256 __attribute__((__always_inline__, __nodebug__))
776_mm256_load_ps(float const *p)
777{
778 return *(__m256 *)p;
779}
780
781static __inline __m256d __attribute__((__always_inline__, __nodebug__))
782_mm256_loadu_pd(double const *p)
783{
784 return (__m256d)__builtin_ia32_loadupd256(p);
785}
786
787static __inline __m256 __attribute__((__always_inline__, __nodebug__))
788_mm256_loadu_ps(float const *p)
789{
790 return (__m256)__builtin_ia32_loadups256(p);
791}
792
793static __inline __m256i __attribute__((__always_inline__, __nodebug__))
794_mm256_load_si256(__m256i const *p)
795{
796 return *p;
797}
798
799static __inline __m256i __attribute__((__always_inline__, __nodebug__))
800_mm256_loadu_si256(__m256i const *p)
801{
802 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
803}
804
805static __inline __m256i __attribute__((__always_inline__, __nodebug__))
806_mm256_lddqu_si256(__m256i const *p)
807{
808 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
809}
810
811/* SIMD store ops */
812static __inline void __attribute__((__always_inline__, __nodebug__))
813_mm256_store_pd(double *p, __m256d a)
814{
815 *(__m256d *)p = a;
816}
817
818static __inline void __attribute__((__always_inline__, __nodebug__))
819_mm256_store_ps(float *p, __m256 a)
820{
821 *(__m256 *)p = a;
822}
823
824static __inline void __attribute__((__always_inline__, __nodebug__))
825_mm256_storeu_pd(double *p, __m256d a)
826{
827 __builtin_ia32_storeupd256(p, (__v4df)a);
828}
829
830static __inline void __attribute__((__always_inline__, __nodebug__))
831_mm256_storeu_ps(float *p, __m256 a)
832{
833 __builtin_ia32_storeups256(p, (__v8sf)a);
834}
835
836static __inline void __attribute__((__always_inline__, __nodebug__))
837_mm256_store_si256(__m256i *p, __m256i a)
838{
839 *p = a;
840}
841
842static __inline void __attribute__((__always_inline__, __nodebug__))
843_mm256_storeu_si256(__m256i *p, __m256i a)
844{
845 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
846}
847
848/* Conditional load ops */
849static __inline __m128d __attribute__((__always_inline__, __nodebug__))
850_mm_maskload_pd(double const *p, __m128d m)
851{
852 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
853}
854
855static __inline __m256d __attribute__((__always_inline__, __nodebug__))
856_mm256_maskload_pd(double const *p, __m256d m)
857{
858 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
859}
860
861static __inline __m128 __attribute__((__always_inline__, __nodebug__))
862_mm_maskload_ps(float const *p, __m128 m)
863{
864 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
865}
866
867static __inline __m256 __attribute__((__always_inline__, __nodebug__))
868_mm256_maskload_ps(float const *p, __m256 m)
869{
870 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
871}
872
873/* Conditional store ops */
874static __inline void __attribute__((__always_inline__, __nodebug__))
875_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
876{
877 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
878}
879
880static __inline void __attribute__((__always_inline__, __nodebug__))
881_mm_maskstore_pd(double *p, __m128d m, __m128d a)
882{
883 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
884}
885
886static __inline void __attribute__((__always_inline__, __nodebug__))
887_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
888{
889 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
890}
891
892static __inline void __attribute__((__always_inline__, __nodebug__))
893_mm_maskstore_ps(float *p, __m128 m, __m128 a)
894{
895 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
896}
897
898/* Cacheability support ops */
899static __inline void __attribute__((__always_inline__, __nodebug__))
900_mm256_stream_si256(__m256i *a, __m256i b)
901{
902 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
903}
904
905static __inline void __attribute__((__always_inline__, __nodebug__))
906_mm256_stream_pd(double *a, __m256d b)
907{
908 __builtin_ia32_movntpd256(a, (__v4df)b);
909}
910
911static __inline void __attribute__((__always_inline__, __nodebug__))
912_mm256_stream_ps(float *p, __m256 a)
913{
914 __builtin_ia32_movntps256(p, (__v8sf)a);
915}
916
917/* Create vectors */
918static __inline __m256d __attribute__((__always_inline__, __nodebug__))
919_mm256_set_pd(double a, double b, double c, double d)
920{
921 return (__m256d){ d, c, b, a };
922}
923
924static __inline __m256 __attribute__((__always_inline__, __nodebug__))
925_mm256_set_ps(float a, float b, float c, float d,
926 float e, float f, float g, float h)
927{
928 return (__m256){ h, g, f, e, d, c, b, a };
929}
930
931static __inline __m256i __attribute__((__always_inline__, __nodebug__))
932_mm256_set_epi32(int i0, int i1, int i2, int i3,
933 int i4, int i5, int i6, int i7)
934{
935 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
936}
937
938static __inline __m256i __attribute__((__always_inline__, __nodebug__))
939_mm256_set_epi16(short w15, short w14, short w13, short w12,
940 short w11, short w10, short w09, short w08,
941 short w07, short w06, short w05, short w04,
942 short w03, short w02, short w01, short w00)
943{
944 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
945 w08, w09, w10, w11, w12, w13, w14, w15 };
946}
947
948static __inline __m256i __attribute__((__always_inline__, __nodebug__))
949_mm256_set_epi8(char b31, char b30, char b29, char b28,
950 char b27, char b26, char b25, char b24,
951 char b23, char b22, char b21, char b20,
952 char b19, char b18, char b17, char b16,
953 char b15, char b14, char b13, char b12,
954 char b11, char b10, char b09, char b08,
955 char b07, char b06, char b05, char b04,
956 char b03, char b02, char b01, char b00)
957{
958 return (__m256i)(__v32qi){
959 b00, b01, b02, b03, b04, b05, b06, b07,
960 b08, b09, b10, b11, b12, b13, b14, b15,
961 b16, b17, b18, b19, b20, b21, b22, b23,
962 b24, b25, b26, b27, b28, b29, b30, b31
963 };
964}
965
966static __inline __m256i __attribute__((__always_inline__, __nodebug__))
967_mm256_set_epi64x(long long a, long long b, long long c, long long d)
968{
969 return (__m256i)(__v4di){ d, c, b, a };
970}
971
972/* Create vectors with elements in reverse order */
973static __inline __m256d __attribute__((__always_inline__, __nodebug__))
974_mm256_setr_pd(double a, double b, double c, double d)
975{
976 return (__m256d){ a, b, c, d };
977}
978
979static __inline __m256 __attribute__((__always_inline__, __nodebug__))
980_mm256_setr_ps(float a, float b, float c, float d,
981 float e, float f, float g, float h)
982{
983 return (__m256){ a, b, c, d, e, f, g, h };
984}
985
986static __inline __m256i __attribute__((__always_inline__, __nodebug__))
987_mm256_setr_epi32(int i0, int i1, int i2, int i3,
988 int i4, int i5, int i6, int i7)
989{
990 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
991}
992
993static __inline __m256i __attribute__((__always_inline__, __nodebug__))
994_mm256_setr_epi16(short w15, short w14, short w13, short w12,
995 short w11, short w10, short w09, short w08,
996 short w07, short w06, short w05, short w04,
997 short w03, short w02, short w01, short w00)
998{
999 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
1000 w07, w06, w05, w04, w03, w02, w01, w00 };
1001}
1002
1003static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1004_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1005 char b27, char b26, char b25, char b24,
1006 char b23, char b22, char b21, char b20,
1007 char b19, char b18, char b17, char b16,
1008 char b15, char b14, char b13, char b12,
1009 char b11, char b10, char b09, char b08,
1010 char b07, char b06, char b05, char b04,
1011 char b03, char b02, char b01, char b00)
1012{
1013 return (__m256i)(__v32qi){
1014 b31, b30, b29, b28, b27, b26, b25, b24,
1015 b23, b22, b21, b20, b19, b18, b17, b16,
1016 b15, b14, b13, b12, b11, b10, b09, b08,
1017 b07, b06, b05, b04, b03, b02, b01, b00 };
1018}
1019
1020static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1021_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1022{
1023 return (__m256i)(__v4di){ a, b, c, d };
1024}
1025
1026/* Create vectors with repeated elements */
1027static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1028_mm256_set1_pd(double w)
1029{
1030 return (__m256d){ w, w, w, w };
1031}
1032
1033static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1034_mm256_set1_ps(float w)
1035{
1036 return (__m256){ w, w, w, w, w, w, w, w };
1037}
1038
1039static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1040_mm256_set1_epi32(int i)
1041{
1042 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1043}
1044
1045static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1046_mm256_set1_epi16(short w)
1047{
1048 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1049}
1050
1051static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1052_mm256_set1_epi8(char b)
1053{
1054 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1055 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1056}
1057
1058static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1059_mm256_set1_epi64x(long long q)
1060{
1061 return (__m256i)(__v4di){ q, q, q, q };
1062}
1063
1064/* Create zeroed vectors */
1065static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1066_mm256_setzero_pd(void)
1067{
1068 return (__m256d){ 0, 0, 0, 0 };
1069}
1070
1071static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1072_mm256_setzero_ps(void)
1073{
1074 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1075}
1076
1077static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1078_mm256_setzero_si256(void)
1079{
1080 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1081}
1082
1083/* Cast between vector types */
1084static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1085_mm256_castpd_ps(__m256d in)
1086{
1087 return (__m256)in;
1088}
1089
1090static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1091_mm256_castpd_si256(__m256d in)
1092{
1093 return (__m256i)in;
1094}
1095
1096static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1097_mm256_castps_pd(__m256 in)
1098{
1099 return (__m256d)in;
1100}
1101
1102static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1103_mm256_castps_si256(__m256 in)
1104{
1105 return (__m256i)in;
1106}
1107
1108static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1109_mm256_castsi256_ps(__m256i in)
1110{
1111 return (__m256)in;
1112}
1113
1114static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1115_mm256_castsi256_pd(__m256i in)
1116{
1117 return (__m256d)in;
1118}
1119
1120static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1121_mm256_castpd256_pd128(__m256d in)
1122{
1123 return (__m128d)__builtin_ia32_pd_pd256((__v4df)in);
1124}
1125
1126static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1127_mm256_castps256_ps128(__m256 in)
1128{
1129 return (__m128)__builtin_ia32_ps_ps256((__v8sf)in);
1130}
1131
1132static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1133_mm256_castsi256_si128(__m256i in)
1134{
1135 return (__m128i)__builtin_ia32_si_si256((__v8si)in);
1136}
1137
1138static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1139_mm256_castpd128_pd256(__m128d in)
1140{
1141 return (__m256d)__builtin_ia32_pd256_pd((__v2df)in);
1142}
1143
1144static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1145_mm256_castps128_ps256(__m128 in)
1146{
1147 return (__m256)__builtin_ia32_ps256_ps((__v4sf)in);
1148}
1149
1150static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1151_mm256_castsi128_si256(__m128i in)
1152{
1153 return (__m256i)__builtin_ia32_si256_si((__v4si)in);
1154}
1155
1156#endif /* __AVX__ */
1157
1158#endif /* __AVXINTRIN_H */