blob: 7beb5ba1c1849c5868adedb3de13e2a77bed3969 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __AVXINTRIN_H
25#define __AVXINTRIN_H
26
27#ifndef __AVX__
28#error "AVX instruction set not enabled"
29#else
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38typedef float __m256 __attribute__ ((__vector_size__ (32)));
39typedef double __m256d __attribute__((__vector_size__(32)));
40typedef long long __m256i __attribute__((__vector_size__(32)));
41
42/* Arithmetic */
43static __inline __m256d __attribute__((__always_inline__, __nodebug__))
44_mm256_add_pd(__m256d a, __m256d b)
45{
46 return a+b;
47}
48
49static __inline __m256 __attribute__((__always_inline__, __nodebug__))
50_mm256_add_ps(__m256 a, __m256 b)
51{
52 return a+b;
53}
54
55static __inline __m256d __attribute__((__always_inline__, __nodebug__))
56_mm256_sub_pd(__m256d a, __m256d b)
57{
58 return a-b;
59}
60
61static __inline __m256 __attribute__((__always_inline__, __nodebug__))
62_mm256_sub_ps(__m256 a, __m256 b)
63{
64 return a-b;
65}
66
67static __inline __m256d __attribute__((__always_inline__, __nodebug__))
68_mm256_addsub_pd(__m256d a, __m256d b)
69{
70 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
71}
72
73static __inline __m256 __attribute__((__always_inline__, __nodebug__))
74_mm256_addsub_ps(__m256 a, __m256 b)
75{
76 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
77}
78
79static __inline __m256d __attribute__((__always_inline__, __nodebug__))
80_mm256_div_pd(__m256d a, __m256d b)
81{
82 return a / b;
83}
84
85static __inline __m256 __attribute__((__always_inline__, __nodebug__))
86_mm256_div_ps(__m256 a, __m256 b)
87{
88 return a / b;
89}
90
91static __inline __m256d __attribute__((__always_inline__, __nodebug__))
92_mm256_max_pd(__m256d a, __m256d b)
93{
94 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
95}
96
97static __inline __m256 __attribute__((__always_inline__, __nodebug__))
98_mm256_max_ps(__m256 a, __m256 b)
99{
100 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
101}
102
103static __inline __m256d __attribute__((__always_inline__, __nodebug__))
104_mm256_min_pd(__m256d a, __m256d b)
105{
106 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
107}
108
109static __inline __m256 __attribute__((__always_inline__, __nodebug__))
110_mm256_min_ps(__m256 a, __m256 b)
111{
112 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
113}
114
115static __inline __m256d __attribute__((__always_inline__, __nodebug__))
116_mm256_mul_pd(__m256d a, __m256d b)
117{
118 return a * b;
119}
120
121static __inline __m256 __attribute__((__always_inline__, __nodebug__))
122_mm256_mul_ps(__m256 a, __m256 b)
123{
124 return a * b;
125}
126
127static __inline __m256d __attribute__((__always_inline__, __nodebug__))
128_mm256_sqrt_pd(__m256d a)
129{
130 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
131}
132
133static __inline __m256 __attribute__((__always_inline__, __nodebug__))
134_mm256_sqrt_ps(__m256 a)
135{
136 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
137}
138
139static __inline __m256 __attribute__((__always_inline__, __nodebug__))
140_mm256_rsqrt_ps(__m256 a)
141{
142 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
143}
144
145static __inline __m256 __attribute__((__always_inline__, __nodebug__))
146_mm256_rcp_ps(__m256 a)
147{
148 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
149}
150
151static __inline __m256d __attribute__((__always_inline__, __nodebug__))
152_mm256_round_pd(__m256d v, const int m)
153{
154 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
155}
156
157static __inline __m256 __attribute__((__always_inline__, __nodebug__))
158_mm256_round_ps(__m256 v, const int m)
159{
160 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
161}
162
163#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
164#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
165#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
166#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
167
168/* Logical */
169static __inline __m256d __attribute__((__always_inline__, __nodebug__))
170_mm256_and_pd(__m256d a, __m256d b)
171{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000172 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000173}
174
175static __inline __m256 __attribute__((__always_inline__, __nodebug__))
176_mm256_and_ps(__m256 a, __m256 b)
177{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000178 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000179}
180
181static __inline __m256d __attribute__((__always_inline__, __nodebug__))
182_mm256_andnot_pd(__m256d a, __m256d b)
183{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000184 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000185}
186
187static __inline __m256 __attribute__((__always_inline__, __nodebug__))
188_mm256_andnot_ps(__m256 a, __m256 b)
189{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000190 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000191}
192
193static __inline __m256d __attribute__((__always_inline__, __nodebug__))
194_mm256_or_pd(__m256d a, __m256d b)
195{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000196 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000197}
198
199static __inline __m256 __attribute__((__always_inline__, __nodebug__))
200_mm256_or_ps(__m256 a, __m256 b)
201{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000202 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000203}
204
205static __inline __m256d __attribute__((__always_inline__, __nodebug__))
206_mm256_xor_pd(__m256d a, __m256d b)
207{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000208 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000209}
210
211static __inline __m256 __attribute__((__always_inline__, __nodebug__))
212_mm256_xor_ps(__m256 a, __m256 b)
213{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000214 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000215}
216
217/* Horizontal arithmetic */
218static __inline __m256d __attribute__((__always_inline__, __nodebug__))
219_mm256_hadd_pd(__m256d a, __m256d b)
220{
221 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
222}
223
224static __inline __m256 __attribute__((__always_inline__, __nodebug__))
225_mm256_hadd_ps(__m256 a, __m256 b)
226{
227 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
228}
229
230static __inline __m256d __attribute__((__always_inline__, __nodebug__))
231_mm256_hsub_pd(__m256d a, __m256d b)
232{
233 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
234}
235
236static __inline __m256 __attribute__((__always_inline__, __nodebug__))
237_mm256_hsub_ps(__m256 a, __m256 b)
238{
239 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
240}
241
242/* Vector permutations */
243static __inline __m128d __attribute__((__always_inline__, __nodebug__))
244_mm_permutevar_pd(__m128d a, __m128i c)
245{
246 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
247}
248
249static __inline __m256d __attribute__((__always_inline__, __nodebug__))
250_mm256_permutevar_pd(__m256d a, __m256i c)
251{
252 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
253}
254
255static __inline __m128 __attribute__((__always_inline__, __nodebug__))
256_mm_permutevar_ps(__m128 a, __m128i c)
257{
258 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
259}
260
261static __inline __m256 __attribute__((__always_inline__, __nodebug__))
262_mm256_permutevar_ps(__m256 a, __m256i c)
263{
264 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
265 (__v8si)c);
266}
267
268static __inline __m128d __attribute__((__always_inline__, __nodebug__))
269_mm_permute_pd(__m128d a, const int c)
270{
271 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
272}
273
274static __inline __m256d __attribute__((__always_inline__, __nodebug__))
275_mm256_permute_pd(__m256d a, const int c)
276{
277 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
278}
279
280static __inline __m128 __attribute__((__always_inline__, __nodebug__))
281_mm_permute_ps(__m128 a, const int c)
282{
283 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
284}
285
286static __inline __m256 __attribute__((__always_inline__, __nodebug__))
287_mm256_permute_ps(__m256 a, const int c)
288{
289 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
290}
291
292static __inline __m256d __attribute__((__always_inline__, __nodebug__))
293_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
294{
295 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
296}
297
298static __inline __m256 __attribute__((__always_inline__, __nodebug__))
299_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
300{
301 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
302}
303
304static __inline __m256i __attribute__((__always_inline__, __nodebug__))
305_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
306{
307 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
308}
309
310/* Vector Blend */
311static __inline __m256d __attribute__((__always_inline__, __nodebug__))
312_mm256_blend_pd(__m256d a, __m256d b, const int c)
313{
314 return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
315}
316
317static __inline __m256 __attribute__((__always_inline__, __nodebug__))
318_mm256_blend_ps(__m256 a, __m256 b, const int c)
319{
320 return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
321}
322
323static __inline __m256d __attribute__((__always_inline__, __nodebug__))
324_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
325{
326 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
327}
328
329static __inline __m256 __attribute__((__always_inline__, __nodebug__))
330_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
331{
332 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
333}
334
335/* Vector Dot Product */
336static __inline __m256 __attribute__((__always_inline__, __nodebug__))
337_mm256_dp_ps(__m256 a, __m256 b, const int c)
338{
339 return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
340}
341
342/* Vector shuffle */
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000343#define _mm256_shuffle_ps(a, b, mask) \
344 (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
345 (mask) & 0x3, ((mask) & 0xc) >> 2, \
346 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8 \
347 (mask) & 0x3 + 4, (((mask) & 0xc) >> 2) + 4, \
348 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000349
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000350#define _mm256_shuffle_pd(a, b, mask) \
351 (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
352 (mask) & 0x1, \
353 (((mask) & 0x2) >> 1) + 4, \
354 (((mask) & 0x4) >> 2) + 2, \
355 (((mask) & 0x8) >> 3) + 6))
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000356
357/* Compare */
358#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
359#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
360#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
361#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
362#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
363#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
364#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
365#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
366#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
367#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
368#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
369#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
370#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
371#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
372#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
373#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
374#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
375#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
376#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
377#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
378#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
379#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
380#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
381#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
382#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
383#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
384#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
385#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
386#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
387#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
388#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
389#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
390
391static __inline __m128d __attribute__((__always_inline__, __nodebug__))
392_mm_cmp_pd(__m128d a, __m128d b, const int c)
393{
394 return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c);
395}
396
397static __inline __m128 __attribute__((__always_inline__, __nodebug__))
398_mm_cmp_ps(__m128 a, __m128 b, const int c)
399{
400 return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c);
401}
402
403static __inline __m256d __attribute__((__always_inline__, __nodebug__))
404_mm256_cmp_pd(__m256d a, __m256d b, const int c)
405{
406 return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c);
407}
408
409static __inline __m256 __attribute__((__always_inline__, __nodebug__))
410_mm256_cmp_ps(__m256 a, __m256 b, const int c)
411{
412 return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c);
413}
414
415static __inline __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cmp_sd(__m128d a, __m128d b, const int c)
417{
418 return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c);
419}
420
421static __inline __m128 __attribute__((__always_inline__, __nodebug__))
422_mm_cmp_ss(__m128 a, __m128 b, const int c)
423{
424 return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c);
425}
426
427/* Vector extract */
428static __inline __m128d __attribute__((__always_inline__, __nodebug__))
429_mm256_extractf128_pd(__m256d a, const int o)
430{
431 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
432}
433
434static __inline __m128 __attribute__((__always_inline__, __nodebug__))
435_mm256_extractf128_ps(__m256 a, const int o)
436{
437 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
438}
439
440static __inline __m128i __attribute__((__always_inline__, __nodebug__))
441_mm256_extractf128_si256(__m256i a, const int o)
442{
443 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
444}
445
446static __inline int __attribute__((__always_inline__, __nodebug__))
447_mm256_extract_epi32(__m256i a, int const imm)
448{
449 __v8si b = (__v8si)a;
450 return b[imm];
451}
452
453static __inline int __attribute__((__always_inline__, __nodebug__))
454_mm256_extract_epi16(__m256i a, int const imm)
455{
456 __v16hi b = (__v16hi)a;
457 return b[imm];
458}
459
460static __inline int __attribute__((__always_inline__, __nodebug__))
461_mm256_extract_epi8(__m256i a, int const imm)
462{
463 __v32qi b = (__v32qi)a;
464 return b[imm];
465}
466
467#ifdef __x86_64__
468static __inline long long __attribute__((__always_inline__, __nodebug__))
469_mm256_extract_epi64(__m256i a, const int imm)
470{
471 __v4di b = (__v4di)a;
472 return b[imm];
473}
474#endif
475
476/* Vector insert */
477static __inline __m256d __attribute__((__always_inline__, __nodebug__))
478_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
479{
480 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
481}
482
483static __inline __m256 __attribute__((__always_inline__, __nodebug__))
484_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
485{
486 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
487}
488
489static __inline __m256i __attribute__((__always_inline__, __nodebug__))
490_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
491{
492 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
493}
494
495static __inline __m256i __attribute__((__always_inline__, __nodebug__))
496_mm256_insert_epi32(__m256i a, int b, int const imm)
497{
498 __v8si c = (__v8si)a;
499 c[imm & 7] = b;
500 return (__m256i)c;
501}
502
503static __inline __m256i __attribute__((__always_inline__, __nodebug__))
504_mm256_insert_epi16(__m256i a, int b, int const imm)
505{
506 __v16hi c = (__v16hi)a;
507 c[imm & 15] = b;
508 return (__m256i)c;
509}
510
511static __inline __m256i __attribute__((__always_inline__, __nodebug__))
512_mm256_insert_epi8(__m256i a, int b, int const imm)
513{
514 __v32qi c = (__v32qi)a;
515 c[imm & 31] = b;
516 return (__m256i)c;
517}
518
519#ifdef __x86_64__
520static __inline __m256i __attribute__((__always_inline__, __nodebug__))
521_mm256_insert_epi64(__m256i a, int b, int const imm)
522{
523 __v4di c = (__v4di)a;
524 c[imm & 3] = b;
525 return (__m256i)c;
526}
527#endif
528
529/* Conversion */
530static __inline __m256d __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtepi32_pd(__m128i a)
532{
533 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
534}
535
536static __inline __m256 __attribute__((__always_inline__, __nodebug__))
537_mm256_cvtepi32_ps(__m256i a)
538{
539 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
540}
541
542static __inline __m128 __attribute__((__always_inline__, __nodebug__))
543_mm256_cvtpd_ps(__m256d a)
544{
545 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
546}
547
548static __inline __m256i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvtps_epi32(__m256 a)
550{
551 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
552}
553
554static __inline __m256d __attribute__((__always_inline__, __nodebug__))
555_mm256_cvtps_pd(__m128 a)
556{
557 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
558}
559
560static __inline __m128i __attribute__((__always_inline__, __nodebug__))
561_mm256_cvttpd_epi32(__m256d a)
562{
563 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
564}
565
566static __inline __m128i __attribute__((__always_inline__, __nodebug__))
567_mm256_cvtpd_epi32(__m256d a)
568{
569 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
570}
571
572static __inline __m256i __attribute__((__always_inline__, __nodebug__))
573_mm256_cvttps_epi32(__m256 a)
574{
575 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
576}
577
578/* Vector replicate */
579static __inline __m256 __attribute__((__always_inline__, __nodebug__))
580_mm256_movehdup_ps(__m256 a)
581{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000582 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000583}
584
585static __inline __m256 __attribute__((__always_inline__, __nodebug__))
586_mm256_moveldup_ps(__m256 a)
587{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000588 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000589}
590
591static __inline __m256d __attribute__((__always_inline__, __nodebug__))
592_mm256_movedup_pd(__m256d a)
593{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000594 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000595}
596
597/* Unpack and Interleave */
598static __inline __m256d __attribute__((__always_inline__, __nodebug__))
599_mm256_unpackhi_pd(__m256d a, __m256d b)
600{
601 return (__m256d)__builtin_ia32_unpckhpd256((__v4df)a, (__v4df)b);
602}
603
604static __inline __m256d __attribute__((__always_inline__, __nodebug__))
605_mm256_unpacklo_pd(__m256d a, __m256d b)
606{
607 return (__m256d)__builtin_ia32_unpcklpd256((__v4df)a, (__v4df)b);
608}
609
610static __inline __m256 __attribute__((__always_inline__, __nodebug__))
611_mm256_unpackhi_ps(__m256 a, __m256 b)
612{
613 return (__m256)__builtin_ia32_unpckhps256((__v8sf)a, (__v8sf)b);
614}
615
616static __inline __m256 __attribute__((__always_inline__, __nodebug__))
617_mm256_unpacklo_ps(__m256 a, __m256 b)
618{
619 return (__m256)__builtin_ia32_unpcklps256((__v8sf)a, (__v8sf)b);
620}
621
622/* Bit Test */
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testz_pd(__m128d a, __m128d b)
625{
626 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testc_pd(__m128d a, __m128d b)
631{
632 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm_testnzc_pd(__m128d a, __m128d b)
637{
638 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm_testz_ps(__m128 a, __m128 b)
643{
644 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm_testc_ps(__m128 a, __m128 b)
649{
650 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm_testnzc_ps(__m128 a, __m128 b)
655{
656 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testz_pd(__m256d a, __m256d b)
661{
662 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testc_pd(__m256d a, __m256d b)
667{
668 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testnzc_pd(__m256d a, __m256d b)
673{
674 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testz_ps(__m256 a, __m256 b)
679{
680 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testc_ps(__m256 a, __m256 b)
685{
686 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
687}
688
689static __inline int __attribute__((__always_inline__, __nodebug__))
690_mm256_testnzc_ps(__m256 a, __m256 b)
691{
692 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
693}
694
695static __inline int __attribute__((__always_inline__, __nodebug__))
696_mm256_testz_si256(__m256i a, __m256i b)
697{
698 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
699}
700
701static __inline int __attribute__((__always_inline__, __nodebug__))
702_mm256_testc_si256(__m256i a, __m256i b)
703{
704 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
705}
706
707static __inline int __attribute__((__always_inline__, __nodebug__))
708_mm256_testnzc_si256(__m256i a, __m256i b)
709{
710 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
711}
712
713/* Vector extract sign mask */
714static __inline int __attribute__((__always_inline__, __nodebug__))
715_mm256_movemask_pd(__m256d a)
716{
717 return __builtin_ia32_movmskpd256((__v4df)a);
718}
719
720static __inline int __attribute__((__always_inline__, __nodebug__))
721_mm256_movemask_ps(__m256 a)
722{
723 return __builtin_ia32_movmskps256((__v8sf)a);
724}
725
726/* Vector zero */
727static __inline void __attribute__((__always_inline__, __nodebug__))
728_mm256_zeroall(void)
729{
730 __builtin_ia32_vzeroall();
731}
732
733static __inline void __attribute__((__always_inline__, __nodebug__))
734_mm256_zeroupper(void)
735{
736 __builtin_ia32_vzeroupper();
737}
738
739/* Vector load with broadcast */
740static __inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_broadcast_ss(float const *a)
742{
743 return (__m128)__builtin_ia32_vbroadcastss(a);
744}
745
746static __inline __m256d __attribute__((__always_inline__, __nodebug__))
747_mm256_broadcast_sd(double const *a)
748{
749 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
750}
751
752static __inline __m256 __attribute__((__always_inline__, __nodebug__))
753_mm256_broadcast_ss(float const *a)
754{
755 return (__m256)__builtin_ia32_vbroadcastss256(a);
756}
757
758static __inline __m256d __attribute__((__always_inline__, __nodebug__))
759_mm256_broadcast_pd(__m128d const *a)
760{
761 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
762}
763
764static __inline __m256 __attribute__((__always_inline__, __nodebug__))
765_mm256_broadcast_ps(__m128 const *a)
766{
767 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
768}
769
770/* SIMD load ops */
771static __inline __m256d __attribute__((__always_inline__, __nodebug__))
772_mm256_load_pd(double const *p)
773{
774 return *(__m256d *)p;
775}
776
777static __inline __m256 __attribute__((__always_inline__, __nodebug__))
778_mm256_load_ps(float const *p)
779{
780 return *(__m256 *)p;
781}
782
783static __inline __m256d __attribute__((__always_inline__, __nodebug__))
784_mm256_loadu_pd(double const *p)
785{
786 return (__m256d)__builtin_ia32_loadupd256(p);
787}
788
789static __inline __m256 __attribute__((__always_inline__, __nodebug__))
790_mm256_loadu_ps(float const *p)
791{
792 return (__m256)__builtin_ia32_loadups256(p);
793}
794
795static __inline __m256i __attribute__((__always_inline__, __nodebug__))
796_mm256_load_si256(__m256i const *p)
797{
798 return *p;
799}
800
801static __inline __m256i __attribute__((__always_inline__, __nodebug__))
802_mm256_loadu_si256(__m256i const *p)
803{
804 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
805}
806
807static __inline __m256i __attribute__((__always_inline__, __nodebug__))
808_mm256_lddqu_si256(__m256i const *p)
809{
810 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
811}
812
813/* SIMD store ops */
814static __inline void __attribute__((__always_inline__, __nodebug__))
815_mm256_store_pd(double *p, __m256d a)
816{
817 *(__m256d *)p = a;
818}
819
820static __inline void __attribute__((__always_inline__, __nodebug__))
821_mm256_store_ps(float *p, __m256 a)
822{
823 *(__m256 *)p = a;
824}
825
826static __inline void __attribute__((__always_inline__, __nodebug__))
827_mm256_storeu_pd(double *p, __m256d a)
828{
829 __builtin_ia32_storeupd256(p, (__v4df)a);
830}
831
832static __inline void __attribute__((__always_inline__, __nodebug__))
833_mm256_storeu_ps(float *p, __m256 a)
834{
835 __builtin_ia32_storeups256(p, (__v8sf)a);
836}
837
838static __inline void __attribute__((__always_inline__, __nodebug__))
839_mm256_store_si256(__m256i *p, __m256i a)
840{
841 *p = a;
842}
843
844static __inline void __attribute__((__always_inline__, __nodebug__))
845_mm256_storeu_si256(__m256i *p, __m256i a)
846{
847 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
848}
849
850/* Conditional load ops */
851static __inline __m128d __attribute__((__always_inline__, __nodebug__))
852_mm_maskload_pd(double const *p, __m128d m)
853{
854 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
855}
856
857static __inline __m256d __attribute__((__always_inline__, __nodebug__))
858_mm256_maskload_pd(double const *p, __m256d m)
859{
860 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
861}
862
863static __inline __m128 __attribute__((__always_inline__, __nodebug__))
864_mm_maskload_ps(float const *p, __m128 m)
865{
866 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
867}
868
869static __inline __m256 __attribute__((__always_inline__, __nodebug__))
870_mm256_maskload_ps(float const *p, __m256 m)
871{
872 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
873}
874
875/* Conditional store ops */
876static __inline void __attribute__((__always_inline__, __nodebug__))
877_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
878{
879 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
880}
881
882static __inline void __attribute__((__always_inline__, __nodebug__))
883_mm_maskstore_pd(double *p, __m128d m, __m128d a)
884{
885 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
886}
887
888static __inline void __attribute__((__always_inline__, __nodebug__))
889_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
890{
891 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
892}
893
894static __inline void __attribute__((__always_inline__, __nodebug__))
895_mm_maskstore_ps(float *p, __m128 m, __m128 a)
896{
897 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
898}
899
900/* Cacheability support ops */
901static __inline void __attribute__((__always_inline__, __nodebug__))
902_mm256_stream_si256(__m256i *a, __m256i b)
903{
904 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
905}
906
907static __inline void __attribute__((__always_inline__, __nodebug__))
908_mm256_stream_pd(double *a, __m256d b)
909{
910 __builtin_ia32_movntpd256(a, (__v4df)b);
911}
912
913static __inline void __attribute__((__always_inline__, __nodebug__))
914_mm256_stream_ps(float *p, __m256 a)
915{
916 __builtin_ia32_movntps256(p, (__v8sf)a);
917}
918
919/* Create vectors */
920static __inline __m256d __attribute__((__always_inline__, __nodebug__))
921_mm256_set_pd(double a, double b, double c, double d)
922{
923 return (__m256d){ d, c, b, a };
924}
925
926static __inline __m256 __attribute__((__always_inline__, __nodebug__))
927_mm256_set_ps(float a, float b, float c, float d,
928 float e, float f, float g, float h)
929{
930 return (__m256){ h, g, f, e, d, c, b, a };
931}
932
933static __inline __m256i __attribute__((__always_inline__, __nodebug__))
934_mm256_set_epi32(int i0, int i1, int i2, int i3,
935 int i4, int i5, int i6, int i7)
936{
937 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
938}
939
940static __inline __m256i __attribute__((__always_inline__, __nodebug__))
941_mm256_set_epi16(short w15, short w14, short w13, short w12,
942 short w11, short w10, short w09, short w08,
943 short w07, short w06, short w05, short w04,
944 short w03, short w02, short w01, short w00)
945{
946 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
947 w08, w09, w10, w11, w12, w13, w14, w15 };
948}
949
950static __inline __m256i __attribute__((__always_inline__, __nodebug__))
951_mm256_set_epi8(char b31, char b30, char b29, char b28,
952 char b27, char b26, char b25, char b24,
953 char b23, char b22, char b21, char b20,
954 char b19, char b18, char b17, char b16,
955 char b15, char b14, char b13, char b12,
956 char b11, char b10, char b09, char b08,
957 char b07, char b06, char b05, char b04,
958 char b03, char b02, char b01, char b00)
959{
960 return (__m256i)(__v32qi){
961 b00, b01, b02, b03, b04, b05, b06, b07,
962 b08, b09, b10, b11, b12, b13, b14, b15,
963 b16, b17, b18, b19, b20, b21, b22, b23,
964 b24, b25, b26, b27, b28, b29, b30, b31
965 };
966}
967
968static __inline __m256i __attribute__((__always_inline__, __nodebug__))
969_mm256_set_epi64x(long long a, long long b, long long c, long long d)
970{
971 return (__m256i)(__v4di){ d, c, b, a };
972}
973
974/* Create vectors with elements in reverse order */
975static __inline __m256d __attribute__((__always_inline__, __nodebug__))
976_mm256_setr_pd(double a, double b, double c, double d)
977{
978 return (__m256d){ a, b, c, d };
979}
980
981static __inline __m256 __attribute__((__always_inline__, __nodebug__))
982_mm256_setr_ps(float a, float b, float c, float d,
983 float e, float f, float g, float h)
984{
985 return (__m256){ a, b, c, d, e, f, g, h };
986}
987
988static __inline __m256i __attribute__((__always_inline__, __nodebug__))
989_mm256_setr_epi32(int i0, int i1, int i2, int i3,
990 int i4, int i5, int i6, int i7)
991{
992 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
993}
994
995static __inline __m256i __attribute__((__always_inline__, __nodebug__))
996_mm256_setr_epi16(short w15, short w14, short w13, short w12,
997 short w11, short w10, short w09, short w08,
998 short w07, short w06, short w05, short w04,
999 short w03, short w02, short w01, short w00)
1000{
1001 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
1002 w07, w06, w05, w04, w03, w02, w01, w00 };
1003}
1004
1005static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1006_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1007 char b27, char b26, char b25, char b24,
1008 char b23, char b22, char b21, char b20,
1009 char b19, char b18, char b17, char b16,
1010 char b15, char b14, char b13, char b12,
1011 char b11, char b10, char b09, char b08,
1012 char b07, char b06, char b05, char b04,
1013 char b03, char b02, char b01, char b00)
1014{
1015 return (__m256i)(__v32qi){
1016 b31, b30, b29, b28, b27, b26, b25, b24,
1017 b23, b22, b21, b20, b19, b18, b17, b16,
1018 b15, b14, b13, b12, b11, b10, b09, b08,
1019 b07, b06, b05, b04, b03, b02, b01, b00 };
1020}
1021
1022static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1023_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1024{
1025 return (__m256i)(__v4di){ a, b, c, d };
1026}
1027
1028/* Create vectors with repeated elements */
1029static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1030_mm256_set1_pd(double w)
1031{
1032 return (__m256d){ w, w, w, w };
1033}
1034
1035static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1036_mm256_set1_ps(float w)
1037{
1038 return (__m256){ w, w, w, w, w, w, w, w };
1039}
1040
1041static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1042_mm256_set1_epi32(int i)
1043{
1044 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1045}
1046
1047static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1048_mm256_set1_epi16(short w)
1049{
1050 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1051}
1052
1053static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1054_mm256_set1_epi8(char b)
1055{
1056 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1057 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1058}
1059
1060static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1061_mm256_set1_epi64x(long long q)
1062{
1063 return (__m256i)(__v4di){ q, q, q, q };
1064}
1065
1066/* Create zeroed vectors */
1067static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1068_mm256_setzero_pd(void)
1069{
1070 return (__m256d){ 0, 0, 0, 0 };
1071}
1072
1073static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1074_mm256_setzero_ps(void)
1075{
1076 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1077}
1078
1079static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1080_mm256_setzero_si256(void)
1081{
1082 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1083}
1084
1085/* Cast between vector types */
1086static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1087_mm256_castpd_ps(__m256d in)
1088{
1089 return (__m256)in;
1090}
1091
1092static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1093_mm256_castpd_si256(__m256d in)
1094{
1095 return (__m256i)in;
1096}
1097
1098static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1099_mm256_castps_pd(__m256 in)
1100{
1101 return (__m256d)in;
1102}
1103
1104static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1105_mm256_castps_si256(__m256 in)
1106{
1107 return (__m256i)in;
1108}
1109
1110static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1111_mm256_castsi256_ps(__m256i in)
1112{
1113 return (__m256)in;
1114}
1115
1116static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1117_mm256_castsi256_pd(__m256i in)
1118{
1119 return (__m256d)in;
1120}
1121
1122static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1123_mm256_castpd256_pd128(__m256d in)
1124{
1125 return (__m128d)__builtin_ia32_pd_pd256((__v4df)in);
1126}
1127
1128static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1129_mm256_castps256_ps128(__m256 in)
1130{
1131 return (__m128)__builtin_ia32_ps_ps256((__v8sf)in);
1132}
1133
1134static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1135_mm256_castsi256_si128(__m256i in)
1136{
1137 return (__m128i)__builtin_ia32_si_si256((__v8si)in);
1138}
1139
1140static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1141_mm256_castpd128_pd256(__m128d in)
1142{
1143 return (__m256d)__builtin_ia32_pd256_pd((__v2df)in);
1144}
1145
1146static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1147_mm256_castps128_ps256(__m128 in)
1148{
1149 return (__m256)__builtin_ia32_ps256_ps((__v4sf)in);
1150}
1151
1152static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1153_mm256_castsi128_si256(__m128i in)
1154{
1155 return (__m256i)__builtin_ia32_si256_si((__v4si)in);
1156}
1157
1158#endif /* __AVX__ */
1159
1160#endif /* __AVXINTRIN_H */