blob: ea26fa575f45219304084e00aed12d25ba251f9e [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosierb8786c42011-12-17 00:15:26 +0000261#define _mm_permute_pd(A, C) __extension__ ({ \
262 __m128d __A = (A); \
263 (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, C); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000264
Chad Rosierb8786c42011-12-17 00:15:26 +0000265#define _mm256_permute_pd(A, C) __extension__ ({ \
266 __m256d __A = (A); \
267 (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, C); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000268
269static __inline __m128 __attribute__((__always_inline__, __nodebug__))
270_mm_permute_ps(__m128 a, const int c)
271{
272 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
273}
274
275static __inline __m256 __attribute__((__always_inline__, __nodebug__))
276_mm256_permute_ps(__m256 a, const int c)
277{
278 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
279}
280
Chad Rosierc5cda112011-12-16 21:07:34 +0000281#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
282 __m256d __V1 = (V1); \
283 __m256d __V2 = (V2); \
284 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000285
Chad Rosierc5cda112011-12-16 21:07:34 +0000286#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
287 __m256 __V1 = (V1); \
288 __m256 __V2 = (V2); \
289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000290
Chad Rosierc5cda112011-12-16 21:07:34 +0000291#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
292 __m256i __V1 = (V1); \
293 __m256i __V2 = (V2); \
294 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000295
296/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000297#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
298 __m256d __V1 = (V1); \
299 __m256d __V2 = (V2); \
300 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000301
Eli Friedman34720892011-11-10 00:11:13 +0000302#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
303 __m256 __V1 = (V1); \
304 __m256 __V2 = (V2); \
305 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000306
307static __inline __m256d __attribute__((__always_inline__, __nodebug__))
308_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
309{
310 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
311}
312
313static __inline __m256 __attribute__((__always_inline__, __nodebug__))
314_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
315{
316 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
317}
318
319/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000320#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
321 __m256 __V1 = (V1); \
322 __m256 __V2 = (V2); \
323 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000324
325/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000326#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
327 __m256 __a = (a); \
328 __m256 __b = (b); \
329 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000330 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000331 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000332 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000333 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000334
Bob Wilson32bae372011-11-05 06:08:06 +0000335#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
336 __m256d __a = (a); \
337 __m256d __b = (b); \
338 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000339 (mask) & 0x1, \
340 (((mask) & 0x2) >> 1) + 4, \
341 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000342 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000343
344/* Compare */
345#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
346#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
347#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
348#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
349#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
350#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
351#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
352#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
353#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
354#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
355#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
356#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
357#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
358#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
359#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
360#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
361#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
362#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
363#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
364#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
365#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
366#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
367#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
368#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
369#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
370#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
371#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
372#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
373#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
374#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
375#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
376#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
377
Bob Wilson32bae372011-11-05 06:08:06 +0000378#define _mm_cmp_pd(a, b, c) __extension__ ({ \
379 __m128d __a = (a); \
380 __m128d __b = (b); \
381 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000382
Bob Wilson32bae372011-11-05 06:08:06 +0000383#define _mm_cmp_ps(a, b, c) __extension__ ({ \
384 __m128 __a = (a); \
385 __m128 __b = (b); \
386 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000387
Bob Wilson32bae372011-11-05 06:08:06 +0000388#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
389 __m256d __a = (a); \
390 __m256d __b = (b); \
391 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000392
Bob Wilson32bae372011-11-05 06:08:06 +0000393#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
394 __m256 __a = (a); \
395 __m256 __b = (b); \
396 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000397
Bob Wilson32bae372011-11-05 06:08:06 +0000398#define _mm_cmp_sd(a, b, c) __extension__ ({ \
399 __m128d __a = (a); \
400 __m128d __b = (b); \
401 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000402
Bob Wilson32bae372011-11-05 06:08:06 +0000403#define _mm_cmp_ss(a, b, c) __extension__ ({ \
404 __m128 __a = (a); \
405 __m128 __b = (b); \
406 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000407
408/* Vector extract */
409static __inline __m128d __attribute__((__always_inline__, __nodebug__))
410_mm256_extractf128_pd(__m256d a, const int o)
411{
412 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
413}
414
415static __inline __m128 __attribute__((__always_inline__, __nodebug__))
416_mm256_extractf128_ps(__m256 a, const int o)
417{
418 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
419}
420
421static __inline __m128i __attribute__((__always_inline__, __nodebug__))
422_mm256_extractf128_si256(__m256i a, const int o)
423{
424 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
425}
426
427static __inline int __attribute__((__always_inline__, __nodebug__))
428_mm256_extract_epi32(__m256i a, int const imm)
429{
430 __v8si b = (__v8si)a;
431 return b[imm];
432}
433
434static __inline int __attribute__((__always_inline__, __nodebug__))
435_mm256_extract_epi16(__m256i a, int const imm)
436{
437 __v16hi b = (__v16hi)a;
438 return b[imm];
439}
440
441static __inline int __attribute__((__always_inline__, __nodebug__))
442_mm256_extract_epi8(__m256i a, int const imm)
443{
444 __v32qi b = (__v32qi)a;
445 return b[imm];
446}
447
448#ifdef __x86_64__
449static __inline long long __attribute__((__always_inline__, __nodebug__))
450_mm256_extract_epi64(__m256i a, const int imm)
451{
452 __v4di b = (__v4di)a;
453 return b[imm];
454}
455#endif
456
457/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000458#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
459 __m256d __V1 = (V1); \
460 __m128d __V2 = (V2); \
461 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000462
Chad Rosierb95ddf12011-12-16 21:40:31 +0000463#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
464 __m256 __V1 = (V1); \
465 __m128 __V2 = (V2); \
466 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000467
Chad Rosierb95ddf12011-12-16 21:40:31 +0000468#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
469 __m256i __V1 = (V1); \
470 __m128i __V2 = (V2); \
471 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000472
473static __inline __m256i __attribute__((__always_inline__, __nodebug__))
474_mm256_insert_epi32(__m256i a, int b, int const imm)
475{
476 __v8si c = (__v8si)a;
477 c[imm & 7] = b;
478 return (__m256i)c;
479}
480
481static __inline __m256i __attribute__((__always_inline__, __nodebug__))
482_mm256_insert_epi16(__m256i a, int b, int const imm)
483{
484 __v16hi c = (__v16hi)a;
485 c[imm & 15] = b;
486 return (__m256i)c;
487}
488
489static __inline __m256i __attribute__((__always_inline__, __nodebug__))
490_mm256_insert_epi8(__m256i a, int b, int const imm)
491{
492 __v32qi c = (__v32qi)a;
493 c[imm & 31] = b;
494 return (__m256i)c;
495}
496
497#ifdef __x86_64__
498static __inline __m256i __attribute__((__always_inline__, __nodebug__))
499_mm256_insert_epi64(__m256i a, int b, int const imm)
500{
501 __v4di c = (__v4di)a;
502 c[imm & 3] = b;
503 return (__m256i)c;
504}
505#endif
506
507/* Conversion */
508static __inline __m256d __attribute__((__always_inline__, __nodebug__))
509_mm256_cvtepi32_pd(__m128i a)
510{
511 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
512}
513
514static __inline __m256 __attribute__((__always_inline__, __nodebug__))
515_mm256_cvtepi32_ps(__m256i a)
516{
517 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
518}
519
520static __inline __m128 __attribute__((__always_inline__, __nodebug__))
521_mm256_cvtpd_ps(__m256d a)
522{
523 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
524}
525
526static __inline __m256i __attribute__((__always_inline__, __nodebug__))
527_mm256_cvtps_epi32(__m256 a)
528{
529 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
530}
531
532static __inline __m256d __attribute__((__always_inline__, __nodebug__))
533_mm256_cvtps_pd(__m128 a)
534{
535 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
536}
537
538static __inline __m128i __attribute__((__always_inline__, __nodebug__))
539_mm256_cvttpd_epi32(__m256d a)
540{
541 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
542}
543
544static __inline __m128i __attribute__((__always_inline__, __nodebug__))
545_mm256_cvtpd_epi32(__m256d a)
546{
547 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
548}
549
550static __inline __m256i __attribute__((__always_inline__, __nodebug__))
551_mm256_cvttps_epi32(__m256 a)
552{
553 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
554}
555
556/* Vector replicate */
557static __inline __m256 __attribute__((__always_inline__, __nodebug__))
558_mm256_movehdup_ps(__m256 a)
559{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000560 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000561}
562
563static __inline __m256 __attribute__((__always_inline__, __nodebug__))
564_mm256_moveldup_ps(__m256 a)
565{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000566 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000567}
568
569static __inline __m256d __attribute__((__always_inline__, __nodebug__))
570_mm256_movedup_pd(__m256d a)
571{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000572 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000573}
574
575/* Unpack and Interleave */
576static __inline __m256d __attribute__((__always_inline__, __nodebug__))
577_mm256_unpackhi_pd(__m256d a, __m256d b)
578{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000579 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000580}
581
582static __inline __m256d __attribute__((__always_inline__, __nodebug__))
583_mm256_unpacklo_pd(__m256d a, __m256d b)
584{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000585 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000586}
587
588static __inline __m256 __attribute__((__always_inline__, __nodebug__))
589_mm256_unpackhi_ps(__m256 a, __m256 b)
590{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000591 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000592}
593
594static __inline __m256 __attribute__((__always_inline__, __nodebug__))
595_mm256_unpacklo_ps(__m256 a, __m256 b)
596{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000597 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000598}
599
600/* Bit Test */
601static __inline int __attribute__((__always_inline__, __nodebug__))
602_mm_testz_pd(__m128d a, __m128d b)
603{
604 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
605}
606
607static __inline int __attribute__((__always_inline__, __nodebug__))
608_mm_testc_pd(__m128d a, __m128d b)
609{
610 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
611}
612
613static __inline int __attribute__((__always_inline__, __nodebug__))
614_mm_testnzc_pd(__m128d a, __m128d b)
615{
616 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
617}
618
619static __inline int __attribute__((__always_inline__, __nodebug__))
620_mm_testz_ps(__m128 a, __m128 b)
621{
622 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
623}
624
625static __inline int __attribute__((__always_inline__, __nodebug__))
626_mm_testc_ps(__m128 a, __m128 b)
627{
628 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
629}
630
631static __inline int __attribute__((__always_inline__, __nodebug__))
632_mm_testnzc_ps(__m128 a, __m128 b)
633{
634 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
635}
636
637static __inline int __attribute__((__always_inline__, __nodebug__))
638_mm256_testz_pd(__m256d a, __m256d b)
639{
640 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
641}
642
643static __inline int __attribute__((__always_inline__, __nodebug__))
644_mm256_testc_pd(__m256d a, __m256d b)
645{
646 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
647}
648
649static __inline int __attribute__((__always_inline__, __nodebug__))
650_mm256_testnzc_pd(__m256d a, __m256d b)
651{
652 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
653}
654
655static __inline int __attribute__((__always_inline__, __nodebug__))
656_mm256_testz_ps(__m256 a, __m256 b)
657{
658 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
659}
660
661static __inline int __attribute__((__always_inline__, __nodebug__))
662_mm256_testc_ps(__m256 a, __m256 b)
663{
664 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
665}
666
667static __inline int __attribute__((__always_inline__, __nodebug__))
668_mm256_testnzc_ps(__m256 a, __m256 b)
669{
670 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
671}
672
673static __inline int __attribute__((__always_inline__, __nodebug__))
674_mm256_testz_si256(__m256i a, __m256i b)
675{
676 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
677}
678
679static __inline int __attribute__((__always_inline__, __nodebug__))
680_mm256_testc_si256(__m256i a, __m256i b)
681{
682 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
683}
684
685static __inline int __attribute__((__always_inline__, __nodebug__))
686_mm256_testnzc_si256(__m256i a, __m256i b)
687{
688 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
689}
690
691/* Vector extract sign mask */
692static __inline int __attribute__((__always_inline__, __nodebug__))
693_mm256_movemask_pd(__m256d a)
694{
695 return __builtin_ia32_movmskpd256((__v4df)a);
696}
697
698static __inline int __attribute__((__always_inline__, __nodebug__))
699_mm256_movemask_ps(__m256 a)
700{
701 return __builtin_ia32_movmskps256((__v8sf)a);
702}
703
704/* Vector zero */
705static __inline void __attribute__((__always_inline__, __nodebug__))
706_mm256_zeroall(void)
707{
708 __builtin_ia32_vzeroall();
709}
710
711static __inline void __attribute__((__always_inline__, __nodebug__))
712_mm256_zeroupper(void)
713{
714 __builtin_ia32_vzeroupper();
715}
716
717/* Vector load with broadcast */
718static __inline __m128 __attribute__((__always_inline__, __nodebug__))
719_mm_broadcast_ss(float const *a)
720{
721 return (__m128)__builtin_ia32_vbroadcastss(a);
722}
723
724static __inline __m256d __attribute__((__always_inline__, __nodebug__))
725_mm256_broadcast_sd(double const *a)
726{
727 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
728}
729
730static __inline __m256 __attribute__((__always_inline__, __nodebug__))
731_mm256_broadcast_ss(float const *a)
732{
733 return (__m256)__builtin_ia32_vbroadcastss256(a);
734}
735
736static __inline __m256d __attribute__((__always_inline__, __nodebug__))
737_mm256_broadcast_pd(__m128d const *a)
738{
739 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
740}
741
742static __inline __m256 __attribute__((__always_inline__, __nodebug__))
743_mm256_broadcast_ps(__m128 const *a)
744{
745 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
746}
747
748/* SIMD load ops */
749static __inline __m256d __attribute__((__always_inline__, __nodebug__))
750_mm256_load_pd(double const *p)
751{
752 return *(__m256d *)p;
753}
754
755static __inline __m256 __attribute__((__always_inline__, __nodebug__))
756_mm256_load_ps(float const *p)
757{
758 return *(__m256 *)p;
759}
760
761static __inline __m256d __attribute__((__always_inline__, __nodebug__))
762_mm256_loadu_pd(double const *p)
763{
764 return (__m256d)__builtin_ia32_loadupd256(p);
765}
766
767static __inline __m256 __attribute__((__always_inline__, __nodebug__))
768_mm256_loadu_ps(float const *p)
769{
770 return (__m256)__builtin_ia32_loadups256(p);
771}
772
773static __inline __m256i __attribute__((__always_inline__, __nodebug__))
774_mm256_load_si256(__m256i const *p)
775{
776 return *p;
777}
778
779static __inline __m256i __attribute__((__always_inline__, __nodebug__))
780_mm256_loadu_si256(__m256i const *p)
781{
782 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
783}
784
785static __inline __m256i __attribute__((__always_inline__, __nodebug__))
786_mm256_lddqu_si256(__m256i const *p)
787{
788 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
789}
790
791/* SIMD store ops */
792static __inline void __attribute__((__always_inline__, __nodebug__))
793_mm256_store_pd(double *p, __m256d a)
794{
795 *(__m256d *)p = a;
796}
797
798static __inline void __attribute__((__always_inline__, __nodebug__))
799_mm256_store_ps(float *p, __m256 a)
800{
801 *(__m256 *)p = a;
802}
803
804static __inline void __attribute__((__always_inline__, __nodebug__))
805_mm256_storeu_pd(double *p, __m256d a)
806{
807 __builtin_ia32_storeupd256(p, (__v4df)a);
808}
809
810static __inline void __attribute__((__always_inline__, __nodebug__))
811_mm256_storeu_ps(float *p, __m256 a)
812{
813 __builtin_ia32_storeups256(p, (__v8sf)a);
814}
815
816static __inline void __attribute__((__always_inline__, __nodebug__))
817_mm256_store_si256(__m256i *p, __m256i a)
818{
819 *p = a;
820}
821
822static __inline void __attribute__((__always_inline__, __nodebug__))
823_mm256_storeu_si256(__m256i *p, __m256i a)
824{
825 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
826}
827
828/* Conditional load ops */
829static __inline __m128d __attribute__((__always_inline__, __nodebug__))
830_mm_maskload_pd(double const *p, __m128d m)
831{
832 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
833}
834
835static __inline __m256d __attribute__((__always_inline__, __nodebug__))
836_mm256_maskload_pd(double const *p, __m256d m)
837{
838 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
839}
840
841static __inline __m128 __attribute__((__always_inline__, __nodebug__))
842_mm_maskload_ps(float const *p, __m128 m)
843{
844 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
845}
846
847static __inline __m256 __attribute__((__always_inline__, __nodebug__))
848_mm256_maskload_ps(float const *p, __m256 m)
849{
850 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
851}
852
853/* Conditional store ops */
854static __inline void __attribute__((__always_inline__, __nodebug__))
855_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
856{
857 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
858}
859
860static __inline void __attribute__((__always_inline__, __nodebug__))
861_mm_maskstore_pd(double *p, __m128d m, __m128d a)
862{
863 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
864}
865
866static __inline void __attribute__((__always_inline__, __nodebug__))
867_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
868{
869 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
870}
871
872static __inline void __attribute__((__always_inline__, __nodebug__))
873_mm_maskstore_ps(float *p, __m128 m, __m128 a)
874{
875 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
876}
877
878/* Cacheability support ops */
879static __inline void __attribute__((__always_inline__, __nodebug__))
880_mm256_stream_si256(__m256i *a, __m256i b)
881{
882 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
883}
884
885static __inline void __attribute__((__always_inline__, __nodebug__))
886_mm256_stream_pd(double *a, __m256d b)
887{
888 __builtin_ia32_movntpd256(a, (__v4df)b);
889}
890
891static __inline void __attribute__((__always_inline__, __nodebug__))
892_mm256_stream_ps(float *p, __m256 a)
893{
894 __builtin_ia32_movntps256(p, (__v8sf)a);
895}
896
897/* Create vectors */
898static __inline __m256d __attribute__((__always_inline__, __nodebug__))
899_mm256_set_pd(double a, double b, double c, double d)
900{
901 return (__m256d){ d, c, b, a };
902}
903
904static __inline __m256 __attribute__((__always_inline__, __nodebug__))
905_mm256_set_ps(float a, float b, float c, float d,
906 float e, float f, float g, float h)
907{
908 return (__m256){ h, g, f, e, d, c, b, a };
909}
910
911static __inline __m256i __attribute__((__always_inline__, __nodebug__))
912_mm256_set_epi32(int i0, int i1, int i2, int i3,
913 int i4, int i5, int i6, int i7)
914{
915 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
916}
917
918static __inline __m256i __attribute__((__always_inline__, __nodebug__))
919_mm256_set_epi16(short w15, short w14, short w13, short w12,
920 short w11, short w10, short w09, short w08,
921 short w07, short w06, short w05, short w04,
922 short w03, short w02, short w01, short w00)
923{
924 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
925 w08, w09, w10, w11, w12, w13, w14, w15 };
926}
927
928static __inline __m256i __attribute__((__always_inline__, __nodebug__))
929_mm256_set_epi8(char b31, char b30, char b29, char b28,
930 char b27, char b26, char b25, char b24,
931 char b23, char b22, char b21, char b20,
932 char b19, char b18, char b17, char b16,
933 char b15, char b14, char b13, char b12,
934 char b11, char b10, char b09, char b08,
935 char b07, char b06, char b05, char b04,
936 char b03, char b02, char b01, char b00)
937{
938 return (__m256i)(__v32qi){
939 b00, b01, b02, b03, b04, b05, b06, b07,
940 b08, b09, b10, b11, b12, b13, b14, b15,
941 b16, b17, b18, b19, b20, b21, b22, b23,
942 b24, b25, b26, b27, b28, b29, b30, b31
943 };
944}
945
946static __inline __m256i __attribute__((__always_inline__, __nodebug__))
947_mm256_set_epi64x(long long a, long long b, long long c, long long d)
948{
949 return (__m256i)(__v4di){ d, c, b, a };
950}
951
952/* Create vectors with elements in reverse order */
953static __inline __m256d __attribute__((__always_inline__, __nodebug__))
954_mm256_setr_pd(double a, double b, double c, double d)
955{
956 return (__m256d){ a, b, c, d };
957}
958
959static __inline __m256 __attribute__((__always_inline__, __nodebug__))
960_mm256_setr_ps(float a, float b, float c, float d,
961 float e, float f, float g, float h)
962{
963 return (__m256){ a, b, c, d, e, f, g, h };
964}
965
966static __inline __m256i __attribute__((__always_inline__, __nodebug__))
967_mm256_setr_epi32(int i0, int i1, int i2, int i3,
968 int i4, int i5, int i6, int i7)
969{
970 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
971}
972
973static __inline __m256i __attribute__((__always_inline__, __nodebug__))
974_mm256_setr_epi16(short w15, short w14, short w13, short w12,
975 short w11, short w10, short w09, short w08,
976 short w07, short w06, short w05, short w04,
977 short w03, short w02, short w01, short w00)
978{
979 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
980 w07, w06, w05, w04, w03, w02, w01, w00 };
981}
982
983static __inline __m256i __attribute__((__always_inline__, __nodebug__))
984_mm256_setr_epi8(char b31, char b30, char b29, char b28,
985 char b27, char b26, char b25, char b24,
986 char b23, char b22, char b21, char b20,
987 char b19, char b18, char b17, char b16,
988 char b15, char b14, char b13, char b12,
989 char b11, char b10, char b09, char b08,
990 char b07, char b06, char b05, char b04,
991 char b03, char b02, char b01, char b00)
992{
993 return (__m256i)(__v32qi){
994 b31, b30, b29, b28, b27, b26, b25, b24,
995 b23, b22, b21, b20, b19, b18, b17, b16,
996 b15, b14, b13, b12, b11, b10, b09, b08,
997 b07, b06, b05, b04, b03, b02, b01, b00 };
998}
999
1000static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1001_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1002{
1003 return (__m256i)(__v4di){ a, b, c, d };
1004}
1005
1006/* Create vectors with repeated elements */
1007static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1008_mm256_set1_pd(double w)
1009{
1010 return (__m256d){ w, w, w, w };
1011}
1012
1013static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1014_mm256_set1_ps(float w)
1015{
1016 return (__m256){ w, w, w, w, w, w, w, w };
1017}
1018
1019static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1020_mm256_set1_epi32(int i)
1021{
1022 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1023}
1024
1025static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1026_mm256_set1_epi16(short w)
1027{
1028 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1029}
1030
1031static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1032_mm256_set1_epi8(char b)
1033{
1034 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1035 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1036}
1037
1038static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1039_mm256_set1_epi64x(long long q)
1040{
1041 return (__m256i)(__v4di){ q, q, q, q };
1042}
1043
1044/* Create zeroed vectors */
1045static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1046_mm256_setzero_pd(void)
1047{
1048 return (__m256d){ 0, 0, 0, 0 };
1049}
1050
1051static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1052_mm256_setzero_ps(void)
1053{
1054 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1055}
1056
1057static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1058_mm256_setzero_si256(void)
1059{
1060 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1061}
1062
1063/* Cast between vector types */
1064static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1065_mm256_castpd_ps(__m256d in)
1066{
1067 return (__m256)in;
1068}
1069
1070static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1071_mm256_castpd_si256(__m256d in)
1072{
1073 return (__m256i)in;
1074}
1075
1076static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1077_mm256_castps_pd(__m256 in)
1078{
1079 return (__m256d)in;
1080}
1081
1082static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1083_mm256_castps_si256(__m256 in)
1084{
1085 return (__m256i)in;
1086}
1087
1088static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1089_mm256_castsi256_ps(__m256i in)
1090{
1091 return (__m256)in;
1092}
1093
1094static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1095_mm256_castsi256_pd(__m256i in)
1096{
1097 return (__m256d)in;
1098}
1099
1100static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1101_mm256_castpd256_pd128(__m256d in)
1102{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001103 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001104}
1105
1106static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1107_mm256_castps256_ps128(__m256 in)
1108{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001109 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001110}
1111
1112static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1113_mm256_castsi256_si128(__m256i in)
1114{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001115 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001116}
1117
1118static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1119_mm256_castpd128_pd256(__m128d in)
1120{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001121 __m128d zero = _mm_setzero_pd();
1122 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001123}
1124
1125static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1126_mm256_castps128_ps256(__m128 in)
1127{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001128 __m128 zero = _mm_setzero_ps();
1129 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001130}
1131
1132static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1133_mm256_castsi128_si256(__m128i in)
1134{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001135 __m128i zero = _mm_setzero_si128();
1136 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001137}