blob: bb3246273613e230070dfd802c254bf98a60a92a [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosier19780872011-12-17 00:50:42 +0000261static __inline __m128d __attribute__((__always_inline__, __nodebug__))
262_mm_permute_pd(__m128d a, const int c)
263{
264 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
265}
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000266
Chad Rosier19780872011-12-17 00:50:42 +0000267static __inline __m256d __attribute__((__always_inline__, __nodebug__))
268_mm256_permute_pd(__m256d a, const int c)
269{
270 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
271}
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000272
273static __inline __m128 __attribute__((__always_inline__, __nodebug__))
274_mm_permute_ps(__m128 a, const int c)
275{
276 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
277}
278
279static __inline __m256 __attribute__((__always_inline__, __nodebug__))
280_mm256_permute_ps(__m256 a, const int c)
281{
282 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
283}
284
Chad Rosierc5cda112011-12-16 21:07:34 +0000285#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
286 __m256d __V1 = (V1); \
287 __m256d __V2 = (V2); \
288 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000289
Chad Rosierc5cda112011-12-16 21:07:34 +0000290#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
291 __m256 __V1 = (V1); \
292 __m256 __V2 = (V2); \
293 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000294
Chad Rosierc5cda112011-12-16 21:07:34 +0000295#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
296 __m256i __V1 = (V1); \
297 __m256i __V2 = (V2); \
298 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000299
300/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000301#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
302 __m256d __V1 = (V1); \
303 __m256d __V2 = (V2); \
304 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000305
Eli Friedman34720892011-11-10 00:11:13 +0000306#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
307 __m256 __V1 = (V1); \
308 __m256 __V2 = (V2); \
309 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000310
311static __inline __m256d __attribute__((__always_inline__, __nodebug__))
312_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
313{
314 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
315}
316
317static __inline __m256 __attribute__((__always_inline__, __nodebug__))
318_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
319{
320 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
321}
322
323/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000324#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
325 __m256 __V1 = (V1); \
326 __m256 __V2 = (V2); \
327 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000328
329/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000330#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
331 __m256 __a = (a); \
332 __m256 __b = (b); \
333 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000334 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000335 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000336 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000337 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000338
Bob Wilson32bae372011-11-05 06:08:06 +0000339#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
340 __m256d __a = (a); \
341 __m256d __b = (b); \
342 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000343 (mask) & 0x1, \
344 (((mask) & 0x2) >> 1) + 4, \
345 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000346 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000347
348/* Compare */
349#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
350#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
351#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
352#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
353#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
354#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
355#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
356#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
357#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
358#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
359#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
360#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
361#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
362#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
363#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
364#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
365#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
366#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
367#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
368#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
369#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
370#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
371#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
372#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
373#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
374#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
375#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
376#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
377#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
378#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
379#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
380#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
381
Bob Wilson32bae372011-11-05 06:08:06 +0000382#define _mm_cmp_pd(a, b, c) __extension__ ({ \
383 __m128d __a = (a); \
384 __m128d __b = (b); \
385 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000386
Bob Wilson32bae372011-11-05 06:08:06 +0000387#define _mm_cmp_ps(a, b, c) __extension__ ({ \
388 __m128 __a = (a); \
389 __m128 __b = (b); \
390 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000391
Bob Wilson32bae372011-11-05 06:08:06 +0000392#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
393 __m256d __a = (a); \
394 __m256d __b = (b); \
395 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000396
Bob Wilson32bae372011-11-05 06:08:06 +0000397#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
398 __m256 __a = (a); \
399 __m256 __b = (b); \
400 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000401
Bob Wilson32bae372011-11-05 06:08:06 +0000402#define _mm_cmp_sd(a, b, c) __extension__ ({ \
403 __m128d __a = (a); \
404 __m128d __b = (b); \
405 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000406
Bob Wilson32bae372011-11-05 06:08:06 +0000407#define _mm_cmp_ss(a, b, c) __extension__ ({ \
408 __m128 __a = (a); \
409 __m128 __b = (b); \
410 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000411
412/* Vector extract */
413static __inline __m128d __attribute__((__always_inline__, __nodebug__))
414_mm256_extractf128_pd(__m256d a, const int o)
415{
416 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
417}
418
419static __inline __m128 __attribute__((__always_inline__, __nodebug__))
420_mm256_extractf128_ps(__m256 a, const int o)
421{
422 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
423}
424
425static __inline __m128i __attribute__((__always_inline__, __nodebug__))
426_mm256_extractf128_si256(__m256i a, const int o)
427{
428 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
429}
430
431static __inline int __attribute__((__always_inline__, __nodebug__))
432_mm256_extract_epi32(__m256i a, int const imm)
433{
434 __v8si b = (__v8si)a;
435 return b[imm];
436}
437
438static __inline int __attribute__((__always_inline__, __nodebug__))
439_mm256_extract_epi16(__m256i a, int const imm)
440{
441 __v16hi b = (__v16hi)a;
442 return b[imm];
443}
444
445static __inline int __attribute__((__always_inline__, __nodebug__))
446_mm256_extract_epi8(__m256i a, int const imm)
447{
448 __v32qi b = (__v32qi)a;
449 return b[imm];
450}
451
452#ifdef __x86_64__
453static __inline long long __attribute__((__always_inline__, __nodebug__))
454_mm256_extract_epi64(__m256i a, const int imm)
455{
456 __v4di b = (__v4di)a;
457 return b[imm];
458}
459#endif
460
461/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000462#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
463 __m256d __V1 = (V1); \
464 __m128d __V2 = (V2); \
465 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000466
Chad Rosierb95ddf12011-12-16 21:40:31 +0000467#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
468 __m256 __V1 = (V1); \
469 __m128 __V2 = (V2); \
470 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000471
Chad Rosierb95ddf12011-12-16 21:40:31 +0000472#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
473 __m256i __V1 = (V1); \
474 __m128i __V2 = (V2); \
475 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000476
477static __inline __m256i __attribute__((__always_inline__, __nodebug__))
478_mm256_insert_epi32(__m256i a, int b, int const imm)
479{
480 __v8si c = (__v8si)a;
481 c[imm & 7] = b;
482 return (__m256i)c;
483}
484
485static __inline __m256i __attribute__((__always_inline__, __nodebug__))
486_mm256_insert_epi16(__m256i a, int b, int const imm)
487{
488 __v16hi c = (__v16hi)a;
489 c[imm & 15] = b;
490 return (__m256i)c;
491}
492
493static __inline __m256i __attribute__((__always_inline__, __nodebug__))
494_mm256_insert_epi8(__m256i a, int b, int const imm)
495{
496 __v32qi c = (__v32qi)a;
497 c[imm & 31] = b;
498 return (__m256i)c;
499}
500
501#ifdef __x86_64__
502static __inline __m256i __attribute__((__always_inline__, __nodebug__))
503_mm256_insert_epi64(__m256i a, int b, int const imm)
504{
505 __v4di c = (__v4di)a;
506 c[imm & 3] = b;
507 return (__m256i)c;
508}
509#endif
510
511/* Conversion */
512static __inline __m256d __attribute__((__always_inline__, __nodebug__))
513_mm256_cvtepi32_pd(__m128i a)
514{
515 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
516}
517
518static __inline __m256 __attribute__((__always_inline__, __nodebug__))
519_mm256_cvtepi32_ps(__m256i a)
520{
521 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
522}
523
524static __inline __m128 __attribute__((__always_inline__, __nodebug__))
525_mm256_cvtpd_ps(__m256d a)
526{
527 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
528}
529
530static __inline __m256i __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtps_epi32(__m256 a)
532{
533 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
534}
535
536static __inline __m256d __attribute__((__always_inline__, __nodebug__))
537_mm256_cvtps_pd(__m128 a)
538{
539 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
540}
541
542static __inline __m128i __attribute__((__always_inline__, __nodebug__))
543_mm256_cvttpd_epi32(__m256d a)
544{
545 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
546}
547
548static __inline __m128i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvtpd_epi32(__m256d a)
550{
551 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
552}
553
554static __inline __m256i __attribute__((__always_inline__, __nodebug__))
555_mm256_cvttps_epi32(__m256 a)
556{
557 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
558}
559
560/* Vector replicate */
561static __inline __m256 __attribute__((__always_inline__, __nodebug__))
562_mm256_movehdup_ps(__m256 a)
563{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000564 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000565}
566
567static __inline __m256 __attribute__((__always_inline__, __nodebug__))
568_mm256_moveldup_ps(__m256 a)
569{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000570 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000571}
572
573static __inline __m256d __attribute__((__always_inline__, __nodebug__))
574_mm256_movedup_pd(__m256d a)
575{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000576 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000577}
578
579/* Unpack and Interleave */
580static __inline __m256d __attribute__((__always_inline__, __nodebug__))
581_mm256_unpackhi_pd(__m256d a, __m256d b)
582{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000583 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000584}
585
586static __inline __m256d __attribute__((__always_inline__, __nodebug__))
587_mm256_unpacklo_pd(__m256d a, __m256d b)
588{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000589 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000590}
591
592static __inline __m256 __attribute__((__always_inline__, __nodebug__))
593_mm256_unpackhi_ps(__m256 a, __m256 b)
594{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000595 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000596}
597
598static __inline __m256 __attribute__((__always_inline__, __nodebug__))
599_mm256_unpacklo_ps(__m256 a, __m256 b)
600{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000601 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000602}
603
604/* Bit Test */
605static __inline int __attribute__((__always_inline__, __nodebug__))
606_mm_testz_pd(__m128d a, __m128d b)
607{
608 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
609}
610
611static __inline int __attribute__((__always_inline__, __nodebug__))
612_mm_testc_pd(__m128d a, __m128d b)
613{
614 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
615}
616
617static __inline int __attribute__((__always_inline__, __nodebug__))
618_mm_testnzc_pd(__m128d a, __m128d b)
619{
620 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
621}
622
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testz_ps(__m128 a, __m128 b)
625{
626 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testc_ps(__m128 a, __m128 b)
631{
632 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm_testnzc_ps(__m128 a, __m128 b)
637{
638 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm256_testz_pd(__m256d a, __m256d b)
643{
644 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm256_testc_pd(__m256d a, __m256d b)
649{
650 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm256_testnzc_pd(__m256d a, __m256d b)
655{
656 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testz_ps(__m256 a, __m256 b)
661{
662 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testc_ps(__m256 a, __m256 b)
667{
668 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testnzc_ps(__m256 a, __m256 b)
673{
674 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testz_si256(__m256i a, __m256i b)
679{
680 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testc_si256(__m256i a, __m256i b)
685{
686 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
687}
688
689static __inline int __attribute__((__always_inline__, __nodebug__))
690_mm256_testnzc_si256(__m256i a, __m256i b)
691{
692 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
693}
694
695/* Vector extract sign mask */
696static __inline int __attribute__((__always_inline__, __nodebug__))
697_mm256_movemask_pd(__m256d a)
698{
699 return __builtin_ia32_movmskpd256((__v4df)a);
700}
701
702static __inline int __attribute__((__always_inline__, __nodebug__))
703_mm256_movemask_ps(__m256 a)
704{
705 return __builtin_ia32_movmskps256((__v8sf)a);
706}
707
708/* Vector zero */
709static __inline void __attribute__((__always_inline__, __nodebug__))
710_mm256_zeroall(void)
711{
712 __builtin_ia32_vzeroall();
713}
714
715static __inline void __attribute__((__always_inline__, __nodebug__))
716_mm256_zeroupper(void)
717{
718 __builtin_ia32_vzeroupper();
719}
720
721/* Vector load with broadcast */
722static __inline __m128 __attribute__((__always_inline__, __nodebug__))
723_mm_broadcast_ss(float const *a)
724{
725 return (__m128)__builtin_ia32_vbroadcastss(a);
726}
727
728static __inline __m256d __attribute__((__always_inline__, __nodebug__))
729_mm256_broadcast_sd(double const *a)
730{
731 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
732}
733
734static __inline __m256 __attribute__((__always_inline__, __nodebug__))
735_mm256_broadcast_ss(float const *a)
736{
737 return (__m256)__builtin_ia32_vbroadcastss256(a);
738}
739
740static __inline __m256d __attribute__((__always_inline__, __nodebug__))
741_mm256_broadcast_pd(__m128d const *a)
742{
743 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
744}
745
746static __inline __m256 __attribute__((__always_inline__, __nodebug__))
747_mm256_broadcast_ps(__m128 const *a)
748{
749 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
750}
751
752/* SIMD load ops */
753static __inline __m256d __attribute__((__always_inline__, __nodebug__))
754_mm256_load_pd(double const *p)
755{
756 return *(__m256d *)p;
757}
758
759static __inline __m256 __attribute__((__always_inline__, __nodebug__))
760_mm256_load_ps(float const *p)
761{
762 return *(__m256 *)p;
763}
764
765static __inline __m256d __attribute__((__always_inline__, __nodebug__))
766_mm256_loadu_pd(double const *p)
767{
768 return (__m256d)__builtin_ia32_loadupd256(p);
769}
770
771static __inline __m256 __attribute__((__always_inline__, __nodebug__))
772_mm256_loadu_ps(float const *p)
773{
774 return (__m256)__builtin_ia32_loadups256(p);
775}
776
777static __inline __m256i __attribute__((__always_inline__, __nodebug__))
778_mm256_load_si256(__m256i const *p)
779{
780 return *p;
781}
782
783static __inline __m256i __attribute__((__always_inline__, __nodebug__))
784_mm256_loadu_si256(__m256i const *p)
785{
786 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
787}
788
789static __inline __m256i __attribute__((__always_inline__, __nodebug__))
790_mm256_lddqu_si256(__m256i const *p)
791{
792 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
793}
794
795/* SIMD store ops */
796static __inline void __attribute__((__always_inline__, __nodebug__))
797_mm256_store_pd(double *p, __m256d a)
798{
799 *(__m256d *)p = a;
800}
801
802static __inline void __attribute__((__always_inline__, __nodebug__))
803_mm256_store_ps(float *p, __m256 a)
804{
805 *(__m256 *)p = a;
806}
807
808static __inline void __attribute__((__always_inline__, __nodebug__))
809_mm256_storeu_pd(double *p, __m256d a)
810{
811 __builtin_ia32_storeupd256(p, (__v4df)a);
812}
813
814static __inline void __attribute__((__always_inline__, __nodebug__))
815_mm256_storeu_ps(float *p, __m256 a)
816{
817 __builtin_ia32_storeups256(p, (__v8sf)a);
818}
819
820static __inline void __attribute__((__always_inline__, __nodebug__))
821_mm256_store_si256(__m256i *p, __m256i a)
822{
823 *p = a;
824}
825
826static __inline void __attribute__((__always_inline__, __nodebug__))
827_mm256_storeu_si256(__m256i *p, __m256i a)
828{
829 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
830}
831
832/* Conditional load ops */
833static __inline __m128d __attribute__((__always_inline__, __nodebug__))
834_mm_maskload_pd(double const *p, __m128d m)
835{
836 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
837}
838
839static __inline __m256d __attribute__((__always_inline__, __nodebug__))
840_mm256_maskload_pd(double const *p, __m256d m)
841{
842 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
843}
844
845static __inline __m128 __attribute__((__always_inline__, __nodebug__))
846_mm_maskload_ps(float const *p, __m128 m)
847{
848 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
849}
850
851static __inline __m256 __attribute__((__always_inline__, __nodebug__))
852_mm256_maskload_ps(float const *p, __m256 m)
853{
854 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
855}
856
857/* Conditional store ops */
858static __inline void __attribute__((__always_inline__, __nodebug__))
859_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
860{
861 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
862}
863
864static __inline void __attribute__((__always_inline__, __nodebug__))
865_mm_maskstore_pd(double *p, __m128d m, __m128d a)
866{
867 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
868}
869
870static __inline void __attribute__((__always_inline__, __nodebug__))
871_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
872{
873 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
874}
875
876static __inline void __attribute__((__always_inline__, __nodebug__))
877_mm_maskstore_ps(float *p, __m128 m, __m128 a)
878{
879 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
880}
881
882/* Cacheability support ops */
883static __inline void __attribute__((__always_inline__, __nodebug__))
884_mm256_stream_si256(__m256i *a, __m256i b)
885{
886 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
887}
888
889static __inline void __attribute__((__always_inline__, __nodebug__))
890_mm256_stream_pd(double *a, __m256d b)
891{
892 __builtin_ia32_movntpd256(a, (__v4df)b);
893}
894
895static __inline void __attribute__((__always_inline__, __nodebug__))
896_mm256_stream_ps(float *p, __m256 a)
897{
898 __builtin_ia32_movntps256(p, (__v8sf)a);
899}
900
901/* Create vectors */
902static __inline __m256d __attribute__((__always_inline__, __nodebug__))
903_mm256_set_pd(double a, double b, double c, double d)
904{
905 return (__m256d){ d, c, b, a };
906}
907
908static __inline __m256 __attribute__((__always_inline__, __nodebug__))
909_mm256_set_ps(float a, float b, float c, float d,
910 float e, float f, float g, float h)
911{
912 return (__m256){ h, g, f, e, d, c, b, a };
913}
914
915static __inline __m256i __attribute__((__always_inline__, __nodebug__))
916_mm256_set_epi32(int i0, int i1, int i2, int i3,
917 int i4, int i5, int i6, int i7)
918{
919 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
920}
921
922static __inline __m256i __attribute__((__always_inline__, __nodebug__))
923_mm256_set_epi16(short w15, short w14, short w13, short w12,
924 short w11, short w10, short w09, short w08,
925 short w07, short w06, short w05, short w04,
926 short w03, short w02, short w01, short w00)
927{
928 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
929 w08, w09, w10, w11, w12, w13, w14, w15 };
930}
931
932static __inline __m256i __attribute__((__always_inline__, __nodebug__))
933_mm256_set_epi8(char b31, char b30, char b29, char b28,
934 char b27, char b26, char b25, char b24,
935 char b23, char b22, char b21, char b20,
936 char b19, char b18, char b17, char b16,
937 char b15, char b14, char b13, char b12,
938 char b11, char b10, char b09, char b08,
939 char b07, char b06, char b05, char b04,
940 char b03, char b02, char b01, char b00)
941{
942 return (__m256i)(__v32qi){
943 b00, b01, b02, b03, b04, b05, b06, b07,
944 b08, b09, b10, b11, b12, b13, b14, b15,
945 b16, b17, b18, b19, b20, b21, b22, b23,
946 b24, b25, b26, b27, b28, b29, b30, b31
947 };
948}
949
950static __inline __m256i __attribute__((__always_inline__, __nodebug__))
951_mm256_set_epi64x(long long a, long long b, long long c, long long d)
952{
953 return (__m256i)(__v4di){ d, c, b, a };
954}
955
956/* Create vectors with elements in reverse order */
957static __inline __m256d __attribute__((__always_inline__, __nodebug__))
958_mm256_setr_pd(double a, double b, double c, double d)
959{
960 return (__m256d){ a, b, c, d };
961}
962
963static __inline __m256 __attribute__((__always_inline__, __nodebug__))
964_mm256_setr_ps(float a, float b, float c, float d,
965 float e, float f, float g, float h)
966{
967 return (__m256){ a, b, c, d, e, f, g, h };
968}
969
970static __inline __m256i __attribute__((__always_inline__, __nodebug__))
971_mm256_setr_epi32(int i0, int i1, int i2, int i3,
972 int i4, int i5, int i6, int i7)
973{
974 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_setr_epi16(short w15, short w14, short w13, short w12,
979 short w11, short w10, short w09, short w08,
980 short w07, short w06, short w05, short w04,
981 short w03, short w02, short w01, short w00)
982{
983 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
984 w07, w06, w05, w04, w03, w02, w01, w00 };
985}
986
987static __inline __m256i __attribute__((__always_inline__, __nodebug__))
988_mm256_setr_epi8(char b31, char b30, char b29, char b28,
989 char b27, char b26, char b25, char b24,
990 char b23, char b22, char b21, char b20,
991 char b19, char b18, char b17, char b16,
992 char b15, char b14, char b13, char b12,
993 char b11, char b10, char b09, char b08,
994 char b07, char b06, char b05, char b04,
995 char b03, char b02, char b01, char b00)
996{
997 return (__m256i)(__v32qi){
998 b31, b30, b29, b28, b27, b26, b25, b24,
999 b23, b22, b21, b20, b19, b18, b17, b16,
1000 b15, b14, b13, b12, b11, b10, b09, b08,
1001 b07, b06, b05, b04, b03, b02, b01, b00 };
1002}
1003
1004static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1005_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1006{
1007 return (__m256i)(__v4di){ a, b, c, d };
1008}
1009
1010/* Create vectors with repeated elements */
1011static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1012_mm256_set1_pd(double w)
1013{
1014 return (__m256d){ w, w, w, w };
1015}
1016
1017static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1018_mm256_set1_ps(float w)
1019{
1020 return (__m256){ w, w, w, w, w, w, w, w };
1021}
1022
1023static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1024_mm256_set1_epi32(int i)
1025{
1026 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1027}
1028
1029static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1030_mm256_set1_epi16(short w)
1031{
1032 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1033}
1034
1035static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1036_mm256_set1_epi8(char b)
1037{
1038 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1039 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1040}
1041
1042static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1043_mm256_set1_epi64x(long long q)
1044{
1045 return (__m256i)(__v4di){ q, q, q, q };
1046}
1047
1048/* Create zeroed vectors */
1049static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1050_mm256_setzero_pd(void)
1051{
1052 return (__m256d){ 0, 0, 0, 0 };
1053}
1054
1055static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1056_mm256_setzero_ps(void)
1057{
1058 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1059}
1060
1061static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1062_mm256_setzero_si256(void)
1063{
1064 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1065}
1066
1067/* Cast between vector types */
1068static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1069_mm256_castpd_ps(__m256d in)
1070{
1071 return (__m256)in;
1072}
1073
1074static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1075_mm256_castpd_si256(__m256d in)
1076{
1077 return (__m256i)in;
1078}
1079
1080static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1081_mm256_castps_pd(__m256 in)
1082{
1083 return (__m256d)in;
1084}
1085
1086static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1087_mm256_castps_si256(__m256 in)
1088{
1089 return (__m256i)in;
1090}
1091
1092static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1093_mm256_castsi256_ps(__m256i in)
1094{
1095 return (__m256)in;
1096}
1097
1098static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1099_mm256_castsi256_pd(__m256i in)
1100{
1101 return (__m256d)in;
1102}
1103
1104static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1105_mm256_castpd256_pd128(__m256d in)
1106{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001107 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001108}
1109
1110static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1111_mm256_castps256_ps128(__m256 in)
1112{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001113 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001114}
1115
1116static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1117_mm256_castsi256_si128(__m256i in)
1118{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001119 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001120}
1121
1122static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1123_mm256_castpd128_pd256(__m128d in)
1124{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001125 __m128d zero = _mm_setzero_pd();
1126 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001127}
1128
1129static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1130_mm256_castps128_ps256(__m128 in)
1131{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001132 __m128 zero = _mm_setzero_ps();
1133 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001134}
1135
1136static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1137_mm256_castsi128_si256(__m128i in)
1138{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001139 __m128i zero = _mm_setzero_si128();
1140 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001141}