blob: 64ab67b5859c8245a639dcc1d8a70c3c76e13435 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosierc17f88e2011-12-17 01:39:56 +0000261#define _mm_permute_pd(A, C) __extension__ ({ \
262 __m128d __A = (A); \
263 (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, C); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000264
Chad Rosierc17f88e2011-12-17 01:39:56 +0000265#define _mm256_permute_pd(A, C) __extension__ ({ \
266 __m256d __A = (A); \
267 (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, C); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000268
269static __inline __m128 __attribute__((__always_inline__, __nodebug__))
270_mm_permute_ps(__m128 a, const int c)
271{
272 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
273}
274
275static __inline __m256 __attribute__((__always_inline__, __nodebug__))
276_mm256_permute_ps(__m256 a, const int c)
277{
278 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
279}
280
Chad Rosierc5cda112011-12-16 21:07:34 +0000281#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
282 __m256d __V1 = (V1); \
283 __m256d __V2 = (V2); \
284 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000285
Chad Rosierc5cda112011-12-16 21:07:34 +0000286#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
287 __m256 __V1 = (V1); \
288 __m256 __V2 = (V2); \
289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000290
Chad Rosierc5cda112011-12-16 21:07:34 +0000291#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
292 __m256i __V1 = (V1); \
293 __m256i __V2 = (V2); \
294 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000295
296/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000297#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
298 __m256d __V1 = (V1); \
299 __m256d __V2 = (V2); \
300 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000301
Eli Friedman34720892011-11-10 00:11:13 +0000302#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
303 __m256 __V1 = (V1); \
304 __m256 __V2 = (V2); \
305 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000306
307static __inline __m256d __attribute__((__always_inline__, __nodebug__))
308_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
309{
310 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
311}
312
313static __inline __m256 __attribute__((__always_inline__, __nodebug__))
314_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
315{
316 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
317}
318
319/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000320#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
321 __m256 __V1 = (V1); \
322 __m256 __V2 = (V2); \
323 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000324
325/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000326#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
327 __m256 __a = (a); \
328 __m256 __b = (b); \
329 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000330 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000331 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000332 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000333 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000334
Bob Wilson32bae372011-11-05 06:08:06 +0000335#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
336 __m256d __a = (a); \
337 __m256d __b = (b); \
338 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000339 (mask) & 0x1, \
340 (((mask) & 0x2) >> 1) + 4, \
341 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000342 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000343
344/* Compare */
345#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
346#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
347#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
348#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
349#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
350#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
351#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
352#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
353#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
354#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
355#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
356#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
357#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
358#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
359#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
360#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
361#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
362#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
363#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
364#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
365#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
366#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
367#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
368#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
369#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
370#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
371#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
372#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
373#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
374#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
375#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
376#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
377
Bob Wilson32bae372011-11-05 06:08:06 +0000378#define _mm_cmp_pd(a, b, c) __extension__ ({ \
379 __m128d __a = (a); \
380 __m128d __b = (b); \
381 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000382
Bob Wilson32bae372011-11-05 06:08:06 +0000383#define _mm_cmp_ps(a, b, c) __extension__ ({ \
384 __m128 __a = (a); \
385 __m128 __b = (b); \
386 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000387
Bob Wilson32bae372011-11-05 06:08:06 +0000388#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
389 __m256d __a = (a); \
390 __m256d __b = (b); \
391 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000392
Bob Wilson32bae372011-11-05 06:08:06 +0000393#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
394 __m256 __a = (a); \
395 __m256 __b = (b); \
396 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000397
Bob Wilson32bae372011-11-05 06:08:06 +0000398#define _mm_cmp_sd(a, b, c) __extension__ ({ \
399 __m128d __a = (a); \
400 __m128d __b = (b); \
401 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000402
Bob Wilson32bae372011-11-05 06:08:06 +0000403#define _mm_cmp_ss(a, b, c) __extension__ ({ \
404 __m128 __a = (a); \
405 __m128 __b = (b); \
406 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000407
408/* Vector extract */
Chad Rosier1e4faf52011-12-17 01:22:27 +0000409#define _mm256_extractf128_pd(A, O) __extension__ ({ \
410 __m256d __A = (A); \
411 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000412
Chad Rosier1e4faf52011-12-17 01:22:27 +0000413#define _mm256_extractf128_ps(A, O) __extension__ ({ \
414 __m256 __A = (A); \
415 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000416
Chad Rosier1e4faf52011-12-17 01:22:27 +0000417#define _mm256_extractf128_si256(A, O) __extension__ ({ \
418 __m256i __A = (A); \
419 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000420
421static __inline int __attribute__((__always_inline__, __nodebug__))
422_mm256_extract_epi32(__m256i a, int const imm)
423{
424 __v8si b = (__v8si)a;
425 return b[imm];
426}
427
428static __inline int __attribute__((__always_inline__, __nodebug__))
429_mm256_extract_epi16(__m256i a, int const imm)
430{
431 __v16hi b = (__v16hi)a;
432 return b[imm];
433}
434
435static __inline int __attribute__((__always_inline__, __nodebug__))
436_mm256_extract_epi8(__m256i a, int const imm)
437{
438 __v32qi b = (__v32qi)a;
439 return b[imm];
440}
441
442#ifdef __x86_64__
443static __inline long long __attribute__((__always_inline__, __nodebug__))
444_mm256_extract_epi64(__m256i a, const int imm)
445{
446 __v4di b = (__v4di)a;
447 return b[imm];
448}
449#endif
450
451/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000452#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
453 __m256d __V1 = (V1); \
454 __m128d __V2 = (V2); \
455 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000456
Chad Rosierb95ddf12011-12-16 21:40:31 +0000457#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
458 __m256 __V1 = (V1); \
459 __m128 __V2 = (V2); \
460 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000461
Chad Rosierb95ddf12011-12-16 21:40:31 +0000462#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
463 __m256i __V1 = (V1); \
464 __m128i __V2 = (V2); \
465 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000466
467static __inline __m256i __attribute__((__always_inline__, __nodebug__))
468_mm256_insert_epi32(__m256i a, int b, int const imm)
469{
470 __v8si c = (__v8si)a;
471 c[imm & 7] = b;
472 return (__m256i)c;
473}
474
475static __inline __m256i __attribute__((__always_inline__, __nodebug__))
476_mm256_insert_epi16(__m256i a, int b, int const imm)
477{
478 __v16hi c = (__v16hi)a;
479 c[imm & 15] = b;
480 return (__m256i)c;
481}
482
483static __inline __m256i __attribute__((__always_inline__, __nodebug__))
484_mm256_insert_epi8(__m256i a, int b, int const imm)
485{
486 __v32qi c = (__v32qi)a;
487 c[imm & 31] = b;
488 return (__m256i)c;
489}
490
491#ifdef __x86_64__
492static __inline __m256i __attribute__((__always_inline__, __nodebug__))
493_mm256_insert_epi64(__m256i a, int b, int const imm)
494{
495 __v4di c = (__v4di)a;
496 c[imm & 3] = b;
497 return (__m256i)c;
498}
499#endif
500
501/* Conversion */
502static __inline __m256d __attribute__((__always_inline__, __nodebug__))
503_mm256_cvtepi32_pd(__m128i a)
504{
505 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
506}
507
508static __inline __m256 __attribute__((__always_inline__, __nodebug__))
509_mm256_cvtepi32_ps(__m256i a)
510{
511 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
512}
513
514static __inline __m128 __attribute__((__always_inline__, __nodebug__))
515_mm256_cvtpd_ps(__m256d a)
516{
517 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
518}
519
520static __inline __m256i __attribute__((__always_inline__, __nodebug__))
521_mm256_cvtps_epi32(__m256 a)
522{
523 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
524}
525
526static __inline __m256d __attribute__((__always_inline__, __nodebug__))
527_mm256_cvtps_pd(__m128 a)
528{
529 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
530}
531
532static __inline __m128i __attribute__((__always_inline__, __nodebug__))
533_mm256_cvttpd_epi32(__m256d a)
534{
535 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
536}
537
538static __inline __m128i __attribute__((__always_inline__, __nodebug__))
539_mm256_cvtpd_epi32(__m256d a)
540{
541 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
542}
543
544static __inline __m256i __attribute__((__always_inline__, __nodebug__))
545_mm256_cvttps_epi32(__m256 a)
546{
547 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
548}
549
550/* Vector replicate */
551static __inline __m256 __attribute__((__always_inline__, __nodebug__))
552_mm256_movehdup_ps(__m256 a)
553{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000554 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000555}
556
557static __inline __m256 __attribute__((__always_inline__, __nodebug__))
558_mm256_moveldup_ps(__m256 a)
559{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000560 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000561}
562
563static __inline __m256d __attribute__((__always_inline__, __nodebug__))
564_mm256_movedup_pd(__m256d a)
565{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000566 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000567}
568
569/* Unpack and Interleave */
570static __inline __m256d __attribute__((__always_inline__, __nodebug__))
571_mm256_unpackhi_pd(__m256d a, __m256d b)
572{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000573 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000574}
575
576static __inline __m256d __attribute__((__always_inline__, __nodebug__))
577_mm256_unpacklo_pd(__m256d a, __m256d b)
578{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000579 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000580}
581
582static __inline __m256 __attribute__((__always_inline__, __nodebug__))
583_mm256_unpackhi_ps(__m256 a, __m256 b)
584{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000585 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000586}
587
588static __inline __m256 __attribute__((__always_inline__, __nodebug__))
589_mm256_unpacklo_ps(__m256 a, __m256 b)
590{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000591 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000592}
593
594/* Bit Test */
595static __inline int __attribute__((__always_inline__, __nodebug__))
596_mm_testz_pd(__m128d a, __m128d b)
597{
598 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
599}
600
601static __inline int __attribute__((__always_inline__, __nodebug__))
602_mm_testc_pd(__m128d a, __m128d b)
603{
604 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
605}
606
607static __inline int __attribute__((__always_inline__, __nodebug__))
608_mm_testnzc_pd(__m128d a, __m128d b)
609{
610 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
611}
612
613static __inline int __attribute__((__always_inline__, __nodebug__))
614_mm_testz_ps(__m128 a, __m128 b)
615{
616 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
617}
618
619static __inline int __attribute__((__always_inline__, __nodebug__))
620_mm_testc_ps(__m128 a, __m128 b)
621{
622 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
623}
624
625static __inline int __attribute__((__always_inline__, __nodebug__))
626_mm_testnzc_ps(__m128 a, __m128 b)
627{
628 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
629}
630
631static __inline int __attribute__((__always_inline__, __nodebug__))
632_mm256_testz_pd(__m256d a, __m256d b)
633{
634 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
635}
636
637static __inline int __attribute__((__always_inline__, __nodebug__))
638_mm256_testc_pd(__m256d a, __m256d b)
639{
640 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
641}
642
643static __inline int __attribute__((__always_inline__, __nodebug__))
644_mm256_testnzc_pd(__m256d a, __m256d b)
645{
646 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
647}
648
649static __inline int __attribute__((__always_inline__, __nodebug__))
650_mm256_testz_ps(__m256 a, __m256 b)
651{
652 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
653}
654
655static __inline int __attribute__((__always_inline__, __nodebug__))
656_mm256_testc_ps(__m256 a, __m256 b)
657{
658 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
659}
660
661static __inline int __attribute__((__always_inline__, __nodebug__))
662_mm256_testnzc_ps(__m256 a, __m256 b)
663{
664 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
665}
666
667static __inline int __attribute__((__always_inline__, __nodebug__))
668_mm256_testz_si256(__m256i a, __m256i b)
669{
670 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
671}
672
673static __inline int __attribute__((__always_inline__, __nodebug__))
674_mm256_testc_si256(__m256i a, __m256i b)
675{
676 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
677}
678
679static __inline int __attribute__((__always_inline__, __nodebug__))
680_mm256_testnzc_si256(__m256i a, __m256i b)
681{
682 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
683}
684
685/* Vector extract sign mask */
686static __inline int __attribute__((__always_inline__, __nodebug__))
687_mm256_movemask_pd(__m256d a)
688{
689 return __builtin_ia32_movmskpd256((__v4df)a);
690}
691
692static __inline int __attribute__((__always_inline__, __nodebug__))
693_mm256_movemask_ps(__m256 a)
694{
695 return __builtin_ia32_movmskps256((__v8sf)a);
696}
697
698/* Vector zero */
699static __inline void __attribute__((__always_inline__, __nodebug__))
700_mm256_zeroall(void)
701{
702 __builtin_ia32_vzeroall();
703}
704
705static __inline void __attribute__((__always_inline__, __nodebug__))
706_mm256_zeroupper(void)
707{
708 __builtin_ia32_vzeroupper();
709}
710
711/* Vector load with broadcast */
712static __inline __m128 __attribute__((__always_inline__, __nodebug__))
713_mm_broadcast_ss(float const *a)
714{
715 return (__m128)__builtin_ia32_vbroadcastss(a);
716}
717
718static __inline __m256d __attribute__((__always_inline__, __nodebug__))
719_mm256_broadcast_sd(double const *a)
720{
721 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
722}
723
724static __inline __m256 __attribute__((__always_inline__, __nodebug__))
725_mm256_broadcast_ss(float const *a)
726{
727 return (__m256)__builtin_ia32_vbroadcastss256(a);
728}
729
730static __inline __m256d __attribute__((__always_inline__, __nodebug__))
731_mm256_broadcast_pd(__m128d const *a)
732{
733 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
734}
735
736static __inline __m256 __attribute__((__always_inline__, __nodebug__))
737_mm256_broadcast_ps(__m128 const *a)
738{
739 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
740}
741
742/* SIMD load ops */
743static __inline __m256d __attribute__((__always_inline__, __nodebug__))
744_mm256_load_pd(double const *p)
745{
746 return *(__m256d *)p;
747}
748
749static __inline __m256 __attribute__((__always_inline__, __nodebug__))
750_mm256_load_ps(float const *p)
751{
752 return *(__m256 *)p;
753}
754
755static __inline __m256d __attribute__((__always_inline__, __nodebug__))
756_mm256_loadu_pd(double const *p)
757{
758 return (__m256d)__builtin_ia32_loadupd256(p);
759}
760
761static __inline __m256 __attribute__((__always_inline__, __nodebug__))
762_mm256_loadu_ps(float const *p)
763{
764 return (__m256)__builtin_ia32_loadups256(p);
765}
766
767static __inline __m256i __attribute__((__always_inline__, __nodebug__))
768_mm256_load_si256(__m256i const *p)
769{
770 return *p;
771}
772
773static __inline __m256i __attribute__((__always_inline__, __nodebug__))
774_mm256_loadu_si256(__m256i const *p)
775{
776 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
777}
778
779static __inline __m256i __attribute__((__always_inline__, __nodebug__))
780_mm256_lddqu_si256(__m256i const *p)
781{
782 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
783}
784
785/* SIMD store ops */
786static __inline void __attribute__((__always_inline__, __nodebug__))
787_mm256_store_pd(double *p, __m256d a)
788{
789 *(__m256d *)p = a;
790}
791
792static __inline void __attribute__((__always_inline__, __nodebug__))
793_mm256_store_ps(float *p, __m256 a)
794{
795 *(__m256 *)p = a;
796}
797
798static __inline void __attribute__((__always_inline__, __nodebug__))
799_mm256_storeu_pd(double *p, __m256d a)
800{
801 __builtin_ia32_storeupd256(p, (__v4df)a);
802}
803
804static __inline void __attribute__((__always_inline__, __nodebug__))
805_mm256_storeu_ps(float *p, __m256 a)
806{
807 __builtin_ia32_storeups256(p, (__v8sf)a);
808}
809
810static __inline void __attribute__((__always_inline__, __nodebug__))
811_mm256_store_si256(__m256i *p, __m256i a)
812{
813 *p = a;
814}
815
816static __inline void __attribute__((__always_inline__, __nodebug__))
817_mm256_storeu_si256(__m256i *p, __m256i a)
818{
819 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
820}
821
822/* Conditional load ops */
823static __inline __m128d __attribute__((__always_inline__, __nodebug__))
824_mm_maskload_pd(double const *p, __m128d m)
825{
826 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
827}
828
829static __inline __m256d __attribute__((__always_inline__, __nodebug__))
830_mm256_maskload_pd(double const *p, __m256d m)
831{
832 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
833}
834
835static __inline __m128 __attribute__((__always_inline__, __nodebug__))
836_mm_maskload_ps(float const *p, __m128 m)
837{
838 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
839}
840
841static __inline __m256 __attribute__((__always_inline__, __nodebug__))
842_mm256_maskload_ps(float const *p, __m256 m)
843{
844 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
845}
846
847/* Conditional store ops */
848static __inline void __attribute__((__always_inline__, __nodebug__))
849_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
850{
851 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
852}
853
854static __inline void __attribute__((__always_inline__, __nodebug__))
855_mm_maskstore_pd(double *p, __m128d m, __m128d a)
856{
857 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
858}
859
860static __inline void __attribute__((__always_inline__, __nodebug__))
861_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
862{
863 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
864}
865
866static __inline void __attribute__((__always_inline__, __nodebug__))
867_mm_maskstore_ps(float *p, __m128 m, __m128 a)
868{
869 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
870}
871
872/* Cacheability support ops */
873static __inline void __attribute__((__always_inline__, __nodebug__))
874_mm256_stream_si256(__m256i *a, __m256i b)
875{
876 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
877}
878
879static __inline void __attribute__((__always_inline__, __nodebug__))
880_mm256_stream_pd(double *a, __m256d b)
881{
882 __builtin_ia32_movntpd256(a, (__v4df)b);
883}
884
885static __inline void __attribute__((__always_inline__, __nodebug__))
886_mm256_stream_ps(float *p, __m256 a)
887{
888 __builtin_ia32_movntps256(p, (__v8sf)a);
889}
890
891/* Create vectors */
892static __inline __m256d __attribute__((__always_inline__, __nodebug__))
893_mm256_set_pd(double a, double b, double c, double d)
894{
895 return (__m256d){ d, c, b, a };
896}
897
898static __inline __m256 __attribute__((__always_inline__, __nodebug__))
899_mm256_set_ps(float a, float b, float c, float d,
900 float e, float f, float g, float h)
901{
902 return (__m256){ h, g, f, e, d, c, b, a };
903}
904
905static __inline __m256i __attribute__((__always_inline__, __nodebug__))
906_mm256_set_epi32(int i0, int i1, int i2, int i3,
907 int i4, int i5, int i6, int i7)
908{
909 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
910}
911
912static __inline __m256i __attribute__((__always_inline__, __nodebug__))
913_mm256_set_epi16(short w15, short w14, short w13, short w12,
914 short w11, short w10, short w09, short w08,
915 short w07, short w06, short w05, short w04,
916 short w03, short w02, short w01, short w00)
917{
918 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
919 w08, w09, w10, w11, w12, w13, w14, w15 };
920}
921
922static __inline __m256i __attribute__((__always_inline__, __nodebug__))
923_mm256_set_epi8(char b31, char b30, char b29, char b28,
924 char b27, char b26, char b25, char b24,
925 char b23, char b22, char b21, char b20,
926 char b19, char b18, char b17, char b16,
927 char b15, char b14, char b13, char b12,
928 char b11, char b10, char b09, char b08,
929 char b07, char b06, char b05, char b04,
930 char b03, char b02, char b01, char b00)
931{
932 return (__m256i)(__v32qi){
933 b00, b01, b02, b03, b04, b05, b06, b07,
934 b08, b09, b10, b11, b12, b13, b14, b15,
935 b16, b17, b18, b19, b20, b21, b22, b23,
936 b24, b25, b26, b27, b28, b29, b30, b31
937 };
938}
939
940static __inline __m256i __attribute__((__always_inline__, __nodebug__))
941_mm256_set_epi64x(long long a, long long b, long long c, long long d)
942{
943 return (__m256i)(__v4di){ d, c, b, a };
944}
945
946/* Create vectors with elements in reverse order */
947static __inline __m256d __attribute__((__always_inline__, __nodebug__))
948_mm256_setr_pd(double a, double b, double c, double d)
949{
950 return (__m256d){ a, b, c, d };
951}
952
953static __inline __m256 __attribute__((__always_inline__, __nodebug__))
954_mm256_setr_ps(float a, float b, float c, float d,
955 float e, float f, float g, float h)
956{
957 return (__m256){ a, b, c, d, e, f, g, h };
958}
959
960static __inline __m256i __attribute__((__always_inline__, __nodebug__))
961_mm256_setr_epi32(int i0, int i1, int i2, int i3,
962 int i4, int i5, int i6, int i7)
963{
964 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
965}
966
967static __inline __m256i __attribute__((__always_inline__, __nodebug__))
968_mm256_setr_epi16(short w15, short w14, short w13, short w12,
969 short w11, short w10, short w09, short w08,
970 short w07, short w06, short w05, short w04,
971 short w03, short w02, short w01, short w00)
972{
973 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
974 w07, w06, w05, w04, w03, w02, w01, w00 };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_setr_epi8(char b31, char b30, char b29, char b28,
979 char b27, char b26, char b25, char b24,
980 char b23, char b22, char b21, char b20,
981 char b19, char b18, char b17, char b16,
982 char b15, char b14, char b13, char b12,
983 char b11, char b10, char b09, char b08,
984 char b07, char b06, char b05, char b04,
985 char b03, char b02, char b01, char b00)
986{
987 return (__m256i)(__v32qi){
988 b31, b30, b29, b28, b27, b26, b25, b24,
989 b23, b22, b21, b20, b19, b18, b17, b16,
990 b15, b14, b13, b12, b11, b10, b09, b08,
991 b07, b06, b05, b04, b03, b02, b01, b00 };
992}
993
994static __inline __m256i __attribute__((__always_inline__, __nodebug__))
995_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
996{
997 return (__m256i)(__v4di){ a, b, c, d };
998}
999
1000/* Create vectors with repeated elements */
1001static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1002_mm256_set1_pd(double w)
1003{
1004 return (__m256d){ w, w, w, w };
1005}
1006
1007static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1008_mm256_set1_ps(float w)
1009{
1010 return (__m256){ w, w, w, w, w, w, w, w };
1011}
1012
1013static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1014_mm256_set1_epi32(int i)
1015{
1016 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1017}
1018
1019static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1020_mm256_set1_epi16(short w)
1021{
1022 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1023}
1024
1025static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1026_mm256_set1_epi8(char b)
1027{
1028 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1029 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1030}
1031
1032static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1033_mm256_set1_epi64x(long long q)
1034{
1035 return (__m256i)(__v4di){ q, q, q, q };
1036}
1037
1038/* Create zeroed vectors */
1039static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1040_mm256_setzero_pd(void)
1041{
1042 return (__m256d){ 0, 0, 0, 0 };
1043}
1044
1045static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1046_mm256_setzero_ps(void)
1047{
1048 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1049}
1050
1051static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1052_mm256_setzero_si256(void)
1053{
1054 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1055}
1056
1057/* Cast between vector types */
1058static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1059_mm256_castpd_ps(__m256d in)
1060{
1061 return (__m256)in;
1062}
1063
1064static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1065_mm256_castpd_si256(__m256d in)
1066{
1067 return (__m256i)in;
1068}
1069
1070static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1071_mm256_castps_pd(__m256 in)
1072{
1073 return (__m256d)in;
1074}
1075
1076static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1077_mm256_castps_si256(__m256 in)
1078{
1079 return (__m256i)in;
1080}
1081
1082static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1083_mm256_castsi256_ps(__m256i in)
1084{
1085 return (__m256)in;
1086}
1087
1088static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1089_mm256_castsi256_pd(__m256i in)
1090{
1091 return (__m256d)in;
1092}
1093
1094static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1095_mm256_castpd256_pd128(__m256d in)
1096{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001097 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001098}
1099
1100static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1101_mm256_castps256_ps128(__m256 in)
1102{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001103 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001104}
1105
1106static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1107_mm256_castsi256_si128(__m256i in)
1108{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001109 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001110}
1111
1112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1113_mm256_castpd128_pd256(__m128d in)
1114{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001115 __m128d zero = _mm_setzero_pd();
1116 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001117}
1118
1119static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1120_mm256_castps128_ps256(__m128 in)
1121{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001122 __m128 zero = _mm_setzero_ps();
1123 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001124}
1125
1126static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1127_mm256_castsi128_si256(__m128i in)
1128{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001129 __m128i zero = _mm_setzero_si128();
1130 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001131}