blob: ce4b2264bfeab77028969ab6b7dfacb8092b4e58 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
Craig Topper34a1da42011-12-24 07:55:14 +0000150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
Craig Topper34a1da42011-12-24 07:55:14 +0000154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosierc17f88e2011-12-17 01:39:56 +0000261#define _mm_permute_pd(A, C) __extension__ ({ \
262 __m128d __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000263 (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, (C)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000264
Chad Rosierc17f88e2011-12-17 01:39:56 +0000265#define _mm256_permute_pd(A, C) __extension__ ({ \
266 __m256d __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000267 (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, (C)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000268
Chad Rosierd7dd7752011-12-17 01:51:05 +0000269#define _mm_permute_ps(A, C) __extension__ ({ \
270 __m128 __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000271 (__m128)__builtin_ia32_vpermilps((__v4sf)__A, (C)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000272
Chad Rosierd7dd7752011-12-17 01:51:05 +0000273#define _mm256_permute_ps(A, C) __extension__ ({ \
274 __m256 __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000275 (__m256)__builtin_ia32_vpermilps256((__v8sf)__A, (C)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000276
Chad Rosierc5cda112011-12-16 21:07:34 +0000277#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
278 __m256d __V1 = (V1); \
279 __m256d __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000280 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000281
Chad Rosierc5cda112011-12-16 21:07:34 +0000282#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
283 __m256 __V1 = (V1); \
284 __m256 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000285 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000286
Chad Rosierc5cda112011-12-16 21:07:34 +0000287#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
288 __m256i __V1 = (V1); \
289 __m256i __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000290 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000291
292/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000293#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
294 __m256d __V1 = (V1); \
295 __m256d __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000296 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000297
Eli Friedman34720892011-11-10 00:11:13 +0000298#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
299 __m256 __V1 = (V1); \
300 __m256 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000301 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000302
303static __inline __m256d __attribute__((__always_inline__, __nodebug__))
304_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
305{
306 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
307}
308
309static __inline __m256 __attribute__((__always_inline__, __nodebug__))
310_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
311{
312 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
313}
314
315/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000316#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
317 __m256 __V1 = (V1); \
318 __m256 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000319 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000320
321/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000322#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
323 __m256 __a = (a); \
324 __m256 __b = (b); \
325 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000326 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000327 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000328 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000329 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000330
Bob Wilson32bae372011-11-05 06:08:06 +0000331#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
332 __m256d __a = (a); \
333 __m256d __b = (b); \
334 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000335 (mask) & 0x1, \
336 (((mask) & 0x2) >> 1) + 4, \
337 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000338 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000339
340/* Compare */
341#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
342#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
343#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
344#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
345#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
346#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
347#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
348#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
349#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
350#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
351#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
352#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
353#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
354#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
355#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
356#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
357#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
358#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
359#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
360#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
361#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
362#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
363#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
364#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
365#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
366#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
367#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
368#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
369#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
370#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
371#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
372#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
373
Bob Wilson32bae372011-11-05 06:08:06 +0000374#define _mm_cmp_pd(a, b, c) __extension__ ({ \
375 __m128d __a = (a); \
376 __m128d __b = (b); \
377 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000378
Bob Wilson32bae372011-11-05 06:08:06 +0000379#define _mm_cmp_ps(a, b, c) __extension__ ({ \
380 __m128 __a = (a); \
381 __m128 __b = (b); \
382 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000383
Bob Wilson32bae372011-11-05 06:08:06 +0000384#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
385 __m256d __a = (a); \
386 __m256d __b = (b); \
387 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000388
Bob Wilson32bae372011-11-05 06:08:06 +0000389#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
390 __m256 __a = (a); \
391 __m256 __b = (b); \
392 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000393
Bob Wilson32bae372011-11-05 06:08:06 +0000394#define _mm_cmp_sd(a, b, c) __extension__ ({ \
395 __m128d __a = (a); \
396 __m128d __b = (b); \
397 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000398
Bob Wilson32bae372011-11-05 06:08:06 +0000399#define _mm_cmp_ss(a, b, c) __extension__ ({ \
400 __m128 __a = (a); \
401 __m128 __b = (b); \
402 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000403
404/* Vector extract */
Chad Rosier1e4faf52011-12-17 01:22:27 +0000405#define _mm256_extractf128_pd(A, O) __extension__ ({ \
406 __m256d __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000407 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000408
Chad Rosier1e4faf52011-12-17 01:22:27 +0000409#define _mm256_extractf128_ps(A, O) __extension__ ({ \
410 __m256 __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000411 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000412
Chad Rosier1e4faf52011-12-17 01:22:27 +0000413#define _mm256_extractf128_si256(A, O) __extension__ ({ \
414 __m256i __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000415 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000416
417static __inline int __attribute__((__always_inline__, __nodebug__))
418_mm256_extract_epi32(__m256i a, int const imm)
419{
420 __v8si b = (__v8si)a;
421 return b[imm];
422}
423
424static __inline int __attribute__((__always_inline__, __nodebug__))
425_mm256_extract_epi16(__m256i a, int const imm)
426{
427 __v16hi b = (__v16hi)a;
428 return b[imm];
429}
430
431static __inline int __attribute__((__always_inline__, __nodebug__))
432_mm256_extract_epi8(__m256i a, int const imm)
433{
434 __v32qi b = (__v32qi)a;
435 return b[imm];
436}
437
438#ifdef __x86_64__
439static __inline long long __attribute__((__always_inline__, __nodebug__))
440_mm256_extract_epi64(__m256i a, const int imm)
441{
442 __v4di b = (__v4di)a;
443 return b[imm];
444}
445#endif
446
447/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000448#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
449 __m256d __V1 = (V1); \
450 __m128d __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000451 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000452
Chad Rosierb95ddf12011-12-16 21:40:31 +0000453#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
454 __m256 __V1 = (V1); \
455 __m128 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000456 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000457
Chad Rosierb95ddf12011-12-16 21:40:31 +0000458#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
459 __m256i __V1 = (V1); \
460 __m128i __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000461 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000462
463static __inline __m256i __attribute__((__always_inline__, __nodebug__))
464_mm256_insert_epi32(__m256i a, int b, int const imm)
465{
466 __v8si c = (__v8si)a;
467 c[imm & 7] = b;
468 return (__m256i)c;
469}
470
471static __inline __m256i __attribute__((__always_inline__, __nodebug__))
472_mm256_insert_epi16(__m256i a, int b, int const imm)
473{
474 __v16hi c = (__v16hi)a;
475 c[imm & 15] = b;
476 return (__m256i)c;
477}
478
479static __inline __m256i __attribute__((__always_inline__, __nodebug__))
480_mm256_insert_epi8(__m256i a, int b, int const imm)
481{
482 __v32qi c = (__v32qi)a;
483 c[imm & 31] = b;
484 return (__m256i)c;
485}
486
487#ifdef __x86_64__
488static __inline __m256i __attribute__((__always_inline__, __nodebug__))
489_mm256_insert_epi64(__m256i a, int b, int const imm)
490{
491 __v4di c = (__v4di)a;
492 c[imm & 3] = b;
493 return (__m256i)c;
494}
495#endif
496
497/* Conversion */
498static __inline __m256d __attribute__((__always_inline__, __nodebug__))
499_mm256_cvtepi32_pd(__m128i a)
500{
501 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
502}
503
504static __inline __m256 __attribute__((__always_inline__, __nodebug__))
505_mm256_cvtepi32_ps(__m256i a)
506{
507 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
508}
509
510static __inline __m128 __attribute__((__always_inline__, __nodebug__))
511_mm256_cvtpd_ps(__m256d a)
512{
513 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
514}
515
516static __inline __m256i __attribute__((__always_inline__, __nodebug__))
517_mm256_cvtps_epi32(__m256 a)
518{
519 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
520}
521
522static __inline __m256d __attribute__((__always_inline__, __nodebug__))
523_mm256_cvtps_pd(__m128 a)
524{
525 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
526}
527
528static __inline __m128i __attribute__((__always_inline__, __nodebug__))
529_mm256_cvttpd_epi32(__m256d a)
530{
531 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
532}
533
534static __inline __m128i __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtpd_epi32(__m256d a)
536{
537 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
538}
539
540static __inline __m256i __attribute__((__always_inline__, __nodebug__))
541_mm256_cvttps_epi32(__m256 a)
542{
543 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
544}
545
546/* Vector replicate */
547static __inline __m256 __attribute__((__always_inline__, __nodebug__))
548_mm256_movehdup_ps(__m256 a)
549{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000550 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000551}
552
553static __inline __m256 __attribute__((__always_inline__, __nodebug__))
554_mm256_moveldup_ps(__m256 a)
555{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000556 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000557}
558
559static __inline __m256d __attribute__((__always_inline__, __nodebug__))
560_mm256_movedup_pd(__m256d a)
561{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000562 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000563}
564
565/* Unpack and Interleave */
566static __inline __m256d __attribute__((__always_inline__, __nodebug__))
567_mm256_unpackhi_pd(__m256d a, __m256d b)
568{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000569 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000570}
571
572static __inline __m256d __attribute__((__always_inline__, __nodebug__))
573_mm256_unpacklo_pd(__m256d a, __m256d b)
574{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000575 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000576}
577
578static __inline __m256 __attribute__((__always_inline__, __nodebug__))
579_mm256_unpackhi_ps(__m256 a, __m256 b)
580{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000581 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000582}
583
584static __inline __m256 __attribute__((__always_inline__, __nodebug__))
585_mm256_unpacklo_ps(__m256 a, __m256 b)
586{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000587 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000588}
589
590/* Bit Test */
591static __inline int __attribute__((__always_inline__, __nodebug__))
592_mm_testz_pd(__m128d a, __m128d b)
593{
594 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
595}
596
597static __inline int __attribute__((__always_inline__, __nodebug__))
598_mm_testc_pd(__m128d a, __m128d b)
599{
600 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
601}
602
603static __inline int __attribute__((__always_inline__, __nodebug__))
604_mm_testnzc_pd(__m128d a, __m128d b)
605{
606 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
607}
608
609static __inline int __attribute__((__always_inline__, __nodebug__))
610_mm_testz_ps(__m128 a, __m128 b)
611{
612 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
613}
614
615static __inline int __attribute__((__always_inline__, __nodebug__))
616_mm_testc_ps(__m128 a, __m128 b)
617{
618 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testnzc_ps(__m128 a, __m128 b)
623{
624 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm256_testz_pd(__m256d a, __m256d b)
629{
630 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm256_testc_pd(__m256d a, __m256d b)
635{
636 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm256_testnzc_pd(__m256d a, __m256d b)
641{
642 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm256_testz_ps(__m256 a, __m256 b)
647{
648 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm256_testc_ps(__m256 a, __m256 b)
653{
654 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testnzc_ps(__m256 a, __m256 b)
659{
660 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testz_si256(__m256i a, __m256i b)
665{
666 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testc_si256(__m256i a, __m256i b)
671{
672 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testnzc_si256(__m256i a, __m256i b)
677{
678 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
679}
680
681/* Vector extract sign mask */
682static __inline int __attribute__((__always_inline__, __nodebug__))
683_mm256_movemask_pd(__m256d a)
684{
685 return __builtin_ia32_movmskpd256((__v4df)a);
686}
687
688static __inline int __attribute__((__always_inline__, __nodebug__))
689_mm256_movemask_ps(__m256 a)
690{
691 return __builtin_ia32_movmskps256((__v8sf)a);
692}
693
694/* Vector zero */
695static __inline void __attribute__((__always_inline__, __nodebug__))
696_mm256_zeroall(void)
697{
698 __builtin_ia32_vzeroall();
699}
700
701static __inline void __attribute__((__always_inline__, __nodebug__))
702_mm256_zeroupper(void)
703{
704 __builtin_ia32_vzeroupper();
705}
706
707/* Vector load with broadcast */
708static __inline __m128 __attribute__((__always_inline__, __nodebug__))
709_mm_broadcast_ss(float const *a)
710{
711 return (__m128)__builtin_ia32_vbroadcastss(a);
712}
713
714static __inline __m256d __attribute__((__always_inline__, __nodebug__))
715_mm256_broadcast_sd(double const *a)
716{
717 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
718}
719
720static __inline __m256 __attribute__((__always_inline__, __nodebug__))
721_mm256_broadcast_ss(float const *a)
722{
723 return (__m256)__builtin_ia32_vbroadcastss256(a);
724}
725
726static __inline __m256d __attribute__((__always_inline__, __nodebug__))
727_mm256_broadcast_pd(__m128d const *a)
728{
729 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
730}
731
732static __inline __m256 __attribute__((__always_inline__, __nodebug__))
733_mm256_broadcast_ps(__m128 const *a)
734{
735 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
736}
737
738/* SIMD load ops */
739static __inline __m256d __attribute__((__always_inline__, __nodebug__))
740_mm256_load_pd(double const *p)
741{
742 return *(__m256d *)p;
743}
744
745static __inline __m256 __attribute__((__always_inline__, __nodebug__))
746_mm256_load_ps(float const *p)
747{
748 return *(__m256 *)p;
749}
750
751static __inline __m256d __attribute__((__always_inline__, __nodebug__))
752_mm256_loadu_pd(double const *p)
753{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000754 struct __loadu_pd {
755 __m256d v;
756 } __attribute__((packed, may_alias));
757 return ((struct __loadu_pd*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000758}
759
760static __inline __m256 __attribute__((__always_inline__, __nodebug__))
761_mm256_loadu_ps(float const *p)
762{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000763 struct __loadu_ps {
764 __m256 v;
765 } __attribute__((packed, may_alias));
766 return ((struct __loadu_ps*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000767}
768
769static __inline __m256i __attribute__((__always_inline__, __nodebug__))
770_mm256_load_si256(__m256i const *p)
771{
772 return *p;
773}
774
775static __inline __m256i __attribute__((__always_inline__, __nodebug__))
776_mm256_loadu_si256(__m256i const *p)
777{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000778 struct __loadu_si256 {
779 __m256i v;
780 } __attribute__((packed, may_alias));
781 return ((struct __loadu_si256*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000782}
783
784static __inline __m256i __attribute__((__always_inline__, __nodebug__))
785_mm256_lddqu_si256(__m256i const *p)
786{
787 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
788}
789
790/* SIMD store ops */
791static __inline void __attribute__((__always_inline__, __nodebug__))
792_mm256_store_pd(double *p, __m256d a)
793{
794 *(__m256d *)p = a;
795}
796
797static __inline void __attribute__((__always_inline__, __nodebug__))
798_mm256_store_ps(float *p, __m256 a)
799{
800 *(__m256 *)p = a;
801}
802
803static __inline void __attribute__((__always_inline__, __nodebug__))
804_mm256_storeu_pd(double *p, __m256d a)
805{
806 __builtin_ia32_storeupd256(p, (__v4df)a);
807}
808
809static __inline void __attribute__((__always_inline__, __nodebug__))
810_mm256_storeu_ps(float *p, __m256 a)
811{
812 __builtin_ia32_storeups256(p, (__v8sf)a);
813}
814
815static __inline void __attribute__((__always_inline__, __nodebug__))
816_mm256_store_si256(__m256i *p, __m256i a)
817{
818 *p = a;
819}
820
821static __inline void __attribute__((__always_inline__, __nodebug__))
822_mm256_storeu_si256(__m256i *p, __m256i a)
823{
824 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
825}
826
827/* Conditional load ops */
828static __inline __m128d __attribute__((__always_inline__, __nodebug__))
829_mm_maskload_pd(double const *p, __m128d m)
830{
831 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
832}
833
834static __inline __m256d __attribute__((__always_inline__, __nodebug__))
835_mm256_maskload_pd(double const *p, __m256d m)
836{
837 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
838}
839
840static __inline __m128 __attribute__((__always_inline__, __nodebug__))
841_mm_maskload_ps(float const *p, __m128 m)
842{
843 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
844}
845
846static __inline __m256 __attribute__((__always_inline__, __nodebug__))
847_mm256_maskload_ps(float const *p, __m256 m)
848{
849 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
850}
851
852/* Conditional store ops */
853static __inline void __attribute__((__always_inline__, __nodebug__))
854_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
855{
856 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
857}
858
859static __inline void __attribute__((__always_inline__, __nodebug__))
860_mm_maskstore_pd(double *p, __m128d m, __m128d a)
861{
862 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
863}
864
865static __inline void __attribute__((__always_inline__, __nodebug__))
866_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
867{
868 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
869}
870
871static __inline void __attribute__((__always_inline__, __nodebug__))
872_mm_maskstore_ps(float *p, __m128 m, __m128 a)
873{
874 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
875}
876
877/* Cacheability support ops */
878static __inline void __attribute__((__always_inline__, __nodebug__))
879_mm256_stream_si256(__m256i *a, __m256i b)
880{
881 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
882}
883
884static __inline void __attribute__((__always_inline__, __nodebug__))
885_mm256_stream_pd(double *a, __m256d b)
886{
887 __builtin_ia32_movntpd256(a, (__v4df)b);
888}
889
890static __inline void __attribute__((__always_inline__, __nodebug__))
891_mm256_stream_ps(float *p, __m256 a)
892{
893 __builtin_ia32_movntps256(p, (__v8sf)a);
894}
895
896/* Create vectors */
897static __inline __m256d __attribute__((__always_inline__, __nodebug__))
898_mm256_set_pd(double a, double b, double c, double d)
899{
900 return (__m256d){ d, c, b, a };
901}
902
903static __inline __m256 __attribute__((__always_inline__, __nodebug__))
904_mm256_set_ps(float a, float b, float c, float d,
905 float e, float f, float g, float h)
906{
907 return (__m256){ h, g, f, e, d, c, b, a };
908}
909
910static __inline __m256i __attribute__((__always_inline__, __nodebug__))
911_mm256_set_epi32(int i0, int i1, int i2, int i3,
912 int i4, int i5, int i6, int i7)
913{
914 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
915}
916
917static __inline __m256i __attribute__((__always_inline__, __nodebug__))
918_mm256_set_epi16(short w15, short w14, short w13, short w12,
919 short w11, short w10, short w09, short w08,
920 short w07, short w06, short w05, short w04,
921 short w03, short w02, short w01, short w00)
922{
923 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
924 w08, w09, w10, w11, w12, w13, w14, w15 };
925}
926
927static __inline __m256i __attribute__((__always_inline__, __nodebug__))
928_mm256_set_epi8(char b31, char b30, char b29, char b28,
929 char b27, char b26, char b25, char b24,
930 char b23, char b22, char b21, char b20,
931 char b19, char b18, char b17, char b16,
932 char b15, char b14, char b13, char b12,
933 char b11, char b10, char b09, char b08,
934 char b07, char b06, char b05, char b04,
935 char b03, char b02, char b01, char b00)
936{
937 return (__m256i)(__v32qi){
938 b00, b01, b02, b03, b04, b05, b06, b07,
939 b08, b09, b10, b11, b12, b13, b14, b15,
940 b16, b17, b18, b19, b20, b21, b22, b23,
941 b24, b25, b26, b27, b28, b29, b30, b31
942 };
943}
944
945static __inline __m256i __attribute__((__always_inline__, __nodebug__))
946_mm256_set_epi64x(long long a, long long b, long long c, long long d)
947{
948 return (__m256i)(__v4di){ d, c, b, a };
949}
950
951/* Create vectors with elements in reverse order */
952static __inline __m256d __attribute__((__always_inline__, __nodebug__))
953_mm256_setr_pd(double a, double b, double c, double d)
954{
955 return (__m256d){ a, b, c, d };
956}
957
958static __inline __m256 __attribute__((__always_inline__, __nodebug__))
959_mm256_setr_ps(float a, float b, float c, float d,
960 float e, float f, float g, float h)
961{
962 return (__m256){ a, b, c, d, e, f, g, h };
963}
964
965static __inline __m256i __attribute__((__always_inline__, __nodebug__))
966_mm256_setr_epi32(int i0, int i1, int i2, int i3,
967 int i4, int i5, int i6, int i7)
968{
969 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
970}
971
972static __inline __m256i __attribute__((__always_inline__, __nodebug__))
973_mm256_setr_epi16(short w15, short w14, short w13, short w12,
974 short w11, short w10, short w09, short w08,
975 short w07, short w06, short w05, short w04,
976 short w03, short w02, short w01, short w00)
977{
978 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
979 w07, w06, w05, w04, w03, w02, w01, w00 };
980}
981
982static __inline __m256i __attribute__((__always_inline__, __nodebug__))
983_mm256_setr_epi8(char b31, char b30, char b29, char b28,
984 char b27, char b26, char b25, char b24,
985 char b23, char b22, char b21, char b20,
986 char b19, char b18, char b17, char b16,
987 char b15, char b14, char b13, char b12,
988 char b11, char b10, char b09, char b08,
989 char b07, char b06, char b05, char b04,
990 char b03, char b02, char b01, char b00)
991{
992 return (__m256i)(__v32qi){
993 b31, b30, b29, b28, b27, b26, b25, b24,
994 b23, b22, b21, b20, b19, b18, b17, b16,
995 b15, b14, b13, b12, b11, b10, b09, b08,
996 b07, b06, b05, b04, b03, b02, b01, b00 };
997}
998
999static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1000_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1001{
1002 return (__m256i)(__v4di){ a, b, c, d };
1003}
1004
1005/* Create vectors with repeated elements */
1006static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1007_mm256_set1_pd(double w)
1008{
1009 return (__m256d){ w, w, w, w };
1010}
1011
1012static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1013_mm256_set1_ps(float w)
1014{
1015 return (__m256){ w, w, w, w, w, w, w, w };
1016}
1017
1018static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1019_mm256_set1_epi32(int i)
1020{
1021 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1022}
1023
1024static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1025_mm256_set1_epi16(short w)
1026{
1027 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1028}
1029
1030static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1031_mm256_set1_epi8(char b)
1032{
1033 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1034 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1035}
1036
1037static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1038_mm256_set1_epi64x(long long q)
1039{
1040 return (__m256i)(__v4di){ q, q, q, q };
1041}
1042
1043/* Create zeroed vectors */
1044static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1045_mm256_setzero_pd(void)
1046{
1047 return (__m256d){ 0, 0, 0, 0 };
1048}
1049
1050static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1051_mm256_setzero_ps(void)
1052{
1053 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1054}
1055
1056static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1057_mm256_setzero_si256(void)
1058{
1059 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1060}
1061
1062/* Cast between vector types */
1063static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1064_mm256_castpd_ps(__m256d in)
1065{
1066 return (__m256)in;
1067}
1068
1069static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1070_mm256_castpd_si256(__m256d in)
1071{
1072 return (__m256i)in;
1073}
1074
1075static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1076_mm256_castps_pd(__m256 in)
1077{
1078 return (__m256d)in;
1079}
1080
1081static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1082_mm256_castps_si256(__m256 in)
1083{
1084 return (__m256i)in;
1085}
1086
1087static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1088_mm256_castsi256_ps(__m256i in)
1089{
1090 return (__m256)in;
1091}
1092
1093static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1094_mm256_castsi256_pd(__m256i in)
1095{
1096 return (__m256d)in;
1097}
1098
1099static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1100_mm256_castpd256_pd128(__m256d in)
1101{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001102 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001103}
1104
1105static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1106_mm256_castps256_ps128(__m256 in)
1107{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001108 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001109}
1110
1111static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1112_mm256_castsi256_si128(__m256i in)
1113{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001114 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001115}
1116
1117static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1118_mm256_castpd128_pd256(__m128d in)
1119{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001120 __m128d zero = _mm_setzero_pd();
1121 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001122}
1123
1124static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1125_mm256_castps128_ps256(__m128 in)
1126{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001127 __m128 zero = _mm_setzero_ps();
1128 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001129}
1130
1131static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1132_mm256_castsi128_si256(__m128i in)
1133{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001134 __m128i zero = _mm_setzero_si128();
1135 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001136}