blob: fc606b37770f3b16ac6c7b66fc0ae0a6bc1bcbaa [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosier19780872011-12-17 00:50:42 +0000261static __inline __m128d __attribute__((__always_inline__, __nodebug__))
262_mm_permute_pd(__m128d a, const int c)
263{
264 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
265}
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000266
Chad Rosier19780872011-12-17 00:50:42 +0000267static __inline __m256d __attribute__((__always_inline__, __nodebug__))
268_mm256_permute_pd(__m256d a, const int c)
269{
270 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
271}
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000272
273static __inline __m128 __attribute__((__always_inline__, __nodebug__))
274_mm_permute_ps(__m128 a, const int c)
275{
276 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
277}
278
279static __inline __m256 __attribute__((__always_inline__, __nodebug__))
280_mm256_permute_ps(__m256 a, const int c)
281{
282 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
283}
284
Chad Rosierc5cda112011-12-16 21:07:34 +0000285#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
286 __m256d __V1 = (V1); \
287 __m256d __V2 = (V2); \
288 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000289
Chad Rosierc5cda112011-12-16 21:07:34 +0000290#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
291 __m256 __V1 = (V1); \
292 __m256 __V2 = (V2); \
293 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000294
Chad Rosierc5cda112011-12-16 21:07:34 +0000295#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
296 __m256i __V1 = (V1); \
297 __m256i __V2 = (V2); \
298 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000299
300/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000301#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
302 __m256d __V1 = (V1); \
303 __m256d __V2 = (V2); \
304 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000305
Eli Friedman34720892011-11-10 00:11:13 +0000306#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
307 __m256 __V1 = (V1); \
308 __m256 __V2 = (V2); \
309 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000310
311static __inline __m256d __attribute__((__always_inline__, __nodebug__))
312_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
313{
314 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
315}
316
317static __inline __m256 __attribute__((__always_inline__, __nodebug__))
318_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
319{
320 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
321}
322
323/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000324#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
325 __m256 __V1 = (V1); \
326 __m256 __V2 = (V2); \
327 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000328
329/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000330#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
331 __m256 __a = (a); \
332 __m256 __b = (b); \
333 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000334 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000335 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000336 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000337 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000338
Bob Wilson32bae372011-11-05 06:08:06 +0000339#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
340 __m256d __a = (a); \
341 __m256d __b = (b); \
342 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000343 (mask) & 0x1, \
344 (((mask) & 0x2) >> 1) + 4, \
345 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000346 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000347
348/* Compare */
349#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
350#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
351#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
352#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
353#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
354#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
355#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
356#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
357#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
358#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
359#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
360#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
361#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
362#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
363#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
364#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
365#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
366#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
367#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
368#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
369#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
370#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
371#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
372#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
373#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
374#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
375#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
376#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
377#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
378#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
379#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
380#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
381
Bob Wilson32bae372011-11-05 06:08:06 +0000382#define _mm_cmp_pd(a, b, c) __extension__ ({ \
383 __m128d __a = (a); \
384 __m128d __b = (b); \
385 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000386
Bob Wilson32bae372011-11-05 06:08:06 +0000387#define _mm_cmp_ps(a, b, c) __extension__ ({ \
388 __m128 __a = (a); \
389 __m128 __b = (b); \
390 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000391
Bob Wilson32bae372011-11-05 06:08:06 +0000392#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
393 __m256d __a = (a); \
394 __m256d __b = (b); \
395 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000396
Bob Wilson32bae372011-11-05 06:08:06 +0000397#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
398 __m256 __a = (a); \
399 __m256 __b = (b); \
400 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000401
Bob Wilson32bae372011-11-05 06:08:06 +0000402#define _mm_cmp_sd(a, b, c) __extension__ ({ \
403 __m128d __a = (a); \
404 __m128d __b = (b); \
405 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000406
Bob Wilson32bae372011-11-05 06:08:06 +0000407#define _mm_cmp_ss(a, b, c) __extension__ ({ \
408 __m128 __a = (a); \
409 __m128 __b = (b); \
410 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000411
412/* Vector extract */
Chad Rosier1e4faf52011-12-17 01:22:27 +0000413#define _mm256_extractf128_pd(A, O) __extension__ ({ \
414 __m256d __A = (A); \
415 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000416
Chad Rosier1e4faf52011-12-17 01:22:27 +0000417#define _mm256_extractf128_ps(A, O) __extension__ ({ \
418 __m256 __A = (A); \
419 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000420
Chad Rosier1e4faf52011-12-17 01:22:27 +0000421#define _mm256_extractf128_si256(A, O) __extension__ ({ \
422 __m256i __A = (A); \
423 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000424
425static __inline int __attribute__((__always_inline__, __nodebug__))
426_mm256_extract_epi32(__m256i a, int const imm)
427{
428 __v8si b = (__v8si)a;
429 return b[imm];
430}
431
432static __inline int __attribute__((__always_inline__, __nodebug__))
433_mm256_extract_epi16(__m256i a, int const imm)
434{
435 __v16hi b = (__v16hi)a;
436 return b[imm];
437}
438
439static __inline int __attribute__((__always_inline__, __nodebug__))
440_mm256_extract_epi8(__m256i a, int const imm)
441{
442 __v32qi b = (__v32qi)a;
443 return b[imm];
444}
445
446#ifdef __x86_64__
447static __inline long long __attribute__((__always_inline__, __nodebug__))
448_mm256_extract_epi64(__m256i a, const int imm)
449{
450 __v4di b = (__v4di)a;
451 return b[imm];
452}
453#endif
454
455/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000456#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
457 __m256d __V1 = (V1); \
458 __m128d __V2 = (V2); \
459 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000460
Chad Rosierb95ddf12011-12-16 21:40:31 +0000461#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
462 __m256 __V1 = (V1); \
463 __m128 __V2 = (V2); \
464 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000465
Chad Rosierb95ddf12011-12-16 21:40:31 +0000466#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
467 __m256i __V1 = (V1); \
468 __m128i __V2 = (V2); \
469 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000470
471static __inline __m256i __attribute__((__always_inline__, __nodebug__))
472_mm256_insert_epi32(__m256i a, int b, int const imm)
473{
474 __v8si c = (__v8si)a;
475 c[imm & 7] = b;
476 return (__m256i)c;
477}
478
479static __inline __m256i __attribute__((__always_inline__, __nodebug__))
480_mm256_insert_epi16(__m256i a, int b, int const imm)
481{
482 __v16hi c = (__v16hi)a;
483 c[imm & 15] = b;
484 return (__m256i)c;
485}
486
487static __inline __m256i __attribute__((__always_inline__, __nodebug__))
488_mm256_insert_epi8(__m256i a, int b, int const imm)
489{
490 __v32qi c = (__v32qi)a;
491 c[imm & 31] = b;
492 return (__m256i)c;
493}
494
495#ifdef __x86_64__
496static __inline __m256i __attribute__((__always_inline__, __nodebug__))
497_mm256_insert_epi64(__m256i a, int b, int const imm)
498{
499 __v4di c = (__v4di)a;
500 c[imm & 3] = b;
501 return (__m256i)c;
502}
503#endif
504
505/* Conversion */
506static __inline __m256d __attribute__((__always_inline__, __nodebug__))
507_mm256_cvtepi32_pd(__m128i a)
508{
509 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
510}
511
512static __inline __m256 __attribute__((__always_inline__, __nodebug__))
513_mm256_cvtepi32_ps(__m256i a)
514{
515 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
516}
517
518static __inline __m128 __attribute__((__always_inline__, __nodebug__))
519_mm256_cvtpd_ps(__m256d a)
520{
521 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
522}
523
524static __inline __m256i __attribute__((__always_inline__, __nodebug__))
525_mm256_cvtps_epi32(__m256 a)
526{
527 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
528}
529
530static __inline __m256d __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtps_pd(__m128 a)
532{
533 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
534}
535
536static __inline __m128i __attribute__((__always_inline__, __nodebug__))
537_mm256_cvttpd_epi32(__m256d a)
538{
539 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
540}
541
542static __inline __m128i __attribute__((__always_inline__, __nodebug__))
543_mm256_cvtpd_epi32(__m256d a)
544{
545 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
546}
547
548static __inline __m256i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvttps_epi32(__m256 a)
550{
551 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
552}
553
554/* Vector replicate */
555static __inline __m256 __attribute__((__always_inline__, __nodebug__))
556_mm256_movehdup_ps(__m256 a)
557{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000558 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000559}
560
561static __inline __m256 __attribute__((__always_inline__, __nodebug__))
562_mm256_moveldup_ps(__m256 a)
563{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000564 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000565}
566
567static __inline __m256d __attribute__((__always_inline__, __nodebug__))
568_mm256_movedup_pd(__m256d a)
569{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000570 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000571}
572
573/* Unpack and Interleave */
574static __inline __m256d __attribute__((__always_inline__, __nodebug__))
575_mm256_unpackhi_pd(__m256d a, __m256d b)
576{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000577 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000578}
579
580static __inline __m256d __attribute__((__always_inline__, __nodebug__))
581_mm256_unpacklo_pd(__m256d a, __m256d b)
582{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000583 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000584}
585
586static __inline __m256 __attribute__((__always_inline__, __nodebug__))
587_mm256_unpackhi_ps(__m256 a, __m256 b)
588{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000589 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000590}
591
592static __inline __m256 __attribute__((__always_inline__, __nodebug__))
593_mm256_unpacklo_ps(__m256 a, __m256 b)
594{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000595 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000596}
597
598/* Bit Test */
599static __inline int __attribute__((__always_inline__, __nodebug__))
600_mm_testz_pd(__m128d a, __m128d b)
601{
602 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
603}
604
605static __inline int __attribute__((__always_inline__, __nodebug__))
606_mm_testc_pd(__m128d a, __m128d b)
607{
608 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
609}
610
611static __inline int __attribute__((__always_inline__, __nodebug__))
612_mm_testnzc_pd(__m128d a, __m128d b)
613{
614 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
615}
616
617static __inline int __attribute__((__always_inline__, __nodebug__))
618_mm_testz_ps(__m128 a, __m128 b)
619{
620 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
621}
622
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testc_ps(__m128 a, __m128 b)
625{
626 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testnzc_ps(__m128 a, __m128 b)
631{
632 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm256_testz_pd(__m256d a, __m256d b)
637{
638 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm256_testc_pd(__m256d a, __m256d b)
643{
644 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm256_testnzc_pd(__m256d a, __m256d b)
649{
650 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm256_testz_ps(__m256 a, __m256 b)
655{
656 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testc_ps(__m256 a, __m256 b)
661{
662 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testnzc_ps(__m256 a, __m256 b)
667{
668 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testz_si256(__m256i a, __m256i b)
673{
674 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testc_si256(__m256i a, __m256i b)
679{
680 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testnzc_si256(__m256i a, __m256i b)
685{
686 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
687}
688
689/* Vector extract sign mask */
690static __inline int __attribute__((__always_inline__, __nodebug__))
691_mm256_movemask_pd(__m256d a)
692{
693 return __builtin_ia32_movmskpd256((__v4df)a);
694}
695
696static __inline int __attribute__((__always_inline__, __nodebug__))
697_mm256_movemask_ps(__m256 a)
698{
699 return __builtin_ia32_movmskps256((__v8sf)a);
700}
701
702/* Vector zero */
703static __inline void __attribute__((__always_inline__, __nodebug__))
704_mm256_zeroall(void)
705{
706 __builtin_ia32_vzeroall();
707}
708
709static __inline void __attribute__((__always_inline__, __nodebug__))
710_mm256_zeroupper(void)
711{
712 __builtin_ia32_vzeroupper();
713}
714
715/* Vector load with broadcast */
716static __inline __m128 __attribute__((__always_inline__, __nodebug__))
717_mm_broadcast_ss(float const *a)
718{
719 return (__m128)__builtin_ia32_vbroadcastss(a);
720}
721
722static __inline __m256d __attribute__((__always_inline__, __nodebug__))
723_mm256_broadcast_sd(double const *a)
724{
725 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
726}
727
728static __inline __m256 __attribute__((__always_inline__, __nodebug__))
729_mm256_broadcast_ss(float const *a)
730{
731 return (__m256)__builtin_ia32_vbroadcastss256(a);
732}
733
734static __inline __m256d __attribute__((__always_inline__, __nodebug__))
735_mm256_broadcast_pd(__m128d const *a)
736{
737 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
738}
739
740static __inline __m256 __attribute__((__always_inline__, __nodebug__))
741_mm256_broadcast_ps(__m128 const *a)
742{
743 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
744}
745
746/* SIMD load ops */
747static __inline __m256d __attribute__((__always_inline__, __nodebug__))
748_mm256_load_pd(double const *p)
749{
750 return *(__m256d *)p;
751}
752
753static __inline __m256 __attribute__((__always_inline__, __nodebug__))
754_mm256_load_ps(float const *p)
755{
756 return *(__m256 *)p;
757}
758
759static __inline __m256d __attribute__((__always_inline__, __nodebug__))
760_mm256_loadu_pd(double const *p)
761{
762 return (__m256d)__builtin_ia32_loadupd256(p);
763}
764
765static __inline __m256 __attribute__((__always_inline__, __nodebug__))
766_mm256_loadu_ps(float const *p)
767{
768 return (__m256)__builtin_ia32_loadups256(p);
769}
770
771static __inline __m256i __attribute__((__always_inline__, __nodebug__))
772_mm256_load_si256(__m256i const *p)
773{
774 return *p;
775}
776
777static __inline __m256i __attribute__((__always_inline__, __nodebug__))
778_mm256_loadu_si256(__m256i const *p)
779{
780 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
781}
782
783static __inline __m256i __attribute__((__always_inline__, __nodebug__))
784_mm256_lddqu_si256(__m256i const *p)
785{
786 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
787}
788
789/* SIMD store ops */
790static __inline void __attribute__((__always_inline__, __nodebug__))
791_mm256_store_pd(double *p, __m256d a)
792{
793 *(__m256d *)p = a;
794}
795
796static __inline void __attribute__((__always_inline__, __nodebug__))
797_mm256_store_ps(float *p, __m256 a)
798{
799 *(__m256 *)p = a;
800}
801
802static __inline void __attribute__((__always_inline__, __nodebug__))
803_mm256_storeu_pd(double *p, __m256d a)
804{
805 __builtin_ia32_storeupd256(p, (__v4df)a);
806}
807
808static __inline void __attribute__((__always_inline__, __nodebug__))
809_mm256_storeu_ps(float *p, __m256 a)
810{
811 __builtin_ia32_storeups256(p, (__v8sf)a);
812}
813
814static __inline void __attribute__((__always_inline__, __nodebug__))
815_mm256_store_si256(__m256i *p, __m256i a)
816{
817 *p = a;
818}
819
820static __inline void __attribute__((__always_inline__, __nodebug__))
821_mm256_storeu_si256(__m256i *p, __m256i a)
822{
823 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
824}
825
826/* Conditional load ops */
827static __inline __m128d __attribute__((__always_inline__, __nodebug__))
828_mm_maskload_pd(double const *p, __m128d m)
829{
830 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
831}
832
833static __inline __m256d __attribute__((__always_inline__, __nodebug__))
834_mm256_maskload_pd(double const *p, __m256d m)
835{
836 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
837}
838
839static __inline __m128 __attribute__((__always_inline__, __nodebug__))
840_mm_maskload_ps(float const *p, __m128 m)
841{
842 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
843}
844
845static __inline __m256 __attribute__((__always_inline__, __nodebug__))
846_mm256_maskload_ps(float const *p, __m256 m)
847{
848 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
849}
850
851/* Conditional store ops */
852static __inline void __attribute__((__always_inline__, __nodebug__))
853_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
854{
855 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
856}
857
858static __inline void __attribute__((__always_inline__, __nodebug__))
859_mm_maskstore_pd(double *p, __m128d m, __m128d a)
860{
861 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
862}
863
864static __inline void __attribute__((__always_inline__, __nodebug__))
865_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
866{
867 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
868}
869
870static __inline void __attribute__((__always_inline__, __nodebug__))
871_mm_maskstore_ps(float *p, __m128 m, __m128 a)
872{
873 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
874}
875
876/* Cacheability support ops */
877static __inline void __attribute__((__always_inline__, __nodebug__))
878_mm256_stream_si256(__m256i *a, __m256i b)
879{
880 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
881}
882
883static __inline void __attribute__((__always_inline__, __nodebug__))
884_mm256_stream_pd(double *a, __m256d b)
885{
886 __builtin_ia32_movntpd256(a, (__v4df)b);
887}
888
889static __inline void __attribute__((__always_inline__, __nodebug__))
890_mm256_stream_ps(float *p, __m256 a)
891{
892 __builtin_ia32_movntps256(p, (__v8sf)a);
893}
894
895/* Create vectors */
896static __inline __m256d __attribute__((__always_inline__, __nodebug__))
897_mm256_set_pd(double a, double b, double c, double d)
898{
899 return (__m256d){ d, c, b, a };
900}
901
902static __inline __m256 __attribute__((__always_inline__, __nodebug__))
903_mm256_set_ps(float a, float b, float c, float d,
904 float e, float f, float g, float h)
905{
906 return (__m256){ h, g, f, e, d, c, b, a };
907}
908
909static __inline __m256i __attribute__((__always_inline__, __nodebug__))
910_mm256_set_epi32(int i0, int i1, int i2, int i3,
911 int i4, int i5, int i6, int i7)
912{
913 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
914}
915
916static __inline __m256i __attribute__((__always_inline__, __nodebug__))
917_mm256_set_epi16(short w15, short w14, short w13, short w12,
918 short w11, short w10, short w09, short w08,
919 short w07, short w06, short w05, short w04,
920 short w03, short w02, short w01, short w00)
921{
922 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
923 w08, w09, w10, w11, w12, w13, w14, w15 };
924}
925
926static __inline __m256i __attribute__((__always_inline__, __nodebug__))
927_mm256_set_epi8(char b31, char b30, char b29, char b28,
928 char b27, char b26, char b25, char b24,
929 char b23, char b22, char b21, char b20,
930 char b19, char b18, char b17, char b16,
931 char b15, char b14, char b13, char b12,
932 char b11, char b10, char b09, char b08,
933 char b07, char b06, char b05, char b04,
934 char b03, char b02, char b01, char b00)
935{
936 return (__m256i)(__v32qi){
937 b00, b01, b02, b03, b04, b05, b06, b07,
938 b08, b09, b10, b11, b12, b13, b14, b15,
939 b16, b17, b18, b19, b20, b21, b22, b23,
940 b24, b25, b26, b27, b28, b29, b30, b31
941 };
942}
943
944static __inline __m256i __attribute__((__always_inline__, __nodebug__))
945_mm256_set_epi64x(long long a, long long b, long long c, long long d)
946{
947 return (__m256i)(__v4di){ d, c, b, a };
948}
949
950/* Create vectors with elements in reverse order */
951static __inline __m256d __attribute__((__always_inline__, __nodebug__))
952_mm256_setr_pd(double a, double b, double c, double d)
953{
954 return (__m256d){ a, b, c, d };
955}
956
957static __inline __m256 __attribute__((__always_inline__, __nodebug__))
958_mm256_setr_ps(float a, float b, float c, float d,
959 float e, float f, float g, float h)
960{
961 return (__m256){ a, b, c, d, e, f, g, h };
962}
963
964static __inline __m256i __attribute__((__always_inline__, __nodebug__))
965_mm256_setr_epi32(int i0, int i1, int i2, int i3,
966 int i4, int i5, int i6, int i7)
967{
968 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
969}
970
971static __inline __m256i __attribute__((__always_inline__, __nodebug__))
972_mm256_setr_epi16(short w15, short w14, short w13, short w12,
973 short w11, short w10, short w09, short w08,
974 short w07, short w06, short w05, short w04,
975 short w03, short w02, short w01, short w00)
976{
977 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
978 w07, w06, w05, w04, w03, w02, w01, w00 };
979}
980
981static __inline __m256i __attribute__((__always_inline__, __nodebug__))
982_mm256_setr_epi8(char b31, char b30, char b29, char b28,
983 char b27, char b26, char b25, char b24,
984 char b23, char b22, char b21, char b20,
985 char b19, char b18, char b17, char b16,
986 char b15, char b14, char b13, char b12,
987 char b11, char b10, char b09, char b08,
988 char b07, char b06, char b05, char b04,
989 char b03, char b02, char b01, char b00)
990{
991 return (__m256i)(__v32qi){
992 b31, b30, b29, b28, b27, b26, b25, b24,
993 b23, b22, b21, b20, b19, b18, b17, b16,
994 b15, b14, b13, b12, b11, b10, b09, b08,
995 b07, b06, b05, b04, b03, b02, b01, b00 };
996}
997
998static __inline __m256i __attribute__((__always_inline__, __nodebug__))
999_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1000{
1001 return (__m256i)(__v4di){ a, b, c, d };
1002}
1003
1004/* Create vectors with repeated elements */
1005static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1006_mm256_set1_pd(double w)
1007{
1008 return (__m256d){ w, w, w, w };
1009}
1010
1011static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1012_mm256_set1_ps(float w)
1013{
1014 return (__m256){ w, w, w, w, w, w, w, w };
1015}
1016
1017static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1018_mm256_set1_epi32(int i)
1019{
1020 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1021}
1022
1023static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1024_mm256_set1_epi16(short w)
1025{
1026 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1027}
1028
1029static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1030_mm256_set1_epi8(char b)
1031{
1032 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1033 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1034}
1035
1036static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1037_mm256_set1_epi64x(long long q)
1038{
1039 return (__m256i)(__v4di){ q, q, q, q };
1040}
1041
1042/* Create zeroed vectors */
1043static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1044_mm256_setzero_pd(void)
1045{
1046 return (__m256d){ 0, 0, 0, 0 };
1047}
1048
1049static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1050_mm256_setzero_ps(void)
1051{
1052 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1053}
1054
1055static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1056_mm256_setzero_si256(void)
1057{
1058 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1059}
1060
1061/* Cast between vector types */
1062static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1063_mm256_castpd_ps(__m256d in)
1064{
1065 return (__m256)in;
1066}
1067
1068static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1069_mm256_castpd_si256(__m256d in)
1070{
1071 return (__m256i)in;
1072}
1073
1074static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1075_mm256_castps_pd(__m256 in)
1076{
1077 return (__m256d)in;
1078}
1079
1080static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1081_mm256_castps_si256(__m256 in)
1082{
1083 return (__m256i)in;
1084}
1085
1086static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1087_mm256_castsi256_ps(__m256i in)
1088{
1089 return (__m256)in;
1090}
1091
1092static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1093_mm256_castsi256_pd(__m256i in)
1094{
1095 return (__m256d)in;
1096}
1097
1098static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1099_mm256_castpd256_pd128(__m256d in)
1100{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001101 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001102}
1103
1104static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1105_mm256_castps256_ps128(__m256 in)
1106{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001107 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001108}
1109
1110static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1111_mm256_castsi256_si128(__m256i in)
1112{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001113 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001114}
1115
1116static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1117_mm256_castpd128_pd256(__m128d in)
1118{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001119 __m128d zero = _mm_setzero_pd();
1120 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001121}
1122
1123static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1124_mm256_castps128_ps256(__m128 in)
1125{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001126 __m128 zero = _mm_setzero_ps();
1127 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001128}
1129
1130static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1131_mm256_castsi128_si256(__m128i in)
1132{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001133 __m128i zero = _mm_setzero_si128();
1134 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001135}