blob: cb15396b3faf329cd8396d662a9f6b41d3a45f21 [file] [log] [blame]
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer6f35f3c2010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000027
Richard Smith49e56442013-07-14 05:41:45 +000028#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
Reid Kleckner89fbd552018-06-04 21:39:20 +000031typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000037
Craig Topper6a77b622016-06-04 05:43:41 +000038/* Unsigned types */
Reid Kleckner89fbd552018-06-04 21:39:20 +000039typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
Craig Topper6a77b622016-06-04 05:43:41 +000043
Chandler Carruthcbe64112015-10-01 23:40:12 +000044/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
Reid Kleckner89fbd552018-06-04 21:39:20 +000046typedef signed char __v32qs __attribute__((__vector_size__(32)));
Chandler Carruthcbe64112015-10-01 23:40:12 +000047
Reid Kleckner89fbd552018-06-04 21:39:20 +000048typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000051
Eric Christopher4d1851682015-06-17 07:09:20 +000052/* Define the default attributes for the functions in this file. */
Craig Topper74c10e32018-07-09 19:00:16 +000053#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
54#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
Eric Christopher4d1851682015-06-17 07:09:20 +000055
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000056/* Arithmetic */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000057/// Adds two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000058///
59/// \headerfile <x86intrin.h>
60///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000061/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000062///
63/// \param __a
64/// A 256-bit vector of [4 x double] containing one of the source operands.
65/// \param __b
66/// A 256-bit vector of [4 x double] containing one of the source operands.
67/// \returns A 256-bit vector of [4 x double] containing the sums of both
68/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000069static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000070_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000071{
Craig Topper1aa231e2016-05-16 06:38:42 +000072 return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000073}
74
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000075/// Adds two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000076///
77/// \headerfile <x86intrin.h>
78///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000079/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000080///
81/// \param __a
82/// A 256-bit vector of [8 x float] containing one of the source operands.
83/// \param __b
84/// A 256-bit vector of [8 x float] containing one of the source operands.
85/// \returns A 256-bit vector of [8 x float] containing the sums of both
86/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000087static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000088_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000089{
Craig Topper1aa231e2016-05-16 06:38:42 +000090 return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000091}
92
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000093/// Subtracts two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000094///
95/// \headerfile <x86intrin.h>
96///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000097/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000098///
99/// \param __a
100/// A 256-bit vector of [4 x double] containing the minuend.
101/// \param __b
102/// A 256-bit vector of [4 x double] containing the subtrahend.
103/// \returns A 256-bit vector of [4 x double] containing the differences between
104/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000105static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000106_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000107{
Craig Topper1aa231e2016-05-16 06:38:42 +0000108 return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000109}
110
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000111/// Subtracts two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000112///
113/// \headerfile <x86intrin.h>
114///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000115/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000116///
117/// \param __a
118/// A 256-bit vector of [8 x float] containing the minuend.
119/// \param __b
120/// A 256-bit vector of [8 x float] containing the subtrahend.
121/// \returns A 256-bit vector of [8 x float] containing the differences between
122/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000123static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000124_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000125{
Craig Topper1aa231e2016-05-16 06:38:42 +0000126 return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000127}
128
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000129/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000130/// two 256-bit vectors of [4 x double].
131///
132/// \headerfile <x86intrin.h>
133///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000134/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000135///
136/// \param __a
137/// A 256-bit vector of [4 x double] containing the left source operand.
138/// \param __b
139/// A 256-bit vector of [4 x double] containing the right source operand.
140/// \returns A 256-bit vector of [4 x double] containing the alternating sums
141/// and differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000142static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000143_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000144{
David Blaikie3302f2b2013-01-16 23:08:36 +0000145 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000146}
147
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000148/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000149/// two 256-bit vectors of [8 x float].
150///
151/// \headerfile <x86intrin.h>
152///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000153/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000154///
155/// \param __a
156/// A 256-bit vector of [8 x float] containing the left source operand.
157/// \param __b
158/// A 256-bit vector of [8 x float] containing the right source operand.
159/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
160/// differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000161static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000162_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000163{
David Blaikie3302f2b2013-01-16 23:08:36 +0000164 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000165}
166
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000167/// Divides two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000168///
169/// \headerfile <x86intrin.h>
170///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000171/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000172///
173/// \param __a
174/// A 256-bit vector of [4 x double] containing the dividend.
175/// \param __b
176/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000177/// \returns A 256-bit vector of [4 x double] containing the quotients of both
178/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000179static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000180_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000181{
Craig Topper1aa231e2016-05-16 06:38:42 +0000182 return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000183}
184
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000185/// Divides two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000186///
187/// \headerfile <x86intrin.h>
188///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000189/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000190///
191/// \param __a
192/// A 256-bit vector of [8 x float] containing the dividend.
193/// \param __b
194/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000195/// \returns A 256-bit vector of [8 x float] containing the quotients of both
196/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000197static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000198_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000199{
Craig Topper1aa231e2016-05-16 06:38:42 +0000200 return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000201}
202
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000203/// Compares two 256-bit vectors of [4 x double] and returns the greater
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000204/// of each pair of values.
205///
206/// \headerfile <x86intrin.h>
207///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000208/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000209///
210/// \param __a
211/// A 256-bit vector of [4 x double] containing one of the operands.
212/// \param __b
213/// A 256-bit vector of [4 x double] containing one of the operands.
214/// \returns A 256-bit vector of [4 x double] containing the maximum values
215/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000216static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000217_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000218{
David Blaikie3302f2b2013-01-16 23:08:36 +0000219 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000220}
221
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000222/// Compares two 256-bit vectors of [8 x float] and returns the greater
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000223/// of each pair of values.
224///
225/// \headerfile <x86intrin.h>
226///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000227/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000228///
229/// \param __a
230/// A 256-bit vector of [8 x float] containing one of the operands.
231/// \param __b
232/// A 256-bit vector of [8 x float] containing one of the operands.
233/// \returns A 256-bit vector of [8 x float] containing the maximum values
234/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000235static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000236_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000237{
David Blaikie3302f2b2013-01-16 23:08:36 +0000238 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000239}
240
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000241/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000242/// of each pair of values.
243///
244/// \headerfile <x86intrin.h>
245///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000246/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000247///
248/// \param __a
249/// A 256-bit vector of [4 x double] containing one of the operands.
250/// \param __b
251/// A 256-bit vector of [4 x double] containing one of the operands.
252/// \returns A 256-bit vector of [4 x double] containing the minimum values
253/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000254static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000255_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000256{
David Blaikie3302f2b2013-01-16 23:08:36 +0000257 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000258}
259
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000260/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000261/// of each pair of values.
262///
263/// \headerfile <x86intrin.h>
264///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000265/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000266///
267/// \param __a
268/// A 256-bit vector of [8 x float] containing one of the operands.
269/// \param __b
270/// A 256-bit vector of [8 x float] containing one of the operands.
271/// \returns A 256-bit vector of [8 x float] containing the minimum values
272/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000273static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000274_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000275{
David Blaikie3302f2b2013-01-16 23:08:36 +0000276 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000277}
278
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000279/// Multiplies two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000280///
281/// \headerfile <x86intrin.h>
282///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000283/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000284///
285/// \param __a
286/// A 256-bit vector of [4 x double] containing one of the operands.
287/// \param __b
288/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000289/// \returns A 256-bit vector of [4 x double] containing the products of both
290/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000291static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000292_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000293{
Craig Topper1aa231e2016-05-16 06:38:42 +0000294 return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000295}
296
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000297/// Multiplies two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000298///
299/// \headerfile <x86intrin.h>
300///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000301/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000302///
303/// \param __a
304/// A 256-bit vector of [8 x float] containing one of the operands.
305/// \param __b
306/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000307/// \returns A 256-bit vector of [8 x float] containing the products of both
308/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000309static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000310_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000311{
Craig Topper1aa231e2016-05-16 06:38:42 +0000312 return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000313}
314
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000315/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000316/// [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000317///
318/// \headerfile <x86intrin.h>
319///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000320/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000321///
322/// \param __a
323/// A 256-bit vector of [4 x double].
324/// \returns A 256-bit vector of [4 x double] containing the square roots of the
325/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000326static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000327_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000328{
David Blaikie3302f2b2013-01-16 23:08:36 +0000329 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000330}
331
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000332/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000333/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000334///
335/// \headerfile <x86intrin.h>
336///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000337/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000338///
339/// \param __a
340/// A 256-bit vector of [8 x float].
341/// \returns A 256-bit vector of [8 x float] containing the square roots of the
342/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000343static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000344_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000345{
David Blaikie3302f2b2013-01-16 23:08:36 +0000346 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000347}
348
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000349/// Calculates the reciprocal square roots of the values in a 256-bit
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000350/// vector of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000351///
352/// \headerfile <x86intrin.h>
353///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000354/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000355///
356/// \param __a
357/// A 256-bit vector of [8 x float].
358/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
359/// roots of the values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000360static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000361_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000362{
David Blaikie3302f2b2013-01-16 23:08:36 +0000363 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000364}
365
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000366/// Calculates the reciprocals of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000367/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000368///
369/// \headerfile <x86intrin.h>
370///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000371/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000372///
373/// \param __a
374/// A 256-bit vector of [8 x float].
375/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
376/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000377static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000378_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000379{
David Blaikie3302f2b2013-01-16 23:08:36 +0000380 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000381}
382
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000383/// Rounds the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000384/// by the byte operand. The source values are rounded to integer values and
385/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000386///
387/// \headerfile <x86intrin.h>
388///
389/// \code
390/// __m256d _mm256_round_pd(__m256d V, const int M);
391/// \endcode
392///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000393/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000394///
395/// \param V
396/// A 256-bit vector of [4 x double].
397/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000398/// An integer value that specifies the rounding operation. \n
399/// Bits [7:4] are reserved. \n
400/// Bit [3] is a precision exception value: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000401/// 0: A normal PE exception is used. \n
402/// 1: The PE field is not updated. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000403/// Bit [2] is the rounding control source: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000404/// 0: Use bits [1:0] of \a M. \n
405/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000406/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000407/// 00: Nearest. \n
408/// 01: Downward (toward negative infinity). \n
409/// 10: Upward (toward positive infinity). \n
410/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000411/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Craig Topperc6338672018-05-31 00:51:20 +0000412#define _mm256_round_pd(V, M) \
413 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000414
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000415/// Rounds the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000416/// specified by the byte operand. The source values are rounded to integer
417/// values and returned as floating-point values.
418///
419/// \headerfile <x86intrin.h>
420///
421/// \code
422/// __m256 _mm256_round_ps(__m256 V, const int M);
423/// \endcode
424///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000425/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000426///
427/// \param V
428/// A 256-bit vector of [8 x float].
429/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000430/// An integer value that specifies the rounding operation. \n
431/// Bits [7:4] are reserved. \n
432/// Bit [3] is a precision exception value: \n
433/// 0: A normal PE exception is used. \n
434/// 1: The PE field is not updated. \n
435/// Bit [2] is the rounding control source: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +0000436/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000437/// 1: Use the current MXCSR setting. \n
438/// Bits [1:0] contain the rounding control definition: \n
439/// 00: Nearest. \n
440/// 01: Downward (toward negative infinity). \n
441/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000442/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000443/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Craig Topperc6338672018-05-31 00:51:20 +0000444#define _mm256_round_ps(V, M) \
445 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000446
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000447/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000448/// source values are rounded up to integer values and returned as 64-bit
449/// double-precision floating-point values.
450///
451/// \headerfile <x86intrin.h>
452///
453/// \code
454/// __m256d _mm256_ceil_pd(__m256d V);
455/// \endcode
456///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000457/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000458///
459/// \param V
460/// A 256-bit vector of [4 x double].
461/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000462#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000463
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000464/// Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000465/// The source values are rounded down to integer values and returned as
466/// 64-bit double-precision floating-point values.
467///
468/// \headerfile <x86intrin.h>
469///
470/// \code
471/// __m256d _mm256_floor_pd(__m256d V);
472/// \endcode
473///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000474/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000475///
476/// \param V
477/// A 256-bit vector of [4 x double].
478/// \returns A 256-bit vector of [4 x double] containing the rounded down
479/// values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000480#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000481
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000482/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000483/// source values are rounded up to integer values and returned as
484/// floating-point values.
485///
486/// \headerfile <x86intrin.h>
487///
488/// \code
489/// __m256 _mm256_ceil_ps(__m256 V);
490/// \endcode
491///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000492/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000493///
494/// \param V
495/// A 256-bit vector of [8 x float].
496/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000497#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000498
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000499/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000500/// source values are rounded down to integer values and returned as
501/// floating-point values.
502///
503/// \headerfile <x86intrin.h>
504///
505/// \code
506/// __m256 _mm256_floor_ps(__m256 V);
507/// \endcode
508///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000509/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000510///
511/// \param V
512/// A 256-bit vector of [8 x float].
513/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000514#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
515
516/* Logical */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000517/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000518///
519/// \headerfile <x86intrin.h>
520///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000521/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000522///
523/// \param __a
524/// A 256-bit vector of [4 x double] containing one of the source operands.
525/// \param __b
526/// A 256-bit vector of [4 x double] containing one of the source operands.
527/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
528/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000529static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000530_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000531{
Craig Topper6a77b622016-06-04 05:43:41 +0000532 return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000533}
534
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000535/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000536///
537/// \headerfile <x86intrin.h>
538///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000539/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000540///
541/// \param __a
542/// A 256-bit vector of [8 x float] containing one of the source operands.
543/// \param __b
544/// A 256-bit vector of [8 x float] containing one of the source operands.
545/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
546/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000547static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000548_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000549{
Craig Topper6a77b622016-06-04 05:43:41 +0000550 return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000551}
552
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000553/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000554/// the one's complement of the values contained in the first source operand.
555///
556/// \headerfile <x86intrin.h>
557///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000558/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000559///
560/// \param __a
561/// A 256-bit vector of [4 x double] containing the left source operand. The
562/// one's complement of this value is used in the bitwise AND.
563/// \param __b
564/// A 256-bit vector of [4 x double] containing the right source operand.
565/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
566/// values of the second operand and the one's complement of the first
567/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000568static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000569_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000570{
Craig Topper6a77b622016-06-04 05:43:41 +0000571 return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000572}
573
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000574/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000575/// the one's complement of the values contained in the first source operand.
576///
577/// \headerfile <x86intrin.h>
578///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000579/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000580///
581/// \param __a
582/// A 256-bit vector of [8 x float] containing the left source operand. The
583/// one's complement of this value is used in the bitwise AND.
584/// \param __b
585/// A 256-bit vector of [8 x float] containing the right source operand.
586/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
587/// values of the second operand and the one's complement of the first
588/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000589static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000590_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000591{
Craig Topper6a77b622016-06-04 05:43:41 +0000592 return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000593}
594
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000595/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000596///
597/// \headerfile <x86intrin.h>
598///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000599/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000600///
601/// \param __a
602/// A 256-bit vector of [4 x double] containing one of the source operands.
603/// \param __b
604/// A 256-bit vector of [4 x double] containing one of the source operands.
605/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
606/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000607static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000608_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000609{
Craig Topper6a77b622016-06-04 05:43:41 +0000610 return (__m256d)((__v4du)__a | (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000611}
612
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000613/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000614///
615/// \headerfile <x86intrin.h>
616///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000617/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000618///
619/// \param __a
620/// A 256-bit vector of [8 x float] containing one of the source operands.
621/// \param __b
622/// A 256-bit vector of [8 x float] containing one of the source operands.
623/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
624/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000625static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000626_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000627{
Craig Topper6a77b622016-06-04 05:43:41 +0000628 return (__m256)((__v8su)__a | (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000629}
630
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000631/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000632///
633/// \headerfile <x86intrin.h>
634///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000635/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000636///
637/// \param __a
638/// A 256-bit vector of [4 x double] containing one of the source operands.
639/// \param __b
640/// A 256-bit vector of [4 x double] containing one of the source operands.
641/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
642/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000643static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000644_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000645{
Craig Topper6a77b622016-06-04 05:43:41 +0000646 return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000647}
648
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000649/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000650///
651/// \headerfile <x86intrin.h>
652///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000653/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000654///
655/// \param __a
656/// A 256-bit vector of [8 x float] containing one of the source operands.
657/// \param __b
658/// A 256-bit vector of [8 x float] containing one of the source operands.
659/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
660/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000661static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000662_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000663{
Craig Topper6a77b622016-06-04 05:43:41 +0000664 return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000665}
666
667/* Horizontal arithmetic */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000668/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000669/// 256-bit vectors of [4 x double].
670///
671/// \headerfile <x86intrin.h>
672///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000673/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000674///
675/// \param __a
676/// A 256-bit vector of [4 x double] containing one of the source operands.
677/// The horizontal sums of the values are returned in the even-indexed
678/// elements of a vector of [4 x double].
679/// \param __b
680/// A 256-bit vector of [4 x double] containing one of the source operands.
681/// The horizontal sums of the values are returned in the odd-indexed
682/// elements of a vector of [4 x double].
683/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
684/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000685static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000686_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000687{
David Blaikie3302f2b2013-01-16 23:08:36 +0000688 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000689}
690
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000691/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000692/// 256-bit vectors of [8 x float].
693///
694/// \headerfile <x86intrin.h>
695///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000696/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000697///
698/// \param __a
699/// A 256-bit vector of [8 x float] containing one of the source operands.
700/// The horizontal sums of the values are returned in the elements with
701/// index 0, 1, 4, 5 of a vector of [8 x float].
702/// \param __b
703/// A 256-bit vector of [8 x float] containing one of the source operands.
704/// The horizontal sums of the values are returned in the elements with
705/// index 2, 3, 6, 7 of a vector of [8 x float].
706/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
707/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000708static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000709_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000710{
David Blaikie3302f2b2013-01-16 23:08:36 +0000711 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000712}
713
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000714/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000715/// 256-bit vectors of [4 x double].
716///
717/// \headerfile <x86intrin.h>
718///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000719/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000720///
721/// \param __a
722/// A 256-bit vector of [4 x double] containing one of the source operands.
723/// The horizontal differences between the values are returned in the
724/// even-indexed elements of a vector of [4 x double].
725/// \param __b
726/// A 256-bit vector of [4 x double] containing one of the source operands.
727/// The horizontal differences between the values are returned in the
728/// odd-indexed elements of a vector of [4 x double].
729/// \returns A 256-bit vector of [4 x double] containing the horizontal
730/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000731static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000732_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000733{
David Blaikie3302f2b2013-01-16 23:08:36 +0000734 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000735}
736
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000737/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000738/// 256-bit vectors of [8 x float].
739///
740/// \headerfile <x86intrin.h>
741///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000742/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000743///
744/// \param __a
745/// A 256-bit vector of [8 x float] containing one of the source operands.
746/// The horizontal differences between the values are returned in the
747/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
748/// \param __b
749/// A 256-bit vector of [8 x float] containing one of the source operands.
750/// The horizontal differences between the values are returned in the
751/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
752/// \returns A 256-bit vector of [8 x float] containing the horizontal
753/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000754static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000755_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000756{
David Blaikie3302f2b2013-01-16 23:08:36 +0000757 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000758}
759
760/* Vector permutations */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000761/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000762/// by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000763///
764/// \headerfile <x86intrin.h>
765///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000766/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000767///
768/// \param __a
769/// A 128-bit vector of [2 x double].
770/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000771/// A 128-bit integer vector operand specifying how the values are to be
772/// copied. \n
773/// Bit [1]: \n
774/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
775/// vector. \n
776/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
777/// returned vector. \n
778/// Bit [65]: \n
779/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
780/// returned vector. \n
781/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
782/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000783/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topper74c10e32018-07-09 19:00:16 +0000784static __inline __m128d __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +0000785_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000786{
David Blaikie3302f2b2013-01-16 23:08:36 +0000787 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000788}
789
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000790/// Copies the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000791/// by the 256-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000792///
793/// \headerfile <x86intrin.h>
794///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000795/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000796///
797/// \param __a
798/// A 256-bit vector of [4 x double].
799/// \param __c
800/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000801/// copied. \n
802/// Bit [1]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000803/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
804/// vector. \n
805/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
806/// returned vector. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000807/// Bit [65]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000808/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
809/// returned vector. \n
810/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
811/// returned vector. \n
812/// Bit [129]: \n
813/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
814/// returned vector. \n
815/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
816/// returned vector. \n
817/// Bit [193]: \n
818/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
819/// returned vector. \n
820/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000821/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000822/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000823static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000824_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000825{
David Blaikie3302f2b2013-01-16 23:08:36 +0000826 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000827}
828
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000829/// Copies the values stored in a 128-bit vector of [4 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000830/// specified by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000831/// \headerfile <x86intrin.h>
832///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000833/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000834///
835/// \param __a
836/// A 128-bit vector of [4 x float].
837/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000838/// A 128-bit integer vector operand specifying how the values are to be
839/// copied. \n
840/// Bits [1:0]: \n
841/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
842/// returned vector. \n
843/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
844/// returned vector. \n
845/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
846/// returned vector. \n
847/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
848/// returned vector. \n
849/// Bits [33:32]: \n
850/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
851/// returned vector. \n
852/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
853/// returned vector. \n
854/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
855/// returned vector. \n
856/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
857/// returned vector. \n
858/// Bits [65:64]: \n
859/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
860/// returned vector. \n
861/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
862/// returned vector. \n
863/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
864/// returned vector. \n
865/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
866/// returned vector. \n
867/// Bits [97:96]: \n
868/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
869/// returned vector. \n
870/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
871/// returned vector. \n
872/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
873/// returned vector. \n
874/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
875/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000876/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topper74c10e32018-07-09 19:00:16 +0000877static __inline __m128 __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +0000878_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000879{
David Blaikie3302f2b2013-01-16 23:08:36 +0000880 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000881}
882
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000883/// Copies the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000884/// specified by the 256-bit integer vector operand.
885///
886/// \headerfile <x86intrin.h>
887///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000888/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000889///
890/// \param __a
891/// A 256-bit vector of [8 x float].
892/// \param __c
893/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000894/// copied. \n
895/// Bits [1:0]: \n
896/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
897/// returned vector. \n
898/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
899/// returned vector. \n
900/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
901/// returned vector. \n
902/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
903/// returned vector. \n
904/// Bits [33:32]: \n
905/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
906/// returned vector. \n
907/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
908/// returned vector. \n
909/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
910/// returned vector. \n
911/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
912/// returned vector. \n
913/// Bits [65:64]: \n
914/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
915/// returned vector. \n
916/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
917/// returned vector. \n
918/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
919/// returned vector. \n
920/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
921/// returned vector. \n
922/// Bits [97:96]: \n
923/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
924/// returned vector. \n
925/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
926/// returned vector. \n
927/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
928/// returned vector. \n
929/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
930/// returned vector. \n
931/// Bits [129:128]: \n
932/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
933/// returned vector. \n
934/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
935/// returned vector. \n
936/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
937/// returned vector. \n
938/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
939/// returned vector. \n
940/// Bits [161:160]: \n
941/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
942/// returned vector. \n
943/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
944/// returned vector. \n
945/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
946/// returned vector. \n
947/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
948/// returned vector. \n
949/// Bits [193:192]: \n
950/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
951/// returned vector. \n
952/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
953/// returned vector. \n
954/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
955/// returned vector. \n
956/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
957/// returned vector. \n
958/// Bits [225:224]: \n
959/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
960/// returned vector. \n
961/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
962/// returned vector. \n
963/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
964/// returned vector. \n
965/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
966/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000967/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000968static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000969_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000970{
Craig Topper9fee8ab2015-01-31 06:33:59 +0000971 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000972}
973
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000974/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000975/// by the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000976///
977/// \headerfile <x86intrin.h>
978///
979/// \code
980/// __m128d _mm_permute_pd(__m128d A, const int C);
981/// \endcode
982///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000983/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000984///
985/// \param A
986/// A 128-bit vector of [2 x double].
987/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000988/// An immediate integer operand specifying how the values are to be
989/// copied. \n
990/// Bit [0]: \n
991/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
992/// vector. \n
993/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
994/// returned vector. \n
995/// Bit [1]: \n
996/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
997/// returned vector. \n
998/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
999/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001000/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001001#define _mm_permute_pd(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001002 (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001003
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001004/// Copies the values in a 256-bit vector of [4 x double] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001005/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001006///
1007/// \headerfile <x86intrin.h>
1008///
1009/// \code
1010/// __m256d _mm256_permute_pd(__m256d A, const int C);
1011/// \endcode
1012///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001013/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001014///
1015/// \param A
1016/// A 256-bit vector of [4 x double].
1017/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001018/// An immediate integer operand specifying how the values are to be
1019/// copied. \n
1020/// Bit [0]: \n
1021/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1022/// vector. \n
1023/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1024/// returned vector. \n
1025/// Bit [1]: \n
1026/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1027/// returned vector. \n
1028/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1029/// returned vector. \n
1030/// Bit [2]: \n
1031/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1032/// returned vector. \n
1033/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1034/// returned vector. \n
1035/// Bit [3]: \n
1036/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1037/// returned vector. \n
1038/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1039/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001040/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001041#define _mm256_permute_pd(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001042 (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001043
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001044/// Copies the values in a 128-bit vector of [4 x float] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001045/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001046///
1047/// \headerfile <x86intrin.h>
1048///
1049/// \code
1050/// __m128 _mm_permute_ps(__m128 A, const int C);
1051/// \endcode
1052///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001053/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001054///
1055/// \param A
1056/// A 128-bit vector of [4 x float].
1057/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001058/// An immediate integer operand specifying how the values are to be
1059/// copied. \n
1060/// Bits [1:0]: \n
1061/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1062/// returned vector. \n
1063/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1064/// returned vector. \n
1065/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1066/// returned vector. \n
1067/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1068/// returned vector. \n
1069/// Bits [3:2]: \n
1070/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1071/// returned vector. \n
1072/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1073/// returned vector. \n
1074/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1075/// returned vector. \n
1076/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1077/// returned vector. \n
1078/// Bits [5:4]: \n
1079/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1080/// returned vector. \n
1081/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1082/// returned vector. \n
1083/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1084/// returned vector. \n
1085/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1086/// returned vector. \n
1087/// Bits [7:6]: \n
1088/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1089/// returned vector. \n
1090/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1091/// returned vector. \n
1092/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1093/// returned vector. \n
1094/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1095/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001096/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001097#define _mm_permute_ps(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001098 (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001099
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001100/// Copies the values in a 256-bit vector of [8 x float] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001101/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001102///
1103/// \headerfile <x86intrin.h>
1104///
1105/// \code
1106/// __m256 _mm256_permute_ps(__m256 A, const int C);
1107/// \endcode
1108///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001109/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001110///
1111/// \param A
1112/// A 256-bit vector of [8 x float].
1113/// \param C
Douglas Yung7ff91422018-01-08 21:21:17 +00001114/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001115/// copied. \n
1116/// Bits [1:0]: \n
1117/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1118/// returned vector. \n
1119/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1120/// returned vector. \n
1121/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1122/// returned vector. \n
1123/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1124/// returned vector. \n
1125/// Bits [3:2]: \n
1126/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1127/// returned vector. \n
1128/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1129/// returned vector. \n
1130/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1131/// returned vector. \n
1132/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1133/// returned vector. \n
1134/// Bits [5:4]: \n
1135/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1136/// returned vector. \n
1137/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1138/// returned vector. \n
1139/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1140/// returned vector. \n
1141/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1142/// returned vector. \n
1143/// Bits [7:6]: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001144/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001145/// returned vector. \n
1146/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1147/// returned vector. \n
1148/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1149/// returned vector. \n
1150/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1151/// returned vector. \n
1152/// Bits [1:0]: \n
1153/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1154/// returned vector. \n
1155/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1156/// returned vector. \n
1157/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1158/// returned vector. \n
1159/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1160/// returned vector. \n
1161/// Bits [3:2]: \n
1162/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1163/// returned vector. \n
1164/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1165/// returned vector. \n
1166/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1167/// returned vector. \n
1168/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1169/// returned vector. \n
1170/// Bits [5:4]: \n
1171/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1172/// returned vector. \n
1173/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1174/// returned vector. \n
1175/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1176/// returned vector. \n
1177/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1178/// returned vector. \n
1179/// Bits [7:6]: \n
1180/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1181/// returned vector. \n
1182/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1183/// returned vector. \n
1184/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1185/// returned vector. \n
1186/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1187/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001188/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001189#define _mm256_permute_ps(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001190 (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001191
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001192/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001193/// [4 x double], as specified by the immediate integer operand.
1194///
1195/// \headerfile <x86intrin.h>
1196///
1197/// \code
1198/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1199/// \endcode
1200///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001201/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001202///
1203/// \param V1
1204/// A 256-bit vector of [4 x double].
1205/// \param V2
1206/// A 256-bit vector of [4 x double.
1207/// \param M
1208/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001209/// permuted. \n
1210/// Bits [1:0]: \n
1211/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1212/// destination. \n
1213/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1214/// destination. \n
1215/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1216/// destination. \n
1217/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1218/// destination. \n
1219/// Bits [5:4]: \n
1220/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1221/// destination. \n
1222/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1223/// destination. \n
1224/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1225/// destination. \n
1226/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1227/// destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001228/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001229#define _mm256_permute2f128_pd(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001230 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
Craig Topper9d3962f2018-06-08 18:00:22 +00001231 (__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001232
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001233/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001234/// [8 x float], as specified by the immediate integer operand.
1235///
1236/// \headerfile <x86intrin.h>
1237///
1238/// \code
1239/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1240/// \endcode
1241///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001242/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001243///
1244/// \param V1
1245/// A 256-bit vector of [8 x float].
1246/// \param V2
1247/// A 256-bit vector of [8 x float].
1248/// \param M
1249/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001250/// permuted. \n
1251/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001252/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001253/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001254/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001255/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001256/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001257/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001258/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001259/// destination. \n
1260/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001261/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001262/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001263/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001264/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001265/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001266/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001267/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001268/// destination.
1269/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001270#define _mm256_permute2f128_ps(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001271 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
Craig Topper9d3962f2018-06-08 18:00:22 +00001272 (__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001273
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001274/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001275/// as specified by the immediate integer operand.
1276///
1277/// \headerfile <x86intrin.h>
1278///
1279/// \code
1280/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1281/// \endcode
1282///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001283/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001284///
1285/// \param V1
1286/// A 256-bit integer vector.
1287/// \param V2
1288/// A 256-bit integer vector.
1289/// \param M
1290/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001291/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001292/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001293/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001294/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001295/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001296/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001297/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001298/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001299/// destination. \n
1300/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001301/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001302/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001303/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001304/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001305/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001306/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001307/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001308/// destination.
1309/// \returns A 256-bit integer vector containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001310#define _mm256_permute2f128_si256(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001311 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
Craig Topper9d3962f2018-06-08 18:00:22 +00001312 (__v8si)(__m256i)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001313
1314/* Vector Blend */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001315/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001316/// two 256-bit vectors of [4 x double], as specified by the immediate
1317/// integer operand.
1318///
1319/// \headerfile <x86intrin.h>
1320///
1321/// \code
1322/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1323/// \endcode
1324///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001325/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001326///
1327/// \param V1
1328/// A 256-bit vector of [4 x double].
1329/// \param V2
1330/// A 256-bit vector of [4 x double].
1331/// \param M
1332/// An immediate integer operand, with mask bits [3:0] specifying how the
1333/// values are to be copied. The position of the mask bit corresponds to the
1334/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001335/// element in operand \a V1 is copied to the same position in the
1336/// destination. When a mask bit is 1, the corresponding 64-bit element in
1337/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001338/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001339#define _mm256_blend_pd(V1, V2, M) \
Craig Topper7d17d722018-06-08 00:00:21 +00001340 (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1341 (__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001342
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001343/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001344/// two 256-bit vectors of [8 x float], as specified by the immediate
1345/// integer operand.
1346///
1347/// \headerfile <x86intrin.h>
1348///
1349/// \code
1350/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1351/// \endcode
1352///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001353/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001354///
1355/// \param V1
1356/// A 256-bit vector of [8 x float].
1357/// \param V2
1358/// A 256-bit vector of [8 x float].
1359/// \param M
1360/// An immediate integer operand, with mask bits [7:0] specifying how the
1361/// values are to be copied. The position of the mask bit corresponds to the
1362/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001363/// element in operand \a V1 is copied to the same position in the
1364/// destination. When a mask bit is 1, the corresponding 32-bit element in
1365/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001366/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001367#define _mm256_blend_ps(V1, V2, M) \
Craig Topper7d17d722018-06-08 00:00:21 +00001368 (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1369 (__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001370
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001371/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001372/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1373/// operand.
1374///
1375/// \headerfile <x86intrin.h>
1376///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001377/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001378///
1379/// \param __a
1380/// A 256-bit vector of [4 x double].
1381/// \param __b
1382/// A 256-bit vector of [4 x double].
1383/// \param __c
1384/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1385/// how the values are to be copied. The position of the mask bit corresponds
1386/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001387/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001388/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001389/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001390/// destination.
1391/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001392static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001393_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001394{
David Blaikie3302f2b2013-01-16 23:08:36 +00001395 return (__m256d)__builtin_ia32_blendvpd256(
1396 (__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001397}
1398
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001399/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001400/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1401/// operand.
1402///
1403/// \headerfile <x86intrin.h>
1404///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001405/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001406///
1407/// \param __a
1408/// A 256-bit vector of [8 x float].
1409/// \param __b
1410/// A 256-bit vector of [8 x float].
1411/// \param __c
1412/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1413/// and 31 specifying how the values are to be copied. The position of the
1414/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001415/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001416/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001417/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001418/// position in the destination.
1419/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001420static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001421_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001422{
David Blaikie5bb70032013-01-16 23:13:42 +00001423 return (__m256)__builtin_ia32_blendvps256(
David Blaikie3302f2b2013-01-16 23:08:36 +00001424 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001425}
1426
1427/* Vector Dot Product */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001428/// Computes two dot products in parallel, using the lower and upper
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001429/// halves of two [8 x float] vectors as input to the two computations, and
1430/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001431/// [8 x float] result.
1432///
1433/// The immediate integer operand controls which input elements will
1434/// contribute to the dot product, and where the final results are returned.
1435/// In general, for each dot product, the four corresponding elements of the
1436/// input vectors are multiplied; the first two and second two products are
1437/// summed, then the two sums are added to form the final result.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001438///
1439/// \headerfile <x86intrin.h>
1440///
1441/// \code
1442/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1443/// \endcode
1444///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001445/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001446///
1447/// \param V1
1448/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1449/// \param V2
1450/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1451/// \param M
1452/// An immediate integer argument. Bits [7:4] determine which elements of
1453/// the input vectors are used, with bit [4] corresponding to the lowest
1454/// element and bit [7] corresponding to the highest element of each [4 x
1455/// float] subvector. If a bit is set, the corresponding elements from the
1456/// two input vectors are used as an input for dot product; otherwise that
1457/// input is treated as zero. Bits [3:0] determine which elements of the
1458/// result will receive a copy of the final dot product, with bit [0]
1459/// corresponding to the lowest element and bit [3] corresponding to the
1460/// highest element of each [4 x float] subvector. If a bit is set, the dot
1461/// product is returned in the corresponding element; otherwise that element
1462/// is set to zero. The bitmask is applied in the same way to each of the
1463/// two parallel dot product computations.
1464/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Craig Topperc6338672018-05-31 00:51:20 +00001465#define _mm256_dp_ps(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001466 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
Craig Topperc6338672018-05-31 00:51:20 +00001467 (__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001468
1469/* Vector shuffle */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001470/// Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001471/// specified by the immediate value operand.
1472///
1473/// The four selected elements in each operand are copied to the destination
1474/// according to the bits specified in the immediate operand. The selected
1475/// elements from the first 256-bit operand are copied to bits [63:0] and
1476/// bits [191:128] of the destination, and the selected elements from the
1477/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1478/// the destination. For example, if bits [7:0] of the immediate operand
1479/// contain a value of 0xFF, the 256-bit destination vector would contain the
1480/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001481///
1482/// \headerfile <x86intrin.h>
1483///
1484/// \code
1485/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1486/// \endcode
1487///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001488/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001489///
1490/// \param a
1491/// A 256-bit vector of [8 x float]. The four selected elements in this
1492/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1493/// according to the bits specified in the immediate operand.
1494/// \param b
1495/// A 256-bit vector of [8 x float]. The four selected elements in this
1496/// operand are copied to bits [127:64] and bits [255:192] in the
1497/// destination, according to the bits specified in the immediate operand.
1498/// \param mask
1499/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001500/// copy from \a a and \a b \n.
1501/// Bits [3:0] specify the values copied from operand \a a. \n
1502/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001503/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001504/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001505/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001506/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001507/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001508/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001509/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001510/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001511/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001512/// the destination. \n
1513/// Bit value assignments: \n
1514/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1515/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1516/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001517/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1518/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Craig Topperc6338672018-05-31 00:51:20 +00001519#define _mm256_shuffle_ps(a, b, mask) \
Craig Topper422a1bb2018-06-08 07:18:33 +00001520 (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1521 (__v8sf)(__m256)(b), (int)(mask))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001522
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001523/// Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001524/// [4 x double], as specified by the immediate value operand.
1525///
1526/// The selected elements from the first 256-bit operand are copied to bits
1527/// [63:0] and bits [191:128] in the destination, and the selected elements
1528/// from the second 256-bit operand are copied to bits [127:64] and bits
1529/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1530/// operand contain a value of 0xF, the 256-bit destination vector would
1531/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// \code
1536/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1537/// \endcode
1538///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001539/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001540///
1541/// \param a
1542/// A 256-bit vector of [4 x double].
1543/// \param b
1544/// A 256-bit vector of [4 x double].
1545/// \param mask
1546/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001547/// copy from \a a and \a b: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001548/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001549/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001550/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001551/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001552/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001553/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001554/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001555/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001556/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001557/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001558/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001559/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001560/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001561/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001562/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001563/// destination.
1564/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Craig Topperc6338672018-05-31 00:51:20 +00001565#define _mm256_shuffle_pd(a, b, mask) \
Craig Topper422a1bb2018-06-08 07:18:33 +00001566 (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1567 (__v4df)(__m256d)(b), (int)(mask))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001568
1569/* Compare */
1570#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1571#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1572#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1573#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1574#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1575#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1576#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001577#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001578#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001579#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001580#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1581#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1582#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1583#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1584#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1585#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1586#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1587#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1588#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1589#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1590#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1591#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001592#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001593#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1594#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001595#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001596#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1597#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1598#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1599#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1600#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1601#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1602
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001603/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001604/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001605/// immediate integer operand.
1606///
1607/// Returns a [2 x double] vector consisting of two doubles corresponding to
1608/// the two comparison results: zero if the comparison is false, and all 1's
1609/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001610///
1611/// \headerfile <x86intrin.h>
1612///
1613/// \code
1614/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1615/// \endcode
1616///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001617/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001618///
1619/// \param a
1620/// A 128-bit vector of [2 x double].
1621/// \param b
1622/// A 128-bit vector of [2 x double].
1623/// \param c
1624/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001625/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001626/// 0x00: Equal (ordered, non-signaling) \n
1627/// 0x01: Less-than (ordered, signaling) \n
1628/// 0x02: Less-than-or-equal (ordered, signaling) \n
1629/// 0x03: Unordered (non-signaling) \n
1630/// 0x04: Not-equal (unordered, non-signaling) \n
1631/// 0x05: Not-less-than (unordered, signaling) \n
1632/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1633/// 0x07: Ordered (non-signaling) \n
1634/// 0x08: Equal (unordered, non-signaling) \n
1635/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1636/// 0x0A: Not-greater-than (unordered, signaling) \n
1637/// 0x0B: False (ordered, non-signaling) \n
1638/// 0x0C: Not-equal (ordered, non-signaling) \n
1639/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1640/// 0x0E: Greater-than (ordered, signaling) \n
1641/// 0x0F: True (unordered, non-signaling) \n
1642/// 0x10: Equal (ordered, signaling) \n
1643/// 0x11: Less-than (ordered, non-signaling) \n
1644/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1645/// 0x13: Unordered (signaling) \n
1646/// 0x14: Not-equal (unordered, signaling) \n
1647/// 0x15: Not-less-than (unordered, non-signaling) \n
1648/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1649/// 0x17: Ordered (signaling) \n
1650/// 0x18: Equal (unordered, signaling) \n
1651/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1652/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1653/// 0x1B: False (ordered, signaling) \n
1654/// 0x1C: Not-equal (ordered, signaling) \n
1655/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1656/// 0x1E: Greater-than (ordered, non-signaling) \n
1657/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001658/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001659#define _mm_cmp_pd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001660 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001661 (__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001662
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001663/// Compares each of the corresponding values of two 128-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001664/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001665/// operand.
1666///
1667/// Returns a [4 x float] vector consisting of four floats corresponding to
1668/// the four comparison results: zero if the comparison is false, and all 1's
1669/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001670///
1671/// \headerfile <x86intrin.h>
1672///
1673/// \code
1674/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1675/// \endcode
1676///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001677/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001678///
1679/// \param a
1680/// A 128-bit vector of [4 x float].
1681/// \param b
1682/// A 128-bit vector of [4 x float].
1683/// \param c
1684/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001685/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001686/// 0x00: Equal (ordered, non-signaling) \n
1687/// 0x01: Less-than (ordered, signaling) \n
1688/// 0x02: Less-than-or-equal (ordered, signaling) \n
1689/// 0x03: Unordered (non-signaling) \n
1690/// 0x04: Not-equal (unordered, non-signaling) \n
1691/// 0x05: Not-less-than (unordered, signaling) \n
1692/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1693/// 0x07: Ordered (non-signaling) \n
1694/// 0x08: Equal (unordered, non-signaling) \n
1695/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1696/// 0x0A: Not-greater-than (unordered, signaling) \n
1697/// 0x0B: False (ordered, non-signaling) \n
1698/// 0x0C: Not-equal (ordered, non-signaling) \n
1699/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1700/// 0x0E: Greater-than (ordered, signaling) \n
1701/// 0x0F: True (unordered, non-signaling) \n
1702/// 0x10: Equal (ordered, signaling) \n
1703/// 0x11: Less-than (ordered, non-signaling) \n
1704/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1705/// 0x13: Unordered (signaling) \n
1706/// 0x14: Not-equal (unordered, signaling) \n
1707/// 0x15: Not-less-than (unordered, non-signaling) \n
1708/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1709/// 0x17: Ordered (signaling) \n
1710/// 0x18: Equal (unordered, signaling) \n
1711/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1712/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1713/// 0x1B: False (ordered, signaling) \n
1714/// 0x1C: Not-equal (ordered, signaling) \n
1715/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1716/// 0x1E: Greater-than (ordered, non-signaling) \n
1717/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001718/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001719#define _mm_cmp_ps(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001720 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001721 (__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001722
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001723/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001724/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001725/// immediate integer operand.
1726///
1727/// Returns a [4 x double] vector consisting of four doubles corresponding to
1728/// the four comparison results: zero if the comparison is false, and all 1's
1729/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001730///
1731/// \headerfile <x86intrin.h>
1732///
1733/// \code
1734/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1735/// \endcode
1736///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001737/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001738///
1739/// \param a
1740/// A 256-bit vector of [4 x double].
1741/// \param b
1742/// A 256-bit vector of [4 x double].
1743/// \param c
1744/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001745/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001746/// 0x00: Equal (ordered, non-signaling) \n
1747/// 0x01: Less-than (ordered, signaling) \n
1748/// 0x02: Less-than-or-equal (ordered, signaling) \n
1749/// 0x03: Unordered (non-signaling) \n
1750/// 0x04: Not-equal (unordered, non-signaling) \n
1751/// 0x05: Not-less-than (unordered, signaling) \n
1752/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1753/// 0x07: Ordered (non-signaling) \n
1754/// 0x08: Equal (unordered, non-signaling) \n
1755/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1756/// 0x0A: Not-greater-than (unordered, signaling) \n
1757/// 0x0B: False (ordered, non-signaling) \n
1758/// 0x0C: Not-equal (ordered, non-signaling) \n
1759/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1760/// 0x0E: Greater-than (ordered, signaling) \n
1761/// 0x0F: True (unordered, non-signaling) \n
1762/// 0x10: Equal (ordered, signaling) \n
1763/// 0x11: Less-than (ordered, non-signaling) \n
1764/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1765/// 0x13: Unordered (signaling) \n
1766/// 0x14: Not-equal (unordered, signaling) \n
1767/// 0x15: Not-less-than (unordered, non-signaling) \n
1768/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1769/// 0x17: Ordered (signaling) \n
1770/// 0x18: Equal (unordered, signaling) \n
1771/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1772/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1773/// 0x1B: False (ordered, signaling) \n
1774/// 0x1C: Not-equal (ordered, signaling) \n
1775/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1776/// 0x1E: Greater-than (ordered, non-signaling) \n
1777/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001778/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001779#define _mm256_cmp_pd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001780 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001781 (__v4df)(__m256d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001782
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001783/// Compares each of the corresponding values of two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001784/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001785/// operand.
1786///
1787/// Returns a [8 x float] vector consisting of eight floats corresponding to
1788/// the eight comparison results: zero if the comparison is false, and all
1789/// 1's if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001790///
1791/// \headerfile <x86intrin.h>
1792///
1793/// \code
1794/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1795/// \endcode
1796///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001797/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001798///
1799/// \param a
1800/// A 256-bit vector of [8 x float].
1801/// \param b
1802/// A 256-bit vector of [8 x float].
1803/// \param c
1804/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001805/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001806/// 0x00: Equal (ordered, non-signaling) \n
1807/// 0x01: Less-than (ordered, signaling) \n
1808/// 0x02: Less-than-or-equal (ordered, signaling) \n
1809/// 0x03: Unordered (non-signaling) \n
1810/// 0x04: Not-equal (unordered, non-signaling) \n
1811/// 0x05: Not-less-than (unordered, signaling) \n
1812/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1813/// 0x07: Ordered (non-signaling) \n
1814/// 0x08: Equal (unordered, non-signaling) \n
1815/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1816/// 0x0A: Not-greater-than (unordered, signaling) \n
1817/// 0x0B: False (ordered, non-signaling) \n
1818/// 0x0C: Not-equal (ordered, non-signaling) \n
1819/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1820/// 0x0E: Greater-than (ordered, signaling) \n
1821/// 0x0F: True (unordered, non-signaling) \n
1822/// 0x10: Equal (ordered, signaling) \n
1823/// 0x11: Less-than (ordered, non-signaling) \n
1824/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1825/// 0x13: Unordered (signaling) \n
1826/// 0x14: Not-equal (unordered, signaling) \n
1827/// 0x15: Not-less-than (unordered, non-signaling) \n
1828/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1829/// 0x17: Ordered (signaling) \n
1830/// 0x18: Equal (unordered, signaling) \n
1831/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1832/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1833/// 0x1B: False (ordered, signaling) \n
1834/// 0x1C: Not-equal (ordered, signaling) \n
1835/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1836/// 0x1E: Greater-than (ordered, non-signaling) \n
1837/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001838/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001839#define _mm256_cmp_ps(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001840 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001841 (__v8sf)(__m256)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001842
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001843/// Compares each of the corresponding scalar double-precision values of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001844/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001845/// immediate integer operand.
1846///
1847/// If the result is true, all 64 bits of the destination vector are set;
1848/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001849///
1850/// \headerfile <x86intrin.h>
1851///
1852/// \code
1853/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1854/// \endcode
1855///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001856/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001857///
1858/// \param a
1859/// A 128-bit vector of [2 x double].
1860/// \param b
1861/// A 128-bit vector of [2 x double].
1862/// \param c
1863/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001864/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001865/// 0x00: Equal (ordered, non-signaling) \n
1866/// 0x01: Less-than (ordered, signaling) \n
1867/// 0x02: Less-than-or-equal (ordered, signaling) \n
1868/// 0x03: Unordered (non-signaling) \n
1869/// 0x04: Not-equal (unordered, non-signaling) \n
1870/// 0x05: Not-less-than (unordered, signaling) \n
1871/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1872/// 0x07: Ordered (non-signaling) \n
1873/// 0x08: Equal (unordered, non-signaling) \n
1874/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1875/// 0x0A: Not-greater-than (unordered, signaling) \n
1876/// 0x0B: False (ordered, non-signaling) \n
1877/// 0x0C: Not-equal (ordered, non-signaling) \n
1878/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1879/// 0x0E: Greater-than (ordered, signaling) \n
1880/// 0x0F: True (unordered, non-signaling) \n
1881/// 0x10: Equal (ordered, signaling) \n
1882/// 0x11: Less-than (ordered, non-signaling) \n
1883/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1884/// 0x13: Unordered (signaling) \n
1885/// 0x14: Not-equal (unordered, signaling) \n
1886/// 0x15: Not-less-than (unordered, non-signaling) \n
1887/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1888/// 0x17: Ordered (signaling) \n
1889/// 0x18: Equal (unordered, signaling) \n
1890/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1891/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1892/// 0x1B: False (ordered, signaling) \n
1893/// 0x1C: Not-equal (ordered, signaling) \n
1894/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1895/// 0x1E: Greater-than (ordered, non-signaling) \n
1896/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001897/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001898#define _mm_cmp_sd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001899 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001900 (__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001901
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001902/// Compares each of the corresponding scalar values of two 128-bit
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001903/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001904/// integer operand.
1905///
1906/// If the result is true, all 32 bits of the destination vector are set;
1907/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001908///
1909/// \headerfile <x86intrin.h>
1910///
1911/// \code
1912/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1913/// \endcode
1914///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001915/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001916///
1917/// \param a
1918/// A 128-bit vector of [4 x float].
1919/// \param b
1920/// A 128-bit vector of [4 x float].
1921/// \param c
1922/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001923/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001924/// 0x00: Equal (ordered, non-signaling) \n
1925/// 0x01: Less-than (ordered, signaling) \n
1926/// 0x02: Less-than-or-equal (ordered, signaling) \n
1927/// 0x03: Unordered (non-signaling) \n
1928/// 0x04: Not-equal (unordered, non-signaling) \n
1929/// 0x05: Not-less-than (unordered, signaling) \n
1930/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1931/// 0x07: Ordered (non-signaling) \n
1932/// 0x08: Equal (unordered, non-signaling) \n
1933/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1934/// 0x0A: Not-greater-than (unordered, signaling) \n
1935/// 0x0B: False (ordered, non-signaling) \n
1936/// 0x0C: Not-equal (ordered, non-signaling) \n
1937/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1938/// 0x0E: Greater-than (ordered, signaling) \n
1939/// 0x0F: True (unordered, non-signaling) \n
1940/// 0x10: Equal (ordered, signaling) \n
1941/// 0x11: Less-than (ordered, non-signaling) \n
1942/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1943/// 0x13: Unordered (signaling) \n
1944/// 0x14: Not-equal (unordered, signaling) \n
1945/// 0x15: Not-less-than (unordered, non-signaling) \n
1946/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1947/// 0x17: Ordered (signaling) \n
1948/// 0x18: Equal (unordered, signaling) \n
1949/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1950/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1951/// 0x1B: False (ordered, signaling) \n
1952/// 0x1C: Not-equal (ordered, signaling) \n
1953/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1954/// 0x1E: Greater-than (ordered, non-signaling) \n
1955/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001956/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001957#define _mm_cmp_ss(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001958 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001959 (__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001960
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001961/// Takes a [8 x i32] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001962/// indexed by the immediate constant operand.
1963///
1964/// \headerfile <x86intrin.h>
1965///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001966/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1967/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001968///
1969/// \param __a
1970/// A 256-bit vector of [8 x i32].
1971/// \param __imm
1972/// An immediate integer operand with bits [2:0] determining which vector
1973/// element is extracted and returned.
1974/// \returns A 32-bit integer containing the extracted 32 bits of extended
1975/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00001976#define _mm256_extract_epi32(X, N) \
1977 (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001978
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001979/// Takes a [16 x i16] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001980/// indexed by the immediate constant operand.
1981///
1982/// \headerfile <x86intrin.h>
1983///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001984/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1985/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001986///
1987/// \param __a
1988/// A 256-bit integer vector of [16 x i16].
1989/// \param __imm
1990/// An immediate integer operand with bits [3:0] determining which vector
1991/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00001992/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001993/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00001994#define _mm256_extract_epi16(X, N) \
1995 (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1996 (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001997
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001998/// Takes a [32 x i8] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001999/// indexed by the immediate constant operand.
2000///
2001/// \headerfile <x86intrin.h>
2002///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002003/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2004/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002005///
2006/// \param __a
2007/// A 256-bit integer vector of [32 x i8].
2008/// \param __imm
2009/// An immediate integer operand with bits [4:0] determining which vector
2010/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002011/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2012/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00002013#define _mm256_extract_epi8(X, N) \
2014 (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2015 (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002016
2017#ifdef __x86_64__
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002018/// Takes a [4 x i64] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002019/// indexed by the immediate constant operand.
2020///
2021/// \headerfile <x86intrin.h>
2022///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002023/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2024/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002025///
2026/// \param __a
2027/// A 256-bit integer vector of [4 x i64].
2028/// \param __imm
2029/// An immediate integer operand with bits [1:0] determining which vector
2030/// element is extracted and returned.
2031/// \returns A 64-bit integer containing the extracted 64 bits of extended
2032/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00002033#define _mm256_extract_epi64(X, N) \
2034 (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002035#endif
2036
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002037/// Takes a [8 x i32] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002038/// indexed by the immediate constant operand by a new value. Returns the
2039/// modified vector.
2040///
2041/// \headerfile <x86intrin.h>
2042///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002043/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2044/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002045///
2046/// \param __a
2047/// A vector of [8 x i32] to be used by the insert operation.
2048/// \param __b
2049/// An integer value. The replacement value for the insert operation.
2050/// \param __imm
2051/// An immediate integer specifying the index of the vector element to be
2052/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002053/// \returns A copy of vector \a __a, after replacing its element indexed by
2054/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002055#define _mm256_insert_epi32(X, I, N) \
2056 (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2057 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002058
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002059
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002060/// Takes a [16 x i16] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002061/// indexed by the immediate constant operand with a new value. Returns the
2062/// modified vector.
2063///
2064/// \headerfile <x86intrin.h>
2065///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002066/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2067/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002068///
2069/// \param __a
2070/// A vector of [16 x i16] to be used by the insert operation.
2071/// \param __b
2072/// An i16 integer value. The replacement value for the insert operation.
2073/// \param __imm
2074/// An immediate integer specifying the index of the vector element to be
2075/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002076/// \returns A copy of vector \a __a, after replacing its element indexed by
2077/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002078#define _mm256_insert_epi16(X, I, N) \
2079 (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2080 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002081
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002082/// Takes a [32 x i8] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002083/// indexed by the immediate constant operand with a new value. Returns the
2084/// modified vector.
2085///
2086/// \headerfile <x86intrin.h>
2087///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002088/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2089/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002090///
2091/// \param __a
2092/// A vector of [32 x i8] to be used by the insert operation.
2093/// \param __b
2094/// An i8 integer value. The replacement value for the insert operation.
2095/// \param __imm
2096/// An immediate integer specifying the index of the vector element to be
2097/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002098/// \returns A copy of vector \a __a, after replacing its element indexed by
2099/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002100#define _mm256_insert_epi8(X, I, N) \
2101 (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2102 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002103
2104#ifdef __x86_64__
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002105/// Takes a [4 x i64] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002106/// indexed by the immediate constant operand with a new value. Returns the
2107/// modified vector.
2108///
2109/// \headerfile <x86intrin.h>
2110///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002111/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2112/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002113///
2114/// \param __a
2115/// A vector of [4 x i64] to be used by the insert operation.
2116/// \param __b
2117/// A 64-bit integer value. The replacement value for the insert operation.
2118/// \param __imm
2119/// An immediate integer specifying the index of the vector element to be
2120/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002121/// \returns A copy of vector \a __a, after replacing its element indexed by
2122/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002123#define _mm256_insert_epi64(X, I, N) \
2124 (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2125 (long long)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002126#endif
2127
2128/* Conversion */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002129/// Converts a vector of [4 x i32] into a vector of [4 x double].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002130///
2131/// \headerfile <x86intrin.h>
2132///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002133/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002134///
2135/// \param __a
2136/// A 128-bit integer vector of [4 x i32].
2137/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002138static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002139_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002140{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002141 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002142}
2143
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002144/// Converts a vector of [8 x i32] into a vector of [8 x float].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002145///
2146/// \headerfile <x86intrin.h>
2147///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002148/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002149///
2150/// \param __a
2151/// A 256-bit integer vector.
2152/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002153static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002154_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002155{
Craig Topper842171d2018-05-21 20:19:17 +00002156 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002157}
2158
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002159/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002160/// [4 x float].
2161///
2162/// \headerfile <x86intrin.h>
2163///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002164/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002165///
2166/// \param __a
2167/// A 256-bit vector of [4 x double].
2168/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002169static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002170_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002171{
David Blaikie3302f2b2013-01-16 23:08:36 +00002172 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002173}
2174
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002175/// Converts a vector of [8 x float] into a vector of [8 x i32].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002176///
2177/// \headerfile <x86intrin.h>
2178///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002179/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002180///
2181/// \param __a
2182/// A 256-bit vector of [8 x float].
2183/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002184static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002185_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002186{
David Blaikie3302f2b2013-01-16 23:08:36 +00002187 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002188}
2189
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002190/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002191/// x double].
2192///
2193/// \headerfile <x86intrin.h>
2194///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002195/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002196///
2197/// \param __a
2198/// A 128-bit vector of [4 x float].
2199/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002200static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002201_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002202{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002203 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002204}
2205
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002206/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002207/// x i32], truncating the result by rounding towards zero when it is
2208/// inexact.
2209///
2210/// \headerfile <x86intrin.h>
2211///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002212/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002213///
2214/// \param __a
2215/// A 256-bit vector of [4 x double].
2216/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002217static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002218_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002219{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002220 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002221}
2222
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002223/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002224/// x i32]. When a conversion is inexact, the value returned is rounded
2225/// according to the rounding control bits in the MXCSR register.
2226///
2227/// \headerfile <x86intrin.h>
2228///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002229/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002230///
2231/// \param __a
2232/// A 256-bit vector of [4 x double].
2233/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002234static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002235_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002236{
David Blaikie3302f2b2013-01-16 23:08:36 +00002237 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002238}
2239
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002240/// Converts a vector of [8 x float] into a vector of [8 x i32],
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002241/// truncating the result by rounding towards zero when it is inexact.
2242///
2243/// \headerfile <x86intrin.h>
2244///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002245/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002246///
2247/// \param __a
2248/// A 256-bit vector of [8 x float].
2249/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002250static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002251_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002252{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002253 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002254}
2255
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002256/// Returns the first element of the input vector of [4 x double].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002257///
2258/// \headerfile <avxintrin.h>
2259///
2260/// This intrinsic is a utility function and does not correspond to a specific
2261/// instruction.
2262///
2263/// \param __a
2264/// A 256-bit vector of [4 x double].
2265/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002266static __inline double __DEFAULT_FN_ATTRS
2267_mm256_cvtsd_f64(__m256d __a)
2268{
2269 return __a[0];
2270}
2271
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002272/// Returns the first element of the input vector of [8 x i32].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002273///
2274/// \headerfile <avxintrin.h>
2275///
2276/// This intrinsic is a utility function and does not correspond to a specific
2277/// instruction.
2278///
2279/// \param __a
2280/// A 256-bit vector of [8 x i32].
2281/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002282static __inline int __DEFAULT_FN_ATTRS
2283_mm256_cvtsi256_si32(__m256i __a)
2284{
2285 __v8si __b = (__v8si)__a;
2286 return __b[0];
2287}
2288
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002289/// Returns the first element of the input vector of [8 x float].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002290///
2291/// \headerfile <avxintrin.h>
2292///
2293/// This intrinsic is a utility function and does not correspond to a specific
2294/// instruction.
2295///
2296/// \param __a
2297/// A 256-bit vector of [8 x float].
2298/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002299static __inline float __DEFAULT_FN_ATTRS
2300_mm256_cvtss_f32(__m256 __a)
2301{
2302 return __a[0];
2303}
2304
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002305/* Vector replicate */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002306/// Moves and duplicates odd-indexed values from a 256-bit vector of
Douglas Yung7ff91422018-01-08 21:21:17 +00002307/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002308///
2309/// \headerfile <x86intrin.h>
2310///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002311/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002312///
2313/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002314/// A 256-bit vector of [8 x float]. \n
2315/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2316/// the return value. \n
2317/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2318/// the return value. \n
2319/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2320/// return value. \n
2321/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2322/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002323/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2324/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002325static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002326_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002327{
Craig Topper1aa231e2016-05-16 06:38:42 +00002328 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002329}
2330
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002331/// Moves and duplicates even-indexed values from a 256-bit vector of
Douglas Yung7ff91422018-01-08 21:21:17 +00002332/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002333///
2334/// \headerfile <x86intrin.h>
2335///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002336/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002337///
2338/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002339/// A 256-bit vector of [8 x float]. \n
2340/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2341/// the return value. \n
2342/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2343/// the return value. \n
2344/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2345/// return value. \n
2346/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2347/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002348/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2349/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002350static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002351_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002352{
Craig Topper1aa231e2016-05-16 06:38:42 +00002353 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002354}
2355
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002356/// Moves and duplicates double-precision floating point values from a
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002357/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2358/// vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002359///
2360/// \headerfile <x86intrin.h>
2361///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002362/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002363///
2364/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002365/// A 256-bit vector of [4 x double]. \n
2366/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2367/// return value. \n
2368/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2369/// the return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002370/// \returns A 256-bit vector of [4 x double] containing the moved and
2371/// duplicated values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002372static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002373_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002374{
Craig Topper1aa231e2016-05-16 06:38:42 +00002375 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002376}
2377
2378/* Unpack and Interleave */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002379/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002380/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2381///
2382/// \headerfile <x86intrin.h>
2383///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002384/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002385///
2386/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002387/// A 256-bit floating-point vector of [4 x double]. \n
2388/// Bits [127:64] are written to bits [63:0] of the return value. \n
2389/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002390/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002391/// A 256-bit floating-point vector of [4 x double]. \n
2392/// Bits [127:64] are written to bits [127:64] of the return value. \n
2393/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002394/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002395static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002396_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002397{
Craig Topper1aa231e2016-05-16 06:38:42 +00002398 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002399}
2400
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002401/// Unpacks the even-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002402/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2403///
2404/// \headerfile <x86intrin.h>
2405///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002406/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002407///
2408/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002409/// A 256-bit floating-point vector of [4 x double]. \n
2410/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002411/// Bits [191:128] are written to bits [191:128] of the return value.
2412/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002413/// A 256-bit floating-point vector of [4 x double]. \n
2414/// Bits [63:0] are written to bits [127:64] of the return value. \n
2415/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002416/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002417static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002418_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002419{
Craig Topper1aa231e2016-05-16 06:38:42 +00002420 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002421}
2422
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002423/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002424/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2425/// vector of [8 x float].
2426///
2427/// \headerfile <x86intrin.h>
2428///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002429/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002430///
2431/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002432/// A 256-bit vector of [8 x float]. \n
2433/// Bits [95:64] are written to bits [31:0] of the return value. \n
2434/// Bits [127:96] are written to bits [95:64] of the return value. \n
2435/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002436/// Bits [255:224] are written to bits [223:192] of the return value.
2437/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002438/// A 256-bit vector of [8 x float]. \n
2439/// Bits [95:64] are written to bits [63:32] of the return value. \n
2440/// Bits [127:96] are written to bits [127:96] of the return value. \n
2441/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002442/// Bits [255:224] are written to bits [255:224] of the return value.
2443/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002444static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002445_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002446{
Craig Topper1aa231e2016-05-16 06:38:42 +00002447 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002448}
2449
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002450/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002451/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2452/// vector of [8 x float].
2453///
2454/// \headerfile <x86intrin.h>
2455///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002456/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002457///
2458/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002459/// A 256-bit vector of [8 x float]. \n
2460/// Bits [31:0] are written to bits [31:0] of the return value. \n
2461/// Bits [63:32] are written to bits [95:64] of the return value. \n
2462/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002463/// Bits [191:160] are written to bits [223:192] of the return value.
2464/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002465/// A 256-bit vector of [8 x float]. \n
2466/// Bits [31:0] are written to bits [63:32] of the return value. \n
2467/// Bits [63:32] are written to bits [127:96] of the return value. \n
2468/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002469/// Bits [191:160] are written to bits [255:224] of the return value.
2470/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002471static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002472_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002473{
Craig Topper1aa231e2016-05-16 06:38:42 +00002474 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002475}
2476
2477/* Bit Test */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002478/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002479/// element-by-element comparison of the double-precision element in the
2480/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002481/// vector.
2482///
2483/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002484/// If there is at least one pair of double-precision elements where the
2485/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002486/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002487/// If there is at least one pair of double-precision elements where the
2488/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002489/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002490/// This intrinsic returns the value of the ZF flag.
2491///
2492/// \headerfile <x86intrin.h>
2493///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002494/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002495///
2496/// \param __a
2497/// A 128-bit vector of [2 x double].
2498/// \param __b
2499/// A 128-bit vector of [2 x double].
2500/// \returns the ZF flag in the EFLAGS register.
Craig Topper74c10e32018-07-09 19:00:16 +00002501static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002502_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002503{
David Blaikie3302f2b2013-01-16 23:08:36 +00002504 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002505}
2506
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002507/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002508/// element-by-element comparison of the double-precision element in the
2509/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002510/// vector.
2511///
2512/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002513/// If there is at least one pair of double-precision elements where the
2514/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002515/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002516/// If there is at least one pair of double-precision elements where the
2517/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002518/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002519/// This intrinsic returns the value of the CF flag.
2520///
2521/// \headerfile <x86intrin.h>
2522///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002523/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002524///
2525/// \param __a
2526/// A 128-bit vector of [2 x double].
2527/// \param __b
2528/// A 128-bit vector of [2 x double].
2529/// \returns the CF flag in the EFLAGS register.
Craig Topper74c10e32018-07-09 19:00:16 +00002530static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002531_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002532{
David Blaikie3302f2b2013-01-16 23:08:36 +00002533 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002534}
2535
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002536/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002537/// element-by-element comparison of the double-precision element in the
2538/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002539/// vector.
2540///
2541/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002542/// If there is at least one pair of double-precision elements where the
2543/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002544/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002545/// If there is at least one pair of double-precision elements where the
2546/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002547/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002548/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2549/// otherwise it returns 0.
2550///
2551/// \headerfile <x86intrin.h>
2552///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002553/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002554///
2555/// \param __a
2556/// A 128-bit vector of [2 x double].
2557/// \param __b
2558/// A 128-bit vector of [2 x double].
2559/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Craig Topper74c10e32018-07-09 19:00:16 +00002560static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002561_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002562{
David Blaikie3302f2b2013-01-16 23:08:36 +00002563 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002564}
2565
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002566/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002567/// element-by-element comparison of the single-precision element in the
2568/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002569/// vector.
2570///
2571/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002572/// If there is at least one pair of single-precision elements where the
2573/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002574/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002575/// If there is at least one pair of single-precision elements where the
2576/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002577/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002578/// This intrinsic returns the value of the ZF flag.
2579///
2580/// \headerfile <x86intrin.h>
2581///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002582/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002583///
2584/// \param __a
2585/// A 128-bit vector of [4 x float].
2586/// \param __b
2587/// A 128-bit vector of [4 x float].
2588/// \returns the ZF flag.
Craig Topper74c10e32018-07-09 19:00:16 +00002589static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002590_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002591{
David Blaikie3302f2b2013-01-16 23:08:36 +00002592 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002593}
2594
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002595/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002596/// element-by-element comparison of the single-precision element in the
2597/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002598/// vector.
2599///
2600/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002601/// If there is at least one pair of single-precision elements where the
2602/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002603/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002604/// If there is at least one pair of single-precision elements where the
2605/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002606/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002607/// This intrinsic returns the value of the CF flag.
2608///
2609/// \headerfile <x86intrin.h>
2610///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002611/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002612///
2613/// \param __a
2614/// A 128-bit vector of [4 x float].
2615/// \param __b
2616/// A 128-bit vector of [4 x float].
2617/// \returns the CF flag.
Craig Topper74c10e32018-07-09 19:00:16 +00002618static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002619_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002620{
David Blaikie3302f2b2013-01-16 23:08:36 +00002621 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002622}
2623
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002624/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002625/// element-by-element comparison of the single-precision element in the
2626/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002627/// vector.
2628///
2629/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002630/// If there is at least one pair of single-precision elements where the
2631/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002632/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002633/// If there is at least one pair of single-precision elements where the
2634/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002635/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002636/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2637/// otherwise it returns 0.
2638///
2639/// \headerfile <x86intrin.h>
2640///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002641/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002642///
2643/// \param __a
2644/// A 128-bit vector of [4 x float].
2645/// \param __b
2646/// A 128-bit vector of [4 x float].
2647/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Craig Topper74c10e32018-07-09 19:00:16 +00002648static __inline int __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002649_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002650{
David Blaikie3302f2b2013-01-16 23:08:36 +00002651 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002652}
2653
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002654/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002655/// element-by-element comparison of the double-precision elements in the
2656/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002657/// vector.
2658///
2659/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002660/// If there is at least one pair of double-precision elements where the
2661/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002662/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002663/// If there is at least one pair of double-precision elements where the
2664/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002665/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002666/// This intrinsic returns the value of the ZF flag.
2667///
2668/// \headerfile <x86intrin.h>
2669///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002670/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002671///
2672/// \param __a
2673/// A 256-bit vector of [4 x double].
2674/// \param __b
2675/// A 256-bit vector of [4 x double].
2676/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002677static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002678_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002679{
David Blaikie3302f2b2013-01-16 23:08:36 +00002680 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002681}
2682
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002683/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002684/// element-by-element comparison of the double-precision elements in the
2685/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002686/// vector.
2687///
2688/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002689/// If there is at least one pair of double-precision elements where the
2690/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002691/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002692/// If there is at least one pair of double-precision elements where the
2693/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002694/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002695/// This intrinsic returns the value of the CF flag.
2696///
2697/// \headerfile <x86intrin.h>
2698///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002699/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002700///
2701/// \param __a
2702/// A 256-bit vector of [4 x double].
2703/// \param __b
2704/// A 256-bit vector of [4 x double].
2705/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002706static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002707_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002708{
David Blaikie3302f2b2013-01-16 23:08:36 +00002709 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002710}
2711
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002712/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002713/// element-by-element comparison of the double-precision elements in the
2714/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002715/// vector.
2716///
2717/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002718/// If there is at least one pair of double-precision elements where the
2719/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002720/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002721/// If there is at least one pair of double-precision elements where the
2722/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002723/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002724/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2725/// otherwise it returns 0.
2726///
2727/// \headerfile <x86intrin.h>
2728///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002729/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002730///
2731/// \param __a
2732/// A 256-bit vector of [4 x double].
2733/// \param __b
2734/// A 256-bit vector of [4 x double].
2735/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002736static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002737_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002738{
David Blaikie3302f2b2013-01-16 23:08:36 +00002739 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002740}
2741
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002742/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002743/// element-by-element comparison of the single-precision element in the
2744/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002745/// vector.
2746///
2747/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002748/// If there is at least one pair of single-precision elements where the
2749/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002750/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002751/// If there is at least one pair of single-precision elements where the
2752/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002753/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002754/// This intrinsic returns the value of the ZF flag.
2755///
2756/// \headerfile <x86intrin.h>
2757///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002758/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002759///
2760/// \param __a
2761/// A 256-bit vector of [8 x float].
2762/// \param __b
2763/// A 256-bit vector of [8 x float].
2764/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002765static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002766_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002767{
David Blaikie3302f2b2013-01-16 23:08:36 +00002768 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002769}
2770
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002771/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002772/// element-by-element comparison of the single-precision element in the
2773/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002774/// vector.
2775///
2776/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002777/// If there is at least one pair of single-precision elements where the
2778/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002779/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002780/// If there is at least one pair of single-precision elements where the
2781/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002782/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002783/// This intrinsic returns the value of the CF flag.
2784///
2785/// \headerfile <x86intrin.h>
2786///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002787/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002788///
2789/// \param __a
2790/// A 256-bit vector of [8 x float].
2791/// \param __b
2792/// A 256-bit vector of [8 x float].
2793/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002794static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002795_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002796{
David Blaikie3302f2b2013-01-16 23:08:36 +00002797 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002798}
2799
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002800/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002801/// element-by-element comparison of the single-precision elements in the
2802/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002803/// vector.
2804///
2805/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002806/// If there is at least one pair of single-precision elements where the
2807/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002808/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002809/// If there is at least one pair of single-precision elements where the
2810/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002811/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002812/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2813/// otherwise it returns 0.
2814///
2815/// \headerfile <x86intrin.h>
2816///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002817/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002818///
2819/// \param __a
2820/// A 256-bit vector of [8 x float].
2821/// \param __b
2822/// A 256-bit vector of [8 x float].
2823/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002824static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002825_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002826{
David Blaikie3302f2b2013-01-16 23:08:36 +00002827 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002828}
2829
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002830/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002831/// of the two source vectors.
2832///
2833/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002834/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002835/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002836/// If there is at least one pair of bits where the bit from the first source
2837/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002838/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002839/// This intrinsic returns the value of the ZF flag.
2840///
2841/// \headerfile <x86intrin.h>
2842///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002843/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002844///
2845/// \param __a
2846/// A 256-bit integer vector.
2847/// \param __b
2848/// A 256-bit integer vector.
2849/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002850static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002851_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002852{
David Blaikie3302f2b2013-01-16 23:08:36 +00002853 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002854}
2855
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002856/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002857/// of the two source vectors.
2858///
2859/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002860/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002861/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002862/// If there is at least one pair of bits where the bit from the first source
2863/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002864/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002865/// This intrinsic returns the value of the CF flag.
2866///
2867/// \headerfile <x86intrin.h>
2868///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002869/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002870///
2871/// \param __a
2872/// A 256-bit integer vector.
2873/// \param __b
2874/// A 256-bit integer vector.
2875/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002876static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002877_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002878{
David Blaikie3302f2b2013-01-16 23:08:36 +00002879 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002880}
2881
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002882/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002883/// of the two source vectors.
2884///
2885/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002886/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002887/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002888/// If there is at least one pair of bits where the bit from the first source
2889/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002890/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002891/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2892/// otherwise it returns 0.
2893///
2894/// \headerfile <x86intrin.h>
2895///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002896/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002897///
2898/// \param __a
2899/// A 256-bit integer vector.
2900/// \param __b
2901/// A 256-bit integer vector.
2902/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002903static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002904_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002905{
David Blaikie3302f2b2013-01-16 23:08:36 +00002906 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002907}
2908
2909/* Vector extract sign mask */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002910/// Extracts the sign bits of double-precision floating point elements
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002911/// in a 256-bit vector of [4 x double] and writes them to the lower order
2912/// bits of the return value.
2913///
2914/// \headerfile <x86intrin.h>
2915///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002916/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002917///
2918/// \param __a
2919/// A 256-bit vector of [4 x double] containing the double-precision
2920/// floating point values with sign bits to be extracted.
2921/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002922static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002923_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002924{
David Blaikie3302f2b2013-01-16 23:08:36 +00002925 return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002926}
2927
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002928/// Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002929/// in a 256-bit vector of [8 x float] and writes them to the lower order
2930/// bits of the return value.
2931///
2932/// \headerfile <x86intrin.h>
2933///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002934/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002935///
2936/// \param __a
Douglas Yung7ff91422018-01-08 21:21:17 +00002937/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002938/// point values with sign bits to be extracted.
2939/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002940static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002941_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002942{
David Blaikie3302f2b2013-01-16 23:08:36 +00002943 return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002944}
2945
David Blaikie3302f2b2013-01-16 23:08:36 +00002946/* Vector __zero */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002947/// Zeroes the contents of all XMM or YMM registers.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002948///
2949/// \headerfile <x86intrin.h>
2950///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002951/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Craig Topper74c10e32018-07-09 19:00:16 +00002952static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002953_mm256_zeroall(void)
2954{
2955 __builtin_ia32_vzeroall();
2956}
2957
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002958/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002959///
2960/// \headerfile <x86intrin.h>
2961///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002962/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Craig Topper74c10e32018-07-09 19:00:16 +00002963static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002964_mm256_zeroupper(void)
2965{
2966 __builtin_ia32_vzeroupper();
2967}
2968
2969/* Vector load with broadcast */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002970/// Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002971/// specified address pointed to by \a __a and broadcasts it to the elements
2972/// of a [4 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002973///
2974/// \headerfile <x86intrin.h>
2975///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002976/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002977///
2978/// \param __a
2979/// The single-precision floating point value to be broadcast.
2980/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2981/// equal to the broadcast value.
Craig Topper74c10e32018-07-09 19:00:16 +00002982static __inline __m128 __DEFAULT_FN_ATTRS128
David Blaikie3302f2b2013-01-16 23:08:36 +00002983_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002984{
Adam Nemet286ae082014-05-29 20:47:29 +00002985 float __f = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00002986 return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002987}
2988
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002989/// Loads a scalar double-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002990/// specified address pointed to by \a __a and broadcasts it to the elements
2991/// of a [4 x double] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002992///
2993/// \headerfile <x86intrin.h>
2994///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002995/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002996///
2997/// \param __a
2998/// The double-precision floating point value to be broadcast.
2999/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3000/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003001static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003002_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003003{
Adam Nemet286ae082014-05-29 20:47:29 +00003004 double __d = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00003005 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003006}
3007
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003008/// Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003009/// specified address pointed to by \a __a and broadcasts it to the elements
3010/// of a [8 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003011///
3012/// \headerfile <x86intrin.h>
3013///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003014/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003015///
3016/// \param __a
3017/// The single-precision floating point value to be broadcast.
3018/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3019/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003020static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003021_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003022{
Adam Nemet286ae082014-05-29 20:47:29 +00003023 float __f = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00003024 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003025}
3026
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003027/// Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003028/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003029/// elements in a 256-bit vector of [4 x double].
3030///
3031/// \headerfile <x86intrin.h>
3032///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003033/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003034///
3035/// \param __a
3036/// The 128-bit vector of [2 x double] to be broadcast.
3037/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3038/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003039static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003040_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003041{
Craig Topper6fb26f92018-06-03 19:42:59 +00003042 __m128d __b = _mm_loadu_pd((const double *)__a);
3043 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3044 0, 1, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003045}
3046
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003047/// Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003048/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003049/// elements in a 256-bit vector of [8 x float].
3050///
3051/// \headerfile <x86intrin.h>
3052///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003053/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003054///
3055/// \param __a
3056/// The 128-bit vector of [4 x float] to be broadcast.
3057/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3058/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003059static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003060_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003061{
Craig Topper6fb26f92018-06-03 19:42:59 +00003062 __m128 __b = _mm_loadu_ps((const float *)__a);
3063 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3064 0, 1, 2, 3, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003065}
3066
3067/* SIMD load ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003068/// Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003069/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003070///
3071/// \headerfile <x86intrin.h>
3072///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003073/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003074///
3075/// \param __p
3076/// A 32-byte aligned pointer to a memory location containing
3077/// double-precision floating point values.
3078/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003079static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003080_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003081{
David Blaikie3302f2b2013-01-16 23:08:36 +00003082 return *(__m256d *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003083}
3084
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003085/// Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003086/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003087///
3088/// \headerfile <x86intrin.h>
3089///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003090/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003091///
3092/// \param __p
3093/// A 32-byte aligned pointer to a memory location containing float values.
3094/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003095static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003096_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003097{
David Blaikie3302f2b2013-01-16 23:08:36 +00003098 return *(__m256 *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003099}
3100
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003101/// Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003102/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003103///
3104/// \headerfile <x86intrin.h>
3105///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003106/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003107///
3108/// \param __p
3109/// A pointer to a memory location containing double-precision floating
3110/// point values.
3111/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003112static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003113_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003114{
Craig Topper9e9301a2012-01-25 04:26:17 +00003115 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +00003116 __m256d __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003117 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003118 return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003119}
3120
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003121/// Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003122/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003123///
3124/// \headerfile <x86intrin.h>
3125///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003126/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003127///
3128/// \param __p
3129/// A pointer to a memory location containing single-precision floating
3130/// point values.
3131/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003132static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003133_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003134{
Craig Topper9e9301a2012-01-25 04:26:17 +00003135 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +00003136 __m256 __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003137 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003138 return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003139}
3140
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003141/// Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003142/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003143///
3144/// \headerfile <x86intrin.h>
3145///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003146/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003147///
3148/// \param __p
3149/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3150/// values.
3151/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003152static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003153_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003154{
David Blaikie3302f2b2013-01-16 23:08:36 +00003155 return *__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003156}
3157
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003158/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003159/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003160///
3161/// \headerfile <x86intrin.h>
3162///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003163/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003164///
3165/// \param __p
3166/// A pointer to a 256-bit integer vector containing integer values.
3167/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003168static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003169_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003170{
Craig Topper9e9301a2012-01-25 04:26:17 +00003171 struct __loadu_si256 {
David Blaikie3302f2b2013-01-16 23:08:36 +00003172 __m256i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003173 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003174 return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003175}
3176
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003177/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003178/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3179/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003180/// line boundary.
3181///
3182/// \headerfile <x86intrin.h>
3183///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003184/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003185///
3186/// \param __p
3187/// A pointer to a 256-bit integer vector containing integer values.
3188/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003189static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003190_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003191{
David Blaikie3302f2b2013-01-16 23:08:36 +00003192 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003193}
3194
3195/* SIMD store ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003196/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003197/// of [4 x double] to a 32-byte aligned memory location pointed to by
3198/// \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003199///
3200/// \headerfile <x86intrin.h>
3201///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003202/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003203///
3204/// \param __p
3205/// A 32-byte aligned pointer to a memory location that will receive the
3206/// double-precision floaing point values.
3207/// \param __a
3208/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003209static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003210_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003211{
David Blaikie3302f2b2013-01-16 23:08:36 +00003212 *(__m256d *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003213}
3214
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003215/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003216/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003217///
3218/// \headerfile <x86intrin.h>
3219///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003220/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003221///
3222/// \param __p
3223/// A 32-byte aligned pointer to a memory location that will receive the
3224/// float values.
3225/// \param __a
3226/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003227static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003228_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003229{
David Blaikie3302f2b2013-01-16 23:08:36 +00003230 *(__m256 *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003231}
3232
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003233/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003234/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003235///
3236/// \headerfile <x86intrin.h>
3237///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003238/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003239///
3240/// \param __p
3241/// A pointer to a memory location that will receive the double-precision
3242/// floating point values.
3243/// \param __a
3244/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003245static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003246_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003247{
Craig Topper09175da2016-05-30 17:10:30 +00003248 struct __storeu_pd {
3249 __m256d __v;
3250 } __attribute__((__packed__, __may_alias__));
3251 ((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003252}
3253
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003254/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003255/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003256///
3257/// \headerfile <x86intrin.h>
3258///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003259/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003260///
3261/// \param __p
3262/// A pointer to a memory location that will receive the float values.
3263/// \param __a
3264/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003265static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003266_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003267{
Craig Topper09175da2016-05-30 17:10:30 +00003268 struct __storeu_ps {
3269 __m256 __v;
3270 } __attribute__((__packed__, __may_alias__));
3271 ((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003272}
3273
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003274/// Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003275/// aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003276///
3277/// \headerfile <x86intrin.h>
3278///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003279/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003280///
3281/// \param __p
3282/// A 32-byte aligned pointer to a memory location that will receive the
3283/// integer values.
3284/// \param __a
3285/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003286static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003287_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003288{
David Blaikie3302f2b2013-01-16 23:08:36 +00003289 *__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003290}
3291
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003292/// Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003293/// memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003294///
3295/// \headerfile <x86intrin.h>
3296///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003297/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003298///
3299/// \param __p
3300/// A pointer to a memory location that will receive the integer values.
3301/// \param __a
3302/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003303static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003304_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003305{
Craig Topper09175da2016-05-30 17:10:30 +00003306 struct __storeu_si256 {
3307 __m256i __v;
3308 } __attribute__((__packed__, __may_alias__));
3309 ((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003310}
3311
3312/* Conditional load ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003313/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003314/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003315/// [2 x double], depending on the mask bits associated with each data
3316/// element.
3317///
3318/// \headerfile <x86intrin.h>
3319///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003320/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003321///
3322/// \param __p
3323/// A pointer to a memory location that contains the double-precision
3324/// floating point values.
3325/// \param __m
3326/// A 128-bit integer vector containing the mask. The most significant bit of
3327/// each data element represents the mask bits. If a mask bit is zero, the
3328/// corresponding value in the memory location is not loaded and the
3329/// corresponding field in the return value is set to zero.
3330/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Craig Topper74c10e32018-07-09 19:00:16 +00003331static __inline __m128d __DEFAULT_FN_ATTRS128
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003332_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003333{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003334 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003335}
3336
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003337/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003338/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003339/// [4 x double], depending on the mask bits associated with each data
3340/// element.
3341///
3342/// \headerfile <x86intrin.h>
3343///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003344/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003345///
3346/// \param __p
3347/// A pointer to a memory location that contains the double-precision
3348/// floating point values.
3349/// \param __m
3350/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3351/// significant bit of each quadword element represents the mask bits. If a
3352/// mask bit is zero, the corresponding value in the memory location is not
3353/// loaded and the corresponding field in the return value is set to zero.
3354/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003355static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003356_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003357{
David Blaikie3302f2b2013-01-16 23:08:36 +00003358 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003359 (__v4di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003360}
3361
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003362/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003363/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003364/// [4 x float], depending on the mask bits associated with each data
3365/// element.
3366///
3367/// \headerfile <x86intrin.h>
3368///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003369/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003370///
3371/// \param __p
3372/// A pointer to a memory location that contains the single-precision
3373/// floating point values.
3374/// \param __m
3375/// A 128-bit integer vector containing the mask. The most significant bit of
3376/// each data element represents the mask bits. If a mask bit is zero, the
3377/// corresponding value in the memory location is not loaded and the
3378/// corresponding field in the return value is set to zero.
3379/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Craig Topper74c10e32018-07-09 19:00:16 +00003380static __inline __m128 __DEFAULT_FN_ATTRS128
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003381_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003382{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003383 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003384}
3385
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003386/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003387/// memory location pointed to by \a __p into a 256-bit vector of
3388/// [8 x float], depending on the mask bits associated with each data
3389/// element.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003390///
3391/// \headerfile <x86intrin.h>
3392///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003393/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003394///
3395/// \param __p
3396/// A pointer to a memory location that contains the single-precision
3397/// floating point values.
3398/// \param __m
3399/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3400/// significant bit of each dword element represents the mask bits. If a mask
3401/// bit is zero, the corresponding value in the memory location is not loaded
3402/// and the corresponding field in the return value is set to zero.
3403/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003404static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003405_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003406{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003407 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003408}
3409
3410/* Conditional store ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003411/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003412/// of [8 x float] to a memory location pointed to by \a __p, according to
3413/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003414///
3415/// \headerfile <x86intrin.h>
3416///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003417/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003418///
3419/// \param __p
3420/// A pointer to a memory location that will receive the float values.
3421/// \param __m
3422/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3423/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003424/// mask bits. If a mask bit is zero, the corresponding value from vector
3425/// \a __a is not stored and the corresponding field in the memory location
3426/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003427/// \param __a
3428/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003429static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003430_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003431{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003432 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003433}
3434
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003435/// Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003436/// to a memory location pointed to by \a __p, according to the specified
3437/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003438///
3439/// \headerfile <x86intrin.h>
3440///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003441/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003442///
3443/// \param __p
3444/// A pointer to a memory location that will receive the float values.
3445/// \param __m
3446/// A 128-bit integer vector containing the mask. The most significant bit of
3447/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003448/// zero, the corresponding value from vector \a __a is not stored and the
3449/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003450/// changed.
3451/// \param __a
3452/// A 128-bit vector of [2 x double] containing the values to be stored.
Craig Topper74c10e32018-07-09 19:00:16 +00003453static __inline void __DEFAULT_FN_ATTRS128
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003454_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003455{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003456 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003457}
3458
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003459/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003460/// to a memory location pointed to by \a __p, according to the specified
3461/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003462///
3463/// \headerfile <x86intrin.h>
3464///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003465/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003466///
3467/// \param __p
3468/// A pointer to a memory location that will receive the float values.
3469/// \param __m
3470/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3471/// significant bit of each quadword element in the mask vector represents
3472/// the mask bits. If a mask bit is zero, the corresponding value from vector
3473/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003474/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003475/// \param __a
3476/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003477static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003478_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003479{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003480 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003481}
3482
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003483/// Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003484/// of [4 x float] to a memory location pointed to by \a __p, according to
3485/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003486///
3487/// \headerfile <x86intrin.h>
3488///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003489/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003490///
3491/// \param __p
3492/// A pointer to a memory location that will receive the float values.
3493/// \param __m
3494/// A 128-bit integer vector containing the mask. The most significant bit of
3495/// each field in the mask vector represents the mask bits. If a mask bit is
3496/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003497/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003498/// changed.
3499/// \param __a
3500/// A 128-bit vector of [4 x float] containing the values to be stored.
Craig Topper74c10e32018-07-09 19:00:16 +00003501static __inline void __DEFAULT_FN_ATTRS128
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003502_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003503{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003504 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003505}
3506
3507/* Cacheability support ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003508/// Moves integer data from a 256-bit integer vector to a 32-byte
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003509/// aligned memory location. To minimize caching, the data is flagged as
3510/// non-temporal (unlikely to be used again soon).
3511///
3512/// \headerfile <x86intrin.h>
3513///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003514/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003515///
3516/// \param __a
3517/// A pointer to a 32-byte aligned memory location that will receive the
3518/// integer values.
3519/// \param __b
3520/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003521static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003522_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003523{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003524 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3525 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003526}
3527
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003528/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003529/// to a 32-byte aligned memory location. To minimize caching, the data is
3530/// flagged as non-temporal (unlikely to be used again soon).
3531///
3532/// \headerfile <x86intrin.h>
3533///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003534/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003535///
3536/// \param __a
3537/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanovacb3603a2017-06-06 22:58:01 +00003538/// double-precision floating-point values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003539/// \param __b
3540/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003541static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003542_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003543{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003544 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3545 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003546}
3547
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003548/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003549/// of [8 x float] to a 32-byte aligned memory location. To minimize
3550/// caching, the data is flagged as non-temporal (unlikely to be used again
3551/// soon).
3552///
3553/// \headerfile <x86intrin.h>
3554///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003555/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003556///
3557/// \param __p
3558/// A pointer to a 32-byte aligned memory location that will receive the
3559/// single-precision floating point values.
3560/// \param __a
3561/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003562static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003563_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003564{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003565 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3566 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003567}
3568
3569/* Create vectors */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003570/// Create a 256-bit vector of [4 x double] with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003571///
3572/// \headerfile <x86intrin.h>
3573///
3574/// This intrinsic has no corresponding instruction.
3575///
3576/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003577static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003578_mm256_undefined_pd(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003579{
3580 return (__m256d)__builtin_ia32_undef256();
3581}
3582
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003583/// Create a 256-bit vector of [8 x float] with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003584///
3585/// \headerfile <x86intrin.h>
3586///
3587/// This intrinsic has no corresponding instruction.
3588///
3589/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003590static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003591_mm256_undefined_ps(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003592{
3593 return (__m256)__builtin_ia32_undef256();
3594}
3595
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003596/// Create a 256-bit integer vector with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003597///
3598/// \headerfile <x86intrin.h>
3599///
3600/// This intrinsic has no corresponding instruction.
3601///
3602/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003603static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003604_mm256_undefined_si256(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003605{
3606 return (__m256i)__builtin_ia32_undef256();
3607}
3608
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003609/// Constructs a 256-bit floating-point vector of [4 x double]
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003610/// initialized with the specified double-precision floating-point values.
3611///
3612/// \headerfile <x86intrin.h>
3613///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003614/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3615/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003616///
3617/// \param __a
3618/// A double-precision floating-point value used to initialize bits [255:192]
3619/// of the result.
3620/// \param __b
3621/// A double-precision floating-point value used to initialize bits [191:128]
3622/// of the result.
3623/// \param __c
3624/// A double-precision floating-point value used to initialize bits [127:64]
3625/// of the result.
3626/// \param __d
3627/// A double-precision floating-point value used to initialize bits [63:0]
3628/// of the result.
3629/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003630static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003631_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003632{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003633 return __extension__ (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003634}
3635
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003636/// Constructs a 256-bit floating-point vector of [8 x float] initialized
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003637/// with the specified single-precision floating-point values.
3638///
3639/// \headerfile <x86intrin.h>
3640///
3641/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003642/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003643///
3644/// \param __a
3645/// A single-precision floating-point value used to initialize bits [255:224]
3646/// of the result.
3647/// \param __b
3648/// A single-precision floating-point value used to initialize bits [223:192]
3649/// of the result.
3650/// \param __c
3651/// A single-precision floating-point value used to initialize bits [191:160]
3652/// of the result.
3653/// \param __d
3654/// A single-precision floating-point value used to initialize bits [159:128]
3655/// of the result.
3656/// \param __e
3657/// A single-precision floating-point value used to initialize bits [127:96]
3658/// of the result.
3659/// \param __f
3660/// A single-precision floating-point value used to initialize bits [95:64]
3661/// of the result.
3662/// \param __g
3663/// A single-precision floating-point value used to initialize bits [63:32]
3664/// of the result.
3665/// \param __h
3666/// A single-precision floating-point value used to initialize bits [31:0]
3667/// of the result.
3668/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003669static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003670_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003671 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003672{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003673 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003674}
3675
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003676/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003677/// 32-bit integral values.
3678///
3679/// \headerfile <x86intrin.h>
3680///
3681/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003682/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003683///
3684/// \param __i0
3685/// A 32-bit integral value used to initialize bits [255:224] of the result.
3686/// \param __i1
3687/// A 32-bit integral value used to initialize bits [223:192] of the result.
3688/// \param __i2
3689/// A 32-bit integral value used to initialize bits [191:160] of the result.
3690/// \param __i3
3691/// A 32-bit integral value used to initialize bits [159:128] of the result.
3692/// \param __i4
3693/// A 32-bit integral value used to initialize bits [127:96] of the result.
3694/// \param __i5
3695/// A 32-bit integral value used to initialize bits [95:64] of the result.
3696/// \param __i6
3697/// A 32-bit integral value used to initialize bits [63:32] of the result.
3698/// \param __i7
3699/// A 32-bit integral value used to initialize bits [31:0] of the result.
3700/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003701static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003702_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003703 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003704{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003705 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003706}
3707
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003708/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003709/// 16-bit integral values.
3710///
3711/// \headerfile <x86intrin.h>
3712///
3713/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003714/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003715///
3716/// \param __w15
3717/// A 16-bit integral value used to initialize bits [255:240] of the result.
3718/// \param __w14
3719/// A 16-bit integral value used to initialize bits [239:224] of the result.
3720/// \param __w13
3721/// A 16-bit integral value used to initialize bits [223:208] of the result.
3722/// \param __w12
3723/// A 16-bit integral value used to initialize bits [207:192] of the result.
3724/// \param __w11
3725/// A 16-bit integral value used to initialize bits [191:176] of the result.
3726/// \param __w10
3727/// A 16-bit integral value used to initialize bits [175:160] of the result.
3728/// \param __w09
3729/// A 16-bit integral value used to initialize bits [159:144] of the result.
3730/// \param __w08
3731/// A 16-bit integral value used to initialize bits [143:128] of the result.
3732/// \param __w07
3733/// A 16-bit integral value used to initialize bits [127:112] of the result.
3734/// \param __w06
3735/// A 16-bit integral value used to initialize bits [111:96] of the result.
3736/// \param __w05
3737/// A 16-bit integral value used to initialize bits [95:80] of the result.
3738/// \param __w04
3739/// A 16-bit integral value used to initialize bits [79:64] of the result.
3740/// \param __w03
3741/// A 16-bit integral value used to initialize bits [63:48] of the result.
3742/// \param __w02
3743/// A 16-bit integral value used to initialize bits [47:32] of the result.
3744/// \param __w01
3745/// A 16-bit integral value used to initialize bits [31:16] of the result.
3746/// \param __w00
3747/// A 16-bit integral value used to initialize bits [15:0] of the result.
3748/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003749static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003750_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003751 short __w11, short __w10, short __w09, short __w08,
3752 short __w07, short __w06, short __w05, short __w04,
3753 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003754{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003755 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
David Blaikie3302f2b2013-01-16 23:08:36 +00003756 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003757}
3758
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003759/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003760/// 8-bit integral values.
3761///
3762/// \headerfile <x86intrin.h>
3763///
3764/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003765/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003766///
3767/// \param __b31
3768/// An 8-bit integral value used to initialize bits [255:248] of the result.
3769/// \param __b30
3770/// An 8-bit integral value used to initialize bits [247:240] of the result.
3771/// \param __b29
3772/// An 8-bit integral value used to initialize bits [239:232] of the result.
3773/// \param __b28
3774/// An 8-bit integral value used to initialize bits [231:224] of the result.
3775/// \param __b27
3776/// An 8-bit integral value used to initialize bits [223:216] of the result.
3777/// \param __b26
3778/// An 8-bit integral value used to initialize bits [215:208] of the result.
3779/// \param __b25
3780/// An 8-bit integral value used to initialize bits [207:200] of the result.
3781/// \param __b24
3782/// An 8-bit integral value used to initialize bits [199:192] of the result.
3783/// \param __b23
3784/// An 8-bit integral value used to initialize bits [191:184] of the result.
3785/// \param __b22
3786/// An 8-bit integral value used to initialize bits [183:176] of the result.
3787/// \param __b21
3788/// An 8-bit integral value used to initialize bits [175:168] of the result.
3789/// \param __b20
3790/// An 8-bit integral value used to initialize bits [167:160] of the result.
3791/// \param __b19
3792/// An 8-bit integral value used to initialize bits [159:152] of the result.
3793/// \param __b18
3794/// An 8-bit integral value used to initialize bits [151:144] of the result.
3795/// \param __b17
3796/// An 8-bit integral value used to initialize bits [143:136] of the result.
3797/// \param __b16
3798/// An 8-bit integral value used to initialize bits [135:128] of the result.
3799/// \param __b15
3800/// An 8-bit integral value used to initialize bits [127:120] of the result.
3801/// \param __b14
3802/// An 8-bit integral value used to initialize bits [119:112] of the result.
3803/// \param __b13
3804/// An 8-bit integral value used to initialize bits [111:104] of the result.
3805/// \param __b12
3806/// An 8-bit integral value used to initialize bits [103:96] of the result.
3807/// \param __b11
3808/// An 8-bit integral value used to initialize bits [95:88] of the result.
3809/// \param __b10
3810/// An 8-bit integral value used to initialize bits [87:80] of the result.
3811/// \param __b09
3812/// An 8-bit integral value used to initialize bits [79:72] of the result.
3813/// \param __b08
3814/// An 8-bit integral value used to initialize bits [71:64] of the result.
3815/// \param __b07
3816/// An 8-bit integral value used to initialize bits [63:56] of the result.
3817/// \param __b06
3818/// An 8-bit integral value used to initialize bits [55:48] of the result.
3819/// \param __b05
3820/// An 8-bit integral value used to initialize bits [47:40] of the result.
3821/// \param __b04
3822/// An 8-bit integral value used to initialize bits [39:32] of the result.
3823/// \param __b03
3824/// An 8-bit integral value used to initialize bits [31:24] of the result.
3825/// \param __b02
3826/// An 8-bit integral value used to initialize bits [23:16] of the result.
3827/// \param __b01
3828/// An 8-bit integral value used to initialize bits [15:8] of the result.
3829/// \param __b00
3830/// An 8-bit integral value used to initialize bits [7:0] of the result.
3831/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003832static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003833_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003834 char __b27, char __b26, char __b25, char __b24,
3835 char __b23, char __b22, char __b21, char __b20,
3836 char __b19, char __b18, char __b17, char __b16,
3837 char __b15, char __b14, char __b13, char __b12,
3838 char __b11, char __b10, char __b09, char __b08,
3839 char __b07, char __b06, char __b05, char __b04,
3840 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003841{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003842 return __extension__ (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00003843 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3844 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3845 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3846 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003847 };
3848}
3849
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003850/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003851/// 64-bit integral values.
3852///
3853/// \headerfile <x86intrin.h>
3854///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003855/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3856/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003857///
3858/// \param __a
3859/// A 64-bit integral value used to initialize bits [255:192] of the result.
3860/// \param __b
3861/// A 64-bit integral value used to initialize bits [191:128] of the result.
3862/// \param __c
3863/// A 64-bit integral value used to initialize bits [127:64] of the result.
3864/// \param __d
3865/// A 64-bit integral value used to initialize bits [63:0] of the result.
3866/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003867static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003868_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003869{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003870 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003871}
3872
3873/* Create vectors with elements in reverse order */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003874/// Constructs a 256-bit floating-point vector of [4 x double],
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003875/// initialized in reverse order with the specified double-precision
3876/// floating-point values.
3877///
3878/// \headerfile <x86intrin.h>
3879///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003880/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3881/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003882///
3883/// \param __a
3884/// A double-precision floating-point value used to initialize bits [63:0]
3885/// of the result.
3886/// \param __b
3887/// A double-precision floating-point value used to initialize bits [127:64]
3888/// of the result.
3889/// \param __c
3890/// A double-precision floating-point value used to initialize bits [191:128]
3891/// of the result.
3892/// \param __d
3893/// A double-precision floating-point value used to initialize bits [255:192]
3894/// of the result.
3895/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003896static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003897_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003898{
Tim Shenf811de42018-05-31 01:51:07 +00003899 return _mm256_set_pd(__d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003900}
3901
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003902/// Constructs a 256-bit floating-point vector of [8 x float],
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003903/// initialized in reverse order with the specified single-precision
3904/// float-point values.
3905///
3906/// \headerfile <x86intrin.h>
3907///
3908/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003909/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003910///
3911/// \param __a
3912/// A single-precision floating-point value used to initialize bits [31:0]
3913/// of the result.
3914/// \param __b
3915/// A single-precision floating-point value used to initialize bits [63:32]
3916/// of the result.
3917/// \param __c
3918/// A single-precision floating-point value used to initialize bits [95:64]
3919/// of the result.
3920/// \param __d
3921/// A single-precision floating-point value used to initialize bits [127:96]
3922/// of the result.
3923/// \param __e
3924/// A single-precision floating-point value used to initialize bits [159:128]
3925/// of the result.
3926/// \param __f
3927/// A single-precision floating-point value used to initialize bits [191:160]
3928/// of the result.
3929/// \param __g
3930/// A single-precision floating-point value used to initialize bits [223:192]
3931/// of the result.
3932/// \param __h
3933/// A single-precision floating-point value used to initialize bits [255:224]
3934/// of the result.
3935/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003936static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003937_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003938 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003939{
Tim Shenf811de42018-05-31 01:51:07 +00003940 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003941}
3942
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003943/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003944/// with the specified 32-bit integral values.
3945///
3946/// \headerfile <x86intrin.h>
3947///
3948/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003949/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003950///
3951/// \param __i0
3952/// A 32-bit integral value used to initialize bits [31:0] of the result.
3953/// \param __i1
3954/// A 32-bit integral value used to initialize bits [63:32] of the result.
3955/// \param __i2
3956/// A 32-bit integral value used to initialize bits [95:64] of the result.
3957/// \param __i3
3958/// A 32-bit integral value used to initialize bits [127:96] of the result.
3959/// \param __i4
3960/// A 32-bit integral value used to initialize bits [159:128] of the result.
3961/// \param __i5
3962/// A 32-bit integral value used to initialize bits [191:160] of the result.
3963/// \param __i6
3964/// A 32-bit integral value used to initialize bits [223:192] of the result.
3965/// \param __i7
3966/// A 32-bit integral value used to initialize bits [255:224] of the result.
3967/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003968static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003969_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003970 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003971{
Tim Shenf811de42018-05-31 01:51:07 +00003972 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003973}
3974
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003975/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003976/// with the specified 16-bit integral values.
3977///
3978/// \headerfile <x86intrin.h>
3979///
3980/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003981/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003982///
3983/// \param __w15
3984/// A 16-bit integral value used to initialize bits [15:0] of the result.
3985/// \param __w14
3986/// A 16-bit integral value used to initialize bits [31:16] of the result.
3987/// \param __w13
3988/// A 16-bit integral value used to initialize bits [47:32] of the result.
3989/// \param __w12
3990/// A 16-bit integral value used to initialize bits [63:48] of the result.
3991/// \param __w11
3992/// A 16-bit integral value used to initialize bits [79:64] of the result.
3993/// \param __w10
3994/// A 16-bit integral value used to initialize bits [95:80] of the result.
3995/// \param __w09
3996/// A 16-bit integral value used to initialize bits [111:96] of the result.
3997/// \param __w08
3998/// A 16-bit integral value used to initialize bits [127:112] of the result.
3999/// \param __w07
4000/// A 16-bit integral value used to initialize bits [143:128] of the result.
4001/// \param __w06
4002/// A 16-bit integral value used to initialize bits [159:144] of the result.
4003/// \param __w05
4004/// A 16-bit integral value used to initialize bits [175:160] of the result.
4005/// \param __w04
4006/// A 16-bit integral value used to initialize bits [191:176] of the result.
4007/// \param __w03
4008/// A 16-bit integral value used to initialize bits [207:192] of the result.
4009/// \param __w02
4010/// A 16-bit integral value used to initialize bits [223:208] of the result.
4011/// \param __w01
4012/// A 16-bit integral value used to initialize bits [239:224] of the result.
4013/// \param __w00
4014/// A 16-bit integral value used to initialize bits [255:240] of the result.
4015/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004016static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004017_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004018 short __w11, short __w10, short __w09, short __w08,
4019 short __w07, short __w06, short __w05, short __w04,
4020 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004021{
Tim Shenf811de42018-05-31 01:51:07 +00004022 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4023 __w04, __w05, __w06, __w07,
4024 __w08, __w09, __w10, __w11,
4025 __w12, __w13, __w14, __w15);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004026}
4027
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004028/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004029/// with the specified 8-bit integral values.
4030///
4031/// \headerfile <x86intrin.h>
4032///
4033/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004034/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004035///
4036/// \param __b31
4037/// An 8-bit integral value used to initialize bits [7:0] of the result.
4038/// \param __b30
4039/// An 8-bit integral value used to initialize bits [15:8] of the result.
4040/// \param __b29
4041/// An 8-bit integral value used to initialize bits [23:16] of the result.
4042/// \param __b28
4043/// An 8-bit integral value used to initialize bits [31:24] of the result.
4044/// \param __b27
4045/// An 8-bit integral value used to initialize bits [39:32] of the result.
4046/// \param __b26
4047/// An 8-bit integral value used to initialize bits [47:40] of the result.
4048/// \param __b25
4049/// An 8-bit integral value used to initialize bits [55:48] of the result.
4050/// \param __b24
4051/// An 8-bit integral value used to initialize bits [63:56] of the result.
4052/// \param __b23
4053/// An 8-bit integral value used to initialize bits [71:64] of the result.
4054/// \param __b22
4055/// An 8-bit integral value used to initialize bits [79:72] of the result.
4056/// \param __b21
4057/// An 8-bit integral value used to initialize bits [87:80] of the result.
4058/// \param __b20
4059/// An 8-bit integral value used to initialize bits [95:88] of the result.
4060/// \param __b19
4061/// An 8-bit integral value used to initialize bits [103:96] of the result.
4062/// \param __b18
4063/// An 8-bit integral value used to initialize bits [111:104] of the result.
4064/// \param __b17
4065/// An 8-bit integral value used to initialize bits [119:112] of the result.
4066/// \param __b16
4067/// An 8-bit integral value used to initialize bits [127:120] of the result.
4068/// \param __b15
4069/// An 8-bit integral value used to initialize bits [135:128] of the result.
4070/// \param __b14
4071/// An 8-bit integral value used to initialize bits [143:136] of the result.
4072/// \param __b13
4073/// An 8-bit integral value used to initialize bits [151:144] of the result.
4074/// \param __b12
4075/// An 8-bit integral value used to initialize bits [159:152] of the result.
4076/// \param __b11
4077/// An 8-bit integral value used to initialize bits [167:160] of the result.
4078/// \param __b10
4079/// An 8-bit integral value used to initialize bits [175:168] of the result.
4080/// \param __b09
4081/// An 8-bit integral value used to initialize bits [183:176] of the result.
4082/// \param __b08
4083/// An 8-bit integral value used to initialize bits [191:184] of the result.
4084/// \param __b07
4085/// An 8-bit integral value used to initialize bits [199:192] of the result.
4086/// \param __b06
4087/// An 8-bit integral value used to initialize bits [207:200] of the result.
4088/// \param __b05
4089/// An 8-bit integral value used to initialize bits [215:208] of the result.
4090/// \param __b04
4091/// An 8-bit integral value used to initialize bits [223:216] of the result.
4092/// \param __b03
4093/// An 8-bit integral value used to initialize bits [231:224] of the result.
4094/// \param __b02
4095/// An 8-bit integral value used to initialize bits [239:232] of the result.
4096/// \param __b01
4097/// An 8-bit integral value used to initialize bits [247:240] of the result.
4098/// \param __b00
4099/// An 8-bit integral value used to initialize bits [255:248] of the result.
4100/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004101static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004102_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004103 char __b27, char __b26, char __b25, char __b24,
4104 char __b23, char __b22, char __b21, char __b20,
4105 char __b19, char __b18, char __b17, char __b16,
4106 char __b15, char __b14, char __b13, char __b12,
4107 char __b11, char __b10, char __b09, char __b08,
4108 char __b07, char __b06, char __b05, char __b04,
4109 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004110{
Tim Shenf811de42018-05-31 01:51:07 +00004111 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4112 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4113 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4114 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004115}
4116
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004117/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004118/// with the specified 64-bit integral values.
4119///
4120/// \headerfile <x86intrin.h>
4121///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004122/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4123/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004124///
4125/// \param __a
4126/// A 64-bit integral value used to initialize bits [63:0] of the result.
4127/// \param __b
4128/// A 64-bit integral value used to initialize bits [127:64] of the result.
4129/// \param __c
4130/// A 64-bit integral value used to initialize bits [191:128] of the result.
4131/// \param __d
4132/// A 64-bit integral value used to initialize bits [255:192] of the result.
4133/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004134static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004135_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004136{
Tim Shenf811de42018-05-31 01:51:07 +00004137 return _mm256_set_epi64x(__d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004138}
4139
4140/* Create vectors with repeated elements */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004141/// Constructs a 256-bit floating-point vector of [4 x double], with each
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004142/// of the four double-precision floating-point vector elements set to the
4143/// specified double-precision floating-point value.
4144///
4145/// \headerfile <x86intrin.h>
4146///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004147/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004148///
4149/// \param __w
4150/// A double-precision floating-point value used to initialize each vector
4151/// element of the result.
4152/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004153static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004154_mm256_set1_pd(double __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004155{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004156 return _mm256_set_pd(__w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004157}
4158
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004159/// Constructs a 256-bit floating-point vector of [8 x float], with each
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004160/// of the eight single-precision floating-point vector elements set to the
4161/// specified single-precision floating-point value.
4162///
4163/// \headerfile <x86intrin.h>
4164///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004165/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4166/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004167///
4168/// \param __w
4169/// A single-precision floating-point value used to initialize each vector
4170/// element of the result.
4171/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004172static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004173_mm256_set1_ps(float __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004174{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004175 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004176}
4177
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004178/// Constructs a 256-bit integer vector of [8 x i32], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004179/// 32-bit integral vector elements set to the specified 32-bit integral
4180/// value.
4181///
4182/// \headerfile <x86intrin.h>
4183///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004184/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4185/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004186///
4187/// \param __i
4188/// A 32-bit integral value used to initialize each vector element of the
4189/// result.
4190/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kupersteine45af542015-06-30 13:36:19 +00004191static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004192_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004193{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004194 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004195}
4196
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004197/// Constructs a 256-bit integer vector of [16 x i16], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004198/// 16-bit integral vector elements set to the specified 16-bit integral
4199/// value.
4200///
4201/// \headerfile <x86intrin.h>
4202///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004203/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004204///
4205/// \param __w
4206/// A 16-bit integral value used to initialize each vector element of the
4207/// result.
4208/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kupersteine45af542015-06-30 13:36:19 +00004209static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004210_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004211{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004212 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4213 __w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004214}
4215
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004216/// Constructs a 256-bit integer vector of [32 x i8], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004217/// 8-bit integral vector elements set to the specified 8-bit integral value.
4218///
4219/// \headerfile <x86intrin.h>
4220///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004221/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004222///
4223/// \param __b
4224/// An 8-bit integral value used to initialize each vector element of the
4225/// result.
4226/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kupersteine45af542015-06-30 13:36:19 +00004227static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004228_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004229{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004230 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4231 __b, __b, __b, __b, __b, __b, __b, __b,
4232 __b, __b, __b, __b, __b, __b, __b, __b,
4233 __b, __b, __b, __b, __b, __b, __b, __b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004234}
4235
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004236/// Constructs a 256-bit integer vector of [4 x i64], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004237/// 64-bit integral vector elements set to the specified 64-bit integral
4238/// value.
4239///
4240/// \headerfile <x86intrin.h>
4241///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004242/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004243///
4244/// \param __q
4245/// A 64-bit integral value used to initialize each vector element of the
4246/// result.
4247/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kupersteine45af542015-06-30 13:36:19 +00004248static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004249_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004250{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004251 return _mm256_set_epi64x(__q, __q, __q, __q);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004252}
4253
David Blaikie3302f2b2013-01-16 23:08:36 +00004254/* Create __zeroed vectors */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004255/// Constructs a 256-bit floating-point vector of [4 x double] with all
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004256/// vector elements initialized to zero.
4257///
4258/// \headerfile <x86intrin.h>
4259///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004260/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004261///
4262/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004263static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004264_mm256_setzero_pd(void)
4265{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004266 return __extension__ (__m256d){ 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004267}
4268
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004269/// Constructs a 256-bit floating-point vector of [8 x float] with all
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004270/// vector elements initialized to zero.
4271///
4272/// \headerfile <x86intrin.h>
4273///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004274/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004275///
4276/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004277static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004278_mm256_setzero_ps(void)
4279{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004280 return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004281}
4282
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004283/// Constructs a 256-bit integer vector initialized to zero.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004284///
4285/// \headerfile <x86intrin.h>
4286///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004287/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004288///
4289/// \returns A 256-bit integer vector initialized to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004290static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004291_mm256_setzero_si256(void)
4292{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004293 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004294}
4295
4296/* Cast between vector types */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004297/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004298/// floating-point vector of [8 x float].
4299///
4300/// \headerfile <x86intrin.h>
4301///
4302/// This intrinsic has no corresponding instruction.
4303///
4304/// \param __a
4305/// A 256-bit floating-point vector of [4 x double].
4306/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4307/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004308static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004309_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004310{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004311 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004312}
4313
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004314/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004315/// integer vector.
4316///
4317/// \headerfile <x86intrin.h>
4318///
4319/// This intrinsic has no corresponding instruction.
4320///
4321/// \param __a
4322/// A 256-bit floating-point vector of [4 x double].
4323/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4324/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004325static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004326_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004327{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004328 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004329}
4330
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004331/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004332/// floating-point vector of [4 x double].
4333///
4334/// \headerfile <x86intrin.h>
4335///
4336/// This intrinsic has no corresponding instruction.
4337///
4338/// \param __a
4339/// A 256-bit floating-point vector of [8 x float].
4340/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4341/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004342static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004343_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004344{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004345 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004346}
4347
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004348/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004349/// integer vector.
4350///
4351/// \headerfile <x86intrin.h>
4352///
4353/// This intrinsic has no corresponding instruction.
4354///
4355/// \param __a
4356/// A 256-bit floating-point vector of [8 x float].
4357/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4358/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004359static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004360_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004361{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004362 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004363}
4364
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004365/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004366/// of [8 x float].
4367///
4368/// \headerfile <x86intrin.h>
4369///
4370/// This intrinsic has no corresponding instruction.
4371///
4372/// \param __a
4373/// A 256-bit integer vector.
4374/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4375/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004376static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004377_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004378{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004379 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004380}
4381
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004382/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004383/// of [4 x double].
4384///
4385/// \headerfile <x86intrin.h>
4386///
4387/// This intrinsic has no corresponding instruction.
4388///
4389/// \param __a
4390/// A 256-bit integer vector.
4391/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4392/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004393static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004394_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004395{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004396 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004397}
4398
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004399/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004400/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4401///
4402/// \headerfile <x86intrin.h>
4403///
4404/// This intrinsic has no corresponding instruction.
4405///
4406/// \param __a
4407/// A 256-bit floating-point vector of [4 x double].
4408/// \returns A 128-bit floating-point vector of [2 x double] containing the
4409/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004410static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004411_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004412{
Craig Topper1aa231e2016-05-16 06:38:42 +00004413 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004414}
4415
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004416/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004417/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4418///
4419/// \headerfile <x86intrin.h>
4420///
4421/// This intrinsic has no corresponding instruction.
4422///
4423/// \param __a
4424/// A 256-bit floating-point vector of [8 x float].
4425/// \returns A 128-bit floating-point vector of [4 x float] containing the
4426/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004427static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004428_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004429{
Craig Topper1aa231e2016-05-16 06:38:42 +00004430 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004431}
4432
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004433/// Truncates a 256-bit integer vector into a 128-bit integer vector.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004434///
4435/// \headerfile <x86intrin.h>
4436///
4437/// This intrinsic has no corresponding instruction.
4438///
4439/// \param __a
4440/// A 256-bit integer vector.
4441/// \returns A 128-bit integer vector containing the lower 128 bits of the
4442/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004443static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004444_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004445{
Craig Topper1aa231e2016-05-16 06:38:42 +00004446 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004447}
4448
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004449/// Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004450/// 128-bit floating-point vector of [2 x double].
4451///
4452/// The lower 128 bits contain the value of the source vector. The contents
4453/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004454///
4455/// \headerfile <x86intrin.h>
4456///
4457/// This intrinsic has no corresponding instruction.
4458///
4459/// \param __a
4460/// A 128-bit vector of [2 x double].
4461/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4462/// contain the value of the parameter. The contents of the upper 128 bits
4463/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004464static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004465_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004466{
Craig Topper1aa231e2016-05-16 06:38:42 +00004467 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004468}
4469
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004470/// Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004471/// 128-bit floating-point vector of [4 x float].
4472///
4473/// The lower 128 bits contain the value of the source vector. The contents
4474/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004475///
4476/// \headerfile <x86intrin.h>
4477///
4478/// This intrinsic has no corresponding instruction.
4479///
4480/// \param __a
4481/// A 128-bit vector of [4 x float].
4482/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4483/// contain the value of the parameter. The contents of the upper 128 bits
4484/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004485static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004486_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004487{
Craig Topper1aa231e2016-05-16 06:38:42 +00004488 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004489}
4490
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004491/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004492///
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004493/// The lower 128 bits contain the value of the source vector. The contents
4494/// of the upper 128 bits are undefined.
4495///
4496/// \headerfile <x86intrin.h>
4497///
4498/// This intrinsic has no corresponding instruction.
4499///
4500/// \param __a
4501/// A 128-bit integer vector.
4502/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4503/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004504static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004505_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004506{
Craig Topper1aa231e2016-05-16 06:38:42 +00004507 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004508}
Chad Rosierf8df4f42012-03-20 16:40:00 +00004509
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004510/// Constructs a 256-bit floating-point vector of [4 x double] from a
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004511/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4512/// contain the value of the source vector. The upper 128 bits are set
4513/// to zero.
4514///
4515/// \headerfile <x86intrin.h>
4516///
4517/// This intrinsic has no corresponding instruction.
4518///
4519/// \param __a
4520/// A 128-bit vector of [2 x double].
4521/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4522/// contain the value of the parameter. The upper 128 bits are set to zero.
4523static __inline __m256d __DEFAULT_FN_ATTRS
4524_mm256_zextpd128_pd256(__m128d __a)
4525{
4526 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4527}
4528
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004529/// Constructs a 256-bit floating-point vector of [8 x float] from a
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004530/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4531/// the value of the source vector. The upper 128 bits are set to zero.
4532///
4533/// \headerfile <x86intrin.h>
4534///
4535/// This intrinsic has no corresponding instruction.
4536///
4537/// \param __a
4538/// A 128-bit vector of [4 x float].
4539/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4540/// contain the value of the parameter. The upper 128 bits are set to zero.
4541static __inline __m256 __DEFAULT_FN_ATTRS
4542_mm256_zextps128_ps256(__m128 __a)
4543{
4544 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4545}
4546
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004547/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004548/// The lower 128 bits contain the value of the source vector. The upper
4549/// 128 bits are set to zero.
4550///
4551/// \headerfile <x86intrin.h>
4552///
4553/// This intrinsic has no corresponding instruction.
4554///
4555/// \param __a
4556/// A 128-bit integer vector.
4557/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4558/// the parameter. The upper 128 bits are set to zero.
4559static __inline __m256i __DEFAULT_FN_ATTRS
4560_mm256_zextsi128_si256(__m128i __a)
4561{
4562 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4563}
4564
Sean Silvae4c37602015-09-12 02:55:19 +00004565/*
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004566 Vector insert.
4567 We use macros rather than inlines because we only want to accept
4568 invocations where the immediate M is a constant expression.
4569*/
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004570/// Constructs a new 256-bit vector of [8 x float] by first duplicating
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004571/// a 256-bit vector of [8 x float] given in the first parameter, and then
4572/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004573/// 128-bit vector of [4 x float] in the second parameter.
4574///
4575/// The immediate integer parameter determines between the upper or the lower
4576/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004577///
4578/// \headerfile <x86intrin.h>
4579///
4580/// \code
4581/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4582/// \endcode
4583///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004584/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004585///
4586/// \param V1
4587/// A 256-bit vector of [8 x float]. This vector is copied to the result
4588/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004589/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004590/// \param V2
4591/// A 128-bit vector of [4 x float]. The contents of this parameter are
4592/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004593/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004594/// \param M
4595/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004596/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004597/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004598/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4599/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004600/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4601/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4602/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004603/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004604#define _mm256_insertf128_ps(V1, V2, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004605 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4606 (__v4sf)(__m128)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004607
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004608/// Constructs a new 256-bit vector of [4 x double] by first duplicating
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004609/// a 256-bit vector of [4 x double] given in the first parameter, and then
4610/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004611/// 128-bit vector of [2 x double] in the second parameter.
4612///
4613/// The immediate integer parameter determines between the upper or the lower
4614/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004615///
4616/// \headerfile <x86intrin.h>
4617///
4618/// \code
4619/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4620/// \endcode
4621///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004622/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004623///
4624/// \param V1
4625/// A 256-bit vector of [4 x double]. This vector is copied to the result
4626/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004627/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004628/// \param V2
4629/// A 128-bit vector of [2 x double]. The contents of this parameter are
4630/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004631/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004632/// \param M
4633/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004634/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004635/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004636/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4637/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004638/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4639/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4640/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004641/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004642#define _mm256_insertf128_pd(V1, V2, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004643 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4644 (__v2df)(__m128d)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004645
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004646/// Constructs a new 256-bit integer vector by first duplicating a
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004647/// 256-bit integer vector given in the first parameter, and then replacing
4648/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004649/// integer vector in the second parameter.
4650///
4651/// The immediate integer parameter determines between the upper or the lower
4652/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004653///
4654/// \headerfile <x86intrin.h>
4655///
4656/// \code
4657/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4658/// \endcode
4659///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004660/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004661///
4662/// \param V1
4663/// A 256-bit integer vector. This vector is copied to the result first, and
4664/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004665/// replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004666/// \param V2
4667/// A 128-bit integer vector. The contents of this parameter are written to
4668/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004669/// value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004670/// \param M
4671/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004672/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004673/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004674/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4675/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004676/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4677/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4678/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004679/// \returns A 256-bit integer vector containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004680#define _mm256_insertf128_si256(V1, V2, M) \
Craig Topper573dab12018-06-08 04:09:14 +00004681 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4682 (__v4si)(__m128i)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004683
Sean Silvae4c37602015-09-12 02:55:19 +00004684/*
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004685 Vector extract.
4686 We use macros rather than inlines because we only want to accept
4687 invocations where the immediate M is a constant expression.
4688*/
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004689/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004690/// of [8 x float], as determined by the immediate integer parameter, and
4691/// returns the extracted bits as a 128-bit vector of [4 x float].
4692///
4693/// \headerfile <x86intrin.h>
4694///
4695/// \code
4696/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4697/// \endcode
4698///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004699/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004700///
4701/// \param V
4702/// A 256-bit vector of [8 x float].
4703/// \param M
4704/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004705/// extracted from the first parameter: \n
4706/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4707/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004708/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004709/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004710#define _mm256_extractf128_ps(V, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004711 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004712
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004713/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004714/// of [4 x double], as determined by the immediate integer parameter, and
4715/// returns the extracted bits as a 128-bit vector of [2 x double].
4716///
4717/// \headerfile <x86intrin.h>
4718///
4719/// \code
4720/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4721/// \endcode
4722///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004723/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004724///
4725/// \param V
4726/// A 256-bit vector of [4 x double].
4727/// \param M
4728/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004729/// extracted from the first parameter: \n
4730/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4731/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004732/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004733/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004734#define _mm256_extractf128_pd(V, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004735 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004736
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004737/// Extracts either the upper or the lower 128 bits from a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004738/// integer vector, as determined by the immediate integer parameter, and
4739/// returns the extracted bits as a 128-bit integer vector.
4740///
4741/// \headerfile <x86intrin.h>
4742///
4743/// \code
4744/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4745/// \endcode
4746///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004747/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004748///
4749/// \param V
4750/// A 256-bit integer vector.
4751/// \param M
4752/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004753/// extracted from the first parameter: \n
4754/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4755/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004756/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004757/// \returns A 128-bit integer vector containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004758#define _mm256_extractf128_si256(V, M) \
Craig Topper573dab12018-06-08 04:09:14 +00004759 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004760
Chad Rosierf8df4f42012-03-20 16:40:00 +00004761/* SIMD load ops (unaligned) */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004762/// Loads two 128-bit floating-point vectors of [4 x float] from
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004763/// unaligned memory locations and constructs a 256-bit floating-point vector
4764/// of [8 x float] by concatenating the two 128-bit vectors.
4765///
4766/// \headerfile <x86intrin.h>
4767///
4768/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004769/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004770///
4771/// \param __addr_hi
4772/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004773/// single-precision floating-point values. These values are to be copied to
4774/// bits[255:128] of the result. The address of the memory location does not
4775/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004776/// \param __addr_lo
4777/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004778/// single-precision floating-point values. These values are to be copied to
4779/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004780/// have to be aligned.
4781/// \returns A 256-bit floating-point vector of [8 x float] containing the
4782/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004783static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004784_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004785{
Craig Topper74b59482016-05-31 05:49:13 +00004786 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4787 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004788}
4789
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004790/// Loads two 128-bit floating-point vectors of [2 x double] from
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004791/// unaligned memory locations and constructs a 256-bit floating-point vector
4792/// of [4 x double] by concatenating the two 128-bit vectors.
4793///
4794/// \headerfile <x86intrin.h>
4795///
4796/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004797/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004798///
4799/// \param __addr_hi
4800/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004801/// double-precision floating-point values. These values are to be copied to
4802/// bits[255:128] of the result. The address of the memory location does not
4803/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004804/// \param __addr_lo
4805/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004806/// double-precision floating-point values. These values are to be copied to
4807/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004808/// have to be aligned.
4809/// \returns A 256-bit floating-point vector of [4 x double] containing the
4810/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004811static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004812_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004813{
Craig Topper74b59482016-05-31 05:49:13 +00004814 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4815 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004816}
4817
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004818/// Loads two 128-bit integer vectors from unaligned memory locations and
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004819/// constructs a 256-bit integer vector by concatenating the two 128-bit
4820/// vectors.
4821///
4822/// \headerfile <x86intrin.h>
4823///
4824/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004825/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004826///
4827/// \param __addr_hi
4828/// A pointer to a 128-bit memory location containing a 128-bit integer
4829/// vector. This vector is to be copied to bits[255:128] of the result. The
4830/// address of the memory location does not have to be aligned.
4831/// \param __addr_lo
4832/// A pointer to a 128-bit memory location containing a 128-bit integer
4833/// vector. This vector is to be copied to bits[127:0] of the result. The
4834/// address of the memory location does not have to be aligned.
4835/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004836static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004837_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004838{
Craig Topper74b59482016-05-31 05:49:13 +00004839 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4840 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004841}
4842
4843/* SIMD store ops (unaligned) */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004844/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004845/// vector of [8 x float] into two different unaligned memory locations.
4846///
4847/// \headerfile <x86intrin.h>
4848///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004849/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4850/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004851///
4852/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004853/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004854/// copied to this memory location. The address of this memory location does
4855/// not have to be aligned.
4856/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004857/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004858/// copied to this memory location. The address of this memory location does
4859/// not have to be aligned.
4860/// \param __a
4861/// A 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004862static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004863_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004864{
David Blaikie3302f2b2013-01-16 23:08:36 +00004865 __m128 __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004866
David Blaikie3302f2b2013-01-16 23:08:36 +00004867 __v128 = _mm256_castps256_ps128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004868 _mm_storeu_ps(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004869 __v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004870 _mm_storeu_ps(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004871}
4872
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004873/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004874/// vector of [4 x double] into two different unaligned memory locations.
4875///
4876/// \headerfile <x86intrin.h>
4877///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004878/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4879/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004880///
4881/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004882/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004883/// copied to this memory location. The address of this memory location does
4884/// not have to be aligned.
4885/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004886/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004887/// copied to this memory location. The address of this memory location does
4888/// not have to be aligned.
4889/// \param __a
4890/// A 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004891static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004892_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004893{
David Blaikie3302f2b2013-01-16 23:08:36 +00004894 __m128d __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004895
David Blaikie3302f2b2013-01-16 23:08:36 +00004896 __v128 = _mm256_castpd256_pd128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004897 _mm_storeu_pd(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004898 __v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004899 _mm_storeu_pd(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004900}
4901
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004902/// Stores the upper and lower 128 bits of a 256-bit integer vector into
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004903/// two different unaligned memory locations.
4904///
4905/// \headerfile <x86intrin.h>
4906///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004907/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4908/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004909///
4910/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004911/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004912/// copied to this memory location. The address of this memory location does
4913/// not have to be aligned.
4914/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004915/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004916/// copied to this memory location. The address of this memory location does
4917/// not have to be aligned.
4918/// \param __a
4919/// A 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004920static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004921_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004922{
David Blaikie3302f2b2013-01-16 23:08:36 +00004923 __m128i __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004924
David Blaikie3302f2b2013-01-16 23:08:36 +00004925 __v128 = _mm256_castsi256_si128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004926 _mm_storeu_si128(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004927 __v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004928 _mm_storeu_si128(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004929}
Richard Smith49e56442013-07-14 05:41:45 +00004930
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004931/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004932/// concatenating two 128-bit floating-point vectors of [4 x float].
4933///
4934/// \headerfile <x86intrin.h>
4935///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004936/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004937///
4938/// \param __hi
4939/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4940/// 128 bits of the result.
4941/// \param __lo
4942/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4943/// 128 bits of the result.
4944/// \returns A 256-bit floating-point vector of [8 x float] containing the
4945/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004946static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004947_mm256_set_m128 (__m128 __hi, __m128 __lo)
4948{
Craig Topper1aa231e2016-05-16 06:38:42 +00004949 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein76190042015-05-20 07:46:52 +00004950}
4951
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004952/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004953/// concatenating two 128-bit floating-point vectors of [2 x double].
4954///
4955/// \headerfile <x86intrin.h>
4956///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004957/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004958///
4959/// \param __hi
4960/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4961/// 128 bits of the result.
4962/// \param __lo
4963/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4964/// 128 bits of the result.
4965/// \returns A 256-bit floating-point vector of [4 x double] containing the
4966/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004967static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004968_mm256_set_m128d (__m128d __hi, __m128d __lo)
4969{
Craig Topper5cbeeed2018-07-07 17:03:32 +00004970 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
Michael Kuperstein76190042015-05-20 07:46:52 +00004971}
4972
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004973/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004974/// integer vectors.
4975///
4976/// \headerfile <x86intrin.h>
4977///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004978/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004979///
4980/// \param __hi
4981/// A 128-bit integer vector to be copied to the upper 128 bits of the
4982/// result.
4983/// \param __lo
4984/// A 128-bit integer vector to be copied to the lower 128 bits of the
4985/// result.
4986/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004987static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004988_mm256_set_m128i (__m128i __hi, __m128i __lo)
4989{
Craig Topper5cbeeed2018-07-07 17:03:32 +00004990 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
Michael Kuperstein76190042015-05-20 07:46:52 +00004991}
4992
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004993/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004994/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4995/// similar to _mm256_set_m128, but the order of the input parameters is
4996/// swapped.
4997///
4998/// \headerfile <x86intrin.h>
4999///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005000/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005001///
5002/// \param __lo
5003/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5004/// 128 bits of the result.
5005/// \param __hi
5006/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5007/// 128 bits of the result.
5008/// \returns A 256-bit floating-point vector of [8 x float] containing the
5009/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005010static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005011_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5012{
Michael Kuperstein76190042015-05-20 07:46:52 +00005013 return _mm256_set_m128(__hi, __lo);
5014}
5015
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00005016/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005017/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5018/// similar to _mm256_set_m128d, but the order of the input parameters is
5019/// swapped.
5020///
5021/// \headerfile <x86intrin.h>
5022///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005023/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005024///
5025/// \param __lo
5026/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5027/// 128 bits of the result.
5028/// \param __hi
5029/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5030/// 128 bits of the result.
5031/// \returns A 256-bit floating-point vector of [4 x double] containing the
5032/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005033static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005034_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5035{
Craig Topper5cbeeed2018-07-07 17:03:32 +00005036 return (__m256d)_mm256_set_m128d(__hi, __lo);
Michael Kuperstein76190042015-05-20 07:46:52 +00005037}
5038
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00005039/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005040/// integer vectors. This is similar to _mm256_set_m128i, but the order of
5041/// the input parameters is swapped.
5042///
5043/// \headerfile <x86intrin.h>
5044///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005045/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005046///
5047/// \param __lo
5048/// A 128-bit integer vector to be copied to the lower 128 bits of the
5049/// result.
5050/// \param __hi
5051/// A 128-bit integer vector to be copied to the upper 128 bits of the
5052/// result.
5053/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005054static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005055_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5056{
Craig Topper5cbeeed2018-07-07 17:03:32 +00005057 return (__m256i)_mm256_set_m128i(__hi, __lo);
Michael Kuperstein76190042015-05-20 07:46:52 +00005058}
5059
Michael Kupersteine45af542015-06-30 13:36:19 +00005060#undef __DEFAULT_FN_ATTRS
Craig Topper74c10e32018-07-09 19:00:16 +00005061#undef __DEFAULT_FN_ATTRS128
Eric Christopher4d1851682015-06-17 07:09:20 +00005062
Richard Smith49e56442013-07-14 05:41:45 +00005063#endif /* __AVXINTRIN_H */