blob: 0b7813526eafd7a581b5215e081702f65b156195 [file] [log] [blame]
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer6f35f3c2010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000027
Richard Smith49e56442013-07-14 05:41:45 +000028#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
Reid Kleckner89fbd552018-06-04 21:39:20 +000031typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000037
Craig Topper6a77b622016-06-04 05:43:41 +000038/* Unsigned types */
Reid Kleckner89fbd552018-06-04 21:39:20 +000039typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
Craig Topper6a77b622016-06-04 05:43:41 +000043
Chandler Carruthcbe64112015-10-01 23:40:12 +000044/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
Reid Kleckner89fbd552018-06-04 21:39:20 +000046typedef signed char __v32qs __attribute__((__vector_size__(32)));
Chandler Carruthcbe64112015-10-01 23:40:12 +000047
Reid Kleckner89fbd552018-06-04 21:39:20 +000048typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000051
Eric Christopher4d1851682015-06-17 07:09:20 +000052/* Define the default attributes for the functions in this file. */
Michael Kupersteine45af542015-06-30 13:36:19 +000053#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher4d1851682015-06-17 07:09:20 +000054
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000055/* Arithmetic */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000056/// Adds two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000057///
58/// \headerfile <x86intrin.h>
59///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000060/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000061///
62/// \param __a
63/// A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65/// A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000068static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000069_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000070{
Craig Topper1aa231e2016-05-16 06:38:42 +000071 return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000072}
73
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000074/// Adds two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000075///
76/// \headerfile <x86intrin.h>
77///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000078/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000079///
80/// \param __a
81/// A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83/// A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000086static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000087_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000088{
Craig Topper1aa231e2016-05-16 06:38:42 +000089 return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000090}
91
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000092/// Subtracts two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000093///
94/// \headerfile <x86intrin.h>
95///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000096/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000097///
98/// \param __a
99/// A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101/// A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000104static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000105_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000106{
Craig Topper1aa231e2016-05-16 06:38:42 +0000107 return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000108}
109
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000110/// Subtracts two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000111///
112/// \headerfile <x86intrin.h>
113///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000115///
116/// \param __a
117/// A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119/// A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000122static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000123_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000124{
Craig Topper1aa231e2016-05-16 06:38:42 +0000125 return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000126}
127
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000128/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000129/// two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000134///
135/// \param __a
136/// A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138/// A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140/// and differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000141static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000142_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000143{
David Blaikie3302f2b2013-01-16 23:08:36 +0000144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000145}
146
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000147/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000148/// two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000153///
154/// \param __a
155/// A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157/// A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159/// differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000160static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000161_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000162{
David Blaikie3302f2b2013-01-16 23:08:36 +0000163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000164}
165
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000166/// Divides two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000167///
168/// \headerfile <x86intrin.h>
169///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000171///
172/// \param __a
173/// A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000178static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000179_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000180{
Craig Topper1aa231e2016-05-16 06:38:42 +0000181 return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000182}
183
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000184/// Divides two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000185///
186/// \headerfile <x86intrin.h>
187///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000189///
190/// \param __a
191/// A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000196static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000197_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000198{
Craig Topper1aa231e2016-05-16 06:38:42 +0000199 return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000200}
201
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000202/// Compares two 256-bit vectors of [4 x double] and returns the greater
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000203/// of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000208///
209/// \param __a
210/// A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212/// A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000215static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000216_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000217{
David Blaikie3302f2b2013-01-16 23:08:36 +0000218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000219}
220
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000221/// Compares two 256-bit vectors of [8 x float] and returns the greater
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000222/// of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000227///
228/// \param __a
229/// A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231/// A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000234static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000235_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000236{
David Blaikie3302f2b2013-01-16 23:08:36 +0000237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000238}
239
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000240/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000241/// of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000246///
247/// \param __a
248/// A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250/// A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000253static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000254_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000255{
David Blaikie3302f2b2013-01-16 23:08:36 +0000256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000257}
258
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000259/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000260/// of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000265///
266/// \param __a
267/// A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269/// A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000272static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000273_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000274{
David Blaikie3302f2b2013-01-16 23:08:36 +0000275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000276}
277
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000278/// Multiplies two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000279///
280/// \headerfile <x86intrin.h>
281///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000283///
284/// \param __a
285/// A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000288/// \returns A 256-bit vector of [4 x double] containing the products of both
289/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000290static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000291_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000292{
Craig Topper1aa231e2016-05-16 06:38:42 +0000293 return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000294}
295
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000296/// Multiplies two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000297///
298/// \headerfile <x86intrin.h>
299///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000301///
302/// \param __a
303/// A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000306/// \returns A 256-bit vector of [8 x float] containing the products of both
307/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000308static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000309_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000310{
Craig Topper1aa231e2016-05-16 06:38:42 +0000311 return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000312}
313
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000314/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000315/// [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000316///
317/// \headerfile <x86intrin.h>
318///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000320///
321/// \param __a
322/// A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000325static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000326_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000327{
David Blaikie3302f2b2013-01-16 23:08:36 +0000328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000329}
330
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000331/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000332/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000333///
334/// \headerfile <x86intrin.h>
335///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000337///
338/// \param __a
339/// A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000342static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000343_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000344{
David Blaikie3302f2b2013-01-16 23:08:36 +0000345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000346}
347
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000348/// Calculates the reciprocal square roots of the values in a 256-bit
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000349/// vector of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000350///
351/// \headerfile <x86intrin.h>
352///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000354///
355/// \param __a
356/// A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358/// roots of the values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000359static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000360_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000361{
David Blaikie3302f2b2013-01-16 23:08:36 +0000362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000363}
364
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000365/// Calculates the reciprocals of the values in a 256-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000366/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000367///
368/// \headerfile <x86intrin.h>
369///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000371///
372/// \param __a
373/// A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000376static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000377_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000378{
David Blaikie3302f2b2013-01-16 23:08:36 +0000379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000380}
381
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000382/// Rounds the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000383/// by the byte operand. The source values are rounded to integer values and
384/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000393///
394/// \param V
395/// A 256-bit vector of [4 x double].
396/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000397/// An integer value that specifies the rounding operation. \n
398/// Bits [7:4] are reserved. \n
399/// Bit [3] is a precision exception value: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000400/// 0: A normal PE exception is used. \n
401/// 1: The PE field is not updated. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000402/// Bit [2] is the rounding control source: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000403/// 0: Use bits [1:0] of \a M. \n
404/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000405/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000406/// 00: Nearest. \n
407/// 01: Downward (toward negative infinity). \n
408/// 10: Upward (toward positive infinity). \n
409/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Craig Topperc6338672018-05-31 00:51:20 +0000411#define _mm256_round_pd(V, M) \
412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000413
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000414/// Rounds the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000415/// specified by the byte operand. The source values are rounded to integer
416/// values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000425///
426/// \param V
427/// A 256-bit vector of [8 x float].
428/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000429/// An integer value that specifies the rounding operation. \n
430/// Bits [7:4] are reserved. \n
431/// Bit [3] is a precision exception value: \n
432/// 0: A normal PE exception is used. \n
433/// 1: The PE field is not updated. \n
434/// Bit [2] is the rounding control source: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +0000435/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000436/// 1: Use the current MXCSR setting. \n
437/// Bits [1:0] contain the rounding control definition: \n
438/// 00: Nearest. \n
439/// 01: Downward (toward negative infinity). \n
440/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000441/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Craig Topperc6338672018-05-31 00:51:20 +0000443#define _mm256_round_ps(V, M) \
444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000445
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000446/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000447/// source values are rounded up to integer values and returned as 64-bit
448/// double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000457///
458/// \param V
459/// A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000462
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000463/// Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000464/// The source values are rounded down to integer values and returned as
465/// 64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000474///
475/// \param V
476/// A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478/// values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000480
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000481/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000482/// source values are rounded up to integer values and returned as
483/// floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000492///
493/// \param V
494/// A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000497
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000498/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000499/// source values are rounded down to integer values and returned as
500/// floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000509///
510/// \param V
511/// A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000516/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000517///
518/// \headerfile <x86intrin.h>
519///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000521///
522/// \param __a
523/// A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525/// A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000528static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000529_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000530{
Craig Topper6a77b622016-06-04 05:43:41 +0000531 return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000532}
533
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000534/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000535///
536/// \headerfile <x86intrin.h>
537///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000539///
540/// \param __a
541/// A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543/// A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000546static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000547_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000548{
Craig Topper6a77b622016-06-04 05:43:41 +0000549 return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000550}
551
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000552/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000553/// the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000558///
559/// \param __a
560/// A 256-bit vector of [4 x double] containing the left source operand. The
561/// one's complement of this value is used in the bitwise AND.
562/// \param __b
563/// A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565/// values of the second operand and the one's complement of the first
566/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000567static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000568_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000569{
Craig Topper6a77b622016-06-04 05:43:41 +0000570 return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000571}
572
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000573/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000574/// the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000579///
580/// \param __a
581/// A 256-bit vector of [8 x float] containing the left source operand. The
582/// one's complement of this value is used in the bitwise AND.
583/// \param __b
584/// A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586/// values of the second operand and the one's complement of the first
587/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000588static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000589_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000590{
Craig Topper6a77b622016-06-04 05:43:41 +0000591 return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000592}
593
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000594/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000595///
596/// \headerfile <x86intrin.h>
597///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000599///
600/// \param __a
601/// A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603/// A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000606static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000607_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000608{
Craig Topper6a77b622016-06-04 05:43:41 +0000609 return (__m256d)((__v4du)__a | (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000610}
611
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000612/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000613///
614/// \headerfile <x86intrin.h>
615///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000617///
618/// \param __a
619/// A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621/// A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000624static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000625_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000626{
Craig Topper6a77b622016-06-04 05:43:41 +0000627 return (__m256)((__v8su)__a | (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000628}
629
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000630/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000631///
632/// \headerfile <x86intrin.h>
633///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000635///
636/// \param __a
637/// A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639/// A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000642static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000643_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000644{
Craig Topper6a77b622016-06-04 05:43:41 +0000645 return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000646}
647
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000648/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000649///
650/// \headerfile <x86intrin.h>
651///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000653///
654/// \param __a
655/// A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657/// A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000660static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000661_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000662{
Craig Topper6a77b622016-06-04 05:43:41 +0000663 return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000664}
665
666/* Horizontal arithmetic */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000667/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000668/// 256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000673///
674/// \param __a
675/// A 256-bit vector of [4 x double] containing one of the source operands.
676/// The horizontal sums of the values are returned in the even-indexed
677/// elements of a vector of [4 x double].
678/// \param __b
679/// A 256-bit vector of [4 x double] containing one of the source operands.
680/// The horizontal sums of the values are returned in the odd-indexed
681/// elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000684static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000685_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000686{
David Blaikie3302f2b2013-01-16 23:08:36 +0000687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000688}
689
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000690/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000691/// 256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000696///
697/// \param __a
698/// A 256-bit vector of [8 x float] containing one of the source operands.
699/// The horizontal sums of the values are returned in the elements with
700/// index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702/// A 256-bit vector of [8 x float] containing one of the source operands.
703/// The horizontal sums of the values are returned in the elements with
704/// index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000707static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000708_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000709{
David Blaikie3302f2b2013-01-16 23:08:36 +0000710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000711}
712
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000713/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000714/// 256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000719///
720/// \param __a
721/// A 256-bit vector of [4 x double] containing one of the source operands.
722/// The horizontal differences between the values are returned in the
723/// even-indexed elements of a vector of [4 x double].
724/// \param __b
725/// A 256-bit vector of [4 x double] containing one of the source operands.
726/// The horizontal differences between the values are returned in the
727/// odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000730static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000731_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000732{
David Blaikie3302f2b2013-01-16 23:08:36 +0000733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000734}
735
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000736/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000737/// 256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000742///
743/// \param __a
744/// A 256-bit vector of [8 x float] containing one of the source operands.
745/// The horizontal differences between the values are returned in the
746/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748/// A 256-bit vector of [8 x float] containing one of the source operands.
749/// The horizontal differences between the values are returned in the
750/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000753static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000754_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000755{
David Blaikie3302f2b2013-01-16 23:08:36 +0000756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000757}
758
759/* Vector permutations */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000760/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000761/// by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000762///
763/// \headerfile <x86intrin.h>
764///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000766///
767/// \param __a
768/// A 128-bit vector of [2 x double].
769/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000770/// A 128-bit integer vector operand specifying how the values are to be
771/// copied. \n
772/// Bit [1]: \n
773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774/// vector. \n
775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
776/// returned vector. \n
777/// Bit [65]: \n
778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
779/// returned vector. \n
780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
781/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000782/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000783static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000784_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000785{
David Blaikie3302f2b2013-01-16 23:08:36 +0000786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000787}
788
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000789/// Copies the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000790/// by the 256-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000791///
792/// \headerfile <x86intrin.h>
793///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000795///
796/// \param __a
797/// A 256-bit vector of [4 x double].
798/// \param __c
799/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000800/// copied. \n
801/// Bit [1]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803/// vector. \n
804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
805/// returned vector. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000806/// Bit [65]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
808/// returned vector. \n
809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
810/// returned vector. \n
811/// Bit [129]: \n
812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
813/// returned vector. \n
814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
815/// returned vector. \n
816/// Bit [193]: \n
817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
818/// returned vector. \n
819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000820/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000821/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000822static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000823_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000824{
David Blaikie3302f2b2013-01-16 23:08:36 +0000825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000826}
827
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000828/// Copies the values stored in a 128-bit vector of [4 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000829/// specified by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000830/// \headerfile <x86intrin.h>
831///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000833///
834/// \param __a
835/// A 128-bit vector of [4 x float].
836/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000837/// A 128-bit integer vector operand specifying how the values are to be
838/// copied. \n
839/// Bits [1:0]: \n
840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
841/// returned vector. \n
842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
843/// returned vector. \n
844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
845/// returned vector. \n
846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
847/// returned vector. \n
848/// Bits [33:32]: \n
849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
850/// returned vector. \n
851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
852/// returned vector. \n
853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
854/// returned vector. \n
855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
856/// returned vector. \n
857/// Bits [65:64]: \n
858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
859/// returned vector. \n
860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
861/// returned vector. \n
862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
863/// returned vector. \n
864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
865/// returned vector. \n
866/// Bits [97:96]: \n
867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
868/// returned vector. \n
869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
870/// returned vector. \n
871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
872/// returned vector. \n
873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
874/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000875/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000876static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000877_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000878{
David Blaikie3302f2b2013-01-16 23:08:36 +0000879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000880}
881
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000882/// Copies the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000883/// specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000888///
889/// \param __a
890/// A 256-bit vector of [8 x float].
891/// \param __c
892/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000893/// copied. \n
894/// Bits [1:0]: \n
895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
896/// returned vector. \n
897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
898/// returned vector. \n
899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
900/// returned vector. \n
901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
902/// returned vector. \n
903/// Bits [33:32]: \n
904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
905/// returned vector. \n
906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
907/// returned vector. \n
908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
909/// returned vector. \n
910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
911/// returned vector. \n
912/// Bits [65:64]: \n
913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
914/// returned vector. \n
915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
916/// returned vector. \n
917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
918/// returned vector. \n
919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
920/// returned vector. \n
921/// Bits [97:96]: \n
922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
923/// returned vector. \n
924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
925/// returned vector. \n
926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
927/// returned vector. \n
928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
929/// returned vector. \n
930/// Bits [129:128]: \n
931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
932/// returned vector. \n
933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
934/// returned vector. \n
935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
936/// returned vector. \n
937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
938/// returned vector. \n
939/// Bits [161:160]: \n
940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
941/// returned vector. \n
942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
943/// returned vector. \n
944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
945/// returned vector. \n
946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
947/// returned vector. \n
948/// Bits [193:192]: \n
949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
950/// returned vector. \n
951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
952/// returned vector. \n
953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
954/// returned vector. \n
955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
956/// returned vector. \n
957/// Bits [225:224]: \n
958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
959/// returned vector. \n
960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
961/// returned vector. \n
962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
963/// returned vector. \n
964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
965/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000966/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000967static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000968_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000969{
Craig Topper9fee8ab2015-01-31 06:33:59 +0000970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000971}
972
Adrian Prantl9fc8faf2018-05-09 01:00:01 +0000973/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000974/// by the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000983///
984/// \param A
985/// A 128-bit vector of [2 x double].
986/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000987/// An immediate integer operand specifying how the values are to be
988/// copied. \n
989/// Bit [0]: \n
990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991/// vector. \n
992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
993/// returned vector. \n
994/// Bit [1]: \n
995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
996/// returned vector. \n
997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
998/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000999/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001000#define _mm_permute_pd(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001001 (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001002
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001003/// Copies the values in a 256-bit vector of [4 x double] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001004/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001005///
1006/// \headerfile <x86intrin.h>
1007///
1008/// \code
1009/// __m256d _mm256_permute_pd(__m256d A, const int C);
1010/// \endcode
1011///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001012/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001013///
1014/// \param A
1015/// A 256-bit vector of [4 x double].
1016/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001017/// An immediate integer operand specifying how the values are to be
1018/// copied. \n
1019/// Bit [0]: \n
1020/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1021/// vector. \n
1022/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1023/// returned vector. \n
1024/// Bit [1]: \n
1025/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1026/// returned vector. \n
1027/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1028/// returned vector. \n
1029/// Bit [2]: \n
1030/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1031/// returned vector. \n
1032/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1033/// returned vector. \n
1034/// Bit [3]: \n
1035/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1036/// returned vector. \n
1037/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1038/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001039/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001040#define _mm256_permute_pd(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001041 (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001042
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001043/// Copies the values in a 128-bit vector of [4 x float] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001044/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001045///
1046/// \headerfile <x86intrin.h>
1047///
1048/// \code
1049/// __m128 _mm_permute_ps(__m128 A, const int C);
1050/// \endcode
1051///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001052/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001053///
1054/// \param A
1055/// A 128-bit vector of [4 x float].
1056/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001057/// An immediate integer operand specifying how the values are to be
1058/// copied. \n
1059/// Bits [1:0]: \n
1060/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1061/// returned vector. \n
1062/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1063/// returned vector. \n
1064/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1065/// returned vector. \n
1066/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1067/// returned vector. \n
1068/// Bits [3:2]: \n
1069/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1070/// returned vector. \n
1071/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1072/// returned vector. \n
1073/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1074/// returned vector. \n
1075/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1076/// returned vector. \n
1077/// Bits [5:4]: \n
1078/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1079/// returned vector. \n
1080/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1081/// returned vector. \n
1082/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1083/// returned vector. \n
1084/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1085/// returned vector. \n
1086/// Bits [7:6]: \n
1087/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1088/// returned vector. \n
1089/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1090/// returned vector. \n
1091/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1092/// returned vector. \n
1093/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1094/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001095/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001096#define _mm_permute_ps(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001097 (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001098
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001099/// Copies the values in a 256-bit vector of [8 x float] as specified by
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001100/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001101///
1102/// \headerfile <x86intrin.h>
1103///
1104/// \code
1105/// __m256 _mm256_permute_ps(__m256 A, const int C);
1106/// \endcode
1107///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001108/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001109///
1110/// \param A
1111/// A 256-bit vector of [8 x float].
1112/// \param C
Douglas Yung7ff91422018-01-08 21:21:17 +00001113/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001114/// copied. \n
1115/// Bits [1:0]: \n
1116/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1117/// returned vector. \n
1118/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1119/// returned vector. \n
1120/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1121/// returned vector. \n
1122/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1123/// returned vector. \n
1124/// Bits [3:2]: \n
1125/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1126/// returned vector. \n
1127/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1128/// returned vector. \n
1129/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1130/// returned vector. \n
1131/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1132/// returned vector. \n
1133/// Bits [5:4]: \n
1134/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1135/// returned vector. \n
1136/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1137/// returned vector. \n
1138/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1139/// returned vector. \n
1140/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1141/// returned vector. \n
1142/// Bits [7:6]: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001143/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001144/// returned vector. \n
1145/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1146/// returned vector. \n
1147/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1148/// returned vector. \n
1149/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1150/// returned vector. \n
1151/// Bits [1:0]: \n
1152/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1153/// returned vector. \n
1154/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1155/// returned vector. \n
1156/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1157/// returned vector. \n
1158/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1159/// returned vector. \n
1160/// Bits [3:2]: \n
1161/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1162/// returned vector. \n
1163/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1164/// returned vector. \n
1165/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1166/// returned vector. \n
1167/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1168/// returned vector. \n
1169/// Bits [5:4]: \n
1170/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1171/// returned vector. \n
1172/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1173/// returned vector. \n
1174/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1175/// returned vector. \n
1176/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1177/// returned vector. \n
1178/// Bits [7:6]: \n
1179/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1180/// returned vector. \n
1181/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1182/// returned vector. \n
1183/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1184/// returned vector. \n
1185/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1186/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001187/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001188#define _mm256_permute_ps(A, C) \
Craig Topperacf56012018-06-08 00:59:27 +00001189 (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001190
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001191/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001192/// [4 x double], as specified by the immediate integer operand.
1193///
1194/// \headerfile <x86intrin.h>
1195///
1196/// \code
1197/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1198/// \endcode
1199///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001200/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001201///
1202/// \param V1
1203/// A 256-bit vector of [4 x double].
1204/// \param V2
1205/// A 256-bit vector of [4 x double.
1206/// \param M
1207/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001208/// permuted. \n
1209/// Bits [1:0]: \n
1210/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1211/// destination. \n
1212/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1213/// destination. \n
1214/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1215/// destination. \n
1216/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1217/// destination. \n
1218/// Bits [5:4]: \n
1219/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1220/// destination. \n
1221/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1222/// destination. \n
1223/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1224/// destination. \n
1225/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1226/// destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001227/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001228#define _mm256_permute2f128_pd(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001229 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
Craig Topperc6338672018-05-31 00:51:20 +00001230 (__v4df)(__m256d)(V2), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001231
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001232/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001233/// [8 x float], as specified by the immediate integer operand.
1234///
1235/// \headerfile <x86intrin.h>
1236///
1237/// \code
1238/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1239/// \endcode
1240///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001241/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001242///
1243/// \param V1
1244/// A 256-bit vector of [8 x float].
1245/// \param V2
1246/// A 256-bit vector of [8 x float].
1247/// \param M
1248/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001249/// permuted. \n
1250/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001251/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001252/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001253/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001254/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001255/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001256/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001257/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001258/// destination. \n
1259/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001260/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001261/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001262/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001263/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001264/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001265/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001266/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001267/// destination.
1268/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001269#define _mm256_permute2f128_ps(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001270 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
Craig Topperc6338672018-05-31 00:51:20 +00001271 (__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001272
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001273/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001274/// as specified by the immediate integer operand.
1275///
1276/// \headerfile <x86intrin.h>
1277///
1278/// \code
1279/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1280/// \endcode
1281///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001282/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001283///
1284/// \param V1
1285/// A 256-bit integer vector.
1286/// \param V2
1287/// A 256-bit integer vector.
1288/// \param M
1289/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001290/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001291/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001292/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001293/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001294/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001295/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001296/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001297/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001298/// destination. \n
1299/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001300/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001301/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001302/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001303/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001304/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001305/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001306/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001307/// destination.
1308/// \returns A 256-bit integer vector containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001309#define _mm256_permute2f128_si256(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001310 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
Craig Topperc6338672018-05-31 00:51:20 +00001311 (__v8si)(__m256i)(V2), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001312
1313/* Vector Blend */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001314/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001315/// two 256-bit vectors of [4 x double], as specified by the immediate
1316/// integer operand.
1317///
1318/// \headerfile <x86intrin.h>
1319///
1320/// \code
1321/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1322/// \endcode
1323///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001324/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001325///
1326/// \param V1
1327/// A 256-bit vector of [4 x double].
1328/// \param V2
1329/// A 256-bit vector of [4 x double].
1330/// \param M
1331/// An immediate integer operand, with mask bits [3:0] specifying how the
1332/// values are to be copied. The position of the mask bit corresponds to the
1333/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001334/// element in operand \a V1 is copied to the same position in the
1335/// destination. When a mask bit is 1, the corresponding 64-bit element in
1336/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001337/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001338#define _mm256_blend_pd(V1, V2, M) \
Craig Topper7d17d722018-06-08 00:00:21 +00001339 (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1340 (__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001341
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001342/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001343/// two 256-bit vectors of [8 x float], as specified by the immediate
1344/// integer operand.
1345///
1346/// \headerfile <x86intrin.h>
1347///
1348/// \code
1349/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1350/// \endcode
1351///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001352/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001353///
1354/// \param V1
1355/// A 256-bit vector of [8 x float].
1356/// \param V2
1357/// A 256-bit vector of [8 x float].
1358/// \param M
1359/// An immediate integer operand, with mask bits [7:0] specifying how the
1360/// values are to be copied. The position of the mask bit corresponds to the
1361/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001362/// element in operand \a V1 is copied to the same position in the
1363/// destination. When a mask bit is 1, the corresponding 32-bit element in
1364/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001365/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topperc6338672018-05-31 00:51:20 +00001366#define _mm256_blend_ps(V1, V2, M) \
Craig Topper7d17d722018-06-08 00:00:21 +00001367 (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1368 (__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001369
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001370/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001371/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1372/// operand.
1373///
1374/// \headerfile <x86intrin.h>
1375///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001376/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001377///
1378/// \param __a
1379/// A 256-bit vector of [4 x double].
1380/// \param __b
1381/// A 256-bit vector of [4 x double].
1382/// \param __c
1383/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1384/// how the values are to be copied. The position of the mask bit corresponds
1385/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001386/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001387/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001388/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001389/// destination.
1390/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001391static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001392_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001393{
David Blaikie3302f2b2013-01-16 23:08:36 +00001394 return (__m256d)__builtin_ia32_blendvpd256(
1395 (__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001396}
1397
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001398/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001399/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1400/// operand.
1401///
1402/// \headerfile <x86intrin.h>
1403///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001404/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001405///
1406/// \param __a
1407/// A 256-bit vector of [8 x float].
1408/// \param __b
1409/// A 256-bit vector of [8 x float].
1410/// \param __c
1411/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1412/// and 31 specifying how the values are to be copied. The position of the
1413/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001414/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001415/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001416/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001417/// position in the destination.
1418/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001419static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001420_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001421{
David Blaikie5bb70032013-01-16 23:13:42 +00001422 return (__m256)__builtin_ia32_blendvps256(
David Blaikie3302f2b2013-01-16 23:08:36 +00001423 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001424}
1425
1426/* Vector Dot Product */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001427/// Computes two dot products in parallel, using the lower and upper
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001428/// halves of two [8 x float] vectors as input to the two computations, and
1429/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001430/// [8 x float] result.
1431///
1432/// The immediate integer operand controls which input elements will
1433/// contribute to the dot product, and where the final results are returned.
1434/// In general, for each dot product, the four corresponding elements of the
1435/// input vectors are multiplied; the first two and second two products are
1436/// summed, then the two sums are added to form the final result.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001437///
1438/// \headerfile <x86intrin.h>
1439///
1440/// \code
1441/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1442/// \endcode
1443///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001444/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001445///
1446/// \param V1
1447/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1448/// \param V2
1449/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1450/// \param M
1451/// An immediate integer argument. Bits [7:4] determine which elements of
1452/// the input vectors are used, with bit [4] corresponding to the lowest
1453/// element and bit [7] corresponding to the highest element of each [4 x
1454/// float] subvector. If a bit is set, the corresponding elements from the
1455/// two input vectors are used as an input for dot product; otherwise that
1456/// input is treated as zero. Bits [3:0] determine which elements of the
1457/// result will receive a copy of the final dot product, with bit [0]
1458/// corresponding to the lowest element and bit [3] corresponding to the
1459/// highest element of each [4 x float] subvector. If a bit is set, the dot
1460/// product is returned in the corresponding element; otherwise that element
1461/// is set to zero. The bitmask is applied in the same way to each of the
1462/// two parallel dot product computations.
1463/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Craig Topperc6338672018-05-31 00:51:20 +00001464#define _mm256_dp_ps(V1, V2, M) \
Craig Topper71481662015-11-10 05:08:05 +00001465 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
Craig Topperc6338672018-05-31 00:51:20 +00001466 (__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001467
1468/* Vector shuffle */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001469/// Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001470/// specified by the immediate value operand.
1471///
1472/// The four selected elements in each operand are copied to the destination
1473/// according to the bits specified in the immediate operand. The selected
1474/// elements from the first 256-bit operand are copied to bits [63:0] and
1475/// bits [191:128] of the destination, and the selected elements from the
1476/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1477/// the destination. For example, if bits [7:0] of the immediate operand
1478/// contain a value of 0xFF, the 256-bit destination vector would contain the
1479/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001480///
1481/// \headerfile <x86intrin.h>
1482///
1483/// \code
1484/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1485/// \endcode
1486///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001487/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001488///
1489/// \param a
1490/// A 256-bit vector of [8 x float]. The four selected elements in this
1491/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1492/// according to the bits specified in the immediate operand.
1493/// \param b
1494/// A 256-bit vector of [8 x float]. The four selected elements in this
1495/// operand are copied to bits [127:64] and bits [255:192] in the
1496/// destination, according to the bits specified in the immediate operand.
1497/// \param mask
1498/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001499/// copy from \a a and \a b \n.
1500/// Bits [3:0] specify the values copied from operand \a a. \n
1501/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001502/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001503/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001504/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001505/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001506/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001507/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001508/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001509/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001510/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001511/// the destination. \n
1512/// Bit value assignments: \n
1513/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1514/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1515/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001516/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1517/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Craig Topperc6338672018-05-31 00:51:20 +00001518#define _mm256_shuffle_ps(a, b, mask) \
Craig Topper2a383c92016-07-04 22:18:01 +00001519 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1520 (__v8sf)(__m256)(b), \
1521 0 + (((mask) >> 0) & 0x3), \
1522 0 + (((mask) >> 2) & 0x3), \
1523 8 + (((mask) >> 4) & 0x3), \
1524 8 + (((mask) >> 6) & 0x3), \
1525 4 + (((mask) >> 0) & 0x3), \
1526 4 + (((mask) >> 2) & 0x3), \
1527 12 + (((mask) >> 4) & 0x3), \
Craig Topperc6338672018-05-31 00:51:20 +00001528 12 + (((mask) >> 6) & 0x3))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001529
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001530/// Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001531/// [4 x double], as specified by the immediate value operand.
1532///
1533/// The selected elements from the first 256-bit operand are copied to bits
1534/// [63:0] and bits [191:128] in the destination, and the selected elements
1535/// from the second 256-bit operand are copied to bits [127:64] and bits
1536/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1537/// operand contain a value of 0xF, the 256-bit destination vector would
1538/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001539///
1540/// \headerfile <x86intrin.h>
1541///
1542/// \code
1543/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1544/// \endcode
1545///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001546/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001547///
1548/// \param a
1549/// A 256-bit vector of [4 x double].
1550/// \param b
1551/// A 256-bit vector of [4 x double].
1552/// \param mask
1553/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001554/// copy from \a a and \a b: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001555/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001556/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001557/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001558/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001559/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001560/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001561/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001562/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001563/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001564/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001565/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001566/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001567/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001568/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001569/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001570/// destination.
1571/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Craig Topperc6338672018-05-31 00:51:20 +00001572#define _mm256_shuffle_pd(a, b, mask) \
Craig Topper2a383c92016-07-04 22:18:01 +00001573 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1574 (__v4df)(__m256d)(b), \
1575 0 + (((mask) >> 0) & 0x1), \
1576 4 + (((mask) >> 1) & 0x1), \
1577 2 + (((mask) >> 2) & 0x1), \
Craig Topperc6338672018-05-31 00:51:20 +00001578 6 + (((mask) >> 3) & 0x1))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001579
1580/* Compare */
1581#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1582#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1583#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1584#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1585#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1586#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1587#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001588#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001589#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001590#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001591#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1592#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1593#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1594#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1595#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1596#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1597#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1598#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1599#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1600#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1601#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1602#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001603#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001604#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1605#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001606#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001607#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1608#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1609#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1610#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1611#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1612#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1613
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001614/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001615/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001616/// immediate integer operand.
1617///
1618/// Returns a [2 x double] vector consisting of two doubles corresponding to
1619/// the two comparison results: zero if the comparison is false, and all 1's
1620/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001621///
1622/// \headerfile <x86intrin.h>
1623///
1624/// \code
1625/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1626/// \endcode
1627///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001628/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001629///
1630/// \param a
1631/// A 128-bit vector of [2 x double].
1632/// \param b
1633/// A 128-bit vector of [2 x double].
1634/// \param c
1635/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001636/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001637/// 0x00: Equal (ordered, non-signaling) \n
1638/// 0x01: Less-than (ordered, signaling) \n
1639/// 0x02: Less-than-or-equal (ordered, signaling) \n
1640/// 0x03: Unordered (non-signaling) \n
1641/// 0x04: Not-equal (unordered, non-signaling) \n
1642/// 0x05: Not-less-than (unordered, signaling) \n
1643/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1644/// 0x07: Ordered (non-signaling) \n
1645/// 0x08: Equal (unordered, non-signaling) \n
1646/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1647/// 0x0A: Not-greater-than (unordered, signaling) \n
1648/// 0x0B: False (ordered, non-signaling) \n
1649/// 0x0C: Not-equal (ordered, non-signaling) \n
1650/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1651/// 0x0E: Greater-than (ordered, signaling) \n
1652/// 0x0F: True (unordered, non-signaling) \n
1653/// 0x10: Equal (ordered, signaling) \n
1654/// 0x11: Less-than (ordered, non-signaling) \n
1655/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1656/// 0x13: Unordered (signaling) \n
1657/// 0x14: Not-equal (unordered, signaling) \n
1658/// 0x15: Not-less-than (unordered, non-signaling) \n
1659/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1660/// 0x17: Ordered (signaling) \n
1661/// 0x18: Equal (unordered, signaling) \n
1662/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1663/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1664/// 0x1B: False (ordered, signaling) \n
1665/// 0x1C: Not-equal (ordered, signaling) \n
1666/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1667/// 0x1E: Greater-than (ordered, non-signaling) \n
1668/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001669/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001670#define _mm_cmp_pd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001671 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001672 (__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001673
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001674/// Compares each of the corresponding values of two 128-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001675/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001676/// operand.
1677///
1678/// Returns a [4 x float] vector consisting of four floats corresponding to
1679/// the four comparison results: zero if the comparison is false, and all 1's
1680/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001681///
1682/// \headerfile <x86intrin.h>
1683///
1684/// \code
1685/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1686/// \endcode
1687///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001688/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001689///
1690/// \param a
1691/// A 128-bit vector of [4 x float].
1692/// \param b
1693/// A 128-bit vector of [4 x float].
1694/// \param c
1695/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001696/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001697/// 0x00: Equal (ordered, non-signaling) \n
1698/// 0x01: Less-than (ordered, signaling) \n
1699/// 0x02: Less-than-or-equal (ordered, signaling) \n
1700/// 0x03: Unordered (non-signaling) \n
1701/// 0x04: Not-equal (unordered, non-signaling) \n
1702/// 0x05: Not-less-than (unordered, signaling) \n
1703/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1704/// 0x07: Ordered (non-signaling) \n
1705/// 0x08: Equal (unordered, non-signaling) \n
1706/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1707/// 0x0A: Not-greater-than (unordered, signaling) \n
1708/// 0x0B: False (ordered, non-signaling) \n
1709/// 0x0C: Not-equal (ordered, non-signaling) \n
1710/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1711/// 0x0E: Greater-than (ordered, signaling) \n
1712/// 0x0F: True (unordered, non-signaling) \n
1713/// 0x10: Equal (ordered, signaling) \n
1714/// 0x11: Less-than (ordered, non-signaling) \n
1715/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1716/// 0x13: Unordered (signaling) \n
1717/// 0x14: Not-equal (unordered, signaling) \n
1718/// 0x15: Not-less-than (unordered, non-signaling) \n
1719/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1720/// 0x17: Ordered (signaling) \n
1721/// 0x18: Equal (unordered, signaling) \n
1722/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1723/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1724/// 0x1B: False (ordered, signaling) \n
1725/// 0x1C: Not-equal (ordered, signaling) \n
1726/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1727/// 0x1E: Greater-than (ordered, non-signaling) \n
1728/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001729/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001730#define _mm_cmp_ps(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001731 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001732 (__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001733
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001734/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001735/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001736/// immediate integer operand.
1737///
1738/// Returns a [4 x double] vector consisting of four doubles corresponding to
1739/// the four comparison results: zero if the comparison is false, and all 1's
1740/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001741///
1742/// \headerfile <x86intrin.h>
1743///
1744/// \code
1745/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1746/// \endcode
1747///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001748/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001749///
1750/// \param a
1751/// A 256-bit vector of [4 x double].
1752/// \param b
1753/// A 256-bit vector of [4 x double].
1754/// \param c
1755/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001756/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001757/// 0x00: Equal (ordered, non-signaling) \n
1758/// 0x01: Less-than (ordered, signaling) \n
1759/// 0x02: Less-than-or-equal (ordered, signaling) \n
1760/// 0x03: Unordered (non-signaling) \n
1761/// 0x04: Not-equal (unordered, non-signaling) \n
1762/// 0x05: Not-less-than (unordered, signaling) \n
1763/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1764/// 0x07: Ordered (non-signaling) \n
1765/// 0x08: Equal (unordered, non-signaling) \n
1766/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1767/// 0x0A: Not-greater-than (unordered, signaling) \n
1768/// 0x0B: False (ordered, non-signaling) \n
1769/// 0x0C: Not-equal (ordered, non-signaling) \n
1770/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1771/// 0x0E: Greater-than (ordered, signaling) \n
1772/// 0x0F: True (unordered, non-signaling) \n
1773/// 0x10: Equal (ordered, signaling) \n
1774/// 0x11: Less-than (ordered, non-signaling) \n
1775/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1776/// 0x13: Unordered (signaling) \n
1777/// 0x14: Not-equal (unordered, signaling) \n
1778/// 0x15: Not-less-than (unordered, non-signaling) \n
1779/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1780/// 0x17: Ordered (signaling) \n
1781/// 0x18: Equal (unordered, signaling) \n
1782/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1783/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1784/// 0x1B: False (ordered, signaling) \n
1785/// 0x1C: Not-equal (ordered, signaling) \n
1786/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1787/// 0x1E: Greater-than (ordered, non-signaling) \n
1788/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001789/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001790#define _mm256_cmp_pd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001791 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001792 (__v4df)(__m256d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001793
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001794/// Compares each of the corresponding values of two 256-bit vectors of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001795/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001796/// operand.
1797///
1798/// Returns a [8 x float] vector consisting of eight floats corresponding to
1799/// the eight comparison results: zero if the comparison is false, and all
1800/// 1's if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001801///
1802/// \headerfile <x86intrin.h>
1803///
1804/// \code
1805/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1806/// \endcode
1807///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001808/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001809///
1810/// \param a
1811/// A 256-bit vector of [8 x float].
1812/// \param b
1813/// A 256-bit vector of [8 x float].
1814/// \param c
1815/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001816/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001817/// 0x00: Equal (ordered, non-signaling) \n
1818/// 0x01: Less-than (ordered, signaling) \n
1819/// 0x02: Less-than-or-equal (ordered, signaling) \n
1820/// 0x03: Unordered (non-signaling) \n
1821/// 0x04: Not-equal (unordered, non-signaling) \n
1822/// 0x05: Not-less-than (unordered, signaling) \n
1823/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1824/// 0x07: Ordered (non-signaling) \n
1825/// 0x08: Equal (unordered, non-signaling) \n
1826/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1827/// 0x0A: Not-greater-than (unordered, signaling) \n
1828/// 0x0B: False (ordered, non-signaling) \n
1829/// 0x0C: Not-equal (ordered, non-signaling) \n
1830/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1831/// 0x0E: Greater-than (ordered, signaling) \n
1832/// 0x0F: True (unordered, non-signaling) \n
1833/// 0x10: Equal (ordered, signaling) \n
1834/// 0x11: Less-than (ordered, non-signaling) \n
1835/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1836/// 0x13: Unordered (signaling) \n
1837/// 0x14: Not-equal (unordered, signaling) \n
1838/// 0x15: Not-less-than (unordered, non-signaling) \n
1839/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1840/// 0x17: Ordered (signaling) \n
1841/// 0x18: Equal (unordered, signaling) \n
1842/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1843/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1844/// 0x1B: False (ordered, signaling) \n
1845/// 0x1C: Not-equal (ordered, signaling) \n
1846/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1847/// 0x1E: Greater-than (ordered, non-signaling) \n
1848/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001849/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001850#define _mm256_cmp_ps(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001851 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001852 (__v8sf)(__m256)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001853
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001854/// Compares each of the corresponding scalar double-precision values of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001855/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001856/// immediate integer operand.
1857///
1858/// If the result is true, all 64 bits of the destination vector are set;
1859/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001860///
1861/// \headerfile <x86intrin.h>
1862///
1863/// \code
1864/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1865/// \endcode
1866///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001867/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001868///
1869/// \param a
1870/// A 128-bit vector of [2 x double].
1871/// \param b
1872/// A 128-bit vector of [2 x double].
1873/// \param c
1874/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001875/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001876/// 0x00: Equal (ordered, non-signaling) \n
1877/// 0x01: Less-than (ordered, signaling) \n
1878/// 0x02: Less-than-or-equal (ordered, signaling) \n
1879/// 0x03: Unordered (non-signaling) \n
1880/// 0x04: Not-equal (unordered, non-signaling) \n
1881/// 0x05: Not-less-than (unordered, signaling) \n
1882/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1883/// 0x07: Ordered (non-signaling) \n
1884/// 0x08: Equal (unordered, non-signaling) \n
1885/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1886/// 0x0A: Not-greater-than (unordered, signaling) \n
1887/// 0x0B: False (ordered, non-signaling) \n
1888/// 0x0C: Not-equal (ordered, non-signaling) \n
1889/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1890/// 0x0E: Greater-than (ordered, signaling) \n
1891/// 0x0F: True (unordered, non-signaling) \n
1892/// 0x10: Equal (ordered, signaling) \n
1893/// 0x11: Less-than (ordered, non-signaling) \n
1894/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1895/// 0x13: Unordered (signaling) \n
1896/// 0x14: Not-equal (unordered, signaling) \n
1897/// 0x15: Not-less-than (unordered, non-signaling) \n
1898/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1899/// 0x17: Ordered (signaling) \n
1900/// 0x18: Equal (unordered, signaling) \n
1901/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1902/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1903/// 0x1B: False (ordered, signaling) \n
1904/// 0x1C: Not-equal (ordered, signaling) \n
1905/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1906/// 0x1E: Greater-than (ordered, non-signaling) \n
1907/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001908/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001909#define _mm_cmp_sd(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001910 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001911 (__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001912
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001913/// Compares each of the corresponding scalar values of two 128-bit
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001914/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001915/// integer operand.
1916///
1917/// If the result is true, all 32 bits of the destination vector are set;
1918/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001919///
1920/// \headerfile <x86intrin.h>
1921///
1922/// \code
1923/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1924/// \endcode
1925///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001926/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001927///
1928/// \param a
1929/// A 128-bit vector of [4 x float].
1930/// \param b
1931/// A 128-bit vector of [4 x float].
1932/// \param c
1933/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001934/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001935/// 0x00: Equal (ordered, non-signaling) \n
1936/// 0x01: Less-than (ordered, signaling) \n
1937/// 0x02: Less-than-or-equal (ordered, signaling) \n
1938/// 0x03: Unordered (non-signaling) \n
1939/// 0x04: Not-equal (unordered, non-signaling) \n
1940/// 0x05: Not-less-than (unordered, signaling) \n
1941/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1942/// 0x07: Ordered (non-signaling) \n
1943/// 0x08: Equal (unordered, non-signaling) \n
1944/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1945/// 0x0A: Not-greater-than (unordered, signaling) \n
1946/// 0x0B: False (ordered, non-signaling) \n
1947/// 0x0C: Not-equal (ordered, non-signaling) \n
1948/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1949/// 0x0E: Greater-than (ordered, signaling) \n
1950/// 0x0F: True (unordered, non-signaling) \n
1951/// 0x10: Equal (ordered, signaling) \n
1952/// 0x11: Less-than (ordered, non-signaling) \n
1953/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1954/// 0x13: Unordered (signaling) \n
1955/// 0x14: Not-equal (unordered, signaling) \n
1956/// 0x15: Not-less-than (unordered, non-signaling) \n
1957/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1958/// 0x17: Ordered (signaling) \n
1959/// 0x18: Equal (unordered, signaling) \n
1960/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1961/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1962/// 0x1B: False (ordered, signaling) \n
1963/// 0x1C: Not-equal (ordered, signaling) \n
1964/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1965/// 0x1E: Greater-than (ordered, non-signaling) \n
1966/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001967/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topperc6338672018-05-31 00:51:20 +00001968#define _mm_cmp_ss(a, b, c) \
Craig Topper71481662015-11-10 05:08:05 +00001969 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
Craig Topperc6338672018-05-31 00:51:20 +00001970 (__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001971
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001972/// Takes a [8 x i32] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001973/// indexed by the immediate constant operand.
1974///
1975/// \headerfile <x86intrin.h>
1976///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001977/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1978/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001979///
1980/// \param __a
1981/// A 256-bit vector of [8 x i32].
1982/// \param __imm
1983/// An immediate integer operand with bits [2:0] determining which vector
1984/// element is extracted and returned.
1985/// \returns A 32-bit integer containing the extracted 32 bits of extended
1986/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00001987#define _mm256_extract_epi32(X, N) \
1988 (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001989
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001990/// Takes a [16 x i16] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001991/// indexed by the immediate constant operand.
1992///
1993/// \headerfile <x86intrin.h>
1994///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001995/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1996/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001997///
1998/// \param __a
1999/// A 256-bit integer vector of [16 x i16].
2000/// \param __imm
2001/// An immediate integer operand with bits [3:0] determining which vector
2002/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002003/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002004/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00002005#define _mm256_extract_epi16(X, N) \
2006 (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2007 (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002008
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002009/// Takes a [32 x i8] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002010/// indexed by the immediate constant operand.
2011///
2012/// \headerfile <x86intrin.h>
2013///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002014/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2015/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002016///
2017/// \param __a
2018/// A 256-bit integer vector of [32 x i8].
2019/// \param __imm
2020/// An immediate integer operand with bits [4:0] determining which vector
2021/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002022/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2023/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00002024#define _mm256_extract_epi8(X, N) \
2025 (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2026 (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002027
2028#ifdef __x86_64__
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002029/// Takes a [4 x i64] vector and returns the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002030/// indexed by the immediate constant operand.
2031///
2032/// \headerfile <x86intrin.h>
2033///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002034/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2035/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002036///
2037/// \param __a
2038/// A 256-bit integer vector of [4 x i64].
2039/// \param __imm
2040/// An immediate integer operand with bits [1:0] determining which vector
2041/// element is extracted and returned.
2042/// \returns A 64-bit integer containing the extracted 64 bits of extended
2043/// packed data.
Craig Topperf3914b72018-06-06 00:24:55 +00002044#define _mm256_extract_epi64(X, N) \
2045 (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002046#endif
2047
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002048/// Takes a [8 x i32] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002049/// indexed by the immediate constant operand by a new value. Returns the
2050/// modified vector.
2051///
2052/// \headerfile <x86intrin.h>
2053///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002054/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2055/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002056///
2057/// \param __a
2058/// A vector of [8 x i32] to be used by the insert operation.
2059/// \param __b
2060/// An integer value. The replacement value for the insert operation.
2061/// \param __imm
2062/// An immediate integer specifying the index of the vector element to be
2063/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002064/// \returns A copy of vector \a __a, after replacing its element indexed by
2065/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002066#define _mm256_insert_epi32(X, I, N) \
2067 (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2068 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002069
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002070
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002071/// Takes a [16 x i16] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002072/// indexed by the immediate constant operand with a new value. Returns the
2073/// modified vector.
2074///
2075/// \headerfile <x86intrin.h>
2076///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002077/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2078/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002079///
2080/// \param __a
2081/// A vector of [16 x i16] to be used by the insert operation.
2082/// \param __b
2083/// An i16 integer value. The replacement value for the insert operation.
2084/// \param __imm
2085/// An immediate integer specifying the index of the vector element to be
2086/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002087/// \returns A copy of vector \a __a, after replacing its element indexed by
2088/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002089#define _mm256_insert_epi16(X, I, N) \
2090 (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2091 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002092
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002093/// Takes a [32 x i8] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002094/// indexed by the immediate constant operand with a new value. Returns the
2095/// modified vector.
2096///
2097/// \headerfile <x86intrin.h>
2098///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2100/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002101///
2102/// \param __a
2103/// A vector of [32 x i8] to be used by the insert operation.
2104/// \param __b
2105/// An i8 integer value. The replacement value for the insert operation.
2106/// \param __imm
2107/// An immediate integer specifying the index of the vector element to be
2108/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002109/// \returns A copy of vector \a __a, after replacing its element indexed by
2110/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002111#define _mm256_insert_epi8(X, I, N) \
2112 (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2113 (int)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002114
2115#ifdef __x86_64__
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002116/// Takes a [4 x i64] vector and replaces the vector element value
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002117/// indexed by the immediate constant operand with a new value. Returns the
2118/// modified vector.
2119///
2120/// \headerfile <x86intrin.h>
2121///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002122/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2123/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002124///
2125/// \param __a
2126/// A vector of [4 x i64] to be used by the insert operation.
2127/// \param __b
2128/// A 64-bit integer value. The replacement value for the insert operation.
2129/// \param __imm
2130/// An immediate integer specifying the index of the vector element to be
2131/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002132/// \returns A copy of vector \a __a, after replacing its element indexed by
2133/// \a __imm with \a __b.
Craig Topperf3914b72018-06-06 00:24:55 +00002134#define _mm256_insert_epi64(X, I, N) \
2135 (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2136 (long long)(I), (int)(N))
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002137#endif
2138
2139/* Conversion */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002140/// Converts a vector of [4 x i32] into a vector of [4 x double].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002141///
2142/// \headerfile <x86intrin.h>
2143///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002144/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002145///
2146/// \param __a
2147/// A 128-bit integer vector of [4 x i32].
2148/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002149static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002150_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002151{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002152 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002153}
2154
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002155/// Converts a vector of [8 x i32] into a vector of [8 x float].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002156///
2157/// \headerfile <x86intrin.h>
2158///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002159/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002160///
2161/// \param __a
2162/// A 256-bit integer vector.
2163/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002164static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002165_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002166{
Craig Topper842171d2018-05-21 20:19:17 +00002167 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002168}
2169
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002170/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002171/// [4 x float].
2172///
2173/// \headerfile <x86intrin.h>
2174///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002175/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002176///
2177/// \param __a
2178/// A 256-bit vector of [4 x double].
2179/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002180static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002181_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002182{
David Blaikie3302f2b2013-01-16 23:08:36 +00002183 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002184}
2185
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002186/// Converts a vector of [8 x float] into a vector of [8 x i32].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002187///
2188/// \headerfile <x86intrin.h>
2189///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002190/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002191///
2192/// \param __a
2193/// A 256-bit vector of [8 x float].
2194/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002195static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002196_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002197{
David Blaikie3302f2b2013-01-16 23:08:36 +00002198 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002199}
2200
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002201/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002202/// x double].
2203///
2204/// \headerfile <x86intrin.h>
2205///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002206/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002207///
2208/// \param __a
2209/// A 128-bit vector of [4 x float].
2210/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002211static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002212_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002213{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002214 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002215}
2216
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002217/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002218/// x i32], truncating the result by rounding towards zero when it is
2219/// inexact.
2220///
2221/// \headerfile <x86intrin.h>
2222///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002223/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002224///
2225/// \param __a
2226/// A 256-bit vector of [4 x double].
2227/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002228static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002229_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002230{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002231 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002232}
2233
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002234/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002235/// x i32]. When a conversion is inexact, the value returned is rounded
2236/// according to the rounding control bits in the MXCSR register.
2237///
2238/// \headerfile <x86intrin.h>
2239///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002240/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002241///
2242/// \param __a
2243/// A 256-bit vector of [4 x double].
2244/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002245static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002246_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002247{
David Blaikie3302f2b2013-01-16 23:08:36 +00002248 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002249}
2250
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002251/// Converts a vector of [8 x float] into a vector of [8 x i32],
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002252/// truncating the result by rounding towards zero when it is inexact.
2253///
2254/// \headerfile <x86intrin.h>
2255///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002256/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002257///
2258/// \param __a
2259/// A 256-bit vector of [8 x float].
2260/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002261static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002262_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002263{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002264 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002265}
2266
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002267/// Returns the first element of the input vector of [4 x double].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002268///
2269/// \headerfile <avxintrin.h>
2270///
2271/// This intrinsic is a utility function and does not correspond to a specific
2272/// instruction.
2273///
2274/// \param __a
2275/// A 256-bit vector of [4 x double].
2276/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002277static __inline double __DEFAULT_FN_ATTRS
2278_mm256_cvtsd_f64(__m256d __a)
2279{
2280 return __a[0];
2281}
2282
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002283/// Returns the first element of the input vector of [8 x i32].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002284///
2285/// \headerfile <avxintrin.h>
2286///
2287/// This intrinsic is a utility function and does not correspond to a specific
2288/// instruction.
2289///
2290/// \param __a
2291/// A 256-bit vector of [8 x i32].
2292/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002293static __inline int __DEFAULT_FN_ATTRS
2294_mm256_cvtsi256_si32(__m256i __a)
2295{
2296 __v8si __b = (__v8si)__a;
2297 return __b[0];
2298}
2299
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002300/// Returns the first element of the input vector of [8 x float].
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002301///
2302/// \headerfile <avxintrin.h>
2303///
2304/// This intrinsic is a utility function and does not correspond to a specific
2305/// instruction.
2306///
2307/// \param __a
2308/// A 256-bit vector of [8 x float].
2309/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002310static __inline float __DEFAULT_FN_ATTRS
2311_mm256_cvtss_f32(__m256 __a)
2312{
2313 return __a[0];
2314}
2315
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002316/* Vector replicate */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002317/// Moves and duplicates odd-indexed values from a 256-bit vector of
Douglas Yung7ff91422018-01-08 21:21:17 +00002318/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002319///
2320/// \headerfile <x86intrin.h>
2321///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002322/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002323///
2324/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002325/// A 256-bit vector of [8 x float]. \n
2326/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2327/// the return value. \n
2328/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2329/// the return value. \n
2330/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2331/// return value. \n
2332/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2333/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002334/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2335/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002336static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002337_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002338{
Craig Topper1aa231e2016-05-16 06:38:42 +00002339 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002340}
2341
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002342/// Moves and duplicates even-indexed values from a 256-bit vector of
Douglas Yung7ff91422018-01-08 21:21:17 +00002343/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002344///
2345/// \headerfile <x86intrin.h>
2346///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002347/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002348///
2349/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002350/// A 256-bit vector of [8 x float]. \n
2351/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2352/// the return value. \n
2353/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2354/// the return value. \n
2355/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2356/// return value. \n
2357/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2358/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002359/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2360/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002361static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002362_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002363{
Craig Topper1aa231e2016-05-16 06:38:42 +00002364 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002365}
2366
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002367/// Moves and duplicates double-precision floating point values from a
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002368/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2369/// vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002370///
2371/// \headerfile <x86intrin.h>
2372///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002373/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002374///
2375/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002376/// A 256-bit vector of [4 x double]. \n
2377/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2378/// return value. \n
2379/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2380/// the return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002381/// \returns A 256-bit vector of [4 x double] containing the moved and
2382/// duplicated values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002383static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002384_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002385{
Craig Topper1aa231e2016-05-16 06:38:42 +00002386 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002387}
2388
2389/* Unpack and Interleave */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002390/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002391/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2392///
2393/// \headerfile <x86intrin.h>
2394///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002395/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002396///
2397/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002398/// A 256-bit floating-point vector of [4 x double]. \n
2399/// Bits [127:64] are written to bits [63:0] of the return value. \n
2400/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002401/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002402/// A 256-bit floating-point vector of [4 x double]. \n
2403/// Bits [127:64] are written to bits [127:64] of the return value. \n
2404/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002405/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002406static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002407_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002408{
Craig Topper1aa231e2016-05-16 06:38:42 +00002409 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002410}
2411
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002412/// Unpacks the even-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002413/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2414///
2415/// \headerfile <x86intrin.h>
2416///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002417/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002418///
2419/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002420/// A 256-bit floating-point vector of [4 x double]. \n
2421/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002422/// Bits [191:128] are written to bits [191:128] of the return value.
2423/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002424/// A 256-bit floating-point vector of [4 x double]. \n
2425/// Bits [63:0] are written to bits [127:64] of the return value. \n
2426/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002427/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002428static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002429_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002430{
Craig Topper1aa231e2016-05-16 06:38:42 +00002431 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002432}
2433
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002434/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002435/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2436/// vector of [8 x float].
2437///
2438/// \headerfile <x86intrin.h>
2439///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002440/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002441///
2442/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002443/// A 256-bit vector of [8 x float]. \n
2444/// Bits [95:64] are written to bits [31:0] of the return value. \n
2445/// Bits [127:96] are written to bits [95:64] of the return value. \n
2446/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002447/// Bits [255:224] are written to bits [223:192] of the return value.
2448/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002449/// A 256-bit vector of [8 x float]. \n
2450/// Bits [95:64] are written to bits [63:32] of the return value. \n
2451/// Bits [127:96] are written to bits [127:96] of the return value. \n
2452/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002453/// Bits [255:224] are written to bits [255:224] of the return value.
2454/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002455static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002456_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002457{
Craig Topper1aa231e2016-05-16 06:38:42 +00002458 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002459}
2460
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002461/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002462/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2463/// vector of [8 x float].
2464///
2465/// \headerfile <x86intrin.h>
2466///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002467/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002468///
2469/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002470/// A 256-bit vector of [8 x float]. \n
2471/// Bits [31:0] are written to bits [31:0] of the return value. \n
2472/// Bits [63:32] are written to bits [95:64] of the return value. \n
2473/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002474/// Bits [191:160] are written to bits [223:192] of the return value.
2475/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002476/// A 256-bit vector of [8 x float]. \n
2477/// Bits [31:0] are written to bits [63:32] of the return value. \n
2478/// Bits [63:32] are written to bits [127:96] of the return value. \n
2479/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002480/// Bits [191:160] are written to bits [255:224] of the return value.
2481/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002482static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002483_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002484{
Craig Topper1aa231e2016-05-16 06:38:42 +00002485 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002486}
2487
2488/* Bit Test */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002489/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002490/// element-by-element comparison of the double-precision element in the
2491/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002492/// vector.
2493///
2494/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002495/// If there is at least one pair of double-precision elements where the
2496/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002497/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002498/// If there is at least one pair of double-precision elements where the
2499/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002500/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002501/// This intrinsic returns the value of the ZF flag.
2502///
2503/// \headerfile <x86intrin.h>
2504///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002505/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002506///
2507/// \param __a
2508/// A 128-bit vector of [2 x double].
2509/// \param __b
2510/// A 128-bit vector of [2 x double].
2511/// \returns the ZF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002512static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002513_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002514{
David Blaikie3302f2b2013-01-16 23:08:36 +00002515 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002516}
2517
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002518/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002519/// element-by-element comparison of the double-precision element in the
2520/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002521/// vector.
2522///
2523/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002524/// If there is at least one pair of double-precision elements where the
2525/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002526/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002527/// If there is at least one pair of double-precision elements where the
2528/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002529/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002530/// This intrinsic returns the value of the CF flag.
2531///
2532/// \headerfile <x86intrin.h>
2533///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002534/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002535///
2536/// \param __a
2537/// A 128-bit vector of [2 x double].
2538/// \param __b
2539/// A 128-bit vector of [2 x double].
2540/// \returns the CF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002541static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002542_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002543{
David Blaikie3302f2b2013-01-16 23:08:36 +00002544 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002545}
2546
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002547/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002548/// element-by-element comparison of the double-precision element in the
2549/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002550/// vector.
2551///
2552/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002553/// If there is at least one pair of double-precision elements where the
2554/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002555/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002556/// If there is at least one pair of double-precision elements where the
2557/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002558/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002559/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2560/// otherwise it returns 0.
2561///
2562/// \headerfile <x86intrin.h>
2563///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002564/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002565///
2566/// \param __a
2567/// A 128-bit vector of [2 x double].
2568/// \param __b
2569/// A 128-bit vector of [2 x double].
2570/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002571static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002572_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002573{
David Blaikie3302f2b2013-01-16 23:08:36 +00002574 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002575}
2576
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002577/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002578/// element-by-element comparison of the single-precision element in the
2579/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002580/// vector.
2581///
2582/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002583/// If there is at least one pair of single-precision elements where the
2584/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002585/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002586/// If there is at least one pair of single-precision elements where the
2587/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002588/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002589/// This intrinsic returns the value of the ZF flag.
2590///
2591/// \headerfile <x86intrin.h>
2592///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002593/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002594///
2595/// \param __a
2596/// A 128-bit vector of [4 x float].
2597/// \param __b
2598/// A 128-bit vector of [4 x float].
2599/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002600static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002601_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002602{
David Blaikie3302f2b2013-01-16 23:08:36 +00002603 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002604}
2605
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002606/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002607/// element-by-element comparison of the single-precision element in the
2608/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002609/// vector.
2610///
2611/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002612/// If there is at least one pair of single-precision elements where the
2613/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002614/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002615/// If there is at least one pair of single-precision elements where the
2616/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002617/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002618/// This intrinsic returns the value of the CF flag.
2619///
2620/// \headerfile <x86intrin.h>
2621///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002622/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002623///
2624/// \param __a
2625/// A 128-bit vector of [4 x float].
2626/// \param __b
2627/// A 128-bit vector of [4 x float].
2628/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002629static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002630_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002631{
David Blaikie3302f2b2013-01-16 23:08:36 +00002632 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002633}
2634
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002635/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002636/// element-by-element comparison of the single-precision element in the
2637/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002638/// vector.
2639///
2640/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002641/// If there is at least one pair of single-precision elements where the
2642/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002643/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002644/// If there is at least one pair of single-precision elements where the
2645/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002646/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002647/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2648/// otherwise it returns 0.
2649///
2650/// \headerfile <x86intrin.h>
2651///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002652/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002653///
2654/// \param __a
2655/// A 128-bit vector of [4 x float].
2656/// \param __b
2657/// A 128-bit vector of [4 x float].
2658/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002659static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002660_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002661{
David Blaikie3302f2b2013-01-16 23:08:36 +00002662 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002663}
2664
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002665/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002666/// element-by-element comparison of the double-precision elements in the
2667/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002668/// vector.
2669///
2670/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002671/// If there is at least one pair of double-precision elements where the
2672/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002673/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002674/// If there is at least one pair of double-precision elements where the
2675/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002676/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002677/// This intrinsic returns the value of the ZF flag.
2678///
2679/// \headerfile <x86intrin.h>
2680///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002681/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002682///
2683/// \param __a
2684/// A 256-bit vector of [4 x double].
2685/// \param __b
2686/// A 256-bit vector of [4 x double].
2687/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002688static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002689_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002690{
David Blaikie3302f2b2013-01-16 23:08:36 +00002691 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002692}
2693
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002694/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002695/// element-by-element comparison of the double-precision elements in the
2696/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002697/// vector.
2698///
2699/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002700/// If there is at least one pair of double-precision elements where the
2701/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002702/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002703/// If there is at least one pair of double-precision elements where the
2704/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002705/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002706/// This intrinsic returns the value of the CF flag.
2707///
2708/// \headerfile <x86intrin.h>
2709///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002710/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002711///
2712/// \param __a
2713/// A 256-bit vector of [4 x double].
2714/// \param __b
2715/// A 256-bit vector of [4 x double].
2716/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002717static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002718_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002719{
David Blaikie3302f2b2013-01-16 23:08:36 +00002720 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002721}
2722
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002723/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002724/// element-by-element comparison of the double-precision elements in the
2725/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002726/// vector.
2727///
2728/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002729/// If there is at least one pair of double-precision elements where the
2730/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002731/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002732/// If there is at least one pair of double-precision elements where the
2733/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002734/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002735/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2736/// otherwise it returns 0.
2737///
2738/// \headerfile <x86intrin.h>
2739///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002740/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002741///
2742/// \param __a
2743/// A 256-bit vector of [4 x double].
2744/// \param __b
2745/// A 256-bit vector of [4 x double].
2746/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002747static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002748_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002749{
David Blaikie3302f2b2013-01-16 23:08:36 +00002750 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002751}
2752
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002753/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002754/// element-by-element comparison of the single-precision element in the
2755/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002756/// vector.
2757///
2758/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002759/// If there is at least one pair of single-precision elements where the
2760/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002761/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002762/// If there is at least one pair of single-precision elements where the
2763/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002764/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002765/// This intrinsic returns the value of the ZF flag.
2766///
2767/// \headerfile <x86intrin.h>
2768///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002769/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002770///
2771/// \param __a
2772/// A 256-bit vector of [8 x float].
2773/// \param __b
2774/// A 256-bit vector of [8 x float].
2775/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002776static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002777_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002778{
David Blaikie3302f2b2013-01-16 23:08:36 +00002779 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002780}
2781
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002782/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002783/// element-by-element comparison of the single-precision element in the
2784/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002785/// vector.
2786///
2787/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002788/// If there is at least one pair of single-precision elements where the
2789/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002790/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002791/// If there is at least one pair of single-precision elements where the
2792/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002793/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002794/// This intrinsic returns the value of the CF flag.
2795///
2796/// \headerfile <x86intrin.h>
2797///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002798/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002799///
2800/// \param __a
2801/// A 256-bit vector of [8 x float].
2802/// \param __b
2803/// A 256-bit vector of [8 x float].
2804/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002805static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002806_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002807{
David Blaikie3302f2b2013-01-16 23:08:36 +00002808 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002809}
2810
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002811/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002812/// element-by-element comparison of the single-precision elements in the
2813/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002814/// vector.
2815///
2816/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002817/// If there is at least one pair of single-precision elements where the
2818/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002819/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002820/// If there is at least one pair of single-precision elements where the
2821/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002822/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002823/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2824/// otherwise it returns 0.
2825///
2826/// \headerfile <x86intrin.h>
2827///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002828/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002829///
2830/// \param __a
2831/// A 256-bit vector of [8 x float].
2832/// \param __b
2833/// A 256-bit vector of [8 x float].
2834/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002835static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002836_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002837{
David Blaikie3302f2b2013-01-16 23:08:36 +00002838 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002839}
2840
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002841/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002842/// of the two source vectors.
2843///
2844/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002845/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002846/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002847/// If there is at least one pair of bits where the bit from the first source
2848/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002849/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002850/// This intrinsic returns the value of the ZF flag.
2851///
2852/// \headerfile <x86intrin.h>
2853///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002854/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002855///
2856/// \param __a
2857/// A 256-bit integer vector.
2858/// \param __b
2859/// A 256-bit integer vector.
2860/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002861static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002862_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002863{
David Blaikie3302f2b2013-01-16 23:08:36 +00002864 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002865}
2866
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002867/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002868/// of the two source vectors.
2869///
2870/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002871/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002872/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002873/// If there is at least one pair of bits where the bit from the first source
2874/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002875/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002876/// This intrinsic returns the value of the CF flag.
2877///
2878/// \headerfile <x86intrin.h>
2879///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002880/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002881///
2882/// \param __a
2883/// A 256-bit integer vector.
2884/// \param __b
2885/// A 256-bit integer vector.
2886/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002887static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002888_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002889{
David Blaikie3302f2b2013-01-16 23:08:36 +00002890 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002891}
2892
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002893/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002894/// of the two source vectors.
2895///
2896/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002897/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002898/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002899/// If there is at least one pair of bits where the bit from the first source
2900/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002901/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002902/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2903/// otherwise it returns 0.
2904///
2905/// \headerfile <x86intrin.h>
2906///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002907/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002908///
2909/// \param __a
2910/// A 256-bit integer vector.
2911/// \param __b
2912/// A 256-bit integer vector.
2913/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002914static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002915_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002916{
David Blaikie3302f2b2013-01-16 23:08:36 +00002917 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002918}
2919
2920/* Vector extract sign mask */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002921/// Extracts the sign bits of double-precision floating point elements
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002922/// in a 256-bit vector of [4 x double] and writes them to the lower order
2923/// bits of the return value.
2924///
2925/// \headerfile <x86intrin.h>
2926///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002927/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002928///
2929/// \param __a
2930/// A 256-bit vector of [4 x double] containing the double-precision
2931/// floating point values with sign bits to be extracted.
2932/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002933static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002934_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002935{
David Blaikie3302f2b2013-01-16 23:08:36 +00002936 return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002937}
2938
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002939/// Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002940/// in a 256-bit vector of [8 x float] and writes them to the lower order
2941/// bits of the return value.
2942///
2943/// \headerfile <x86intrin.h>
2944///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002945/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002946///
2947/// \param __a
Douglas Yung7ff91422018-01-08 21:21:17 +00002948/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002949/// point values with sign bits to be extracted.
2950/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002951static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002952_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002953{
David Blaikie3302f2b2013-01-16 23:08:36 +00002954 return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002955}
2956
David Blaikie3302f2b2013-01-16 23:08:36 +00002957/* Vector __zero */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002958/// Zeroes the contents of all XMM or YMM registers.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002959///
2960/// \headerfile <x86intrin.h>
2961///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002962/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00002963static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002964_mm256_zeroall(void)
2965{
2966 __builtin_ia32_vzeroall();
2967}
2968
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002969/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002970///
2971/// \headerfile <x86intrin.h>
2972///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002973/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00002974static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002975_mm256_zeroupper(void)
2976{
2977 __builtin_ia32_vzeroupper();
2978}
2979
2980/* Vector load with broadcast */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00002981/// Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002982/// specified address pointed to by \a __a and broadcasts it to the elements
2983/// of a [4 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002984///
2985/// \headerfile <x86intrin.h>
2986///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002987/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002988///
2989/// \param __a
2990/// The single-precision floating point value to be broadcast.
2991/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2992/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002993static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002994_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002995{
Adam Nemet286ae082014-05-29 20:47:29 +00002996 float __f = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00002997 return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002998}
2999
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003000/// Loads a scalar double-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003001/// specified address pointed to by \a __a and broadcasts it to the elements
3002/// of a [4 x double] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003003///
3004/// \headerfile <x86intrin.h>
3005///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003006/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003007///
3008/// \param __a
3009/// The double-precision floating point value to be broadcast.
3010/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3011/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003012static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003013_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003014{
Adam Nemet286ae082014-05-29 20:47:29 +00003015 double __d = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00003016 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003017}
3018
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003019/// Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003020/// specified address pointed to by \a __a and broadcasts it to the elements
3021/// of a [8 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003022///
3023/// \headerfile <x86intrin.h>
3024///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003025/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003026///
3027/// \param __a
3028/// The single-precision floating point value to be broadcast.
3029/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3030/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003031static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003032_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003033{
Adam Nemet286ae082014-05-29 20:47:29 +00003034 float __f = *__a;
Craig Topper63ec0ea2018-05-30 21:08:27 +00003035 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003036}
3037
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003038/// Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003039/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003040/// elements in a 256-bit vector of [4 x double].
3041///
3042/// \headerfile <x86intrin.h>
3043///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003044/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003045///
3046/// \param __a
3047/// The 128-bit vector of [2 x double] to be broadcast.
3048/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3049/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003050static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003051_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003052{
Craig Topper6fb26f92018-06-03 19:42:59 +00003053 __m128d __b = _mm_loadu_pd((const double *)__a);
3054 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3055 0, 1, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003056}
3057
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003058/// Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003059/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003060/// elements in a 256-bit vector of [8 x float].
3061///
3062/// \headerfile <x86intrin.h>
3063///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003064/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003065///
3066/// \param __a
3067/// The 128-bit vector of [4 x float] to be broadcast.
3068/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3069/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003070static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003071_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003072{
Craig Topper6fb26f92018-06-03 19:42:59 +00003073 __m128 __b = _mm_loadu_ps((const float *)__a);
3074 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3075 0, 1, 2, 3, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003076}
3077
3078/* SIMD load ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003079/// Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003080/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003081///
3082/// \headerfile <x86intrin.h>
3083///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003084/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003085///
3086/// \param __p
3087/// A 32-byte aligned pointer to a memory location containing
3088/// double-precision floating point values.
3089/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003090static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003091_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003092{
David Blaikie3302f2b2013-01-16 23:08:36 +00003093 return *(__m256d *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003094}
3095
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003096/// Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003097/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003098///
3099/// \headerfile <x86intrin.h>
3100///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003101/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003102///
3103/// \param __p
3104/// A 32-byte aligned pointer to a memory location containing float values.
3105/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003106static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003107_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003108{
David Blaikie3302f2b2013-01-16 23:08:36 +00003109 return *(__m256 *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003110}
3111
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003112/// Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003113/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003114///
3115/// \headerfile <x86intrin.h>
3116///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003117/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003118///
3119/// \param __p
3120/// A pointer to a memory location containing double-precision floating
3121/// point values.
3122/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003123static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003124_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003125{
Craig Topper9e9301a2012-01-25 04:26:17 +00003126 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +00003127 __m256d __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003128 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003129 return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003130}
3131
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003132/// Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003133/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003134///
3135/// \headerfile <x86intrin.h>
3136///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003137/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003138///
3139/// \param __p
3140/// A pointer to a memory location containing single-precision floating
3141/// point values.
3142/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003143static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003144_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003145{
Craig Topper9e9301a2012-01-25 04:26:17 +00003146 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +00003147 __m256 __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003148 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003149 return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003150}
3151
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003152/// Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003153/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003154///
3155/// \headerfile <x86intrin.h>
3156///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003157/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003158///
3159/// \param __p
3160/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3161/// values.
3162/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003163static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003164_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003165{
David Blaikie3302f2b2013-01-16 23:08:36 +00003166 return *__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003167}
3168
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003169/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003170/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003171///
3172/// \headerfile <x86intrin.h>
3173///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003174/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003175///
3176/// \param __p
3177/// A pointer to a 256-bit integer vector containing integer values.
3178/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003179static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003180_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003181{
Craig Topper9e9301a2012-01-25 04:26:17 +00003182 struct __loadu_si256 {
David Blaikie3302f2b2013-01-16 23:08:36 +00003183 __m256i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003184 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003185 return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003186}
3187
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003188/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003189/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3190/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003191/// line boundary.
3192///
3193/// \headerfile <x86intrin.h>
3194///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003195/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003196///
3197/// \param __p
3198/// A pointer to a 256-bit integer vector containing integer values.
3199/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003200static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003201_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003202{
David Blaikie3302f2b2013-01-16 23:08:36 +00003203 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003204}
3205
3206/* SIMD store ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003207/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003208/// of [4 x double] to a 32-byte aligned memory location pointed to by
3209/// \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003210///
3211/// \headerfile <x86intrin.h>
3212///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003213/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003214///
3215/// \param __p
3216/// A 32-byte aligned pointer to a memory location that will receive the
3217/// double-precision floaing point values.
3218/// \param __a
3219/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003220static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003221_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003222{
David Blaikie3302f2b2013-01-16 23:08:36 +00003223 *(__m256d *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003224}
3225
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003226/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003227/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003228///
3229/// \headerfile <x86intrin.h>
3230///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003231/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003232///
3233/// \param __p
3234/// A 32-byte aligned pointer to a memory location that will receive the
3235/// float values.
3236/// \param __a
3237/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003238static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003239_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003240{
David Blaikie3302f2b2013-01-16 23:08:36 +00003241 *(__m256 *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003242}
3243
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003244/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003245/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003246///
3247/// \headerfile <x86intrin.h>
3248///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003249/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003250///
3251/// \param __p
3252/// A pointer to a memory location that will receive the double-precision
3253/// floating point values.
3254/// \param __a
3255/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003256static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003257_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003258{
Craig Topper09175da2016-05-30 17:10:30 +00003259 struct __storeu_pd {
3260 __m256d __v;
3261 } __attribute__((__packed__, __may_alias__));
3262 ((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003263}
3264
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003265/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003266/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003267///
3268/// \headerfile <x86intrin.h>
3269///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003270/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003271///
3272/// \param __p
3273/// A pointer to a memory location that will receive the float values.
3274/// \param __a
3275/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003276static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003277_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003278{
Craig Topper09175da2016-05-30 17:10:30 +00003279 struct __storeu_ps {
3280 __m256 __v;
3281 } __attribute__((__packed__, __may_alias__));
3282 ((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003283}
3284
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003285/// Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003286/// aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003287///
3288/// \headerfile <x86intrin.h>
3289///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003290/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003291///
3292/// \param __p
3293/// A 32-byte aligned pointer to a memory location that will receive the
3294/// integer values.
3295/// \param __a
3296/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003297static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003298_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003299{
David Blaikie3302f2b2013-01-16 23:08:36 +00003300 *__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003301}
3302
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003303/// Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003304/// memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003305///
3306/// \headerfile <x86intrin.h>
3307///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003308/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003309///
3310/// \param __p
3311/// A pointer to a memory location that will receive the integer values.
3312/// \param __a
3313/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003314static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003315_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003316{
Craig Topper09175da2016-05-30 17:10:30 +00003317 struct __storeu_si256 {
3318 __m256i __v;
3319 } __attribute__((__packed__, __may_alias__));
3320 ((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003321}
3322
3323/* Conditional load ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003324/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003325/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003326/// [2 x double], depending on the mask bits associated with each data
3327/// element.
3328///
3329/// \headerfile <x86intrin.h>
3330///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003331/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003332///
3333/// \param __p
3334/// A pointer to a memory location that contains the double-precision
3335/// floating point values.
3336/// \param __m
3337/// A 128-bit integer vector containing the mask. The most significant bit of
3338/// each data element represents the mask bits. If a mask bit is zero, the
3339/// corresponding value in the memory location is not loaded and the
3340/// corresponding field in the return value is set to zero.
3341/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003342static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003343_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003344{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003345 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003346}
3347
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003348/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003349/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003350/// [4 x double], depending on the mask bits associated with each data
3351/// element.
3352///
3353/// \headerfile <x86intrin.h>
3354///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003355/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003356///
3357/// \param __p
3358/// A pointer to a memory location that contains the double-precision
3359/// floating point values.
3360/// \param __m
3361/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3362/// significant bit of each quadword element represents the mask bits. If a
3363/// mask bit is zero, the corresponding value in the memory location is not
3364/// loaded and the corresponding field in the return value is set to zero.
3365/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003366static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003367_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003368{
David Blaikie3302f2b2013-01-16 23:08:36 +00003369 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003370 (__v4di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003371}
3372
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003373/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003374/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003375/// [4 x float], depending on the mask bits associated with each data
3376/// element.
3377///
3378/// \headerfile <x86intrin.h>
3379///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003380/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003381///
3382/// \param __p
3383/// A pointer to a memory location that contains the single-precision
3384/// floating point values.
3385/// \param __m
3386/// A 128-bit integer vector containing the mask. The most significant bit of
3387/// each data element represents the mask bits. If a mask bit is zero, the
3388/// corresponding value in the memory location is not loaded and the
3389/// corresponding field in the return value is set to zero.
3390/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003391static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003392_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003393{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003394 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003395}
3396
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003397/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003398/// memory location pointed to by \a __p into a 256-bit vector of
3399/// [8 x float], depending on the mask bits associated with each data
3400/// element.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003401///
3402/// \headerfile <x86intrin.h>
3403///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003404/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003405///
3406/// \param __p
3407/// A pointer to a memory location that contains the single-precision
3408/// floating point values.
3409/// \param __m
3410/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3411/// significant bit of each dword element represents the mask bits. If a mask
3412/// bit is zero, the corresponding value in the memory location is not loaded
3413/// and the corresponding field in the return value is set to zero.
3414/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003415static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003416_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003417{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003418 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003419}
3420
3421/* Conditional store ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003422/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003423/// of [8 x float] to a memory location pointed to by \a __p, according to
3424/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003425///
3426/// \headerfile <x86intrin.h>
3427///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003428/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003429///
3430/// \param __p
3431/// A pointer to a memory location that will receive the float values.
3432/// \param __m
3433/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3434/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003435/// mask bits. If a mask bit is zero, the corresponding value from vector
3436/// \a __a is not stored and the corresponding field in the memory location
3437/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003438/// \param __a
3439/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003440static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003441_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003442{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003443 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003444}
3445
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003446/// Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003447/// to a memory location pointed to by \a __p, according to the specified
3448/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003449///
3450/// \headerfile <x86intrin.h>
3451///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003452/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003453///
3454/// \param __p
3455/// A pointer to a memory location that will receive the float values.
3456/// \param __m
3457/// A 128-bit integer vector containing the mask. The most significant bit of
3458/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003459/// zero, the corresponding value from vector \a __a is not stored and the
3460/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003461/// changed.
3462/// \param __a
3463/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003464static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003465_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003466{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003467 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003468}
3469
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003470/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003471/// to a memory location pointed to by \a __p, according to the specified
3472/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003473///
3474/// \headerfile <x86intrin.h>
3475///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003476/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003477///
3478/// \param __p
3479/// A pointer to a memory location that will receive the float values.
3480/// \param __m
3481/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3482/// significant bit of each quadword element in the mask vector represents
3483/// the mask bits. If a mask bit is zero, the corresponding value from vector
3484/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003485/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003486/// \param __a
3487/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003488static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003489_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003490{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003491 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003492}
3493
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003494/// Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003495/// of [4 x float] to a memory location pointed to by \a __p, according to
3496/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003497///
3498/// \headerfile <x86intrin.h>
3499///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003500/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003501///
3502/// \param __p
3503/// A pointer to a memory location that will receive the float values.
3504/// \param __m
3505/// A 128-bit integer vector containing the mask. The most significant bit of
3506/// each field in the mask vector represents the mask bits. If a mask bit is
3507/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003508/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003509/// changed.
3510/// \param __a
3511/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003512static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003513_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003514{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003515 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003516}
3517
3518/* Cacheability support ops */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003519/// Moves integer data from a 256-bit integer vector to a 32-byte
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003520/// aligned memory location. To minimize caching, the data is flagged as
3521/// non-temporal (unlikely to be used again soon).
3522///
3523/// \headerfile <x86intrin.h>
3524///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003525/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003526///
3527/// \param __a
3528/// A pointer to a 32-byte aligned memory location that will receive the
3529/// integer values.
3530/// \param __b
3531/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003532static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003533_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003534{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003535 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3536 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003537}
3538
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003539/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003540/// to a 32-byte aligned memory location. To minimize caching, the data is
3541/// flagged as non-temporal (unlikely to be used again soon).
3542///
3543/// \headerfile <x86intrin.h>
3544///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003545/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003546///
3547/// \param __a
3548/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanovacb3603a2017-06-06 22:58:01 +00003549/// double-precision floating-point values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003550/// \param __b
3551/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003552static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003553_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003554{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003555 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3556 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003557}
3558
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003559/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003560/// of [8 x float] to a 32-byte aligned memory location. To minimize
3561/// caching, the data is flagged as non-temporal (unlikely to be used again
3562/// soon).
3563///
3564/// \headerfile <x86intrin.h>
3565///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003566/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003567///
3568/// \param __p
3569/// A pointer to a 32-byte aligned memory location that will receive the
3570/// single-precision floating point values.
3571/// \param __a
3572/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003573static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003574_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003575{
Reid Kleckner89fbd552018-06-04 21:39:20 +00003576 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3577 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003578}
3579
3580/* Create vectors */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003581/// Create a 256-bit vector of [4 x double] with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003582///
3583/// \headerfile <x86intrin.h>
3584///
3585/// This intrinsic has no corresponding instruction.
3586///
3587/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003588static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003589_mm256_undefined_pd(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003590{
3591 return (__m256d)__builtin_ia32_undef256();
3592}
3593
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003594/// Create a 256-bit vector of [8 x float] with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003595///
3596/// \headerfile <x86intrin.h>
3597///
3598/// This intrinsic has no corresponding instruction.
3599///
3600/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003601static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003602_mm256_undefined_ps(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003603{
3604 return (__m256)__builtin_ia32_undef256();
3605}
3606
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003607/// Create a 256-bit integer vector with undefined values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003608///
3609/// \headerfile <x86intrin.h>
3610///
3611/// This intrinsic has no corresponding instruction.
3612///
3613/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003614static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003615_mm256_undefined_si256(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003616{
3617 return (__m256i)__builtin_ia32_undef256();
3618}
3619
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003620/// Constructs a 256-bit floating-point vector of [4 x double]
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003621/// initialized with the specified double-precision floating-point values.
3622///
3623/// \headerfile <x86intrin.h>
3624///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003625/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3626/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003627///
3628/// \param __a
3629/// A double-precision floating-point value used to initialize bits [255:192]
3630/// of the result.
3631/// \param __b
3632/// A double-precision floating-point value used to initialize bits [191:128]
3633/// of the result.
3634/// \param __c
3635/// A double-precision floating-point value used to initialize bits [127:64]
3636/// of the result.
3637/// \param __d
3638/// A double-precision floating-point value used to initialize bits [63:0]
3639/// of the result.
3640/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003641static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003642_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003643{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003644 return __extension__ (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003645}
3646
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003647/// Constructs a 256-bit floating-point vector of [8 x float] initialized
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003648/// with the specified single-precision floating-point values.
3649///
3650/// \headerfile <x86intrin.h>
3651///
3652/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003653/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003654///
3655/// \param __a
3656/// A single-precision floating-point value used to initialize bits [255:224]
3657/// of the result.
3658/// \param __b
3659/// A single-precision floating-point value used to initialize bits [223:192]
3660/// of the result.
3661/// \param __c
3662/// A single-precision floating-point value used to initialize bits [191:160]
3663/// of the result.
3664/// \param __d
3665/// A single-precision floating-point value used to initialize bits [159:128]
3666/// of the result.
3667/// \param __e
3668/// A single-precision floating-point value used to initialize bits [127:96]
3669/// of the result.
3670/// \param __f
3671/// A single-precision floating-point value used to initialize bits [95:64]
3672/// of the result.
3673/// \param __g
3674/// A single-precision floating-point value used to initialize bits [63:32]
3675/// of the result.
3676/// \param __h
3677/// A single-precision floating-point value used to initialize bits [31:0]
3678/// of the result.
3679/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003680static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003681_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003682 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003683{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003684 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003685}
3686
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003687/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003688/// 32-bit integral values.
3689///
3690/// \headerfile <x86intrin.h>
3691///
3692/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003693/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003694///
3695/// \param __i0
3696/// A 32-bit integral value used to initialize bits [255:224] of the result.
3697/// \param __i1
3698/// A 32-bit integral value used to initialize bits [223:192] of the result.
3699/// \param __i2
3700/// A 32-bit integral value used to initialize bits [191:160] of the result.
3701/// \param __i3
3702/// A 32-bit integral value used to initialize bits [159:128] of the result.
3703/// \param __i4
3704/// A 32-bit integral value used to initialize bits [127:96] of the result.
3705/// \param __i5
3706/// A 32-bit integral value used to initialize bits [95:64] of the result.
3707/// \param __i6
3708/// A 32-bit integral value used to initialize bits [63:32] of the result.
3709/// \param __i7
3710/// A 32-bit integral value used to initialize bits [31:0] of the result.
3711/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003712static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003713_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003714 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003715{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003716 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003717}
3718
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003719/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003720/// 16-bit integral values.
3721///
3722/// \headerfile <x86intrin.h>
3723///
3724/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003725/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003726///
3727/// \param __w15
3728/// A 16-bit integral value used to initialize bits [255:240] of the result.
3729/// \param __w14
3730/// A 16-bit integral value used to initialize bits [239:224] of the result.
3731/// \param __w13
3732/// A 16-bit integral value used to initialize bits [223:208] of the result.
3733/// \param __w12
3734/// A 16-bit integral value used to initialize bits [207:192] of the result.
3735/// \param __w11
3736/// A 16-bit integral value used to initialize bits [191:176] of the result.
3737/// \param __w10
3738/// A 16-bit integral value used to initialize bits [175:160] of the result.
3739/// \param __w09
3740/// A 16-bit integral value used to initialize bits [159:144] of the result.
3741/// \param __w08
3742/// A 16-bit integral value used to initialize bits [143:128] of the result.
3743/// \param __w07
3744/// A 16-bit integral value used to initialize bits [127:112] of the result.
3745/// \param __w06
3746/// A 16-bit integral value used to initialize bits [111:96] of the result.
3747/// \param __w05
3748/// A 16-bit integral value used to initialize bits [95:80] of the result.
3749/// \param __w04
3750/// A 16-bit integral value used to initialize bits [79:64] of the result.
3751/// \param __w03
3752/// A 16-bit integral value used to initialize bits [63:48] of the result.
3753/// \param __w02
3754/// A 16-bit integral value used to initialize bits [47:32] of the result.
3755/// \param __w01
3756/// A 16-bit integral value used to initialize bits [31:16] of the result.
3757/// \param __w00
3758/// A 16-bit integral value used to initialize bits [15:0] of the result.
3759/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003760static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003761_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003762 short __w11, short __w10, short __w09, short __w08,
3763 short __w07, short __w06, short __w05, short __w04,
3764 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003765{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003766 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
David Blaikie3302f2b2013-01-16 23:08:36 +00003767 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003768}
3769
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003770/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003771/// 8-bit integral values.
3772///
3773/// \headerfile <x86intrin.h>
3774///
3775/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003776/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003777///
3778/// \param __b31
3779/// An 8-bit integral value used to initialize bits [255:248] of the result.
3780/// \param __b30
3781/// An 8-bit integral value used to initialize bits [247:240] of the result.
3782/// \param __b29
3783/// An 8-bit integral value used to initialize bits [239:232] of the result.
3784/// \param __b28
3785/// An 8-bit integral value used to initialize bits [231:224] of the result.
3786/// \param __b27
3787/// An 8-bit integral value used to initialize bits [223:216] of the result.
3788/// \param __b26
3789/// An 8-bit integral value used to initialize bits [215:208] of the result.
3790/// \param __b25
3791/// An 8-bit integral value used to initialize bits [207:200] of the result.
3792/// \param __b24
3793/// An 8-bit integral value used to initialize bits [199:192] of the result.
3794/// \param __b23
3795/// An 8-bit integral value used to initialize bits [191:184] of the result.
3796/// \param __b22
3797/// An 8-bit integral value used to initialize bits [183:176] of the result.
3798/// \param __b21
3799/// An 8-bit integral value used to initialize bits [175:168] of the result.
3800/// \param __b20
3801/// An 8-bit integral value used to initialize bits [167:160] of the result.
3802/// \param __b19
3803/// An 8-bit integral value used to initialize bits [159:152] of the result.
3804/// \param __b18
3805/// An 8-bit integral value used to initialize bits [151:144] of the result.
3806/// \param __b17
3807/// An 8-bit integral value used to initialize bits [143:136] of the result.
3808/// \param __b16
3809/// An 8-bit integral value used to initialize bits [135:128] of the result.
3810/// \param __b15
3811/// An 8-bit integral value used to initialize bits [127:120] of the result.
3812/// \param __b14
3813/// An 8-bit integral value used to initialize bits [119:112] of the result.
3814/// \param __b13
3815/// An 8-bit integral value used to initialize bits [111:104] of the result.
3816/// \param __b12
3817/// An 8-bit integral value used to initialize bits [103:96] of the result.
3818/// \param __b11
3819/// An 8-bit integral value used to initialize bits [95:88] of the result.
3820/// \param __b10
3821/// An 8-bit integral value used to initialize bits [87:80] of the result.
3822/// \param __b09
3823/// An 8-bit integral value used to initialize bits [79:72] of the result.
3824/// \param __b08
3825/// An 8-bit integral value used to initialize bits [71:64] of the result.
3826/// \param __b07
3827/// An 8-bit integral value used to initialize bits [63:56] of the result.
3828/// \param __b06
3829/// An 8-bit integral value used to initialize bits [55:48] of the result.
3830/// \param __b05
3831/// An 8-bit integral value used to initialize bits [47:40] of the result.
3832/// \param __b04
3833/// An 8-bit integral value used to initialize bits [39:32] of the result.
3834/// \param __b03
3835/// An 8-bit integral value used to initialize bits [31:24] of the result.
3836/// \param __b02
3837/// An 8-bit integral value used to initialize bits [23:16] of the result.
3838/// \param __b01
3839/// An 8-bit integral value used to initialize bits [15:8] of the result.
3840/// \param __b00
3841/// An 8-bit integral value used to initialize bits [7:0] of the result.
3842/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003843static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003844_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003845 char __b27, char __b26, char __b25, char __b24,
3846 char __b23, char __b22, char __b21, char __b20,
3847 char __b19, char __b18, char __b17, char __b16,
3848 char __b15, char __b14, char __b13, char __b12,
3849 char __b11, char __b10, char __b09, char __b08,
3850 char __b07, char __b06, char __b05, char __b04,
3851 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003852{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003853 return __extension__ (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00003854 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3855 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3856 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3857 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003858 };
3859}
3860
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003861/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003862/// 64-bit integral values.
3863///
3864/// \headerfile <x86intrin.h>
3865///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003866/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3867/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003868///
3869/// \param __a
3870/// A 64-bit integral value used to initialize bits [255:192] of the result.
3871/// \param __b
3872/// A 64-bit integral value used to initialize bits [191:128] of the result.
3873/// \param __c
3874/// A 64-bit integral value used to initialize bits [127:64] of the result.
3875/// \param __d
3876/// A 64-bit integral value used to initialize bits [63:0] of the result.
3877/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003878static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003879_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003880{
Craig Topper63ec0ea2018-05-30 21:08:27 +00003881 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003882}
3883
3884/* Create vectors with elements in reverse order */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003885/// Constructs a 256-bit floating-point vector of [4 x double],
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003886/// initialized in reverse order with the specified double-precision
3887/// floating-point values.
3888///
3889/// \headerfile <x86intrin.h>
3890///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003891/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3892/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003893///
3894/// \param __a
3895/// A double-precision floating-point value used to initialize bits [63:0]
3896/// of the result.
3897/// \param __b
3898/// A double-precision floating-point value used to initialize bits [127:64]
3899/// of the result.
3900/// \param __c
3901/// A double-precision floating-point value used to initialize bits [191:128]
3902/// of the result.
3903/// \param __d
3904/// A double-precision floating-point value used to initialize bits [255:192]
3905/// of the result.
3906/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003907static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003908_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003909{
Tim Shenf811de42018-05-31 01:51:07 +00003910 return _mm256_set_pd(__d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003911}
3912
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003913/// Constructs a 256-bit floating-point vector of [8 x float],
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003914/// initialized in reverse order with the specified single-precision
3915/// float-point values.
3916///
3917/// \headerfile <x86intrin.h>
3918///
3919/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003920/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003921///
3922/// \param __a
3923/// A single-precision floating-point value used to initialize bits [31:0]
3924/// of the result.
3925/// \param __b
3926/// A single-precision floating-point value used to initialize bits [63:32]
3927/// of the result.
3928/// \param __c
3929/// A single-precision floating-point value used to initialize bits [95:64]
3930/// of the result.
3931/// \param __d
3932/// A single-precision floating-point value used to initialize bits [127:96]
3933/// of the result.
3934/// \param __e
3935/// A single-precision floating-point value used to initialize bits [159:128]
3936/// of the result.
3937/// \param __f
3938/// A single-precision floating-point value used to initialize bits [191:160]
3939/// of the result.
3940/// \param __g
3941/// A single-precision floating-point value used to initialize bits [223:192]
3942/// of the result.
3943/// \param __h
3944/// A single-precision floating-point value used to initialize bits [255:224]
3945/// of the result.
3946/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003947static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003948_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003949 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003950{
Tim Shenf811de42018-05-31 01:51:07 +00003951 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003952}
3953
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003954/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003955/// with the specified 32-bit integral values.
3956///
3957/// \headerfile <x86intrin.h>
3958///
3959/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003960/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003961///
3962/// \param __i0
3963/// A 32-bit integral value used to initialize bits [31:0] of the result.
3964/// \param __i1
3965/// A 32-bit integral value used to initialize bits [63:32] of the result.
3966/// \param __i2
3967/// A 32-bit integral value used to initialize bits [95:64] of the result.
3968/// \param __i3
3969/// A 32-bit integral value used to initialize bits [127:96] of the result.
3970/// \param __i4
3971/// A 32-bit integral value used to initialize bits [159:128] of the result.
3972/// \param __i5
3973/// A 32-bit integral value used to initialize bits [191:160] of the result.
3974/// \param __i6
3975/// A 32-bit integral value used to initialize bits [223:192] of the result.
3976/// \param __i7
3977/// A 32-bit integral value used to initialize bits [255:224] of the result.
3978/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003979static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003980_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003981 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003982{
Tim Shenf811de42018-05-31 01:51:07 +00003983 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003984}
3985
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00003986/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003987/// with the specified 16-bit integral values.
3988///
3989/// \headerfile <x86intrin.h>
3990///
3991/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003992/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003993///
3994/// \param __w15
3995/// A 16-bit integral value used to initialize bits [15:0] of the result.
3996/// \param __w14
3997/// A 16-bit integral value used to initialize bits [31:16] of the result.
3998/// \param __w13
3999/// A 16-bit integral value used to initialize bits [47:32] of the result.
4000/// \param __w12
4001/// A 16-bit integral value used to initialize bits [63:48] of the result.
4002/// \param __w11
4003/// A 16-bit integral value used to initialize bits [79:64] of the result.
4004/// \param __w10
4005/// A 16-bit integral value used to initialize bits [95:80] of the result.
4006/// \param __w09
4007/// A 16-bit integral value used to initialize bits [111:96] of the result.
4008/// \param __w08
4009/// A 16-bit integral value used to initialize bits [127:112] of the result.
4010/// \param __w07
4011/// A 16-bit integral value used to initialize bits [143:128] of the result.
4012/// \param __w06
4013/// A 16-bit integral value used to initialize bits [159:144] of the result.
4014/// \param __w05
4015/// A 16-bit integral value used to initialize bits [175:160] of the result.
4016/// \param __w04
4017/// A 16-bit integral value used to initialize bits [191:176] of the result.
4018/// \param __w03
4019/// A 16-bit integral value used to initialize bits [207:192] of the result.
4020/// \param __w02
4021/// A 16-bit integral value used to initialize bits [223:208] of the result.
4022/// \param __w01
4023/// A 16-bit integral value used to initialize bits [239:224] of the result.
4024/// \param __w00
4025/// A 16-bit integral value used to initialize bits [255:240] of the result.
4026/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004027static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004028_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004029 short __w11, short __w10, short __w09, short __w08,
4030 short __w07, short __w06, short __w05, short __w04,
4031 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004032{
Tim Shenf811de42018-05-31 01:51:07 +00004033 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4034 __w04, __w05, __w06, __w07,
4035 __w08, __w09, __w10, __w11,
4036 __w12, __w13, __w14, __w15);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004037}
4038
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004039/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004040/// with the specified 8-bit integral values.
4041///
4042/// \headerfile <x86intrin.h>
4043///
4044/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004045/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004046///
4047/// \param __b31
4048/// An 8-bit integral value used to initialize bits [7:0] of the result.
4049/// \param __b30
4050/// An 8-bit integral value used to initialize bits [15:8] of the result.
4051/// \param __b29
4052/// An 8-bit integral value used to initialize bits [23:16] of the result.
4053/// \param __b28
4054/// An 8-bit integral value used to initialize bits [31:24] of the result.
4055/// \param __b27
4056/// An 8-bit integral value used to initialize bits [39:32] of the result.
4057/// \param __b26
4058/// An 8-bit integral value used to initialize bits [47:40] of the result.
4059/// \param __b25
4060/// An 8-bit integral value used to initialize bits [55:48] of the result.
4061/// \param __b24
4062/// An 8-bit integral value used to initialize bits [63:56] of the result.
4063/// \param __b23
4064/// An 8-bit integral value used to initialize bits [71:64] of the result.
4065/// \param __b22
4066/// An 8-bit integral value used to initialize bits [79:72] of the result.
4067/// \param __b21
4068/// An 8-bit integral value used to initialize bits [87:80] of the result.
4069/// \param __b20
4070/// An 8-bit integral value used to initialize bits [95:88] of the result.
4071/// \param __b19
4072/// An 8-bit integral value used to initialize bits [103:96] of the result.
4073/// \param __b18
4074/// An 8-bit integral value used to initialize bits [111:104] of the result.
4075/// \param __b17
4076/// An 8-bit integral value used to initialize bits [119:112] of the result.
4077/// \param __b16
4078/// An 8-bit integral value used to initialize bits [127:120] of the result.
4079/// \param __b15
4080/// An 8-bit integral value used to initialize bits [135:128] of the result.
4081/// \param __b14
4082/// An 8-bit integral value used to initialize bits [143:136] of the result.
4083/// \param __b13
4084/// An 8-bit integral value used to initialize bits [151:144] of the result.
4085/// \param __b12
4086/// An 8-bit integral value used to initialize bits [159:152] of the result.
4087/// \param __b11
4088/// An 8-bit integral value used to initialize bits [167:160] of the result.
4089/// \param __b10
4090/// An 8-bit integral value used to initialize bits [175:168] of the result.
4091/// \param __b09
4092/// An 8-bit integral value used to initialize bits [183:176] of the result.
4093/// \param __b08
4094/// An 8-bit integral value used to initialize bits [191:184] of the result.
4095/// \param __b07
4096/// An 8-bit integral value used to initialize bits [199:192] of the result.
4097/// \param __b06
4098/// An 8-bit integral value used to initialize bits [207:200] of the result.
4099/// \param __b05
4100/// An 8-bit integral value used to initialize bits [215:208] of the result.
4101/// \param __b04
4102/// An 8-bit integral value used to initialize bits [223:216] of the result.
4103/// \param __b03
4104/// An 8-bit integral value used to initialize bits [231:224] of the result.
4105/// \param __b02
4106/// An 8-bit integral value used to initialize bits [239:232] of the result.
4107/// \param __b01
4108/// An 8-bit integral value used to initialize bits [247:240] of the result.
4109/// \param __b00
4110/// An 8-bit integral value used to initialize bits [255:248] of the result.
4111/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004112static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004113_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004114 char __b27, char __b26, char __b25, char __b24,
4115 char __b23, char __b22, char __b21, char __b20,
4116 char __b19, char __b18, char __b17, char __b16,
4117 char __b15, char __b14, char __b13, char __b12,
4118 char __b11, char __b10, char __b09, char __b08,
4119 char __b07, char __b06, char __b05, char __b04,
4120 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004121{
Tim Shenf811de42018-05-31 01:51:07 +00004122 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4123 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4124 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4125 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004126}
4127
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004128/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004129/// with the specified 64-bit integral values.
4130///
4131/// \headerfile <x86intrin.h>
4132///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004133/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4134/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004135///
4136/// \param __a
4137/// A 64-bit integral value used to initialize bits [63:0] of the result.
4138/// \param __b
4139/// A 64-bit integral value used to initialize bits [127:64] of the result.
4140/// \param __c
4141/// A 64-bit integral value used to initialize bits [191:128] of the result.
4142/// \param __d
4143/// A 64-bit integral value used to initialize bits [255:192] of the result.
4144/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004145static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004146_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004147{
Tim Shenf811de42018-05-31 01:51:07 +00004148 return _mm256_set_epi64x(__d, __c, __b, __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004149}
4150
4151/* Create vectors with repeated elements */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004152/// Constructs a 256-bit floating-point vector of [4 x double], with each
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004153/// of the four double-precision floating-point vector elements set to the
4154/// specified double-precision floating-point value.
4155///
4156/// \headerfile <x86intrin.h>
4157///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004158/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004159///
4160/// \param __w
4161/// A double-precision floating-point value used to initialize each vector
4162/// element of the result.
4163/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004164static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004165_mm256_set1_pd(double __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004166{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004167 return _mm256_set_pd(__w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004168}
4169
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004170/// Constructs a 256-bit floating-point vector of [8 x float], with each
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004171/// of the eight single-precision floating-point vector elements set to the
4172/// specified single-precision floating-point value.
4173///
4174/// \headerfile <x86intrin.h>
4175///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004176/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4177/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004178///
4179/// \param __w
4180/// A single-precision floating-point value used to initialize each vector
4181/// element of the result.
4182/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004183static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004184_mm256_set1_ps(float __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004185{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004186 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004187}
4188
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004189/// Constructs a 256-bit integer vector of [8 x i32], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004190/// 32-bit integral vector elements set to the specified 32-bit integral
4191/// value.
4192///
4193/// \headerfile <x86intrin.h>
4194///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004195/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4196/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004197///
4198/// \param __i
4199/// A 32-bit integral value used to initialize each vector element of the
4200/// result.
4201/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kupersteine45af542015-06-30 13:36:19 +00004202static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004203_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004204{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004205 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004206}
4207
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004208/// Constructs a 256-bit integer vector of [16 x i16], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004209/// 16-bit integral vector elements set to the specified 16-bit integral
4210/// value.
4211///
4212/// \headerfile <x86intrin.h>
4213///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004214/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004215///
4216/// \param __w
4217/// A 16-bit integral value used to initialize each vector element of the
4218/// result.
4219/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kupersteine45af542015-06-30 13:36:19 +00004220static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004221_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004222{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004223 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4224 __w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004225}
4226
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004227/// Constructs a 256-bit integer vector of [32 x i8], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004228/// 8-bit integral vector elements set to the specified 8-bit integral value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004232/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004233///
4234/// \param __b
4235/// An 8-bit integral value used to initialize each vector element of the
4236/// result.
4237/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kupersteine45af542015-06-30 13:36:19 +00004238static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004239_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004240{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004241 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4242 __b, __b, __b, __b, __b, __b, __b, __b,
4243 __b, __b, __b, __b, __b, __b, __b, __b,
4244 __b, __b, __b, __b, __b, __b, __b, __b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004245}
4246
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004247/// Constructs a 256-bit integer vector of [4 x i64], with each of the
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004248/// 64-bit integral vector elements set to the specified 64-bit integral
4249/// value.
4250///
4251/// \headerfile <x86intrin.h>
4252///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004253/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004254///
4255/// \param __q
4256/// A 64-bit integral value used to initialize each vector element of the
4257/// result.
4258/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kupersteine45af542015-06-30 13:36:19 +00004259static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004260_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004261{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004262 return _mm256_set_epi64x(__q, __q, __q, __q);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004263}
4264
David Blaikie3302f2b2013-01-16 23:08:36 +00004265/* Create __zeroed vectors */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004266/// Constructs a 256-bit floating-point vector of [4 x double] with all
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004267/// vector elements initialized to zero.
4268///
4269/// \headerfile <x86intrin.h>
4270///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004271/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004272///
4273/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004274static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004275_mm256_setzero_pd(void)
4276{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004277 return __extension__ (__m256d){ 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004278}
4279
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004280/// Constructs a 256-bit floating-point vector of [8 x float] with all
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004281/// vector elements initialized to zero.
4282///
4283/// \headerfile <x86intrin.h>
4284///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004285/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004286///
4287/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004288static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004289_mm256_setzero_ps(void)
4290{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004291 return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004292}
4293
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004294/// Constructs a 256-bit integer vector initialized to zero.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004295///
4296/// \headerfile <x86intrin.h>
4297///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004298/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004299///
4300/// \returns A 256-bit integer vector initialized to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004301static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004302_mm256_setzero_si256(void)
4303{
Craig Topper63ec0ea2018-05-30 21:08:27 +00004304 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004305}
4306
4307/* Cast between vector types */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004308/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004309/// floating-point vector of [8 x float].
4310///
4311/// \headerfile <x86intrin.h>
4312///
4313/// This intrinsic has no corresponding instruction.
4314///
4315/// \param __a
4316/// A 256-bit floating-point vector of [4 x double].
4317/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4318/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004319static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004320_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004321{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004322 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004323}
4324
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004325/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004326/// integer vector.
4327///
4328/// \headerfile <x86intrin.h>
4329///
4330/// This intrinsic has no corresponding instruction.
4331///
4332/// \param __a
4333/// A 256-bit floating-point vector of [4 x double].
4334/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4335/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004336static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004337_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004338{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004339 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004340}
4341
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004342/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004343/// floating-point vector of [4 x double].
4344///
4345/// \headerfile <x86intrin.h>
4346///
4347/// This intrinsic has no corresponding instruction.
4348///
4349/// \param __a
4350/// A 256-bit floating-point vector of [8 x float].
4351/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4352/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004353static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004354_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004355{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004356 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004357}
4358
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004359/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004360/// integer vector.
4361///
4362/// \headerfile <x86intrin.h>
4363///
4364/// This intrinsic has no corresponding instruction.
4365///
4366/// \param __a
4367/// A 256-bit floating-point vector of [8 x float].
4368/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4369/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004370static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004371_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004372{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004373 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004374}
4375
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004376/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004377/// of [8 x float].
4378///
4379/// \headerfile <x86intrin.h>
4380///
4381/// This intrinsic has no corresponding instruction.
4382///
4383/// \param __a
4384/// A 256-bit integer vector.
4385/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4386/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004387static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004388_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004389{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004390 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004391}
4392
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004393/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004394/// of [4 x double].
4395///
4396/// \headerfile <x86intrin.h>
4397///
4398/// This intrinsic has no corresponding instruction.
4399///
4400/// \param __a
4401/// A 256-bit integer vector.
4402/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4403/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004404static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004405_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004406{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004407 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004408}
4409
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004410/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004411/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4412///
4413/// \headerfile <x86intrin.h>
4414///
4415/// This intrinsic has no corresponding instruction.
4416///
4417/// \param __a
4418/// A 256-bit floating-point vector of [4 x double].
4419/// \returns A 128-bit floating-point vector of [2 x double] containing the
4420/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004421static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004422_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004423{
Craig Topper1aa231e2016-05-16 06:38:42 +00004424 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004425}
4426
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004427/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004428/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4429///
4430/// \headerfile <x86intrin.h>
4431///
4432/// This intrinsic has no corresponding instruction.
4433///
4434/// \param __a
4435/// A 256-bit floating-point vector of [8 x float].
4436/// \returns A 128-bit floating-point vector of [4 x float] containing the
4437/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004438static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004439_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004440{
Craig Topper1aa231e2016-05-16 06:38:42 +00004441 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004442}
4443
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004444/// Truncates a 256-bit integer vector into a 128-bit integer vector.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004445///
4446/// \headerfile <x86intrin.h>
4447///
4448/// This intrinsic has no corresponding instruction.
4449///
4450/// \param __a
4451/// A 256-bit integer vector.
4452/// \returns A 128-bit integer vector containing the lower 128 bits of the
4453/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004454static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004455_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004456{
Craig Topper1aa231e2016-05-16 06:38:42 +00004457 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004458}
4459
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004460/// Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004461/// 128-bit floating-point vector of [2 x double].
4462///
4463/// The lower 128 bits contain the value of the source vector. The contents
4464/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004465///
4466/// \headerfile <x86intrin.h>
4467///
4468/// This intrinsic has no corresponding instruction.
4469///
4470/// \param __a
4471/// A 128-bit vector of [2 x double].
4472/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4473/// contain the value of the parameter. The contents of the upper 128 bits
4474/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004475static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004476_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004477{
Craig Topper1aa231e2016-05-16 06:38:42 +00004478 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004479}
4480
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004481/// Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004482/// 128-bit floating-point vector of [4 x float].
4483///
4484/// The lower 128 bits contain the value of the source vector. The contents
4485/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004486///
4487/// \headerfile <x86intrin.h>
4488///
4489/// This intrinsic has no corresponding instruction.
4490///
4491/// \param __a
4492/// A 128-bit vector of [4 x float].
4493/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4494/// contain the value of the parameter. The contents of the upper 128 bits
4495/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004496static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004497_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004498{
Craig Topper1aa231e2016-05-16 06:38:42 +00004499 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004500}
4501
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004502/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004503///
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004504/// The lower 128 bits contain the value of the source vector. The contents
4505/// of the upper 128 bits are undefined.
4506///
4507/// \headerfile <x86intrin.h>
4508///
4509/// This intrinsic has no corresponding instruction.
4510///
4511/// \param __a
4512/// A 128-bit integer vector.
4513/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4514/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004515static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004516_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004517{
Craig Topper1aa231e2016-05-16 06:38:42 +00004518 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004519}
Chad Rosierf8df4f42012-03-20 16:40:00 +00004520
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004521/// Constructs a 256-bit floating-point vector of [4 x double] from a
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004522/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4523/// contain the value of the source vector. The upper 128 bits are set
4524/// to zero.
4525///
4526/// \headerfile <x86intrin.h>
4527///
4528/// This intrinsic has no corresponding instruction.
4529///
4530/// \param __a
4531/// A 128-bit vector of [2 x double].
4532/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4533/// contain the value of the parameter. The upper 128 bits are set to zero.
4534static __inline __m256d __DEFAULT_FN_ATTRS
4535_mm256_zextpd128_pd256(__m128d __a)
4536{
4537 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4538}
4539
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004540/// Constructs a 256-bit floating-point vector of [8 x float] from a
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004541/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4542/// the value of the source vector. The upper 128 bits are set to zero.
4543///
4544/// \headerfile <x86intrin.h>
4545///
4546/// This intrinsic has no corresponding instruction.
4547///
4548/// \param __a
4549/// A 128-bit vector of [4 x float].
4550/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4551/// contain the value of the parameter. The upper 128 bits are set to zero.
4552static __inline __m256 __DEFAULT_FN_ATTRS
4553_mm256_zextps128_ps256(__m128 __a)
4554{
4555 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4556}
4557
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004558/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004559/// The lower 128 bits contain the value of the source vector. The upper
4560/// 128 bits are set to zero.
4561///
4562/// \headerfile <x86intrin.h>
4563///
4564/// This intrinsic has no corresponding instruction.
4565///
4566/// \param __a
4567/// A 128-bit integer vector.
4568/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4569/// the parameter. The upper 128 bits are set to zero.
4570static __inline __m256i __DEFAULT_FN_ATTRS
4571_mm256_zextsi128_si256(__m128i __a)
4572{
4573 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4574}
4575
Sean Silvae4c37602015-09-12 02:55:19 +00004576/*
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004577 Vector insert.
4578 We use macros rather than inlines because we only want to accept
4579 invocations where the immediate M is a constant expression.
4580*/
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004581/// Constructs a new 256-bit vector of [8 x float] by first duplicating
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004582/// a 256-bit vector of [8 x float] given in the first parameter, and then
4583/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004584/// 128-bit vector of [4 x float] in the second parameter.
4585///
4586/// The immediate integer parameter determines between the upper or the lower
4587/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004588///
4589/// \headerfile <x86intrin.h>
4590///
4591/// \code
4592/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4593/// \endcode
4594///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004595/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004596///
4597/// \param V1
4598/// A 256-bit vector of [8 x float]. This vector is copied to the result
4599/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004600/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004601/// \param V2
4602/// A 128-bit vector of [4 x float]. The contents of this parameter are
4603/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004604/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004605/// \param M
4606/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004607/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004608/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004609/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4610/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004611/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4612/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4613/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004614/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004615#define _mm256_insertf128_ps(V1, V2, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004616 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4617 (__v4sf)(__m128)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004618
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004619/// Constructs a new 256-bit vector of [4 x double] by first duplicating
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004620/// a 256-bit vector of [4 x double] given in the first parameter, and then
4621/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004622/// 128-bit vector of [2 x double] in the second parameter.
4623///
4624/// The immediate integer parameter determines between the upper or the lower
4625/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004626///
4627/// \headerfile <x86intrin.h>
4628///
4629/// \code
4630/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4631/// \endcode
4632///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004633/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004634///
4635/// \param V1
4636/// A 256-bit vector of [4 x double]. This vector is copied to the result
4637/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004638/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004639/// \param V2
4640/// A 128-bit vector of [2 x double]. The contents of this parameter are
4641/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004642/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004643/// \param M
4644/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004645/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004646/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004647/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4648/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004649/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4650/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4651/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004652/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004653#define _mm256_insertf128_pd(V1, V2, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004654 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4655 (__v2df)(__m128d)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004656
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004657/// Constructs a new 256-bit integer vector by first duplicating a
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004658/// 256-bit integer vector given in the first parameter, and then replacing
4659/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004660/// integer vector in the second parameter.
4661///
4662/// The immediate integer parameter determines between the upper or the lower
4663/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004664///
4665/// \headerfile <x86intrin.h>
4666///
4667/// \code
4668/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4669/// \endcode
4670///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004671/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004672///
4673/// \param V1
4674/// A 256-bit integer vector. This vector is copied to the result first, and
4675/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004676/// replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004677/// \param V2
4678/// A 128-bit integer vector. The contents of this parameter are written to
4679/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004680/// value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004681/// \param M
4682/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004683/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004684/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004685/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4686/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004687/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4688/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4689/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004690/// \returns A 256-bit integer vector containing the interleaved values.
Craig Topperc6338672018-05-31 00:51:20 +00004691#define _mm256_insertf128_si256(V1, V2, M) \
Craig Topper573dab12018-06-08 04:09:14 +00004692 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4693 (__v4si)(__m128i)(V2), (int)(M))
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004694
Sean Silvae4c37602015-09-12 02:55:19 +00004695/*
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004696 Vector extract.
4697 We use macros rather than inlines because we only want to accept
4698 invocations where the immediate M is a constant expression.
4699*/
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004700/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004701/// of [8 x float], as determined by the immediate integer parameter, and
4702/// returns the extracted bits as a 128-bit vector of [4 x float].
4703///
4704/// \headerfile <x86intrin.h>
4705///
4706/// \code
4707/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4708/// \endcode
4709///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004710/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004711///
4712/// \param V
4713/// A 256-bit vector of [8 x float].
4714/// \param M
4715/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004716/// extracted from the first parameter: \n
4717/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4718/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004719/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004720/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004721#define _mm256_extractf128_ps(V, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004722 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004723
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004724/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004725/// of [4 x double], as determined by the immediate integer parameter, and
4726/// returns the extracted bits as a 128-bit vector of [2 x double].
4727///
4728/// \headerfile <x86intrin.h>
4729///
4730/// \code
4731/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4732/// \endcode
4733///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004734/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004735///
4736/// \param V
4737/// A 256-bit vector of [4 x double].
4738/// \param M
4739/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004740/// extracted from the first parameter: \n
4741/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4742/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004743/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004744/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004745#define _mm256_extractf128_pd(V, M) \
Craig Topper3428bee2018-06-08 03:24:47 +00004746 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004747
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004748/// Extracts either the upper or the lower 128 bits from a 256-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004749/// integer vector, as determined by the immediate integer parameter, and
4750/// returns the extracted bits as a 128-bit integer vector.
4751///
4752/// \headerfile <x86intrin.h>
4753///
4754/// \code
4755/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4756/// \endcode
4757///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004758/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004759///
4760/// \param V
4761/// A 256-bit integer vector.
4762/// \param M
4763/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004764/// extracted from the first parameter: \n
4765/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4766/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004767/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004768/// \returns A 128-bit integer vector containing the extracted bits.
Craig Topperc6338672018-05-31 00:51:20 +00004769#define _mm256_extractf128_si256(V, M) \
Craig Topper573dab12018-06-08 04:09:14 +00004770 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004771
Chad Rosierf8df4f42012-03-20 16:40:00 +00004772/* SIMD load ops (unaligned) */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004773/// Loads two 128-bit floating-point vectors of [4 x float] from
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004774/// unaligned memory locations and constructs a 256-bit floating-point vector
4775/// of [8 x float] by concatenating the two 128-bit vectors.
4776///
4777/// \headerfile <x86intrin.h>
4778///
4779/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004780/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004781///
4782/// \param __addr_hi
4783/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004784/// single-precision floating-point values. These values are to be copied to
4785/// bits[255:128] of the result. The address of the memory location does not
4786/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004787/// \param __addr_lo
4788/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004789/// single-precision floating-point values. These values are to be copied to
4790/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004791/// have to be aligned.
4792/// \returns A 256-bit floating-point vector of [8 x float] containing the
4793/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004794static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004795_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004796{
Craig Topper74b59482016-05-31 05:49:13 +00004797 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4798 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004799}
4800
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004801/// Loads two 128-bit floating-point vectors of [2 x double] from
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004802/// unaligned memory locations and constructs a 256-bit floating-point vector
4803/// of [4 x double] by concatenating the two 128-bit vectors.
4804///
4805/// \headerfile <x86intrin.h>
4806///
4807/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004808/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004809///
4810/// \param __addr_hi
4811/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004812/// double-precision floating-point values. These values are to be copied to
4813/// bits[255:128] of the result. The address of the memory location does not
4814/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004815/// \param __addr_lo
4816/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004817/// double-precision floating-point values. These values are to be copied to
4818/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004819/// have to be aligned.
4820/// \returns A 256-bit floating-point vector of [4 x double] containing the
4821/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004822static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004823_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004824{
Craig Topper74b59482016-05-31 05:49:13 +00004825 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4826 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004827}
4828
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004829/// Loads two 128-bit integer vectors from unaligned memory locations and
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004830/// constructs a 256-bit integer vector by concatenating the two 128-bit
4831/// vectors.
4832///
4833/// \headerfile <x86intrin.h>
4834///
4835/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004836/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004837///
4838/// \param __addr_hi
4839/// A pointer to a 128-bit memory location containing a 128-bit integer
4840/// vector. This vector is to be copied to bits[255:128] of the result. The
4841/// address of the memory location does not have to be aligned.
4842/// \param __addr_lo
4843/// A pointer to a 128-bit memory location containing a 128-bit integer
4844/// vector. This vector is to be copied to bits[127:0] of the result. The
4845/// address of the memory location does not have to be aligned.
4846/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004847static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004848_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004849{
Craig Topper74b59482016-05-31 05:49:13 +00004850 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4851 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004852}
4853
4854/* SIMD store ops (unaligned) */
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004855/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004856/// vector of [8 x float] into two different unaligned memory locations.
4857///
4858/// \headerfile <x86intrin.h>
4859///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004860/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4861/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004862///
4863/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004864/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004865/// copied to this memory location. The address of this memory location does
4866/// not have to be aligned.
4867/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004868/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004869/// copied to this memory location. The address of this memory location does
4870/// not have to be aligned.
4871/// \param __a
4872/// A 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004873static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004874_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004875{
David Blaikie3302f2b2013-01-16 23:08:36 +00004876 __m128 __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004877
David Blaikie3302f2b2013-01-16 23:08:36 +00004878 __v128 = _mm256_castps256_ps128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004879 _mm_storeu_ps(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004880 __v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004881 _mm_storeu_ps(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004882}
4883
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004884/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004885/// vector of [4 x double] into two different unaligned memory locations.
4886///
4887/// \headerfile <x86intrin.h>
4888///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004889/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4890/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004891///
4892/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004893/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004894/// copied to this memory location. The address of this memory location does
4895/// not have to be aligned.
4896/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004897/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004898/// copied to this memory location. The address of this memory location does
4899/// not have to be aligned.
4900/// \param __a
4901/// A 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004902static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004903_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004904{
David Blaikie3302f2b2013-01-16 23:08:36 +00004905 __m128d __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004906
David Blaikie3302f2b2013-01-16 23:08:36 +00004907 __v128 = _mm256_castpd256_pd128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004908 _mm_storeu_pd(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004909 __v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004910 _mm_storeu_pd(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004911}
4912
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004913/// Stores the upper and lower 128 bits of a 256-bit integer vector into
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004914/// two different unaligned memory locations.
4915///
4916/// \headerfile <x86intrin.h>
4917///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004918/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4919/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004920///
4921/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004922/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004923/// copied to this memory location. The address of this memory location does
4924/// not have to be aligned.
4925/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004926/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004927/// copied to this memory location. The address of this memory location does
4928/// not have to be aligned.
4929/// \param __a
4930/// A 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004931static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004932_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004933{
David Blaikie3302f2b2013-01-16 23:08:36 +00004934 __m128i __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004935
David Blaikie3302f2b2013-01-16 23:08:36 +00004936 __v128 = _mm256_castsi256_si128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004937 _mm_storeu_si128(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004938 __v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004939 _mm_storeu_si128(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004940}
Richard Smith49e56442013-07-14 05:41:45 +00004941
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004942/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004943/// concatenating two 128-bit floating-point vectors of [4 x float].
4944///
4945/// \headerfile <x86intrin.h>
4946///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004947/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004948///
4949/// \param __hi
4950/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4951/// 128 bits of the result.
4952/// \param __lo
4953/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4954/// 128 bits of the result.
4955/// \returns A 256-bit floating-point vector of [8 x float] containing the
4956/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004957static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004958_mm256_set_m128 (__m128 __hi, __m128 __lo)
4959{
Craig Topper1aa231e2016-05-16 06:38:42 +00004960 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein76190042015-05-20 07:46:52 +00004961}
4962
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004963/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004964/// concatenating two 128-bit floating-point vectors of [2 x double].
4965///
4966/// \headerfile <x86intrin.h>
4967///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004968/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004969///
4970/// \param __hi
4971/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4972/// 128 bits of the result.
4973/// \param __lo
4974/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4975/// 128 bits of the result.
4976/// \returns A 256-bit floating-point vector of [4 x double] containing the
4977/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004978static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004979_mm256_set_m128d (__m128d __hi, __m128d __lo)
4980{
Michael Kuperstein76190042015-05-20 07:46:52 +00004981 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4982}
4983
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00004984/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004985/// integer vectors.
4986///
4987/// \headerfile <x86intrin.h>
4988///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004989/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004990///
4991/// \param __hi
4992/// A 128-bit integer vector to be copied to the upper 128 bits of the
4993/// result.
4994/// \param __lo
4995/// A 128-bit integer vector to be copied to the lower 128 bits of the
4996/// result.
4997/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004998static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004999_mm256_set_m128i (__m128i __hi, __m128i __lo)
5000{
Michael Kuperstein76190042015-05-20 07:46:52 +00005001 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5002}
5003
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00005004/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005005/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
5006/// similar to _mm256_set_m128, but the order of the input parameters is
5007/// swapped.
5008///
5009/// \headerfile <x86intrin.h>
5010///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005011/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005012///
5013/// \param __lo
5014/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5015/// 128 bits of the result.
5016/// \param __hi
5017/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5018/// 128 bits of the result.
5019/// \returns A 256-bit floating-point vector of [8 x float] containing the
5020/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005021static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005022_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5023{
Michael Kuperstein76190042015-05-20 07:46:52 +00005024 return _mm256_set_m128(__hi, __lo);
5025}
5026
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00005027/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005028/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5029/// similar to _mm256_set_m128d, but the order of the input parameters is
5030/// swapped.
5031///
5032/// \headerfile <x86intrin.h>
5033///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005034/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005035///
5036/// \param __lo
5037/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5038/// 128 bits of the result.
5039/// \param __hi
5040/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5041/// 128 bits of the result.
5042/// \returns A 256-bit floating-point vector of [4 x double] containing the
5043/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005044static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005045_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5046{
Michael Kuperstein76190042015-05-20 07:46:52 +00005047 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5048}
5049
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00005050/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005051/// integer vectors. This is similar to _mm256_set_m128i, but the order of
5052/// the input parameters is swapped.
5053///
5054/// \headerfile <x86intrin.h>
5055///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005056/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005057///
5058/// \param __lo
5059/// A 128-bit integer vector to be copied to the lower 128 bits of the
5060/// result.
5061/// \param __hi
5062/// A 128-bit integer vector to be copied to the upper 128 bits of the
5063/// result.
5064/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005065static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005066_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5067{
Michael Kuperstein76190042015-05-20 07:46:52 +00005068 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5069}
5070
Michael Kupersteine45af542015-06-30 13:36:19 +00005071#undef __DEFAULT_FN_ATTRS
Eric Christopher4d1851682015-06-17 07:09:20 +00005072
Richard Smith49e56442013-07-14 05:41:45 +00005073#endif /* __AVXINTRIN_H */