blob: 811a2e07498f1364cdf0faffc9017a0e57e34464 [file] [log] [blame]
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer6f35f3c2010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000027
Richard Smith49e56442013-07-14 05:41:45 +000028#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000031typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
Craig Topper6a77b622016-06-04 05:43:41 +000038/* Unsigned types */
39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
Chandler Carruthcbe64112015-10-01 23:40:12 +000044/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
46typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000048typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
51
Eric Christopher4d1851682015-06-17 07:09:20 +000052/* Define the default attributes for the functions in this file. */
Michael Kupersteine45af542015-06-30 13:36:19 +000053#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher4d1851682015-06-17 07:09:20 +000054
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000055/* Arithmetic */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000056/// \brief Adds two 256-bit vectors of [4 x double].
57///
58/// \headerfile <x86intrin.h>
59///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +000060/// This intrinsic corresponds to the \c VADDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000061///
62/// \param __a
63/// A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65/// A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000068static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000069_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000070{
Craig Topper1aa231e2016-05-16 06:38:42 +000071 return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000072}
73
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000074/// \brief Adds two 256-bit vectors of [8 x float].
75///
76/// \headerfile <x86intrin.h>
77///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +000078/// This intrinsic corresponds to the \c VADDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000079///
80/// \param __a
81/// A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83/// A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000086static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000087_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000088{
Craig Topper1aa231e2016-05-16 06:38:42 +000089 return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000090}
91
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000092/// \brief Subtracts two 256-bit vectors of [4 x double].
93///
94/// \headerfile <x86intrin.h>
95///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +000096/// This intrinsic corresponds to the \c VSUBPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000097///
98/// \param __a
99/// A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101/// A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000104static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000105_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000106{
Craig Topper1aa231e2016-05-16 06:38:42 +0000107 return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000108}
109
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000110/// \brief Subtracts two 256-bit vectors of [8 x float].
111///
112/// \headerfile <x86intrin.h>
113///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000114/// This intrinsic corresponds to the \c VSUBPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000115///
116/// \param __a
117/// A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119/// A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000122static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000123_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000124{
Craig Topper1aa231e2016-05-16 06:38:42 +0000125 return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000126}
127
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129/// two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000133/// This intrinsic corresponds to the \c VADDSUBPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000134///
135/// \param __a
136/// A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138/// A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140/// and differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000141static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000142_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000143{
David Blaikie3302f2b2013-01-16 23:08:36 +0000144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000145}
146
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148/// two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000152/// This intrinsic corresponds to the \c VADDSUBPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000153///
154/// \param __a
155/// A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157/// A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159/// differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000160static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000161_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000162{
David Blaikie3302f2b2013-01-16 23:08:36 +0000163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000164}
165
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000166/// \brief Divides two 256-bit vectors of [4 x double].
167///
168/// \headerfile <x86intrin.h>
169///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000170/// This intrinsic corresponds to the \c VDIVPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000171///
172/// \param __a
173/// A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000178static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000179_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000180{
Craig Topper1aa231e2016-05-16 06:38:42 +0000181 return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000182}
183
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000188/// This intrinsic corresponds to the \c VDIVPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000189///
190/// \param __a
191/// A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000196static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000197_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000198{
Craig Topper1aa231e2016-05-16 06:38:42 +0000199 return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000200}
201
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203/// of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000207/// This intrinsic corresponds to the \c VMAXPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000208///
209/// \param __a
210/// A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212/// A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000215static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000216_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000217{
David Blaikie3302f2b2013-01-16 23:08:36 +0000218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000219}
220
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222/// of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000226/// This intrinsic corresponds to the \c VMAXPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000227///
228/// \param __a
229/// A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231/// A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000234static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000235_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000236{
David Blaikie3302f2b2013-01-16 23:08:36 +0000237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000238}
239
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241/// of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000245/// This intrinsic corresponds to the \c VMINPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000246///
247/// \param __a
248/// A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250/// A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000253static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000254_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000255{
David Blaikie3302f2b2013-01-16 23:08:36 +0000256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000257}
258
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260/// of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000264/// This intrinsic corresponds to the \c VMINPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000265///
266/// \param __a
267/// A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269/// A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000272static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000273_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000274{
David Blaikie3302f2b2013-01-16 23:08:36 +0000275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000276}
277
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000282/// This intrinsic corresponds to the \c VMULPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000283///
284/// \param __a
285/// A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000288/// \returns A 256-bit vector of [4 x double] containing the products of both
289/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000290static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000291_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000292{
Craig Topper1aa231e2016-05-16 06:38:42 +0000293 return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000294}
295
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000300/// This intrinsic corresponds to the \c VMULPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000301///
302/// \param __a
303/// A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000306/// \returns A 256-bit vector of [8 x float] containing the products of both
307/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000308static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000309_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000310{
Craig Topper1aa231e2016-05-16 06:38:42 +0000311 return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000312}
313
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000314/// \brief Calculates the square roots of the values in a 256-bit vector of
315/// [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000316///
317/// \headerfile <x86intrin.h>
318///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000319/// This intrinsic corresponds to the \c VSQRTPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000320///
321/// \param __a
322/// A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000325static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000326_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000327{
David Blaikie3302f2b2013-01-16 23:08:36 +0000328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000329}
330
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000331/// \brief Calculates the square roots of the values in a 256-bit vector of
332/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000333///
334/// \headerfile <x86intrin.h>
335///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000336/// This intrinsic corresponds to the \c VSQRTPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000337///
338/// \param __a
339/// A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000342static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000343_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000344{
David Blaikie3302f2b2013-01-16 23:08:36 +0000345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000346}
347
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349/// vector of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000350///
351/// \headerfile <x86intrin.h>
352///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000353/// This intrinsic corresponds to the \c VRSQRTPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000354///
355/// \param __a
356/// A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358/// roots of the values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000359static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000360_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000361{
David Blaikie3302f2b2013-01-16 23:08:36 +0000362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000363}
364
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000367///
368/// \headerfile <x86intrin.h>
369///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000370/// This intrinsic corresponds to the \c VRCPPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000371///
372/// \param __a
373/// A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000376static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000377_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000378{
David Blaikie3302f2b2013-01-16 23:08:36 +0000379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000380}
381
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383/// by the byte operand. The source values are rounded to integer values and
384/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000392/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000393///
394/// \param V
395/// A 256-bit vector of [4 x double].
396/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000397/// An integer value that specifies the rounding operation. \n
398/// Bits [7:4] are reserved. \n
399/// Bit [3] is a precision exception value: \n
400/// 0: A normal PE exception is used. \n
401/// 1: The PE field is not updated. \n
402/// Bit [2] is the rounding control source: \n
403/// 0: Use bits [1:0] of M. \n
404/// 1: Use the current MXCSR setting. \n
405/// Bits [1:0] contain the rounding control definition: \n
406/// 00: Nearest. \n
407/// 01: Downward (toward negative infinity). \n
408/// 10: Upward (toward positive infinity). \n
409/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier060d03b2011-12-17 00:15:26 +0000411#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +0000412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000413
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415/// specified by the byte operand. The source values are rounded to integer
416/// values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000424/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000425///
426/// \param V
427/// A 256-bit vector of [8 x float].
428/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000429/// An integer value that specifies the rounding operation. \n
430/// Bits [7:4] are reserved. \n
431/// Bit [3] is a precision exception value: \n
432/// 0: A normal PE exception is used. \n
433/// 1: The PE field is not updated. \n
434/// Bit [2] is the rounding control source: \n
435/// 0: Use bits [1:0] of M. \n
436/// 1: Use the current MXCSR setting. \n
437/// Bits [1:0] contain the rounding control definition: \n
438/// 00: Nearest. \n
439/// 01: Downward (toward negative infinity). \n
440/// 10: Upward (toward positive infinity). \n
441/// 11: Truncated. \n
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier060d03b2011-12-17 00:15:26 +0000443#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +0000444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000445
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000447/// source values are rounded up to integer values and returned as 64-bit
448/// double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000456/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000457///
458/// \param V
459/// A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000462
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000464/// The source values are rounded down to integer values and returned as
465/// 64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000473/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000474///
475/// \param V
476/// A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478/// values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000480
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000482/// source values are rounded up to integer values and returned as
483/// floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000491/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000492///
493/// \param V
494/// A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000497
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000499/// source values are rounded down to integer values and returned as
500/// floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000508/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000509///
510/// \param V
511/// A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000520/// This intrinsic corresponds to the \c VANDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000521///
522/// \param __a
523/// A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525/// A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000528static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000529_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000530{
Craig Topper6a77b622016-06-04 05:43:41 +0000531 return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000532}
533
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000538/// This intrinsic corresponds to the \c VANDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000539///
540/// \param __a
541/// A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543/// A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000546static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000547_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000548{
Craig Topper6a77b622016-06-04 05:43:41 +0000549 return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000550}
551
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553/// the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000557/// This intrinsic corresponds to the \c VANDNPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000558///
559/// \param __a
560/// A 256-bit vector of [4 x double] containing the left source operand. The
561/// one's complement of this value is used in the bitwise AND.
562/// \param __b
563/// A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565/// values of the second operand and the one's complement of the first
566/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000567static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000568_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000569{
Craig Topper6a77b622016-06-04 05:43:41 +0000570 return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000571}
572
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574/// the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000578/// This intrinsic corresponds to the \c VANDNPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000579///
580/// \param __a
581/// A 256-bit vector of [8 x float] containing the left source operand. The
582/// one's complement of this value is used in the bitwise AND.
583/// \param __b
584/// A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586/// values of the second operand and the one's complement of the first
587/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000588static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000589_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000590{
Craig Topper6a77b622016-06-04 05:43:41 +0000591 return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000592}
593
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000598/// This intrinsic corresponds to the \c VORPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000599///
600/// \param __a
601/// A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603/// A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000606static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000607_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000608{
Craig Topper6a77b622016-06-04 05:43:41 +0000609 return (__m256d)((__v4du)__a | (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000610}
611
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000616/// This intrinsic corresponds to the \c VORPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000617///
618/// \param __a
619/// A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621/// A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000624static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000625_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000626{
Craig Topper6a77b622016-06-04 05:43:41 +0000627 return (__m256)((__v8su)__a | (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000628}
629
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000634/// This intrinsic corresponds to the \c VXORPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000635///
636/// \param __a
637/// A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639/// A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000642static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000643_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000644{
Craig Topper6a77b622016-06-04 05:43:41 +0000645 return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000646}
647
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000652/// This intrinsic corresponds to the \c VXORPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000653///
654/// \param __a
655/// A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657/// A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000660static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000661_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000662{
Craig Topper6a77b622016-06-04 05:43:41 +0000663 return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000664}
665
666/* Horizontal arithmetic */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000667/// \brief Horizontally adds the adjacent pairs of values contained in two
668/// 256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000672/// This intrinsic corresponds to the \c VHADDPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000673///
674/// \param __a
675/// A 256-bit vector of [4 x double] containing one of the source operands.
676/// The horizontal sums of the values are returned in the even-indexed
677/// elements of a vector of [4 x double].
678/// \param __b
679/// A 256-bit vector of [4 x double] containing one of the source operands.
680/// The horizontal sums of the values are returned in the odd-indexed
681/// elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000684static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000685_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000686{
David Blaikie3302f2b2013-01-16 23:08:36 +0000687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000688}
689
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000690/// \brief Horizontally adds the adjacent pairs of values contained in two
691/// 256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000695/// This intrinsic corresponds to the \c VHADDPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000696///
697/// \param __a
698/// A 256-bit vector of [8 x float] containing one of the source operands.
699/// The horizontal sums of the values are returned in the elements with
700/// index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702/// A 256-bit vector of [8 x float] containing one of the source operands.
703/// The horizontal sums of the values are returned in the elements with
704/// index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000707static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000708_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000709{
David Blaikie3302f2b2013-01-16 23:08:36 +0000710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000711}
712
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714/// 256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000718/// This intrinsic corresponds to the \c VHSUBPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000719///
720/// \param __a
721/// A 256-bit vector of [4 x double] containing one of the source operands.
722/// The horizontal differences between the values are returned in the
723/// even-indexed elements of a vector of [4 x double].
724/// \param __b
725/// A 256-bit vector of [4 x double] containing one of the source operands.
726/// The horizontal differences between the values are returned in the
727/// odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000730static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000731_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000732{
David Blaikie3302f2b2013-01-16 23:08:36 +0000733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000734}
735
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737/// 256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000741/// This intrinsic corresponds to the \c VHSUBPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000742///
743/// \param __a
744/// A 256-bit vector of [8 x float] containing one of the source operands.
745/// The horizontal differences between the values are returned in the
746/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748/// A 256-bit vector of [8 x float] containing one of the source operands.
749/// The horizontal differences between the values are returned in the
750/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000753static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000754_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000755{
David Blaikie3302f2b2013-01-16 23:08:36 +0000756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000757}
758
759/* Vector permutations */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761/// by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000762///
763/// \headerfile <x86intrin.h>
764///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000765/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000766///
767/// \param __a
768/// A 128-bit vector of [2 x double].
769/// \param __c
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000770/// \li A 128-bit integer vector operand specifying how the values are to be
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000771/// copied.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000772/// \li Bit [1]:
773/// \li 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000774/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000775/// \li 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000776/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000777/// \li Bit [65]:
778/// \li 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000779/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000780/// \li 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000781/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000782/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000783static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000784_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000785{
David Blaikie3302f2b2013-01-16 23:08:36 +0000786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000787}
788
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000789/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000790/// specified by the 256-bit integer vector operand.
791///
792/// \headerfile <x86intrin.h>
793///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000794/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000795///
796/// \param __a
797/// A 256-bit vector of [4 x double].
798/// \param __c
799/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000800/// copied. \n
801/// Bit [1]: \n
802/// \li 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000803/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000804/// \li 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000805/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000806/// Bit [65]: \n
807/// \li 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000808/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000809/// \li 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000810/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000811/// Bit [129]: \n
812/// \li 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000813/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000814/// \li 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000815/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000816/// Bit [193]: \n
817/// \li 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000818/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000819/// \li 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000820/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000821/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000822static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000823_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000824{
David Blaikie3302f2b2013-01-16 23:08:36 +0000825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000826}
827
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829/// specified by the 128-bit integer vector operand.
830///
831/// \headerfile <x86intrin.h>
832///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000833/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000834///
835/// \param __a
836/// A 128-bit vector of [4 x float].
837/// \param __c
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000838/// <ul>
839/// <li> A 128-bit integer vector operand specifying how the values are to be
840/// copied.
841/// </li>
842/// <li> Bits [1:0]:
843/// <ul>
844/// <li> 00: Bits [31:0] of the source are copied to bits [31:0] of the
845/// returned vector. </li>
846/// <li> 01: Bits [63:32] of the source are copied to bits [31:0] of the
847/// returned vector. </li>
848/// <li> 10: Bits [95:64] of the source are copied to bits [31:0] of the
849/// returned vector. </li>
850/// <li> 11: Bits [127:96] of the source are copied to bits [31:0] of the
851/// returned vector. </li>
852/// </ul>
853/// </li>
854/// <li> Bits [33:32]:
855/// <ul>
856/// <li> 00: Bits [31:0] of the source are copied to bits [63:32] of the
857/// returned vector. </li>
858/// <li> 01: Bits [63:32] of the source are copied to bits [63:32] of the
859/// returned vector. </li>
860/// <li> 10: Bits [95:64] of the source are copied to bits [63:32] of the
861/// returned vector. </li>
862/// <li> 11: Bits [127:96] of the source are copied to bits [63:32] of the
863/// returned vector. </li>
864/// </ul>
865/// </li>
866/// <li> Bits [65:64]:
867/// <ul>
868/// <li> 00: Bits [31:0] of the source are copied to bits [95:64] of the
869/// returned vector. </li>
870/// <li> 01: Bits [63:32] of the source are copied to bits [95:64] of the
871/// returned vector. </li>
872/// <li> 10: Bits [95:64] of the source are copied to bits [95:64] of the
873/// returned vector. </li>
874/// <li> 11: Bits [127:96] of the source are copied to bits [95:64] of the
875/// returned vector. </li>
876/// </ul>
877/// </li>
878/// <li> Bits [97:96]:
879/// <ul>
880/// <li> 00: Bits [31:0] of the source are copied to bits [127:96] of the
881/// returned vector. </li>
882/// <li> 01: Bits [63:32] of the source are copied to bits [127:96] of the
883/// returned vector. </li>
884/// <li> 10: Bits [95:64] of the source are copied to bits [127:96] of the
885/// returned vector. </li>
886/// <li> 11: Bits [127:96] of the source are copied to bits [127:96] of the
887/// returned vector. </li>
888/// <li> 11: Bits [127:96] of the source are copied to bits [95:64] of the
889/// returned vector. </li>
890/// </ul>
891/// </li>
892/// </ul>
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000893/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000894static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000895_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000896{
David Blaikie3302f2b2013-01-16 23:08:36 +0000897 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000898}
899
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000900/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
901/// specified by the 256-bit integer vector operand.
902///
903/// \headerfile <x86intrin.h>
904///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000905/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000906///
907/// \param __a
908/// A 256-bit vector of [8 x float].
909/// \param __c
910/// A 256-bit integer vector operand specifying how the values are to be
911/// copied.
912/// Bits [1:0]:
913/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000914/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000915/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000916/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000917/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000918/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000919/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000920/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000921/// Bits [33:32]:
922/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000923/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000924/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000925/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000926/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000927/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000928/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000929/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000930/// Bits [65:64]:
931/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000932/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000933/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000934/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000935/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000936/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000937/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000938/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000939/// Bits [97:96]:
940/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000941/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000942/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000943/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000944/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000945/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000946/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000947/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000948/// Bits [129:128]:
949/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000950/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000951/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000952/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000953/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000954/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000955/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000956/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000957/// Bits [161:160]:
958/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000959/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000960/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000961/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000962/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000963/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000964/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000965/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000966/// Bits [193:192]:
967/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000968/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000969/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000970/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000971/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000972/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000973/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000974/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000975/// Bits [225:224]:
976/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000977/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000978/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000979/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000980/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000981/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000982/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000983/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000984/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000985static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000986_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000987{
Craig Topper9fee8ab2015-01-31 06:33:59 +0000988 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000989}
990
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000991/// \brief Copies the values in a 128-bit vector of [2 x double] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000992/// specified by the immediate integer operand.
993///
994/// \headerfile <x86intrin.h>
995///
996/// \code
997/// __m128d _mm_permute_pd(__m128d A, const int C);
998/// \endcode
999///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001000/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001001///
1002/// \param A
1003/// A 128-bit vector of [2 x double].
1004/// \param C
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001005/// \parblock
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001006/// An immediate integer operand specifying how the values are to be copied.
1007/// Bit [0]:
1008/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001009/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001010/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001011/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001012/// Bit [1]:
1013/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001014/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001015/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001016/// returned vector.
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001017/// \endparblock
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001018/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier93375d52011-12-17 01:39:56 +00001019#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001020 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001021 (__v2df)_mm_undefined_pd(), \
1022 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001023
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001024/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001025/// specified by the immediate integer operand.
1026///
1027/// \headerfile <x86intrin.h>
1028///
1029/// \code
1030/// __m256d _mm256_permute_pd(__m256d A, const int C);
1031/// \endcode
1032///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001033/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001034///
1035/// \param A
1036/// A 256-bit vector of [4 x double].
1037/// \param C
1038/// An immediate integer operand specifying how the values are to be copied.
1039/// Bit [0]:
1040/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001041/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001042/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001043/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001044/// Bit [1]:
1045/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001046/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001047/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001048/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001049/// Bit [2]:
1050/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001051/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001052/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001053/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001054/// Bit [3]:
1055/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001056/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001057/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001058/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001059/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier93375d52011-12-17 01:39:56 +00001060#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001061 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001062 (__v4df)_mm256_undefined_pd(), \
1063 0 + (((C) >> 0) & 0x1), \
1064 0 + (((C) >> 1) & 0x1), \
1065 2 + (((C) >> 2) & 0x1), \
1066 2 + (((C) >> 3) & 0x1)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001067
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001068/// \brief Copies the values in a 128-bit vector of [4 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001069/// specified by the immediate integer operand.
1070///
1071/// \headerfile <x86intrin.h>
1072///
1073/// \code
1074/// __m128 _mm_permute_ps(__m128 A, const int C);
1075/// \endcode
1076///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001077/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001078///
1079/// \param A
1080/// A 128-bit vector of [4 x float].
1081/// \param C
1082/// An immediate integer operand specifying how the values are to be copied.
1083/// Bits [1:0]:
1084/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001085/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001086/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001087/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001088/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001089/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001090/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001091/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001092/// Bits [3:2]:
1093/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001094/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001095/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001096/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001097/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001098/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001099/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001100/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001101/// Bits [5:4]:
1102/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001103/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001104/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001105/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001106/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001107/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001108/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001109/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001110/// Bits [7:6]:
1111/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001112/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001113/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001114/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001115/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001116/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001117/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001118/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001119/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier7caca842011-12-17 01:51:05 +00001120#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001121 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001122 (__v4sf)_mm_undefined_ps(), \
1123 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1124 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001125
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001126/// \brief Copies the values in a 256-bit vector of [8 x float] as
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001127/// specified by the immediate integer operand.
1128///
1129/// \headerfile <x86intrin.h>
1130///
1131/// \code
1132/// __m256 _mm256_permute_ps(__m256 A, const int C);
1133/// \endcode
1134///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001135/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001136///
1137/// \param A
1138/// A 256-bit vector of [8 x float].
1139/// \param C
1140/// An immediate integer operand specifying how the values are to be copied.
1141/// Bits [1:0]:
1142/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001143/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001144/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001145/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001146/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001147/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001148/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001149/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001150/// Bits [3:2]:
1151/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001152/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001153/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001154/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001155/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001156/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001157/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001158/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001159/// Bits [5:4]:
1160/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001161/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001162/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001163/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001164/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001165/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001166/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001167/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001168/// Bits [7:6]:
1169/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001170/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001171/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001172/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001173/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001174/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001175/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001176/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001177/// Bits [1:0]:
1178/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001179/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001180/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001181/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001182/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001183/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001184/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001185/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001186/// Bits [3:2]:
1187/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001188/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001189/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001190/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001191/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001192/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001193/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001194/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001195/// Bits [5:4]:
1196/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001197/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001198/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001199/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001200/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001201/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001202/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001203/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001204/// Bits [7:6]:
1205/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001206/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001207/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001208/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001209/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001210/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001211/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001212/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001213/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier7caca842011-12-17 01:51:05 +00001214#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001215 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001216 (__v8sf)_mm256_undefined_ps(), \
1217 0 + (((C) >> 0) & 0x3), \
1218 0 + (((C) >> 2) & 0x3), \
1219 0 + (((C) >> 4) & 0x3), \
1220 0 + (((C) >> 6) & 0x3), \
1221 4 + (((C) >> 0) & 0x3), \
1222 4 + (((C) >> 2) & 0x3), \
1223 4 + (((C) >> 4) & 0x3), \
1224 4 + (((C) >> 6) & 0x3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001225
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001226/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1227/// [4 x double], as specified by the immediate integer operand.
1228///
1229/// \headerfile <x86intrin.h>
1230///
1231/// \code
1232/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1233/// \endcode
1234///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001235/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001236///
1237/// \param V1
1238/// A 256-bit vector of [4 x double].
1239/// \param V2
1240/// A 256-bit vector of [4 x double.
1241/// \param M
1242/// An immediate integer operand specifying how the values are to be
1243/// permuted.
1244/// Bits [1:0]:
1245/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1246/// destination.
1247/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1248/// destination.
1249/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1250/// destination.
1251/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1252/// destination.
1253/// Bits [5:4]:
1254/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1255/// destination.
1256/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1257/// destination.
1258/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1259/// destination.
1260/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1261/// destination.
1262/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001263#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001264 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1265 (__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001266
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001267/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1268/// [8 x float], as specified by the immediate integer operand.
1269///
1270/// \headerfile <x86intrin.h>
1271///
1272/// \code
1273/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1274/// \endcode
1275///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001276/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001277///
1278/// \param V1
1279/// A 256-bit vector of [8 x float].
1280/// \param V2
1281/// A 256-bit vector of [8 x float].
1282/// \param M
1283/// An immediate integer operand specifying how the values are to be
1284/// permuted.
1285/// Bits [1:0]:
1286/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1287/// destination.
1288/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1289/// destination.
1290/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1291/// destination.
1292/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1293/// destination.
1294/// Bits [5:4]:
1295/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1296/// destination.
1297/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1298/// destination.
1299/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1300/// destination.
1301/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1302/// destination.
1303/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001304#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001305 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1306 (__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001307
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001308/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1309/// as specified by the immediate integer operand.
1310///
1311/// \headerfile <x86intrin.h>
1312///
1313/// \code
1314/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1315/// \endcode
1316///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001317/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001318///
1319/// \param V1
1320/// A 256-bit integer vector.
1321/// \param V2
1322/// A 256-bit integer vector.
1323/// \param M
1324/// An immediate integer operand specifying how the values are to be copied.
1325/// Bits [1:0]:
1326/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
1327/// destination.
1328/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
1329/// destination.
1330/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
1331/// destination.
1332/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
1333/// destination.
1334/// Bits [5:4]:
1335/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
1336/// destination.
1337/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
1338/// destination.
1339/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
1340/// destination.
1341/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
1342/// destination.
1343/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001344#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001345 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1346 (__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001347
1348/* Vector Blend */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001349/// \brief Merges 64-bit double-precision data values stored in either of the
1350/// two 256-bit vectors of [4 x double], as specified by the immediate
1351/// integer operand.
1352///
1353/// \headerfile <x86intrin.h>
1354///
1355/// \code
1356/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1357/// \endcode
1358///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001359/// This intrinsic corresponds to the \c VBLENDPD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001360///
1361/// \param V1
1362/// A 256-bit vector of [4 x double].
1363/// \param V2
1364/// A 256-bit vector of [4 x double].
1365/// \param M
1366/// An immediate integer operand, with mask bits [3:0] specifying how the
1367/// values are to be copied. The position of the mask bit corresponds to the
1368/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1369/// element in operand V1 is copied to the same position in the destination.
1370/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
1371/// copied to the same position in the destination.
1372/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001373#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001374 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1375 (__v4df)(__m256d)(V2), \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +00001376 (((M) & 0x01) ? 4 : 0), \
1377 (((M) & 0x02) ? 5 : 1), \
1378 (((M) & 0x04) ? 6 : 2), \
1379 (((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001380
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001381/// \brief Merges 32-bit single-precision data values stored in either of the
1382/// two 256-bit vectors of [8 x float], as specified by the immediate
1383/// integer operand.
1384///
1385/// \headerfile <x86intrin.h>
1386///
1387/// \code
1388/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1389/// \endcode
1390///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001391/// This intrinsic corresponds to the \c VBLENDPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001392///
1393/// \param V1
1394/// A 256-bit vector of [8 x float].
1395/// \param V2
1396/// A 256-bit vector of [8 x float].
1397/// \param M
1398/// An immediate integer operand, with mask bits [7:0] specifying how the
1399/// values are to be copied. The position of the mask bit corresponds to the
1400/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1401/// element in operand V1 is copied to the same position in the destination.
1402/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
1403/// copied to the same position in the destination.
1404/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001405#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001406 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1407 (__v8sf)(__m256)(V2), \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +00001408 (((M) & 0x01) ? 8 : 0), \
1409 (((M) & 0x02) ? 9 : 1), \
1410 (((M) & 0x04) ? 10 : 2), \
1411 (((M) & 0x08) ? 11 : 3), \
1412 (((M) & 0x10) ? 12 : 4), \
1413 (((M) & 0x20) ? 13 : 5), \
1414 (((M) & 0x40) ? 14 : 6), \
1415 (((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001416
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001417/// \brief Merges 64-bit double-precision data values stored in either of the
1418/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1419/// operand.
1420///
1421/// \headerfile <x86intrin.h>
1422///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001423/// This intrinsic corresponds to the \c VBLENDVPD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001424///
1425/// \param __a
1426/// A 256-bit vector of [4 x double].
1427/// \param __b
1428/// A 256-bit vector of [4 x double].
1429/// \param __c
1430/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1431/// how the values are to be copied. The position of the mask bit corresponds
1432/// to the most significant bit of a copied value. When a mask bit is 0, the
1433/// corresponding 64-bit element in operand __a is copied to the same
1434/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001435/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001436/// destination.
1437/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001438static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001439_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001440{
David Blaikie3302f2b2013-01-16 23:08:36 +00001441 return (__m256d)__builtin_ia32_blendvpd256(
1442 (__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001443}
1444
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001445/// \brief Merges 32-bit single-precision data values stored in either of the
1446/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1447/// operand.
1448///
1449/// \headerfile <x86intrin.h>
1450///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001451/// This intrinsic corresponds to the \c VBLENDVPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001452///
1453/// \param __a
1454/// A 256-bit vector of [8 x float].
1455/// \param __b
1456/// A 256-bit vector of [8 x float].
1457/// \param __c
1458/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1459/// and 31 specifying how the values are to be copied. The position of the
1460/// mask bit corresponds to the most significant bit of a copied value. When
1461/// a mask bit is 0, the corresponding 32-bit element in operand __a is
1462/// copied to the same position in the destination. When a mask bit is 1, the
1463/// corresponding 32-bit element in operand __b is copied to the same
1464/// position in the destination.
1465/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001466static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001467_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001468{
David Blaikie5bb70032013-01-16 23:13:42 +00001469 return (__m256)__builtin_ia32_blendvps256(
David Blaikie3302f2b2013-01-16 23:08:36 +00001470 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001471}
1472
1473/* Vector Dot Product */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001474/// \brief Computes two dot products in parallel, using the lower and upper
1475/// halves of two [8 x float] vectors as input to the two computations, and
1476/// returning the two dot products in the lower and upper halves of the
1477/// [8 x float] result. The immediate integer operand controls which
1478/// input elements will contribute to the dot product, and where the final
1479/// results are returned. In general, for each dot product, the four
1480/// corresponding elements of the input vectors are multiplied; the first
1481/// two and second two products are summed, then the two sums are added to
1482/// form the final result.
1483///
1484/// \headerfile <x86intrin.h>
1485///
1486/// \code
1487/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1488/// \endcode
1489///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001490/// This intrinsic corresponds to the \c VDPPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001491///
1492/// \param V1
1493/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1494/// \param V2
1495/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1496/// \param M
1497/// An immediate integer argument. Bits [7:4] determine which elements of
1498/// the input vectors are used, with bit [4] corresponding to the lowest
1499/// element and bit [7] corresponding to the highest element of each [4 x
1500/// float] subvector. If a bit is set, the corresponding elements from the
1501/// two input vectors are used as an input for dot product; otherwise that
1502/// input is treated as zero. Bits [3:0] determine which elements of the
1503/// result will receive a copy of the final dot product, with bit [0]
1504/// corresponding to the lowest element and bit [3] corresponding to the
1505/// highest element of each [4 x float] subvector. If a bit is set, the dot
1506/// product is returned in the corresponding element; otherwise that element
1507/// is set to zero. The bitmask is applied in the same way to each of the
1508/// two parallel dot product computations.
1509/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001510#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001511 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1512 (__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001513
1514/* Vector shuffle */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001515/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
1516/// specified by the immediate value operand. The four selected elements in
1517/// each operand are copied to the destination according to the bits
1518/// specified in the immediate operand. The selected elements from the first
1519/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
1520/// destination, and the selected elements from the second 256-bit operand
1521/// are copied to bits [127:64] and bits [255:192] of the destination. For
1522/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
1523/// the 256-bit destination vector would contain the following values: b[7],
1524/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1525///
1526/// \headerfile <x86intrin.h>
1527///
1528/// \code
1529/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1530/// \endcode
1531///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001532/// This intrinsic corresponds to the \c VSHUFPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001533///
1534/// \param a
1535/// A 256-bit vector of [8 x float]. The four selected elements in this
1536/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1537/// according to the bits specified in the immediate operand.
1538/// \param b
1539/// A 256-bit vector of [8 x float]. The four selected elements in this
1540/// operand are copied to bits [127:64] and bits [255:192] in the
1541/// destination, according to the bits specified in the immediate operand.
1542/// \param mask
1543/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001544/// copy from a and b.
1545/// Bits [3:0] specify the values copied from operand a.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001546/// Bits [7:4] specify the values copied from operand b.
1547/// The destinations within the 256-bit destination are assigned values as
1548/// follows, according to the bit value assignments described below:
1549/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1550/// destination.
1551/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1552/// destination.
1553/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1554/// destination.
1555/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1556/// the destination.
1557/// Bit value assignments:
1558/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
1559/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
1560/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
1561/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1562/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001563#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper2a383c92016-07-04 22:18:01 +00001564 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1565 (__v8sf)(__m256)(b), \
1566 0 + (((mask) >> 0) & 0x3), \
1567 0 + (((mask) >> 2) & 0x3), \
1568 8 + (((mask) >> 4) & 0x3), \
1569 8 + (((mask) >> 6) & 0x3), \
1570 4 + (((mask) >> 0) & 0x3), \
1571 4 + (((mask) >> 2) & 0x3), \
1572 12 + (((mask) >> 4) & 0x3), \
1573 12 + (((mask) >> 6) & 0x3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001574
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001575/// \brief Selects four double-precision values from the 256-bit operands of
1576/// [4 x double], as specified by the immediate value operand. The selected
1577/// elements from the first 256-bit operand are copied to bits [63:0] and
1578/// bits [191:128] in the destination, and the selected elements from the
1579/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
1580/// the destination. For example, if bits [3:0] of the immediate operand
1581/// contain a value of 0xF, the 256-bit destination vector would contain the
1582/// following values: b[3], a[3], b[1], a[1].
1583///
1584/// \headerfile <x86intrin.h>
1585///
1586/// \code
1587/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1588/// \endcode
1589///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001590/// This intrinsic corresponds to the \c VSHUFPD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001591///
1592/// \param a
1593/// A 256-bit vector of [4 x double].
1594/// \param b
1595/// A 256-bit vector of [4 x double].
1596/// \param mask
1597/// An immediate value containing 8-bit values specifying which elements to
1598/// copy from a and b:
1599/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
1600/// destination.
1601/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
1602/// destination.
1603/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
1604/// destination.
1605/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
1606/// destination.
1607/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
1608/// destination.
1609/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
1610/// destination.
1611/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
1612/// destination.
1613/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
1614/// destination.
1615/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001616#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper2a383c92016-07-04 22:18:01 +00001617 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1618 (__v4df)(__m256d)(b), \
1619 0 + (((mask) >> 0) & 0x1), \
1620 4 + (((mask) >> 1) & 0x1), \
1621 2 + (((mask) >> 2) & 0x1), \
1622 6 + (((mask) >> 3) & 0x1)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001623
1624/* Compare */
1625#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1626#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1627#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1628#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1629#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1630#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1631#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1632#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
1633#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1634#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
1635#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1636#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1637#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1638#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1639#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1640#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1641#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1642#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1643#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1644#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1645#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1646#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1647#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
1648#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1649#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1650#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
1651#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1652#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1653#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1654#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1655#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1656#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1657
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001658/// \brief Compares each of the corresponding double-precision values of two
1659/// 128-bit vectors of [2 x double], using the operation specified by the
1660/// immediate integer operand. Returns a [2 x double] vector consisting of
1661/// two doubles corresponding to the two comparison results: zero if the
1662/// comparison is false, and all 1's if the comparison is true.
1663///
1664/// \headerfile <x86intrin.h>
1665///
1666/// \code
1667/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1668/// \endcode
1669///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001670/// This intrinsic corresponds to the \c VCMPPD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001671///
1672/// \param a
1673/// A 128-bit vector of [2 x double].
1674/// \param b
1675/// A 128-bit vector of [2 x double].
1676/// \param c
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001677/// \parblock
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001678/// An immediate integer operand, with bits [4:0] specifying which comparison
1679/// operation to use:
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001680///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001681/// 00h, 08h, 10h, 18h: Equal
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001682///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001683/// 01h, 09h, 11h, 19h: Less than
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001684///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001685/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1686/// operands)
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001687///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001688/// 03h, 0Bh, 13h, 1Bh: Unordered
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001689///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001690/// 04h, 0Ch, 14h, 1Ch: Not equal
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001691///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001692/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001693///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001694/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1695/// (swapped operands)
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001696///
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001697/// 07h, 0Fh, 17h, 1Fh: Ordered
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001698/// \endparblock
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001699/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001700#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001701 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1702 (__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001703
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001704/// \brief Compares each of the corresponding values of two 128-bit vectors of
1705/// [4 x float], using the operation specified by the immediate integer
1706/// operand. Returns a [4 x float] vector consisting of four floats
1707/// corresponding to the four comparison results: zero if the comparison is
1708/// false, and all 1's if the comparison is true.
1709///
1710/// \headerfile <x86intrin.h>
1711///
1712/// \code
1713/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1714/// \endcode
1715///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001716/// This intrinsic corresponds to the \c VCMPPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001717///
1718/// \param a
1719/// A 128-bit vector of [4 x float].
1720/// \param b
1721/// A 128-bit vector of [4 x float].
1722/// \param c
1723/// An immediate integer operand, with bits [4:0] specifying which comparison
1724/// operation to use:
1725/// 00h, 08h, 10h, 18h: Equal
1726/// 01h, 09h, 11h, 19h: Less than
1727/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1728/// operands)
1729/// 03h, 0Bh, 13h, 1Bh: Unordered
1730/// 04h, 0Ch, 14h, 1Ch: Not equal
1731/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1732/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1733/// (swapped operands)
1734/// 07h, 0Fh, 17h, 1Fh: Ordered
1735/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001736#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001737 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1738 (__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001739
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001740/// \brief Compares each of the corresponding double-precision values of two
1741/// 256-bit vectors of [4 x double], using the operation specified by the
1742/// immediate integer operand. Returns a [4 x double] vector consisting of
1743/// four doubles corresponding to the four comparison results: zero if the
1744/// comparison is false, and all 1's if the comparison is true.
1745///
1746/// \headerfile <x86intrin.h>
1747///
1748/// \code
1749/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1750/// \endcode
1751///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001752/// This intrinsic corresponds to the \c VCMPPD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001753///
1754/// \param a
1755/// A 256-bit vector of [4 x double].
1756/// \param b
1757/// A 256-bit vector of [4 x double].
1758/// \param c
1759/// An immediate integer operand, with bits [4:0] specifying which comparison
1760/// operation to use:
1761/// 00h, 08h, 10h, 18h: Equal
1762/// 01h, 09h, 11h, 19h: Less than
1763/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1764/// operands)
1765/// 03h, 0Bh, 13h, 1Bh: Unordered
1766/// 04h, 0Ch, 14h, 1Ch: Not equal
1767/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1768/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1769/// (swapped operands)
1770/// 07h, 0Fh, 17h, 1Fh: Ordered
1771/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001772#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001773 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1774 (__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001775
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001776/// \brief Compares each of the corresponding values of two 256-bit vectors of
1777/// [8 x float], using the operation specified by the immediate integer
1778/// operand. Returns a [8 x float] vector consisting of eight floats
1779/// corresponding to the eight comparison results: zero if the comparison is
1780/// false, and all 1's if the comparison is true.
1781///
1782/// \headerfile <x86intrin.h>
1783///
1784/// \code
1785/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1786/// \endcode
1787///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001788/// This intrinsic corresponds to the \c VCMPPS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001789///
1790/// \param a
1791/// A 256-bit vector of [8 x float].
1792/// \param b
1793/// A 256-bit vector of [8 x float].
1794/// \param c
1795/// An immediate integer operand, with bits [4:0] specifying which comparison
1796/// operation to use:
1797/// 00h, 08h, 10h, 18h: Equal
1798/// 01h, 09h, 11h, 19h: Less than
1799/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1800/// operands)
1801/// 03h, 0Bh, 13h, 1Bh: Unordered
1802/// 04h, 0Ch, 14h, 1Ch: Not equal
1803/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1804/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1805/// (swapped operands)
1806/// 07h, 0Fh, 17h, 1Fh: Ordered
1807/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001808#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001809 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1810 (__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001811
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001812/// \brief Compares each of the corresponding scalar double-precision values of
1813/// two 128-bit vectors of [2 x double], using the operation specified by the
1814/// immediate integer operand. If the result is true, all 64 bits of the
1815/// destination vector are set; otherwise they are cleared.
1816///
1817/// \headerfile <x86intrin.h>
1818///
1819/// \code
1820/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1821/// \endcode
1822///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001823/// This intrinsic corresponds to the \c VCMPSD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001824///
1825/// \param a
1826/// A 128-bit vector of [2 x double].
1827/// \param b
1828/// A 128-bit vector of [2 x double].
1829/// \param c
1830/// An immediate integer operand, with bits [4:0] specifying which comparison
1831/// operation to use:
1832/// 00h, 08h, 10h, 18h: Equal
1833/// 01h, 09h, 11h, 19h: Less than
1834/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1835/// operands)
1836/// 03h, 0Bh, 13h, 1Bh: Unordered
1837/// 04h, 0Ch, 14h, 1Ch: Not equal
1838/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1839/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1840/// (swapped operands)
1841/// 07h, 0Fh, 17h, 1Fh: Ordered
1842/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001843#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001844 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1845 (__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001846
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001847/// \brief Compares each of the corresponding scalar values of two 128-bit
1848/// vectors of [4 x float], using the operation specified by the immediate
1849/// integer operand. If the result is true, all 32 bits of the destination
1850/// vector are set; otherwise they are cleared.
1851///
1852/// \headerfile <x86intrin.h>
1853///
1854/// \code
1855/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1856/// \endcode
1857///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001858/// This intrinsic corresponds to the \c VCMPSS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001859///
1860/// \param a
1861/// A 128-bit vector of [4 x float].
1862/// \param b
1863/// A 128-bit vector of [4 x float].
1864/// \param c
1865/// An immediate integer operand, with bits [4:0] specifying which comparison
1866/// operation to use:
1867/// 00h, 08h, 10h, 18h: Equal
1868/// 01h, 09h, 11h, 19h: Less than
1869/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
1870/// operands)
1871/// 03h, 0Bh, 13h, 1Bh: Unordered
1872/// 04h, 0Ch, 14h, 1Ch: Not equal
1873/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
1874/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
1875/// (swapped operands)
1876/// 07h, 0Fh, 17h, 1Fh: Ordered
1877/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001878#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001879 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1880 (__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001881
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001882/// \brief Takes a [8 x i32] vector and returns the vector element value
1883/// indexed by the immediate constant operand.
1884///
1885/// \headerfile <x86intrin.h>
1886///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001887/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001888///
1889/// \param __a
1890/// A 256-bit vector of [8 x i32].
1891/// \param __imm
1892/// An immediate integer operand with bits [2:0] determining which vector
1893/// element is extracted and returned.
1894/// \returns A 32-bit integer containing the extracted 32 bits of extended
1895/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00001896static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00001897_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001898{
David Blaikie3302f2b2013-01-16 23:08:36 +00001899 __v8si __b = (__v8si)__a;
Manman Renc94122e2013-10-23 20:33:14 +00001900 return __b[__imm & 7];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001901}
1902
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001903/// \brief Takes a [16 x i16] vector and returns the vector element value
1904/// indexed by the immediate constant operand.
1905///
1906/// \headerfile <x86intrin.h>
1907///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001908/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001909///
1910/// \param __a
1911/// A 256-bit integer vector of [16 x i16].
1912/// \param __imm
1913/// An immediate integer operand with bits [3:0] determining which vector
1914/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00001915/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001916/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00001917static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00001918_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001919{
David Blaikie3302f2b2013-01-16 23:08:36 +00001920 __v16hi __b = (__v16hi)__a;
Simon Pilgrim28666ce2016-05-21 21:14:35 +00001921 return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001922}
1923
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001924/// \brief Takes a [32 x i8] vector and returns the vector element value
1925/// indexed by the immediate constant operand.
1926///
1927/// \headerfile <x86intrin.h>
1928///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001929/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001930///
1931/// \param __a
1932/// A 256-bit integer vector of [32 x i8].
1933/// \param __imm
1934/// An immediate integer operand with bits [4:0] determining which vector
1935/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00001936/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
1937/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00001938static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00001939_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001940{
David Blaikie3302f2b2013-01-16 23:08:36 +00001941 __v32qi __b = (__v32qi)__a;
Simon Pilgrim28666ce2016-05-21 21:14:35 +00001942 return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001943}
1944
1945#ifdef __x86_64__
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001946/// \brief Takes a [4 x i64] vector and returns the vector element value
1947/// indexed by the immediate constant operand.
1948///
1949/// \headerfile <x86intrin.h>
1950///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001951/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001952///
1953/// \param __a
1954/// A 256-bit integer vector of [4 x i64].
1955/// \param __imm
1956/// An immediate integer operand with bits [1:0] determining which vector
1957/// element is extracted and returned.
1958/// \returns A 64-bit integer containing the extracted 64 bits of extended
1959/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00001960static __inline long long __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001961_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001962{
David Blaikie3302f2b2013-01-16 23:08:36 +00001963 __v4di __b = (__v4di)__a;
Manman Renc94122e2013-10-23 20:33:14 +00001964 return __b[__imm & 3];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001965}
1966#endif
1967
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001968/// \brief Takes a [8 x i32] vector and replaces the vector element value
1969/// indexed by the immediate constant operand by a new value. Returns the
1970/// modified vector.
1971///
1972/// \headerfile <x86intrin.h>
1973///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001974/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001975///
1976/// \param __a
1977/// A vector of [8 x i32] to be used by the insert operation.
1978/// \param __b
1979/// An integer value. The replacement value for the insert operation.
1980/// \param __imm
1981/// An immediate integer specifying the index of the vector element to be
1982/// replaced.
1983/// \returns A copy of vector __a, after replacing its element indexed by __imm
1984/// with __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00001985static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001986_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001987{
David Blaikie3302f2b2013-01-16 23:08:36 +00001988 __v8si __c = (__v8si)__a;
1989 __c[__imm & 7] = __b;
1990 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001991}
1992
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001993
1994/// \brief Takes a [16 x i16] vector and replaces the vector element value
1995/// indexed by the immediate constant operand with a new value. Returns the
1996/// modified vector.
1997///
1998/// \headerfile <x86intrin.h>
1999///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002000/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002001///
2002/// \param __a
2003/// A vector of [16 x i16] to be used by the insert operation.
2004/// \param __b
2005/// An i16 integer value. The replacement value for the insert operation.
2006/// \param __imm
2007/// An immediate integer specifying the index of the vector element to be
2008/// replaced.
2009/// \returns A copy of vector __a, after replacing its element indexed by __imm
2010/// with __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002011static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002012_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002013{
David Blaikie3302f2b2013-01-16 23:08:36 +00002014 __v16hi __c = (__v16hi)__a;
2015 __c[__imm & 15] = __b;
2016 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002017}
2018
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002019/// \brief Takes a [32 x i8] vector and replaces the vector element value
2020/// indexed by the immediate constant operand with a new value. Returns the
2021/// modified vector.
2022///
2023/// \headerfile <x86intrin.h>
2024///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002025/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002026///
2027/// \param __a
2028/// A vector of [32 x i8] to be used by the insert operation.
2029/// \param __b
2030/// An i8 integer value. The replacement value for the insert operation.
2031/// \param __imm
2032/// An immediate integer specifying the index of the vector element to be
2033/// replaced.
2034/// \returns A copy of vector __a, after replacing its element indexed by __imm
2035/// with __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002036static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002037_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002038{
David Blaikie3302f2b2013-01-16 23:08:36 +00002039 __v32qi __c = (__v32qi)__a;
2040 __c[__imm & 31] = __b;
2041 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002042}
2043
2044#ifdef __x86_64__
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002045/// \brief Takes a [4 x i64] vector and replaces the vector element value
2046/// indexed by the immediate constant operand with a new value. Returns the
2047/// modified vector.
2048///
2049/// \headerfile <x86intrin.h>
2050///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002051/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002052///
2053/// \param __a
2054/// A vector of [4 x i64] to be used by the insert operation.
2055/// \param __b
2056/// A 64-bit integer value. The replacement value for the insert operation.
2057/// \param __imm
2058/// An immediate integer specifying the index of the vector element to be
2059/// replaced.
2060/// \returns A copy of vector __a, after replacing its element indexed by __imm
2061/// with __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002062static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhasd7400292015-02-19 19:00:33 +00002063_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002064{
David Blaikie3302f2b2013-01-16 23:08:36 +00002065 __v4di __c = (__v4di)__a;
2066 __c[__imm & 3] = __b;
2067 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002068}
2069#endif
2070
2071/* Conversion */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002072/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2073///
2074/// \headerfile <x86intrin.h>
2075///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002076/// This intrinsic corresponds to the \c VCVTDQ2PD instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002077///
2078/// \param __a
2079/// A 128-bit integer vector of [4 x i32].
2080/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002081static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002082_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002083{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002084 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002085}
2086
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002087/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2088///
2089/// \headerfile <x86intrin.h>
2090///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002091/// This intrinsic corresponds to the \c VCVTDQ2PS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002092///
2093/// \param __a
2094/// A 256-bit integer vector.
2095/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002096static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002097_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002098{
David Blaikie3302f2b2013-01-16 23:08:36 +00002099 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002100}
2101
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002102/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2103/// [4 x float].
2104///
2105/// \headerfile <x86intrin.h>
2106///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002107/// This intrinsic corresponds to the \c VCVTPD2PS instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002108///
2109/// \param __a
2110/// A 256-bit vector of [4 x double].
2111/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002112static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002113_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002114{
David Blaikie3302f2b2013-01-16 23:08:36 +00002115 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002116}
2117
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002118/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2119///
2120/// \headerfile <x86intrin.h>
2121///
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00002122/// This intrinsic corresponds to the \c VCVTPS2DQ instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002123///
2124/// \param __a
2125/// A 256-bit vector of [8 x float].
2126/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002127static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002128_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002129{
David Blaikie3302f2b2013-01-16 23:08:36 +00002130 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002131}
2132
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002133/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2134/// x double].
2135///
2136/// \headerfile <x86intrin.h>
2137///
2138/// This intrinsic corresponds to the \c VCVTPS2PD instruction.
2139///
2140/// \param __a
2141/// A 128-bit vector of [4 x float].
2142/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002143static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002144_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002145{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002146 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002147}
2148
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002149/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2150/// x i32], truncating the result by rounding towards zero when it is
2151/// inexact.
2152///
2153/// \headerfile <x86intrin.h>
2154///
2155/// This intrinsic corresponds to the \c VCVTTPD2DQ instruction.
2156///
2157/// \param __a
2158/// A 256-bit vector of [4 x double].
2159/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002160static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002161_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002162{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002163 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002164}
2165
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002166/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2167/// x i32]. When a conversion is inexact, the value returned is rounded
2168/// according to the rounding control bits in the MXCSR register.
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// This intrinsic corresponds to the \c VCVTPD2DQ instruction.
2173///
2174/// \param __a
2175/// A 256-bit vector of [4 x double].
2176/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002177static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002178_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002179{
David Blaikie3302f2b2013-01-16 23:08:36 +00002180 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002181}
2182
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002183/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2184/// truncating the result by rounding towards zero when it is inexact.
2185///
2186/// \headerfile <x86intrin.h>
2187///
2188/// This intrinsic corresponds to the \c VCVTTPS2DQ instruction.
2189///
2190/// \param __a
2191/// A 256-bit vector of [8 x float].
2192/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002193static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002194_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002195{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002196 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002197}
2198
Michael Zuckermane54093f2016-06-01 12:21:00 +00002199static __inline double __DEFAULT_FN_ATTRS
2200_mm256_cvtsd_f64(__m256d __a)
2201{
2202 return __a[0];
2203}
2204
2205static __inline int __DEFAULT_FN_ATTRS
2206_mm256_cvtsi256_si32(__m256i __a)
2207{
2208 __v8si __b = (__v8si)__a;
2209 return __b[0];
2210}
2211
2212static __inline float __DEFAULT_FN_ATTRS
2213_mm256_cvtss_f32(__m256 __a)
2214{
2215 return __a[0];
2216}
2217
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002218/* Vector replicate */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002219/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
2220/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2221/// Bits [255:224] of __a are written to bits [255:224] and [223:192]
2222/// of the return value.
2223/// Bits [191:160] of __a are written to bits [191:160] and [159:128]
2224/// of the return value.
2225/// Bits [127:96] of __a are written to bits [127:96] and [95:64] of
2226/// the return value.
2227/// Bits [63:32] of __a are written to bits [63:32] and [31:0] of the
2228/// return value.
2229///
2230/// \headerfile <x86intrin.h>
2231///
2232/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
2233///
2234/// \param __a
2235/// A 256-bit vector of [8 x float].
2236/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2237/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002238static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002239_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002240{
Craig Topper1aa231e2016-05-16 06:38:42 +00002241 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002242}
2243
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002244/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
2245/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
2246/// Bits [223:192] of __a are written to bits [255:224] and [223:192]
2247/// of the return value.
2248/// Bits [159:128] of __a are written to bits [191:160] and [159:128]
2249/// of the return value.
2250/// Bits [95:64] of __a are written to bits [127:96] and [95:64] of
2251/// the return value.
2252/// Bits [31:0] of __a are written to bits [63:32] and [31:0] of the
2253/// return value.
2254///
2255/// \headerfile <x86intrin.h>
2256///
2257/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
2258///
2259/// \param __a
2260/// A 256-bit vector of [8 x float].
2261/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2262/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002263static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002264_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002265{
Craig Topper1aa231e2016-05-16 06:38:42 +00002266 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002267}
2268
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002269/// \brief Moves and duplicates double-precision floating point values from a
2270/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2271/// vector of [4 x double].
2272/// Bits [63:0] of __a are written to bits [127:64] and [63:0] of the
2273/// return value.
2274/// Bits [191:128] of __a are written to bits [255:192] and [191:128]
2275/// of the return value.
2276///
2277/// \headerfile <x86intrin.h>
2278///
2279/// This intrinsic corresponds to the \c VMOVDDUP instruction.
2280///
2281/// \param __a
2282/// A 256-bit vector of [4 x double].
2283/// \returns A 256-bit vector of [4 x double] containing the moved and
2284/// duplicated values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002285static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002286_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002287{
Craig Topper1aa231e2016-05-16 06:38:42 +00002288 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002289}
2290
2291/* Unpack and Interleave */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002292/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2293/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2294///
2295/// \headerfile <x86intrin.h>
2296///
2297/// This intrinsic corresponds to the \c VUNPCKHPD instruction.
2298///
2299/// \param __a
2300/// A 256-bit floating-point vector of [4 x double].
2301/// Bits [127:64] are written to bits [63:0] of the return value.
2302/// Bits [255:192] are written to bits [191:128] of the return value.
2303/// \param __b
2304/// A 256-bit floating-point vector of [4 x double].
2305/// Bits [127:64] are written to bits [127:64] of the return value.
2306/// Bits [255:192] are written to bits [255:192] of the return value.
2307/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002308static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002309_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002310{
Craig Topper1aa231e2016-05-16 06:38:42 +00002311 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002312}
2313
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002314/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2315/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2316///
2317/// \headerfile <x86intrin.h>
2318///
2319/// This intrinsic corresponds to the \c VUNPCKLPD instruction.
2320///
2321/// \param __a
2322/// A 256-bit floating-point vector of [4 x double].
2323/// Bits [63:0] are written to bits [63:0] of the return value.
2324/// Bits [191:128] are written to bits [191:128] of the return value.
2325/// \param __b
2326/// A 256-bit floating-point vector of [4 x double].
2327/// Bits [63:0] are written to bits [127:64] of the return value.
2328/// Bits [191:128] are written to bits [255:192] of the return value.
2329/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002330static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002331_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002332{
Craig Topper1aa231e2016-05-16 06:38:42 +00002333 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002334}
2335
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002336/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2337/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2338/// vector of [8 x float].
2339///
2340/// \headerfile <x86intrin.h>
2341///
2342/// This intrinsic corresponds to the \c VUNPCKHPS instruction.
2343///
2344/// \param __a
2345/// A 256-bit vector of [8 x float].
2346/// Bits [95:64] are written to bits [31:0] of the return value.
2347/// Bits [127:96] are written to bits [95:64] of the return value.
2348/// Bits [223:192] are written to bits [159:128] of the return value.
2349/// Bits [255:224] are written to bits [223:192] of the return value.
2350/// \param __b
2351/// A 256-bit vector of [8 x float].
2352/// Bits [95:64] are written to bits [63:32] of the return value.
2353/// Bits [127:96] are written to bits [127:96] of the return value.
2354/// Bits [223:192] are written to bits [191:160] of the return value.
2355/// Bits [255:224] are written to bits [255:224] of the return value.
2356/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002357static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002358_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002359{
Craig Topper1aa231e2016-05-16 06:38:42 +00002360 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002361}
2362
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002363/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2364/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2365/// vector of [8 x float].
2366///
2367/// \headerfile <x86intrin.h>
2368///
2369/// This intrinsic corresponds to the \c VUNPCKLPS instruction.
2370///
2371/// \param __a
2372/// A 256-bit vector of [8 x float].
2373/// Bits [31:0] are written to bits [31:0] of the return value.
2374/// Bits [63:32] are written to bits [95:64] of the return value.
2375/// Bits [159:128] are written to bits [159:128] of the return value.
2376/// Bits [191:160] are written to bits [223:192] of the return value.
2377/// \param __b
2378/// A 256-bit vector of [8 x float].
2379/// Bits [31:0] are written to bits [63:32] of the return value.
2380/// Bits [63:32] are written to bits [127:96] of the return value.
2381/// Bits [159:128] are written to bits [191:160] of the return value.
2382/// Bits [191:160] are written to bits [255:224] of the return value.
2383/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002384static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002385_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002386{
Craig Topper1aa231e2016-05-16 06:38:42 +00002387 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002388}
2389
2390/* Bit Test */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002391/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2392/// element-by-element comparison of the double-precision element in the
2393/// first source vector and the corresponding element in the second source
2394/// vector. The EFLAGS register is updated as follows:
2395/// If there is at least one pair of double-precision elements where the
2396/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2397/// ZF flag is set to 1.
2398/// If there is at least one pair of double-precision elements where the
2399/// sign-bit of the first element is 0 and the sign-bit of the second element
2400/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2401/// This intrinsic returns the value of the ZF flag.
2402///
2403/// \headerfile <x86intrin.h>
2404///
2405/// This intrinsic corresponds to the \c VTESTPD instruction.
2406///
2407/// \param __a
2408/// A 128-bit vector of [2 x double].
2409/// \param __b
2410/// A 128-bit vector of [2 x double].
2411/// \returns the ZF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002412static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002413_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002414{
David Blaikie3302f2b2013-01-16 23:08:36 +00002415 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002416}
2417
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002418/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2419/// element-by-element comparison of the double-precision element in the
2420/// first source vector and the corresponding element in the second source
2421/// vector. The EFLAGS register is updated as follows:
2422/// If there is at least one pair of double-precision elements where the
2423/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2424/// ZF flag is set to 1.
2425/// If there is at least one pair of double-precision elements where the
2426/// sign-bit of the first element is 0 and the sign-bit of the second element
2427/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2428/// This intrinsic returns the value of the CF flag.
2429///
2430/// \headerfile <x86intrin.h>
2431///
2432/// This intrinsic corresponds to the \c VTESTPD instruction.
2433///
2434/// \param __a
2435/// A 128-bit vector of [2 x double].
2436/// \param __b
2437/// A 128-bit vector of [2 x double].
2438/// \returns the CF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002439static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002440_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002441{
David Blaikie3302f2b2013-01-16 23:08:36 +00002442 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002443}
2444
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002445/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2446/// element-by-element comparison of the double-precision element in the
2447/// first source vector and the corresponding element in the second source
2448/// vector. The EFLAGS register is updated as follows:
2449/// If there is at least one pair of double-precision elements where the
2450/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2451/// ZF flag is set to 1.
2452/// If there is at least one pair of double-precision elements where the
2453/// sign-bit of the first element is 0 and the sign-bit of the second element
2454/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2455/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2456/// otherwise it returns 0.
2457///
2458/// \headerfile <x86intrin.h>
2459///
2460/// This intrinsic corresponds to the \c VTESTPD instruction.
2461///
2462/// \param __a
2463/// A 128-bit vector of [2 x double].
2464/// \param __b
2465/// A 128-bit vector of [2 x double].
2466/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002467static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002468_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002469{
David Blaikie3302f2b2013-01-16 23:08:36 +00002470 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002471}
2472
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002473/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2474/// element-by-element comparison of the single-precision element in the
2475/// first source vector and the corresponding element in the second source
2476/// vector. The EFLAGS register is updated as follows:
2477/// If there is at least one pair of single-precision elements where the
2478/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2479/// ZF flag is set to 1.
2480/// If there is at least one pair of single-precision elements where the
2481/// sign-bit of the first element is 0 and the sign-bit of the second element
2482/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2483/// This intrinsic returns the value of the ZF flag.
2484///
2485/// \headerfile <x86intrin.h>
2486///
2487/// This intrinsic corresponds to the \c VTESTPS instruction.
2488///
2489/// \param __a
2490/// A 128-bit vector of [4 x float].
2491/// \param __b
2492/// A 128-bit vector of [4 x float].
2493/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002494static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002495_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002496{
David Blaikie3302f2b2013-01-16 23:08:36 +00002497 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002498}
2499
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002500/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2501/// element-by-element comparison of the single-precision element in the
2502/// first source vector and the corresponding element in the second source
2503/// vector. The EFLAGS register is updated as follows:
2504/// If there is at least one pair of single-precision elements where the
2505/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2506/// ZF flag is set to 1.
2507/// If there is at least one pair of single-precision elements where the
2508/// sign-bit of the first element is 0 and the sign-bit of the second element
2509/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2510/// This intrinsic returns the value of the CF flag.
2511///
2512/// \headerfile <x86intrin.h>
2513///
2514/// This intrinsic corresponds to the \c VTESTPS instruction.
2515///
2516/// \param __a
2517/// A 128-bit vector of [4 x float].
2518/// \param __b
2519/// A 128-bit vector of [4 x float].
2520/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002521static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002522_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002523{
David Blaikie3302f2b2013-01-16 23:08:36 +00002524 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002525}
2526
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002527/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2528/// element-by-element comparison of the single-precision element in the
2529/// first source vector and the corresponding element in the second source
2530/// vector. The EFLAGS register is updated as follows:
2531/// If there is at least one pair of single-precision elements where the
2532/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2533/// ZF flag is set to 1.
2534/// If there is at least one pair of single-precision elements where the
2535/// sign-bit of the first element is 0 and the sign-bit of the second element
2536/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2537/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2538/// otherwise it returns 0.
2539///
2540/// \headerfile <x86intrin.h>
2541///
2542/// This intrinsic corresponds to the \c VTESTPS instruction.
2543///
2544/// \param __a
2545/// A 128-bit vector of [4 x float].
2546/// \param __b
2547/// A 128-bit vector of [4 x float].
2548/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002549static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002550_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002551{
David Blaikie3302f2b2013-01-16 23:08:36 +00002552 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002553}
2554
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002555/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2556/// element-by-element comparison of the double-precision elements in the
2557/// first source vector and the corresponding elements in the second source
2558/// vector. The EFLAGS register is updated as follows:
2559/// If there is at least one pair of double-precision elements where the
2560/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2561/// ZF flag is set to 1.
2562/// If there is at least one pair of double-precision elements where the
2563/// sign-bit of the first element is 0 and the sign-bit of the second element
2564/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2565/// This intrinsic returns the value of the ZF flag.
2566///
2567/// \headerfile <x86intrin.h>
2568///
2569/// This intrinsic corresponds to the \c VTESTPD instruction.
2570///
2571/// \param __a
2572/// A 256-bit vector of [4 x double].
2573/// \param __b
2574/// A 256-bit vector of [4 x double].
2575/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002576static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002577_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002578{
David Blaikie3302f2b2013-01-16 23:08:36 +00002579 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002580}
2581
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002582/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2583/// element-by-element comparison of the double-precision elements in the
2584/// first source vector and the corresponding elements in the second source
2585/// vector. The EFLAGS register is updated as follows:
2586/// If there is at least one pair of double-precision elements where the
2587/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2588/// ZF flag is set to 1.
2589/// If there is at least one pair of double-precision elements where the
2590/// sign-bit of the first element is 0 and the sign-bit of the second element
2591/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2592/// This intrinsic returns the value of the CF flag.
2593///
2594/// \headerfile <x86intrin.h>
2595///
2596/// This intrinsic corresponds to the \c VTESTPD instruction.
2597///
2598/// \param __a
2599/// A 256-bit vector of [4 x double].
2600/// \param __b
2601/// A 256-bit vector of [4 x double].
2602/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002603static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002604_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002605{
David Blaikie3302f2b2013-01-16 23:08:36 +00002606 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002607}
2608
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002609/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2610/// element-by-element comparison of the double-precision elements in the
2611/// first source vector and the corresponding elements in the second source
2612/// vector. The EFLAGS register is updated as follows:
2613/// If there is at least one pair of double-precision elements where the
2614/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2615/// ZF flag is set to 1.
2616/// If there is at least one pair of double-precision elements where the
2617/// sign-bit of the first element is 0 and the sign-bit of the second element
2618/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2619/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2620/// otherwise it returns 0.
2621///
2622/// \headerfile <x86intrin.h>
2623///
2624/// This intrinsic corresponds to the \c VTESTPD instruction.
2625///
2626/// \param __a
2627/// A 256-bit vector of [4 x double].
2628/// \param __b
2629/// A 256-bit vector of [4 x double].
2630/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002631static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002632_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002633{
David Blaikie3302f2b2013-01-16 23:08:36 +00002634 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002635}
2636
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002637/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2638/// element-by-element comparison of the single-precision element in the
2639/// first source vector and the corresponding element in the second source
2640/// vector. The EFLAGS register is updated as follows:
2641/// If there is at least one pair of single-precision elements where the
2642/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2643/// ZF flag is set to 1.
2644/// If there is at least one pair of single-precision elements where the
2645/// sign-bit of the first element is 0 and the sign-bit of the second element
2646/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2647/// This intrinsic returns the value of the ZF flag.
2648///
2649/// \headerfile <x86intrin.h>
2650///
2651/// This intrinsic corresponds to the \c VTESTPS instruction.
2652///
2653/// \param __a
2654/// A 256-bit vector of [8 x float].
2655/// \param __b
2656/// A 256-bit vector of [8 x float].
2657/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002658static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002659_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002660{
David Blaikie3302f2b2013-01-16 23:08:36 +00002661 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002662}
2663
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002664/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2665/// element-by-element comparison of the single-precision element in the
2666/// first source vector and the corresponding element in the second source
2667/// vector. The EFLAGS register is updated as follows:
2668/// If there is at least one pair of single-precision elements where the
2669/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2670/// ZF flag is set to 1.
2671/// If there is at least one pair of single-precision elements where the
2672/// sign-bit of the first element is 0 and the sign-bit of the second element
2673/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2674/// This intrinsic returns the value of the CF flag.
2675///
2676/// \headerfile <x86intrin.h>
2677///
2678/// This intrinsic corresponds to the \c VTESTPS instruction.
2679///
2680/// \param __a
2681/// A 256-bit vector of [8 x float].
2682/// \param __b
2683/// A 256-bit vector of [8 x float].
2684/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002685static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002686_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002687{
David Blaikie3302f2b2013-01-16 23:08:36 +00002688 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002689}
2690
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002691/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2692/// element-by-element comparison of the single-precision elements in the
2693/// first source vector and the corresponding elements in the second source
2694/// vector. The EFLAGS register is updated as follows:
2695/// If there is at least one pair of single-precision elements where the
2696/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2697/// ZF flag is set to 1.
2698/// If there is at least one pair of single-precision elements where the
2699/// sign-bit of the first element is 0 and the sign-bit of the second element
2700/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
2701/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2702/// otherwise it returns 0.
2703///
2704/// \headerfile <x86intrin.h>
2705///
2706/// This intrinsic corresponds to the \c VTESTPS instruction.
2707///
2708/// \param __a
2709/// A 256-bit vector of [8 x float].
2710/// \param __b
2711/// A 256-bit vector of [8 x float].
2712/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002713static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002714_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002715{
David Blaikie3302f2b2013-01-16 23:08:36 +00002716 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002717}
2718
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002719/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2720/// of the two source vectors and update the EFLAGS register as follows:
2721/// If there is at least one pair of bits where both bits are 1, the ZF flag
2722/// is set to 0. Otherwise the ZF flag is set to 1.
2723/// If there is at least one pair of bits where the bit from the first source
2724/// vector is 0 and the bit from the second source vector is 1, the CF flag
2725/// is set to 0. Otherwise the CF flag is set to 1.
2726/// This intrinsic returns the value of the ZF flag.
2727///
2728/// \headerfile <x86intrin.h>
2729///
2730/// This intrinsic corresponds to the \c VPTEST instruction.
2731///
2732/// \param __a
2733/// A 256-bit integer vector.
2734/// \param __b
2735/// A 256-bit integer vector.
2736/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002737static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002738_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002739{
David Blaikie3302f2b2013-01-16 23:08:36 +00002740 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002741}
2742
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002743/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2744/// of the two source vectors and update the EFLAGS register as follows:
2745/// If there is at least one pair of bits where both bits are 1, the ZF flag
2746/// is set to 0. Otherwise the ZF flag is set to 1.
2747/// If there is at least one pair of bits where the bit from the first source
2748/// vector is 0 and the bit from the second source vector is 1, the CF flag
2749/// is set to 0. Otherwise the CF flag is set to 1.
2750/// This intrinsic returns the value of the CF flag.
2751///
2752/// \headerfile <x86intrin.h>
2753///
2754/// This intrinsic corresponds to the \c VPTEST instruction.
2755///
2756/// \param __a
2757/// A 256-bit integer vector.
2758/// \param __b
2759/// A 256-bit integer vector.
2760/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002761static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002762_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002763{
David Blaikie3302f2b2013-01-16 23:08:36 +00002764 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002765}
2766
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002767/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
2768/// of the two source vectors and update the EFLAGS register as follows:
2769/// If there is at least one pair of bits where both bits are 1, the ZF flag
2770/// is set to 0. Otherwise the ZF flag is set to 1.
2771/// If there is at least one pair of bits where the bit from the first source
2772/// vector is 0 and the bit from the second source vector is 1, the CF flag
2773/// is set to 0. Otherwise the CF flag is set to 1.
2774/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2775/// otherwise it returns 0.
2776///
2777/// \headerfile <x86intrin.h>
2778///
2779/// This intrinsic corresponds to the \c VPTEST instruction.
2780///
2781/// \param __a
2782/// A 256-bit integer vector.
2783/// \param __b
2784/// A 256-bit integer vector.
2785/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002786static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002787_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002788{
David Blaikie3302f2b2013-01-16 23:08:36 +00002789 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002790}
2791
2792/* Vector extract sign mask */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002793/// \brief Extracts the sign bits of double-precision floating point elements
2794/// in a 256-bit vector of [4 x double] and writes them to the lower order
2795/// bits of the return value.
2796///
2797/// \headerfile <x86intrin.h>
2798///
2799/// This intrinsic corresponds to the \c VMOVMSKPD instruction.
2800///
2801/// \param __a
2802/// A 256-bit vector of [4 x double] containing the double-precision
2803/// floating point values with sign bits to be extracted.
2804/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002805static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002806_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002807{
David Blaikie3302f2b2013-01-16 23:08:36 +00002808 return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002809}
2810
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002811/// \brief Extracts the sign bits of double-precision floating point elements
2812/// in a 256-bit vector of [8 x float] and writes them to the lower order
2813/// bits of the return value.
2814///
2815/// \headerfile <x86intrin.h>
2816///
2817/// This intrinsic corresponds to the \c VMOVMSKPS instruction.
2818///
2819/// \param __a
2820/// A 256-bit vector of [8 x float] containing the double-precision floating
2821/// point values with sign bits to be extracted.
2822/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002823static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002824_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002825{
David Blaikie3302f2b2013-01-16 23:08:36 +00002826 return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002827}
2828
David Blaikie3302f2b2013-01-16 23:08:36 +00002829/* Vector __zero */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002830/// \brief Zeroes the contents of all XMM or YMM registers.
2831///
2832/// \headerfile <x86intrin.h>
2833///
2834/// This intrinsic corresponds to the \c VZEROALL instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00002835static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002836_mm256_zeroall(void)
2837{
2838 __builtin_ia32_vzeroall();
2839}
2840
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002841/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2842///
2843/// \headerfile <x86intrin.h>
2844///
2845/// This intrinsic corresponds to the \c VZEROUPPER instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00002846static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002847_mm256_zeroupper(void)
2848{
2849 __builtin_ia32_vzeroupper();
2850}
2851
2852/* Vector load with broadcast */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002853/// \brief Loads a scalar single-precision floating point value from the
2854/// specified address pointed to by __a and broadcasts it to the elements of
2855/// a [4 x float] vector.
2856///
2857/// \headerfile <x86intrin.h>
2858///
2859/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2860///
2861/// \param __a
2862/// The single-precision floating point value to be broadcast.
2863/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2864/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002865static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002866_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002867{
Adam Nemet286ae082014-05-29 20:47:29 +00002868 float __f = *__a;
2869 return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002870}
2871
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002872/// \brief Loads a scalar double-precision floating point value from the
2873/// specified address pointed to by __a and broadcasts it to the elements of
2874/// a [4 x double] vector.
2875///
2876/// \headerfile <x86intrin.h>
2877///
2878/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2879///
2880/// \param __a
2881/// The double-precision floating point value to be broadcast.
2882/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2883/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002884static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002885_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002886{
Adam Nemet286ae082014-05-29 20:47:29 +00002887 double __d = *__a;
2888 return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002889}
2890
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002891/// \brief Loads a scalar single-precision floating point value from the
2892/// specified address pointed to by __a and broadcasts it to the elements of
2893/// a [8 x float] vector.
2894///
2895/// \headerfile <x86intrin.h>
2896///
2897/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2898///
2899/// \param __a
2900/// The single-precision floating point value to be broadcast.
2901/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
2902/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002903static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002904_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002905{
Adam Nemet286ae082014-05-29 20:47:29 +00002906 float __f = *__a;
2907 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002908}
2909
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002910/// \brief Loads the data from a 128-bit vector of [2 x double] from the
2911/// specified address pointed to by __a and broadcasts it to 128-bit
2912/// elements in a 256-bit vector of [4 x double].
2913///
2914/// \headerfile <x86intrin.h>
2915///
2916/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
2917///
2918/// \param __a
2919/// The 128-bit vector of [2 x double] to be broadcast.
2920/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
2921/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002922static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002923_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002924{
Craig Topper1aa231e2016-05-16 06:38:42 +00002925 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002926}
2927
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002928/// \brief Loads the data from a 128-bit vector of [4 x float] from the
2929/// specified address pointed to by __a and broadcasts it to 128-bit
2930/// elements in a 256-bit vector of [8 x float].
2931///
2932/// \headerfile <x86intrin.h>
2933///
2934/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
2935///
2936/// \param __a
2937/// The 128-bit vector of [4 x float] to be broadcast.
2938/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
2939/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00002940static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002941_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002942{
Craig Topper1aa231e2016-05-16 06:38:42 +00002943 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002944}
2945
2946/* SIMD load ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002947/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
2948/// memory location pointed to by __p into a vector of [4 x double].
2949///
2950/// \headerfile <x86intrin.h>
2951///
2952/// This intrinsic corresponds to the \c VMOVAPD instruction.
2953///
2954/// \param __p
2955/// A 32-byte aligned pointer to a memory location containing
2956/// double-precision floating point values.
2957/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002958static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002959_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002960{
David Blaikie3302f2b2013-01-16 23:08:36 +00002961 return *(__m256d *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002962}
2963
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002964/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
2965/// memory location pointed to by __p into a vector of [8 x float].
2966///
2967/// \headerfile <x86intrin.h>
2968///
2969/// This intrinsic corresponds to the \c VMOVAPS instruction.
2970///
2971/// \param __p
2972/// A 32-byte aligned pointer to a memory location containing float values.
2973/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002974static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002975_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002976{
David Blaikie3302f2b2013-01-16 23:08:36 +00002977 return *(__m256 *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002978}
2979
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002980/// \brief Loads 4 double-precision floating point values from an unaligned
2981/// memory location pointed to by __p into a vector of [4 x double].
2982///
2983/// \headerfile <x86intrin.h>
2984///
2985/// This intrinsic corresponds to the \c VMOVUPD instruction.
2986///
2987/// \param __p
2988/// A pointer to a memory location containing double-precision floating
2989/// point values.
2990/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002991static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002992_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002993{
Craig Topper9e9301a2012-01-25 04:26:17 +00002994 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +00002995 __m256d __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00002996 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00002997 return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002998}
2999
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003000/// \brief Loads 8 single-precision floating point values from an unaligned
3001/// memory location pointed to by __p into a vector of [8 x float].
3002///
3003/// \headerfile <x86intrin.h>
3004///
3005/// This intrinsic corresponds to the \c VMOVUPS instruction.
3006///
3007/// \param __p
3008/// A pointer to a memory location containing single-precision floating
3009/// point values.
3010/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003011static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003012_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003013{
Craig Topper9e9301a2012-01-25 04:26:17 +00003014 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +00003015 __m256 __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003016 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003017 return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003018}
3019
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003020/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
3021/// location pointed to by __p into elements of a 256-bit integer vector.
3022///
3023/// \headerfile <x86intrin.h>
3024///
3025/// This intrinsic corresponds to the \c VMOVDQA instruction.
3026///
3027/// \param __p
3028/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3029/// values.
3030/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003031static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003032_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003033{
David Blaikie3302f2b2013-01-16 23:08:36 +00003034 return *__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003035}
3036
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003037/// \brief Loads 256 bits of integer data from an unaligned memory location
3038/// pointed to by __p into a 256-bit integer vector.
3039///
3040/// \headerfile <x86intrin.h>
3041///
3042/// This intrinsic corresponds to the \c VMOVDQU instruction.
3043///
3044/// \param __p
3045/// A pointer to a 256-bit integer vector containing integer values.
3046/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003047static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003048_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003049{
Craig Topper9e9301a2012-01-25 04:26:17 +00003050 struct __loadu_si256 {
David Blaikie3302f2b2013-01-16 23:08:36 +00003051 __m256i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003052 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003053 return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003054}
3055
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003056/// \brief Loads 256 bits of integer data from an unaligned memory location
3057/// pointed to by __p into a 256-bit integer vector. This intrinsic may
3058/// perform better than _mm256_loadu_si256 when the data crosses a cache
3059/// line boundary.
3060///
3061/// \headerfile <x86intrin.h>
3062///
3063/// This intrinsic corresponds to the \c VLDDQU instruction.
3064///
3065/// \param __p
3066/// A pointer to a 256-bit integer vector containing integer values.
3067/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003068static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003069_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003070{
David Blaikie3302f2b2013-01-16 23:08:36 +00003071 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003072}
3073
3074/* SIMD store ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003075/// \brief Stores double-precision floating point values from a 256-bit vector
3076/// of [4 x double] to a 32-byte aligned memory location pointed to by __p.
3077///
3078/// \headerfile <x86intrin.h>
3079///
3080/// This intrinsic corresponds to the \c VMOVAPD instruction.
3081///
3082/// \param __p
3083/// A 32-byte aligned pointer to a memory location that will receive the
3084/// double-precision floaing point values.
3085/// \param __a
3086/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003087static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003088_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003089{
David Blaikie3302f2b2013-01-16 23:08:36 +00003090 *(__m256d *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003091}
3092
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003093/// \brief Stores single-precision floating point values from a 256-bit vector
3094/// of [8 x float] to a 32-byte aligned memory location pointed to by __p.
3095///
3096/// \headerfile <x86intrin.h>
3097///
3098/// This intrinsic corresponds to the \c VMOVAPS instruction.
3099///
3100/// \param __p
3101/// A 32-byte aligned pointer to a memory location that will receive the
3102/// float values.
3103/// \param __a
3104/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003105static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003106_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003107{
David Blaikie3302f2b2013-01-16 23:08:36 +00003108 *(__m256 *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003109}
3110
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003111/// \brief Stores double-precision floating point values from a 256-bit vector
3112/// of [4 x double] to an unaligned memory location pointed to by __p.
3113///
3114/// \headerfile <x86intrin.h>
3115///
3116/// This intrinsic corresponds to the \c VMOVUPD instruction.
3117///
3118/// \param __p
3119/// A pointer to a memory location that will receive the double-precision
3120/// floating point values.
3121/// \param __a
3122/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003123static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003124_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003125{
Craig Topper09175da2016-05-30 17:10:30 +00003126 struct __storeu_pd {
3127 __m256d __v;
3128 } __attribute__((__packed__, __may_alias__));
3129 ((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003130}
3131
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003132/// \brief Stores single-precision floating point values from a 256-bit vector
3133/// of [8 x float] to an unaligned memory location pointed to by __p.
3134///
3135/// \headerfile <x86intrin.h>
3136///
3137/// This intrinsic corresponds to the \c VMOVUPS instruction.
3138///
3139/// \param __p
3140/// A pointer to a memory location that will receive the float values.
3141/// \param __a
3142/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003143static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003144_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003145{
Craig Topper09175da2016-05-30 17:10:30 +00003146 struct __storeu_ps {
3147 __m256 __v;
3148 } __attribute__((__packed__, __may_alias__));
3149 ((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003150}
3151
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003152/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
3153/// aligned memory location pointed to by __p.
3154///
3155/// \headerfile <x86intrin.h>
3156///
3157/// This intrinsic corresponds to the \c VMOVDQA instruction.
3158///
3159/// \param __p
3160/// A 32-byte aligned pointer to a memory location that will receive the
3161/// integer values.
3162/// \param __a
3163/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003164static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003165_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003166{
David Blaikie3302f2b2013-01-16 23:08:36 +00003167 *__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003168}
3169
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003170/// \brief Stores integer values from a 256-bit integer vector to an unaligned
3171/// memory location pointed to by __p.
3172///
3173/// \headerfile <x86intrin.h>
3174///
3175/// This intrinsic corresponds to the \c VMOVDQU instruction.
3176///
3177/// \param __p
3178/// A pointer to a memory location that will receive the integer values.
3179/// \param __a
3180/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003181static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003182_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003183{
Craig Topper09175da2016-05-30 17:10:30 +00003184 struct __storeu_si256 {
3185 __m256i __v;
3186 } __attribute__((__packed__, __may_alias__));
3187 ((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003188}
3189
3190/* Conditional load ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003191/// \brief Conditionally loads double-precision floating point elements
3192/// from a memory location pointed to by __p into a 128-bit vector of
3193/// [2 x double], depending on the mask bits associated with each data
3194/// element.
3195///
3196/// \headerfile <x86intrin.h>
3197///
3198/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
3199///
3200/// \param __p
3201/// A pointer to a memory location that contains the double-precision
3202/// floating point values.
3203/// \param __m
3204/// A 128-bit integer vector containing the mask. The most significant bit of
3205/// each data element represents the mask bits. If a mask bit is zero, the
3206/// corresponding value in the memory location is not loaded and the
3207/// corresponding field in the return value is set to zero.
3208/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003209static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003210_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003211{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003212 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003213}
3214
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003215/// \brief Conditionally loads double-precision floating point elements
3216/// from a memory location pointed to by __p into a 256-bit vector of
3217/// [4 x double], depending on the mask bits associated with each data
3218/// element.
3219///
3220/// \headerfile <x86intrin.h>
3221///
3222/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
3223///
3224/// \param __p
3225/// A pointer to a memory location that contains the double-precision
3226/// floating point values.
3227/// \param __m
3228/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3229/// significant bit of each quadword element represents the mask bits. If a
3230/// mask bit is zero, the corresponding value in the memory location is not
3231/// loaded and the corresponding field in the return value is set to zero.
3232/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003233static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003234_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003235{
David Blaikie3302f2b2013-01-16 23:08:36 +00003236 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003237 (__v4di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003238}
3239
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003240/// \brief Conditionally loads single-precision floating point elements
3241/// from a memory location pointed to by __p into a 128-bit vector of
3242/// [4 x float], depending on the mask bits associated with each data
3243/// element.
3244///
3245/// \headerfile <x86intrin.h>
3246///
3247/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
3248///
3249/// \param __p
3250/// A pointer to a memory location that contains the single-precision
3251/// floating point values.
3252/// \param __m
3253/// A 128-bit integer vector containing the mask. The most significant bit of
3254/// each data element represents the mask bits. If a mask bit is zero, the
3255/// corresponding value in the memory location is not loaded and the
3256/// corresponding field in the return value is set to zero.
3257/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003258static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003259_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003260{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003261 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003262}
3263
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003264/// \brief Conditionally loads single-precision floating point elements from a
3265/// memory location pointed to by __p into a 256-bit vector of [8 x float],
3266/// depending on the mask bits associated with each data element.
3267///
3268/// \headerfile <x86intrin.h>
3269///
3270/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
3271///
3272/// \param __p
3273/// A pointer to a memory location that contains the single-precision
3274/// floating point values.
3275/// \param __m
3276/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3277/// significant bit of each dword element represents the mask bits. If a mask
3278/// bit is zero, the corresponding value in the memory location is not loaded
3279/// and the corresponding field in the return value is set to zero.
3280/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003281static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003282_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003283{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003284 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003285}
3286
3287/* Conditional store ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003288/// \brief Moves single-precision floating point values from a 256-bit vector
3289/// of [8 x float] to a memory location pointed to by __p, according to the
3290/// specified mask.
3291///
3292/// \headerfile <x86intrin.h>
3293///
3294/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
3295///
3296/// \param __p
3297/// A pointer to a memory location that will receive the float values.
3298/// \param __m
3299/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3300/// significant bit of each dword element in the mask vector represents the
3301/// mask bits. If a mask bit is zero, the corresponding value from vector __a
3302/// is not stored and the corresponding field in the memory location pointed
3303/// to by __p is not changed.
3304/// \param __a
3305/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003306static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003307_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003308{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003309 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003310}
3311
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003312/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
3313/// to a memory location pointed to by __p, according to the specified mask.
3314///
3315/// \headerfile <x86intrin.h>
3316///
3317/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
3318///
3319/// \param __p
3320/// A pointer to a memory location that will receive the float values.
3321/// \param __m
3322/// A 128-bit integer vector containing the mask. The most significant bit of
3323/// each field in the mask vector represents the mask bits. If a mask bit is
3324/// zero, the corresponding value from vector __a is not stored and the
3325/// corresponding field in the memory location pointed to by __p is not
3326/// changed.
3327/// \param __a
3328/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003329static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003330_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003331{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003332 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003333}
3334
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003335/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3336/// to a memory location pointed to by __p, according to the specified mask.
3337///
3338/// \headerfile <x86intrin.h>
3339///
3340/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
3341///
3342/// \param __p
3343/// A pointer to a memory location that will receive the float values.
3344/// \param __m
3345/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3346/// significant bit of each quadword element in the mask vector represents
3347/// the mask bits. If a mask bit is zero, the corresponding value from vector
3348/// __a is not stored and the corresponding field in the memory location
3349/// pointed to by __p is not changed.
3350/// \param __a
3351/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003352static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003353_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003354{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003355 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003356}
3357
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003358/// \brief Moves single-precision floating point values from a 128-bit vector
3359/// of [4 x float] to a memory location pointed to by __p, according to the
3360/// specified mask.
3361///
3362/// \headerfile <x86intrin.h>
3363///
3364/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
3365///
3366/// \param __p
3367/// A pointer to a memory location that will receive the float values.
3368/// \param __m
3369/// A 128-bit integer vector containing the mask. The most significant bit of
3370/// each field in the mask vector represents the mask bits. If a mask bit is
3371/// zero, the corresponding value from vector __a is not stored and the
3372/// corresponding field in the memory location pointed to by __p is not
3373/// changed.
3374/// \param __a
3375/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003376static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003377_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003378{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003379 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003380}
3381
3382/* Cacheability support ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003383/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3384/// aligned memory location. To minimize caching, the data is flagged as
3385/// non-temporal (unlikely to be used again soon).
3386///
3387/// \headerfile <x86intrin.h>
3388///
3389/// This intrinsic corresponds to the \c VMOVNTDQ instruction.
3390///
3391/// \param __a
3392/// A pointer to a 32-byte aligned memory location that will receive the
3393/// integer values.
3394/// \param __b
3395/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003396static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003397_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003398{
Simon Pilgrimbeca5f22016-06-13 09:57:52 +00003399 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003400}
3401
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003402/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3403/// to a 32-byte aligned memory location. To minimize caching, the data is
3404/// flagged as non-temporal (unlikely to be used again soon).
3405///
3406/// \headerfile <x86intrin.h>
3407///
3408/// This intrinsic corresponds to the \c VMOVNTPD instruction.
3409///
3410/// \param __a
3411/// A pointer to a 32-byte aligned memory location that will receive the
3412/// integer values.
3413/// \param __b
3414/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003415static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003416_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003417{
Simon Pilgrimbeca5f22016-06-13 09:57:52 +00003418 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003419}
3420
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003421/// \brief Moves single-precision floating point values from a 256-bit vector
3422/// of [8 x float] to a 32-byte aligned memory location. To minimize
3423/// caching, the data is flagged as non-temporal (unlikely to be used again
3424/// soon).
3425///
3426/// \headerfile <x86intrin.h>
3427///
3428/// This intrinsic corresponds to the \c VMOVNTPS instruction.
3429///
3430/// \param __p
3431/// A pointer to a 32-byte aligned memory location that will receive the
3432/// single-precision floating point values.
3433/// \param __a
3434/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003435static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003436_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003437{
Simon Pilgrimbeca5f22016-06-13 09:57:52 +00003438 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003439}
3440
3441/* Create vectors */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003442/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3443///
3444/// \headerfile <x86intrin.h>
3445///
3446/// This intrinsic has no corresponding instruction.
3447///
3448/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003449static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003450_mm256_undefined_pd(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003451{
3452 return (__m256d)__builtin_ia32_undef256();
3453}
3454
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003455/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3456///
3457/// \headerfile <x86intrin.h>
3458///
3459/// This intrinsic has no corresponding instruction.
3460///
3461/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003462static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003463_mm256_undefined_ps(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003464{
3465 return (__m256)__builtin_ia32_undef256();
3466}
3467
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003468/// \brief Create a 256-bit integer vector with undefined values.
3469///
3470/// \headerfile <x86intrin.h>
3471///
3472/// This intrinsic has no corresponding instruction.
3473///
3474/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003475static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003476_mm256_undefined_si256(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003477{
3478 return (__m256i)__builtin_ia32_undef256();
3479}
3480
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003481/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3482/// initialized with the specified double-precision floating-point values.
3483///
3484/// \headerfile <x86intrin.h>
3485///
3486/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
3487///
3488/// \param __a
3489/// A double-precision floating-point value used to initialize bits [255:192]
3490/// of the result.
3491/// \param __b
3492/// A double-precision floating-point value used to initialize bits [191:128]
3493/// of the result.
3494/// \param __c
3495/// A double-precision floating-point value used to initialize bits [127:64]
3496/// of the result.
3497/// \param __d
3498/// A double-precision floating-point value used to initialize bits [63:0]
3499/// of the result.
3500/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003501static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003502_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003503{
David Blaikie3302f2b2013-01-16 23:08:36 +00003504 return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003505}
3506
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003507/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3508/// with the specified single-precision floating-point values.
3509///
3510/// \headerfile <x86intrin.h>
3511///
3512/// This intrinsic is a utility function and does not correspond to a specific
3513/// instruction.
3514///
3515/// \param __a
3516/// A single-precision floating-point value used to initialize bits [255:224]
3517/// of the result.
3518/// \param __b
3519/// A single-precision floating-point value used to initialize bits [223:192]
3520/// of the result.
3521/// \param __c
3522/// A single-precision floating-point value used to initialize bits [191:160]
3523/// of the result.
3524/// \param __d
3525/// A single-precision floating-point value used to initialize bits [159:128]
3526/// of the result.
3527/// \param __e
3528/// A single-precision floating-point value used to initialize bits [127:96]
3529/// of the result.
3530/// \param __f
3531/// A single-precision floating-point value used to initialize bits [95:64]
3532/// of the result.
3533/// \param __g
3534/// A single-precision floating-point value used to initialize bits [63:32]
3535/// of the result.
3536/// \param __h
3537/// A single-precision floating-point value used to initialize bits [31:0]
3538/// of the result.
3539/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003540static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003541_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003542 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003543{
David Blaikie3302f2b2013-01-16 23:08:36 +00003544 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003545}
3546
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003547/// \brief Constructs a 256-bit integer vector initialized with the specified
3548/// 32-bit integral values.
3549///
3550/// \headerfile <x86intrin.h>
3551///
3552/// This intrinsic is a utility function and does not correspond to a specific
3553/// instruction.
3554///
3555/// \param __i0
3556/// A 32-bit integral value used to initialize bits [255:224] of the result.
3557/// \param __i1
3558/// A 32-bit integral value used to initialize bits [223:192] of the result.
3559/// \param __i2
3560/// A 32-bit integral value used to initialize bits [191:160] of the result.
3561/// \param __i3
3562/// A 32-bit integral value used to initialize bits [159:128] of the result.
3563/// \param __i4
3564/// A 32-bit integral value used to initialize bits [127:96] of the result.
3565/// \param __i5
3566/// A 32-bit integral value used to initialize bits [95:64] of the result.
3567/// \param __i6
3568/// A 32-bit integral value used to initialize bits [63:32] of the result.
3569/// \param __i7
3570/// A 32-bit integral value used to initialize bits [31:0] of the result.
3571/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003572static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003573_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003574 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003575{
David Blaikie3302f2b2013-01-16 23:08:36 +00003576 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003577}
3578
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003579/// \brief Constructs a 256-bit integer vector initialized with the specified
3580/// 16-bit integral values.
3581///
3582/// \headerfile <x86intrin.h>
3583///
3584/// This intrinsic is a utility function and does not correspond to a specific
3585/// instruction.
3586///
3587/// \param __w15
3588/// A 16-bit integral value used to initialize bits [255:240] of the result.
3589/// \param __w14
3590/// A 16-bit integral value used to initialize bits [239:224] of the result.
3591/// \param __w13
3592/// A 16-bit integral value used to initialize bits [223:208] of the result.
3593/// \param __w12
3594/// A 16-bit integral value used to initialize bits [207:192] of the result.
3595/// \param __w11
3596/// A 16-bit integral value used to initialize bits [191:176] of the result.
3597/// \param __w10
3598/// A 16-bit integral value used to initialize bits [175:160] of the result.
3599/// \param __w09
3600/// A 16-bit integral value used to initialize bits [159:144] of the result.
3601/// \param __w08
3602/// A 16-bit integral value used to initialize bits [143:128] of the result.
3603/// \param __w07
3604/// A 16-bit integral value used to initialize bits [127:112] of the result.
3605/// \param __w06
3606/// A 16-bit integral value used to initialize bits [111:96] of the result.
3607/// \param __w05
3608/// A 16-bit integral value used to initialize bits [95:80] of the result.
3609/// \param __w04
3610/// A 16-bit integral value used to initialize bits [79:64] of the result.
3611/// \param __w03
3612/// A 16-bit integral value used to initialize bits [63:48] of the result.
3613/// \param __w02
3614/// A 16-bit integral value used to initialize bits [47:32] of the result.
3615/// \param __w01
3616/// A 16-bit integral value used to initialize bits [31:16] of the result.
3617/// \param __w00
3618/// A 16-bit integral value used to initialize bits [15:0] of the result.
3619/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003620static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003621_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003622 short __w11, short __w10, short __w09, short __w08,
3623 short __w07, short __w06, short __w05, short __w04,
3624 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003625{
David Blaikie3302f2b2013-01-16 23:08:36 +00003626 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3627 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003628}
3629
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003630/// \brief Constructs a 256-bit integer vector initialized with the specified
3631/// 8-bit integral values.
3632///
3633/// \headerfile <x86intrin.h>
3634///
3635/// This intrinsic is a utility function and does not correspond to a specific
3636/// instruction.
3637///
3638/// \param __b31
3639/// An 8-bit integral value used to initialize bits [255:248] of the result.
3640/// \param __b30
3641/// An 8-bit integral value used to initialize bits [247:240] of the result.
3642/// \param __b29
3643/// An 8-bit integral value used to initialize bits [239:232] of the result.
3644/// \param __b28
3645/// An 8-bit integral value used to initialize bits [231:224] of the result.
3646/// \param __b27
3647/// An 8-bit integral value used to initialize bits [223:216] of the result.
3648/// \param __b26
3649/// An 8-bit integral value used to initialize bits [215:208] of the result.
3650/// \param __b25
3651/// An 8-bit integral value used to initialize bits [207:200] of the result.
3652/// \param __b24
3653/// An 8-bit integral value used to initialize bits [199:192] of the result.
3654/// \param __b23
3655/// An 8-bit integral value used to initialize bits [191:184] of the result.
3656/// \param __b22
3657/// An 8-bit integral value used to initialize bits [183:176] of the result.
3658/// \param __b21
3659/// An 8-bit integral value used to initialize bits [175:168] of the result.
3660/// \param __b20
3661/// An 8-bit integral value used to initialize bits [167:160] of the result.
3662/// \param __b19
3663/// An 8-bit integral value used to initialize bits [159:152] of the result.
3664/// \param __b18
3665/// An 8-bit integral value used to initialize bits [151:144] of the result.
3666/// \param __b17
3667/// An 8-bit integral value used to initialize bits [143:136] of the result.
3668/// \param __b16
3669/// An 8-bit integral value used to initialize bits [135:128] of the result.
3670/// \param __b15
3671/// An 8-bit integral value used to initialize bits [127:120] of the result.
3672/// \param __b14
3673/// An 8-bit integral value used to initialize bits [119:112] of the result.
3674/// \param __b13
3675/// An 8-bit integral value used to initialize bits [111:104] of the result.
3676/// \param __b12
3677/// An 8-bit integral value used to initialize bits [103:96] of the result.
3678/// \param __b11
3679/// An 8-bit integral value used to initialize bits [95:88] of the result.
3680/// \param __b10
3681/// An 8-bit integral value used to initialize bits [87:80] of the result.
3682/// \param __b09
3683/// An 8-bit integral value used to initialize bits [79:72] of the result.
3684/// \param __b08
3685/// An 8-bit integral value used to initialize bits [71:64] of the result.
3686/// \param __b07
3687/// An 8-bit integral value used to initialize bits [63:56] of the result.
3688/// \param __b06
3689/// An 8-bit integral value used to initialize bits [55:48] of the result.
3690/// \param __b05
3691/// An 8-bit integral value used to initialize bits [47:40] of the result.
3692/// \param __b04
3693/// An 8-bit integral value used to initialize bits [39:32] of the result.
3694/// \param __b03
3695/// An 8-bit integral value used to initialize bits [31:24] of the result.
3696/// \param __b02
3697/// An 8-bit integral value used to initialize bits [23:16] of the result.
3698/// \param __b01
3699/// An 8-bit integral value used to initialize bits [15:8] of the result.
3700/// \param __b00
3701/// An 8-bit integral value used to initialize bits [7:0] of the result.
3702/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003703static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003704_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003705 char __b27, char __b26, char __b25, char __b24,
3706 char __b23, char __b22, char __b21, char __b20,
3707 char __b19, char __b18, char __b17, char __b16,
3708 char __b15, char __b14, char __b13, char __b12,
3709 char __b11, char __b10, char __b09, char __b08,
3710 char __b07, char __b06, char __b05, char __b04,
3711 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003712{
3713 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00003714 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3715 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3716 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3717 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003718 };
3719}
3720
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003721/// \brief Constructs a 256-bit integer vector initialized with the specified
3722/// 64-bit integral values.
3723///
3724/// \headerfile <x86intrin.h>
3725///
3726/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
3727///
3728/// \param __a
3729/// A 64-bit integral value used to initialize bits [255:192] of the result.
3730/// \param __b
3731/// A 64-bit integral value used to initialize bits [191:128] of the result.
3732/// \param __c
3733/// A 64-bit integral value used to initialize bits [127:64] of the result.
3734/// \param __d
3735/// A 64-bit integral value used to initialize bits [63:0] of the result.
3736/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003737static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003738_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003739{
David Blaikie3302f2b2013-01-16 23:08:36 +00003740 return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003741}
3742
3743/* Create vectors with elements in reverse order */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003744/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3745/// initialized in reverse order with the specified double-precision
3746/// floating-point values.
3747///
3748/// \headerfile <x86intrin.h>
3749///
3750/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
3751///
3752/// \param __a
3753/// A double-precision floating-point value used to initialize bits [63:0]
3754/// of the result.
3755/// \param __b
3756/// A double-precision floating-point value used to initialize bits [127:64]
3757/// of the result.
3758/// \param __c
3759/// A double-precision floating-point value used to initialize bits [191:128]
3760/// of the result.
3761/// \param __d
3762/// A double-precision floating-point value used to initialize bits [255:192]
3763/// of the result.
3764/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003765static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003766_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003767{
David Blaikie3302f2b2013-01-16 23:08:36 +00003768 return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003769}
3770
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003771/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3772/// initialized in reverse order with the specified single-precision
3773/// float-point values.
3774///
3775/// \headerfile <x86intrin.h>
3776///
3777/// This intrinsic is a utility function and does not correspond to a specific
3778/// instruction.
3779///
3780/// \param __a
3781/// A single-precision floating-point value used to initialize bits [31:0]
3782/// of the result.
3783/// \param __b
3784/// A single-precision floating-point value used to initialize bits [63:32]
3785/// of the result.
3786/// \param __c
3787/// A single-precision floating-point value used to initialize bits [95:64]
3788/// of the result.
3789/// \param __d
3790/// A single-precision floating-point value used to initialize bits [127:96]
3791/// of the result.
3792/// \param __e
3793/// A single-precision floating-point value used to initialize bits [159:128]
3794/// of the result.
3795/// \param __f
3796/// A single-precision floating-point value used to initialize bits [191:160]
3797/// of the result.
3798/// \param __g
3799/// A single-precision floating-point value used to initialize bits [223:192]
3800/// of the result.
3801/// \param __h
3802/// A single-precision floating-point value used to initialize bits [255:224]
3803/// of the result.
3804/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003805static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003806_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003807 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003808{
David Blaikie3302f2b2013-01-16 23:08:36 +00003809 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003810}
3811
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003812/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3813/// with the specified 32-bit integral values.
3814///
3815/// \headerfile <x86intrin.h>
3816///
3817/// This intrinsic is a utility function and does not correspond to a specific
3818/// instruction.
3819///
3820/// \param __i0
3821/// A 32-bit integral value used to initialize bits [31:0] of the result.
3822/// \param __i1
3823/// A 32-bit integral value used to initialize bits [63:32] of the result.
3824/// \param __i2
3825/// A 32-bit integral value used to initialize bits [95:64] of the result.
3826/// \param __i3
3827/// A 32-bit integral value used to initialize bits [127:96] of the result.
3828/// \param __i4
3829/// A 32-bit integral value used to initialize bits [159:128] of the result.
3830/// \param __i5
3831/// A 32-bit integral value used to initialize bits [191:160] of the result.
3832/// \param __i6
3833/// A 32-bit integral value used to initialize bits [223:192] of the result.
3834/// \param __i7
3835/// A 32-bit integral value used to initialize bits [255:224] of the result.
3836/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003837static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003838_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003839 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003840{
David Blaikie3302f2b2013-01-16 23:08:36 +00003841 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003842}
3843
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003844/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3845/// with the specified 16-bit integral values.
3846///
3847/// \headerfile <x86intrin.h>
3848///
3849/// This intrinsic is a utility function and does not correspond to a specific
3850/// instruction.
3851///
3852/// \param __w15
3853/// A 16-bit integral value used to initialize bits [15:0] of the result.
3854/// \param __w14
3855/// A 16-bit integral value used to initialize bits [31:16] of the result.
3856/// \param __w13
3857/// A 16-bit integral value used to initialize bits [47:32] of the result.
3858/// \param __w12
3859/// A 16-bit integral value used to initialize bits [63:48] of the result.
3860/// \param __w11
3861/// A 16-bit integral value used to initialize bits [79:64] of the result.
3862/// \param __w10
3863/// A 16-bit integral value used to initialize bits [95:80] of the result.
3864/// \param __w09
3865/// A 16-bit integral value used to initialize bits [111:96] of the result.
3866/// \param __w08
3867/// A 16-bit integral value used to initialize bits [127:112] of the result.
3868/// \param __w07
3869/// A 16-bit integral value used to initialize bits [143:128] of the result.
3870/// \param __w06
3871/// A 16-bit integral value used to initialize bits [159:144] of the result.
3872/// \param __w05
3873/// A 16-bit integral value used to initialize bits [175:160] of the result.
3874/// \param __w04
3875/// A 16-bit integral value used to initialize bits [191:176] of the result.
3876/// \param __w03
3877/// A 16-bit integral value used to initialize bits [207:192] of the result.
3878/// \param __w02
3879/// A 16-bit integral value used to initialize bits [223:208] of the result.
3880/// \param __w01
3881/// A 16-bit integral value used to initialize bits [239:224] of the result.
3882/// \param __w00
3883/// A 16-bit integral value used to initialize bits [255:240] of the result.
3884/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003885static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003886_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003887 short __w11, short __w10, short __w09, short __w08,
3888 short __w07, short __w06, short __w05, short __w04,
3889 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003890{
David Blaikie3302f2b2013-01-16 23:08:36 +00003891 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
3892 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003893}
3894
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003895/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3896/// with the specified 8-bit integral values.
3897///
3898/// \headerfile <x86intrin.h>
3899///
3900/// This intrinsic is a utility function and does not correspond to a specific
3901/// instruction.
3902///
3903/// \param __b31
3904/// An 8-bit integral value used to initialize bits [7:0] of the result.
3905/// \param __b30
3906/// An 8-bit integral value used to initialize bits [15:8] of the result.
3907/// \param __b29
3908/// An 8-bit integral value used to initialize bits [23:16] of the result.
3909/// \param __b28
3910/// An 8-bit integral value used to initialize bits [31:24] of the result.
3911/// \param __b27
3912/// An 8-bit integral value used to initialize bits [39:32] of the result.
3913/// \param __b26
3914/// An 8-bit integral value used to initialize bits [47:40] of the result.
3915/// \param __b25
3916/// An 8-bit integral value used to initialize bits [55:48] of the result.
3917/// \param __b24
3918/// An 8-bit integral value used to initialize bits [63:56] of the result.
3919/// \param __b23
3920/// An 8-bit integral value used to initialize bits [71:64] of the result.
3921/// \param __b22
3922/// An 8-bit integral value used to initialize bits [79:72] of the result.
3923/// \param __b21
3924/// An 8-bit integral value used to initialize bits [87:80] of the result.
3925/// \param __b20
3926/// An 8-bit integral value used to initialize bits [95:88] of the result.
3927/// \param __b19
3928/// An 8-bit integral value used to initialize bits [103:96] of the result.
3929/// \param __b18
3930/// An 8-bit integral value used to initialize bits [111:104] of the result.
3931/// \param __b17
3932/// An 8-bit integral value used to initialize bits [119:112] of the result.
3933/// \param __b16
3934/// An 8-bit integral value used to initialize bits [127:120] of the result.
3935/// \param __b15
3936/// An 8-bit integral value used to initialize bits [135:128] of the result.
3937/// \param __b14
3938/// An 8-bit integral value used to initialize bits [143:136] of the result.
3939/// \param __b13
3940/// An 8-bit integral value used to initialize bits [151:144] of the result.
3941/// \param __b12
3942/// An 8-bit integral value used to initialize bits [159:152] of the result.
3943/// \param __b11
3944/// An 8-bit integral value used to initialize bits [167:160] of the result.
3945/// \param __b10
3946/// An 8-bit integral value used to initialize bits [175:168] of the result.
3947/// \param __b09
3948/// An 8-bit integral value used to initialize bits [183:176] of the result.
3949/// \param __b08
3950/// An 8-bit integral value used to initialize bits [191:184] of the result.
3951/// \param __b07
3952/// An 8-bit integral value used to initialize bits [199:192] of the result.
3953/// \param __b06
3954/// An 8-bit integral value used to initialize bits [207:200] of the result.
3955/// \param __b05
3956/// An 8-bit integral value used to initialize bits [215:208] of the result.
3957/// \param __b04
3958/// An 8-bit integral value used to initialize bits [223:216] of the result.
3959/// \param __b03
3960/// An 8-bit integral value used to initialize bits [231:224] of the result.
3961/// \param __b02
3962/// An 8-bit integral value used to initialize bits [239:232] of the result.
3963/// \param __b01
3964/// An 8-bit integral value used to initialize bits [247:240] of the result.
3965/// \param __b00
3966/// An 8-bit integral value used to initialize bits [255:248] of the result.
3967/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003968static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003969_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003970 char __b27, char __b26, char __b25, char __b24,
3971 char __b23, char __b22, char __b21, char __b20,
3972 char __b19, char __b18, char __b17, char __b16,
3973 char __b15, char __b14, char __b13, char __b12,
3974 char __b11, char __b10, char __b09, char __b08,
3975 char __b07, char __b06, char __b05, char __b04,
3976 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003977{
3978 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00003979 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003980 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
3981 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
3982 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003983}
3984
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003985/// \brief Constructs a 256-bit integer vector, initialized in reverse order
3986/// with the specified 64-bit integral values.
3987///
3988/// \headerfile <x86intrin.h>
3989///
3990/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
3991///
3992/// \param __a
3993/// A 64-bit integral value used to initialize bits [63:0] of the result.
3994/// \param __b
3995/// A 64-bit integral value used to initialize bits [127:64] of the result.
3996/// \param __c
3997/// A 64-bit integral value used to initialize bits [191:128] of the result.
3998/// \param __d
3999/// A 64-bit integral value used to initialize bits [255:192] of the result.
4000/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004001static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004002_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004003{
David Blaikie3302f2b2013-01-16 23:08:36 +00004004 return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004005}
4006
4007/* Create vectors with repeated elements */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004008/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4009/// of the four double-precision floating-point vector elements set to the
4010/// specified double-precision floating-point value.
4011///
4012/// \headerfile <x86intrin.h>
4013///
4014/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
4015///
4016/// \param __w
4017/// A double-precision floating-point value used to initialize each vector
4018/// element of the result.
4019/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004020static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004021_mm256_set1_pd(double __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004022{
David Blaikie3302f2b2013-01-16 23:08:36 +00004023 return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004024}
4025
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004026/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4027/// of the eight single-precision floating-point vector elements set to the
4028/// specified single-precision floating-point value.
4029///
4030/// \headerfile <x86intrin.h>
4031///
4032/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
4033///
4034/// \param __w
4035/// A single-precision floating-point value used to initialize each vector
4036/// element of the result.
4037/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004038static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004039_mm256_set1_ps(float __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004040{
David Blaikie3302f2b2013-01-16 23:08:36 +00004041 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004042}
4043
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004044/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4045/// 32-bit integral vector elements set to the specified 32-bit integral
4046/// value.
4047///
4048/// \headerfile <x86intrin.h>
4049///
4050/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
4051///
4052/// \param __i
4053/// A 32-bit integral value used to initialize each vector element of the
4054/// result.
4055/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kupersteine45af542015-06-30 13:36:19 +00004056static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004057_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004058{
David Blaikie3302f2b2013-01-16 23:08:36 +00004059 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004060}
4061
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004062/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4063/// 16-bit integral vector elements set to the specified 16-bit integral
4064/// value.
4065///
4066/// \headerfile <x86intrin.h>
4067///
4068/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
4069///
4070/// \param __w
4071/// A 16-bit integral value used to initialize each vector element of the
4072/// result.
4073/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kupersteine45af542015-06-30 13:36:19 +00004074static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004075_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004076{
David Blaikie3302f2b2013-01-16 23:08:36 +00004077 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4078 __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004079}
4080
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004081/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4082/// 8-bit integral vector elements set to the specified 8-bit integral value.
4083///
4084/// \headerfile <x86intrin.h>
4085///
4086/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
4087///
4088/// \param __b
4089/// An 8-bit integral value used to initialize each vector element of the
4090/// result.
4091/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kupersteine45af542015-06-30 13:36:19 +00004092static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004093_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004094{
David Blaikie3302f2b2013-01-16 23:08:36 +00004095 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4096 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4097 __b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004098}
4099
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004100/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4101/// 64-bit integral vector elements set to the specified 64-bit integral
4102/// value.
4103///
4104/// \headerfile <x86intrin.h>
4105///
4106/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
4107///
4108/// \param __q
4109/// A 64-bit integral value used to initialize each vector element of the
4110/// result.
4111/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kupersteine45af542015-06-30 13:36:19 +00004112static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004113_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004114{
David Blaikie3302f2b2013-01-16 23:08:36 +00004115 return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004116}
4117
David Blaikie3302f2b2013-01-16 23:08:36 +00004118/* Create __zeroed vectors */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004119/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4120/// vector elements initialized to zero.
4121///
4122/// \headerfile <x86intrin.h>
4123///
4124/// This intrinsic corresponds to the \c VXORPS instruction.
4125///
4126/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004127static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004128_mm256_setzero_pd(void)
4129{
4130 return (__m256d){ 0, 0, 0, 0 };
4131}
4132
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004133/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4134/// vector elements initialized to zero.
4135///
4136/// \headerfile <x86intrin.h>
4137///
4138/// This intrinsic corresponds to the \c VXORPS instruction.
4139///
4140/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004141static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004142_mm256_setzero_ps(void)
4143{
4144 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4145}
4146
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004147/// \brief Constructs a 256-bit integer vector initialized to zero.
4148///
4149/// \headerfile <x86intrin.h>
4150///
4151/// This intrinsic corresponds to the \c VXORPS instruction.
4152///
4153/// \returns A 256-bit integer vector initialized to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004154static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004155_mm256_setzero_si256(void)
4156{
4157 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4158}
4159
4160/* Cast between vector types */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004161/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4162/// floating-point vector of [8 x float].
4163///
4164/// \headerfile <x86intrin.h>
4165///
4166/// This intrinsic has no corresponding instruction.
4167///
4168/// \param __a
4169/// A 256-bit floating-point vector of [4 x double].
4170/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4171/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004172static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004173_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004174{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004175 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004176}
4177
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004178/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4179/// integer vector.
4180///
4181/// \headerfile <x86intrin.h>
4182///
4183/// This intrinsic has no corresponding instruction.
4184///
4185/// \param __a
4186/// A 256-bit floating-point vector of [4 x double].
4187/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4188/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004189static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004190_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004191{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004192 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004193}
4194
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004195/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4196/// floating-point vector of [4 x double].
4197///
4198/// \headerfile <x86intrin.h>
4199///
4200/// This intrinsic has no corresponding instruction.
4201///
4202/// \param __a
4203/// A 256-bit floating-point vector of [8 x float].
4204/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4205/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004206static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004207_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004208{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004209 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004210}
4211
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004212/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4213/// integer vector.
4214///
4215/// \headerfile <x86intrin.h>
4216///
4217/// This intrinsic has no corresponding instruction.
4218///
4219/// \param __a
4220/// A 256-bit floating-point vector of [8 x float].
4221/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4222/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004223static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004224_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004225{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004226 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004227}
4228
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004229/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4230/// of [8 x float].
4231///
4232/// \headerfile <x86intrin.h>
4233///
4234/// This intrinsic has no corresponding instruction.
4235///
4236/// \param __a
4237/// A 256-bit integer vector.
4238/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4239/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004240static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004241_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004242{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004243 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004244}
4245
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004246/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4247/// of [4 x double].
4248///
4249/// \headerfile <x86intrin.h>
4250///
4251/// This intrinsic has no corresponding instruction.
4252///
4253/// \param __a
4254/// A 256-bit integer vector.
4255/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4256/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004257static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004258_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004259{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004260 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004261}
4262
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004263/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4264/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4265///
4266/// \headerfile <x86intrin.h>
4267///
4268/// This intrinsic has no corresponding instruction.
4269///
4270/// \param __a
4271/// A 256-bit floating-point vector of [4 x double].
4272/// \returns A 128-bit floating-point vector of [2 x double] containing the
4273/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004274static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004275_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004276{
Craig Topper1aa231e2016-05-16 06:38:42 +00004277 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004278}
4279
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004280/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4281/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4282///
4283/// \headerfile <x86intrin.h>
4284///
4285/// This intrinsic has no corresponding instruction.
4286///
4287/// \param __a
4288/// A 256-bit floating-point vector of [8 x float].
4289/// \returns A 128-bit floating-point vector of [4 x float] containing the
4290/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004291static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004292_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004293{
Craig Topper1aa231e2016-05-16 06:38:42 +00004294 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004295}
4296
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004297/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4298///
4299/// \headerfile <x86intrin.h>
4300///
4301/// This intrinsic has no corresponding instruction.
4302///
4303/// \param __a
4304/// A 256-bit integer vector.
4305/// \returns A 128-bit integer vector containing the lower 128 bits of the
4306/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004307static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004308_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004309{
Craig Topper1aa231e2016-05-16 06:38:42 +00004310 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004311}
4312
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004313/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4314/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4315/// contain the value of the source vector. The contents of the upper 128
4316/// bits are undefined.
4317///
4318/// \headerfile <x86intrin.h>
4319///
4320/// This intrinsic has no corresponding instruction.
4321///
4322/// \param __a
4323/// A 128-bit vector of [2 x double].
4324/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4325/// contain the value of the parameter. The contents of the upper 128 bits
4326/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004327static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004328_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004329{
Craig Topper1aa231e2016-05-16 06:38:42 +00004330 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004331}
4332
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004333/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4334/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4335/// the value of the source vector. The contents of the upper 128 bits are
4336/// undefined.
4337///
4338/// \headerfile <x86intrin.h>
4339///
4340/// This intrinsic has no corresponding instruction.
4341///
4342/// \param __a
4343/// A 128-bit vector of [4 x float].
4344/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4345/// contain the value of the parameter. The contents of the upper 128 bits
4346/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004347static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004348_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004349{
Craig Topper1aa231e2016-05-16 06:38:42 +00004350 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004351}
4352
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004353/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4354/// The lower 128 bits contain the value of the source vector. The contents
4355/// of the upper 128 bits are undefined.
4356///
4357/// \headerfile <x86intrin.h>
4358///
4359/// This intrinsic has no corresponding instruction.
4360///
4361/// \param __a
4362/// A 128-bit integer vector.
4363/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4364/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004365static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004366_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004367{
Craig Topper1aa231e2016-05-16 06:38:42 +00004368 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004369}
Chad Rosierf8df4f42012-03-20 16:40:00 +00004370
Sean Silvae4c37602015-09-12 02:55:19 +00004371/*
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004372 Vector insert.
4373 We use macros rather than inlines because we only want to accept
4374 invocations where the immediate M is a constant expression.
4375*/
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004376/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4377/// a 256-bit vector of [8 x float] given in the first parameter, and then
4378/// replacing either the upper or the lower 128 bits with the contents of a
4379/// 128-bit vector of [4 x float] in the second parameter. The immediate
4380/// integer parameter determines between the upper or the lower 128 bits.
4381///
4382/// \headerfile <x86intrin.h>
4383///
4384/// \code
4385/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4386/// \endcode
4387///
4388/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4389///
4390/// \param V1
4391/// A 256-bit vector of [8 x float]. This vector is copied to the result
4392/// first, and then either the upper or the lower 128 bits of the result will
4393/// be replaced by the contents of V2.
4394/// \param V2
4395/// A 128-bit vector of [4 x float]. The contents of this parameter are
4396/// written to either the upper or the lower 128 bits of the result depending
4397/// on the value of parameter M.
4398/// \param M
4399/// An immediate integer. The least significant bit determines how the values
4400/// from the two parameters are interleaved:
4401/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
4402/// bits [255:128] of V1 are copied to bits [255:128] of the result.
4403/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
4404/// bits [127:0] of V1 are copied to bits [127:0] of the result.
4405/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004406#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4407 (__m256)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004408 (__v8sf)(__m256)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004409 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4410 (((M) & 1) ? 0 : 8), \
4411 (((M) & 1) ? 1 : 9), \
4412 (((M) & 1) ? 2 : 10), \
4413 (((M) & 1) ? 3 : 11), \
4414 (((M) & 1) ? 8 : 4), \
4415 (((M) & 1) ? 9 : 5), \
4416 (((M) & 1) ? 10 : 6), \
4417 (((M) & 1) ? 11 : 7) );})
4418
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004419/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4420/// a 256-bit vector of [4 x double] given in the first parameter, and then
4421/// replacing either the upper or the lower 128 bits with the contents of a
4422/// 128-bit vector of [2 x double] in the second parameter. The immediate
4423/// integer parameter determines between the upper or the lower 128 bits.
4424///
4425/// \headerfile <x86intrin.h>
4426///
4427/// \code
4428/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4429/// \endcode
4430///
4431/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4432///
4433/// \param V1
4434/// A 256-bit vector of [4 x double]. This vector is copied to the result
4435/// first, and then either the upper or the lower 128 bits of the result will
4436/// be replaced by the contents of V2.
4437/// \param V2
4438/// A 128-bit vector of [2 x double]. The contents of this parameter are
4439/// written to either the upper or the lower 128 bits of the result depending
4440/// on the value of parameter M.
4441/// \param M
4442/// An immediate integer. The least significant bit determines how the values
4443/// from the two parameters are interleaved:
4444/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
4445/// bits [255:128] of V1 are copied to bits [255:128] of the result.
4446/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
4447/// bits [127:0] of V1 are copied to bits [127:0] of the result.
4448/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004449#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4450 (__m256d)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004451 (__v4df)(__m256d)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004452 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4453 (((M) & 1) ? 0 : 4), \
4454 (((M) & 1) ? 1 : 5), \
4455 (((M) & 1) ? 4 : 2), \
4456 (((M) & 1) ? 5 : 3) );})
4457
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004458/// \brief Constructs a new 256-bit integer vector by first duplicating a
4459/// 256-bit integer vector given in the first parameter, and then replacing
4460/// either the upper or the lower 128 bits with the contents of a 128-bit
4461/// integer vector in the second parameter. The immediate integer parameter
4462/// determines between the upper or the lower 128 bits.
4463///
4464/// \headerfile <x86intrin.h>
4465///
4466/// \code
4467/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4468/// \endcode
4469///
4470/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4471///
4472/// \param V1
4473/// A 256-bit integer vector. This vector is copied to the result first, and
4474/// then either the upper or the lower 128 bits of the result will be
4475/// replaced by the contents of V2.
4476/// \param V2
4477/// A 128-bit integer vector. The contents of this parameter are written to
4478/// either the upper or the lower 128 bits of the result depending on the
4479/// value of parameter M.
4480/// \param M
4481/// An immediate integer. The least significant bit determines how the values
4482/// from the two parameters are interleaved:
4483/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
4484/// bits [255:128] of V1 are copied to bits [255:128] of the result.
4485/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
4486/// bits [127:0] of V1 are copied to bits [127:0] of the result.
4487/// \returns A 256-bit integer vector containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004488#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4489 (__m256i)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004490 (__v4di)(__m256i)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004491 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4492 (((M) & 1) ? 0 : 4), \
4493 (((M) & 1) ? 1 : 5), \
4494 (((M) & 1) ? 4 : 2), \
4495 (((M) & 1) ? 5 : 3) );})
4496
Sean Silvae4c37602015-09-12 02:55:19 +00004497/*
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004498 Vector extract.
4499 We use macros rather than inlines because we only want to accept
4500 invocations where the immediate M is a constant expression.
4501*/
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004502/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4503/// of [8 x float], as determined by the immediate integer parameter, and
4504/// returns the extracted bits as a 128-bit vector of [4 x float].
4505///
4506/// \headerfile <x86intrin.h>
4507///
4508/// \code
4509/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4510/// \endcode
4511///
4512/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
4513///
4514/// \param V
4515/// A 256-bit vector of [8 x float].
4516/// \param M
4517/// An immediate integer. The least significant bit determines which bits are
4518/// extracted from the first parameter:
4519/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
4520/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
4521/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004522#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4523 (__m128)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004524 (__v8sf)(__m256)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004525 (__v8sf)(_mm256_undefined_ps()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004526 (((M) & 1) ? 4 : 0), \
4527 (((M) & 1) ? 5 : 1), \
4528 (((M) & 1) ? 6 : 2), \
4529 (((M) & 1) ? 7 : 3) );})
4530
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004531/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4532/// of [4 x double], as determined by the immediate integer parameter, and
4533/// returns the extracted bits as a 128-bit vector of [2 x double].
4534///
4535/// \headerfile <x86intrin.h>
4536///
4537/// \code
4538/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4539/// \endcode
4540///
4541/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
4542///
4543/// \param V
4544/// A 256-bit vector of [4 x double].
4545/// \param M
4546/// An immediate integer. The least significant bit determines which bits are
4547/// extracted from the first parameter:
4548/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
4549/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
4550/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004551#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4552 (__m128d)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004553 (__v4df)(__m256d)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004554 (__v4df)(_mm256_undefined_pd()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004555 (((M) & 1) ? 2 : 0), \
4556 (((M) & 1) ? 3 : 1) );})
4557
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004558/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4559/// integer vector, as determined by the immediate integer parameter, and
4560/// returns the extracted bits as a 128-bit integer vector.
4561///
4562/// \headerfile <x86intrin.h>
4563///
4564/// \code
4565/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4566/// \endcode
4567///
4568/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
4569///
4570/// \param V
4571/// A 256-bit integer vector.
4572/// \param M
4573/// An immediate integer. The least significant bit determines which bits are
4574/// extracted from the first parameter:
4575/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
4576/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
4577/// \returns A 128-bit integer vector containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004578#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4579 (__m128i)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004580 (__v4di)(__m256i)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004581 (__v4di)(_mm256_undefined_si256()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004582 (((M) & 1) ? 2 : 0), \
4583 (((M) & 1) ? 3 : 1) );})
4584
Chad Rosierf8df4f42012-03-20 16:40:00 +00004585/* SIMD load ops (unaligned) */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004586/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4587/// unaligned memory locations and constructs a 256-bit floating-point vector
4588/// of [8 x float] by concatenating the two 128-bit vectors.
4589///
4590/// \headerfile <x86intrin.h>
4591///
4592/// This intrinsic corresponds to load instructions followed by the
4593/// \c VINSERTF128 instruction.
4594///
4595/// \param __addr_hi
4596/// A pointer to a 128-bit memory location containing 4 consecutive
4597/// single-precision floating-point values. These values are to be copied
4598/// to bits[255:128] of the result. The address of the memory location does
4599/// not have to be aligned.
4600/// \param __addr_lo
4601/// A pointer to a 128-bit memory location containing 4 consecutive
4602/// single-precision floating-point values. These values are to be copied
4603/// to bits[127:0] of the result. The address of the memory location does not
4604/// have to be aligned.
4605/// \returns A 256-bit floating-point vector of [8 x float] containing the
4606/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004607static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004608_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004609{
Craig Topper74b59482016-05-31 05:49:13 +00004610 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4611 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004612}
4613
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004614/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4615/// unaligned memory locations and constructs a 256-bit floating-point vector
4616/// of [4 x double] by concatenating the two 128-bit vectors.
4617///
4618/// \headerfile <x86intrin.h>
4619///
4620/// This intrinsic corresponds to load instructions followed by the
4621/// \c VINSERTF128 instruction.
4622///
4623/// \param __addr_hi
4624/// A pointer to a 128-bit memory location containing two consecutive
4625/// double-precision floating-point values. These values are to be copied
4626/// to bits[255:128] of the result. The address of the memory location does
4627/// not have to be aligned.
4628/// \param __addr_lo
4629/// A pointer to a 128-bit memory location containing two consecutive
4630/// double-precision floating-point values. These values are to be copied
4631/// to bits[127:0] of the result. The address of the memory location does not
4632/// have to be aligned.
4633/// \returns A 256-bit floating-point vector of [4 x double] containing the
4634/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004635static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004636_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004637{
Craig Topper74b59482016-05-31 05:49:13 +00004638 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4639 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004640}
4641
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004642/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4643/// constructs a 256-bit integer vector by concatenating the two 128-bit
4644/// vectors.
4645///
4646/// \headerfile <x86intrin.h>
4647///
4648/// This intrinsic corresponds to load instructions followed by the
4649/// \c VINSERTF128 instruction.
4650///
4651/// \param __addr_hi
4652/// A pointer to a 128-bit memory location containing a 128-bit integer
4653/// vector. This vector is to be copied to bits[255:128] of the result. The
4654/// address of the memory location does not have to be aligned.
4655/// \param __addr_lo
4656/// A pointer to a 128-bit memory location containing a 128-bit integer
4657/// vector. This vector is to be copied to bits[127:0] of the result. The
4658/// address of the memory location does not have to be aligned.
4659/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004660static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004661_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004662{
Craig Topper74b59482016-05-31 05:49:13 +00004663 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4664 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004665}
4666
4667/* SIMD store ops (unaligned) */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004668/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4669/// vector of [8 x float] into two different unaligned memory locations.
4670///
4671/// \headerfile <x86intrin.h>
4672///
4673/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
4674/// instructions.
4675///
4676/// \param __addr_hi
4677/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
4678/// copied to this memory location. The address of this memory location does
4679/// not have to be aligned.
4680/// \param __addr_lo
4681/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
4682/// copied to this memory location. The address of this memory location does
4683/// not have to be aligned.
4684/// \param __a
4685/// A 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004686static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004687_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004688{
David Blaikie3302f2b2013-01-16 23:08:36 +00004689 __m128 __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004690
David Blaikie3302f2b2013-01-16 23:08:36 +00004691 __v128 = _mm256_castps256_ps128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004692 _mm_storeu_ps(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004693 __v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004694 _mm_storeu_ps(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004695}
4696
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004697/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4698/// vector of [4 x double] into two different unaligned memory locations.
4699///
4700/// \headerfile <x86intrin.h>
4701///
4702/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
4703/// instructions.
4704///
4705/// \param __addr_hi
4706/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
4707/// copied to this memory location. The address of this memory location does
4708/// not have to be aligned.
4709/// \param __addr_lo
4710/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
4711/// copied to this memory location. The address of this memory location does
4712/// not have to be aligned.
4713/// \param __a
4714/// A 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004715static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004716_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004717{
David Blaikie3302f2b2013-01-16 23:08:36 +00004718 __m128d __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004719
David Blaikie3302f2b2013-01-16 23:08:36 +00004720 __v128 = _mm256_castpd256_pd128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004721 _mm_storeu_pd(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004722 __v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004723 _mm_storeu_pd(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004724}
4725
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004726/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
4727/// two different unaligned memory locations.
4728///
4729/// \headerfile <x86intrin.h>
4730///
4731/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
4732/// instructions.
4733///
4734/// \param __addr_hi
4735/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
4736/// copied to this memory location. The address of this memory location does
4737/// not have to be aligned.
4738/// \param __addr_lo
4739/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
4740/// copied to this memory location. The address of this memory location does
4741/// not have to be aligned.
4742/// \param __a
4743/// A 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004744static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004745_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004746{
David Blaikie3302f2b2013-01-16 23:08:36 +00004747 __m128i __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004748
David Blaikie3302f2b2013-01-16 23:08:36 +00004749 __v128 = _mm256_castsi256_si128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004750 _mm_storeu_si128(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004751 __v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004752 _mm_storeu_si128(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004753}
Richard Smith49e56442013-07-14 05:41:45 +00004754
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004755/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4756/// concatenating two 128-bit floating-point vectors of [4 x float].
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4761///
4762/// \param __hi
4763/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4764/// 128 bits of the result.
4765/// \param __lo
4766/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4767/// 128 bits of the result.
4768/// \returns A 256-bit floating-point vector of [8 x float] containing the
4769/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004770static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004771_mm256_set_m128 (__m128 __hi, __m128 __lo)
4772{
Craig Topper1aa231e2016-05-16 06:38:42 +00004773 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein76190042015-05-20 07:46:52 +00004774}
4775
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004776/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
4777/// concatenating two 128-bit floating-point vectors of [2 x double].
4778///
4779/// \headerfile <x86intrin.h>
4780///
4781/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4782///
4783/// \param __hi
4784/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4785/// 128 bits of the result.
4786/// \param __lo
4787/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4788/// 128 bits of the result.
4789/// \returns A 256-bit floating-point vector of [4 x double] containing the
4790/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004791static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004792_mm256_set_m128d (__m128d __hi, __m128d __lo)
4793{
Michael Kuperstein76190042015-05-20 07:46:52 +00004794 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4795}
4796
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004797/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
4798/// integer vectors.
4799///
4800/// \headerfile <x86intrin.h>
4801///
4802/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4803///
4804/// \param __hi
4805/// A 128-bit integer vector to be copied to the upper 128 bits of the
4806/// result.
4807/// \param __lo
4808/// A 128-bit integer vector to be copied to the lower 128 bits of the
4809/// result.
4810/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004811static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004812_mm256_set_m128i (__m128i __hi, __m128i __lo)
4813{
Michael Kuperstein76190042015-05-20 07:46:52 +00004814 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4815}
4816
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004817/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
4818/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4819/// similar to _mm256_set_m128, but the order of the input parameters is
4820/// swapped.
4821///
4822/// \headerfile <x86intrin.h>
4823///
4824/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4825///
4826/// \param __lo
4827/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4828/// 128 bits of the result.
4829/// \param __hi
4830/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4831/// 128 bits of the result.
4832/// \returns A 256-bit floating-point vector of [8 x float] containing the
4833/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004834static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004835_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4836{
Michael Kuperstein76190042015-05-20 07:46:52 +00004837 return _mm256_set_m128(__hi, __lo);
4838}
4839
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004840/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
4841/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4842/// similar to _mm256_set_m128d, but the order of the input parameters is
4843/// swapped.
4844///
4845/// \headerfile <x86intrin.h>
4846///
4847/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4848///
4849/// \param __lo
4850/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4851/// 128 bits of the result.
4852/// \param __hi
4853/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4854/// 128 bits of the result.
4855/// \returns A 256-bit floating-point vector of [4 x double] containing the
4856/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004857static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004858_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4859{
Michael Kuperstein76190042015-05-20 07:46:52 +00004860 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4861}
4862
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004863/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
4864/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4865/// the input parameters is swapped.
4866///
4867/// \headerfile <x86intrin.h>
4868///
4869/// This intrinsic corresponds to the \c VINSERTF128 instruction.
4870///
4871/// \param __lo
4872/// A 128-bit integer vector to be copied to the lower 128 bits of the
4873/// result.
4874/// \param __hi
4875/// A 128-bit integer vector to be copied to the upper 128 bits of the
4876/// result.
4877/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004878static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00004879_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4880{
Michael Kuperstein76190042015-05-20 07:46:52 +00004881 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4882}
4883
Michael Kupersteine45af542015-06-30 13:36:19 +00004884#undef __DEFAULT_FN_ATTRS
Eric Christopher4d1851682015-06-17 07:09:20 +00004885
Richard Smith49e56442013-07-14 05:41:45 +00004886#endif /* __AVXINTRIN_H */