blob: 27dc64424ae076d9e7636ecafc69c72d44f2334b [file] [log] [blame]
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer6f35f3c2010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000027
Richard Smith49e56442013-07-14 05:41:45 +000028#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000031typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
Craig Topper6a77b622016-06-04 05:43:41 +000038/* Unsigned types */
39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
Chandler Carruthcbe64112015-10-01 23:40:12 +000044/* We need an explicitly signed variant for char. Note that this shouldn't
45 * appear in the interface though. */
46typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000048typedef float __m256 __attribute__ ((__vector_size__ (32)));
49typedef double __m256d __attribute__((__vector_size__(32)));
50typedef long long __m256i __attribute__((__vector_size__(32)));
51
Eric Christopher4d1851682015-06-17 07:09:20 +000052/* Define the default attributes for the functions in this file. */
Michael Kupersteine45af542015-06-30 13:36:19 +000053#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher4d1851682015-06-17 07:09:20 +000054
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000055/* Arithmetic */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000056/// \brief Adds two 256-bit vectors of [4 x double].
57///
58/// \headerfile <x86intrin.h>
59///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000060/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000061///
62/// \param __a
63/// A 256-bit vector of [4 x double] containing one of the source operands.
64/// \param __b
65/// A 256-bit vector of [4 x double] containing one of the source operands.
66/// \returns A 256-bit vector of [4 x double] containing the sums of both
67/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000068static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000069_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000070{
Craig Topper1aa231e2016-05-16 06:38:42 +000071 return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000072}
73
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000074/// \brief Adds two 256-bit vectors of [8 x float].
75///
76/// \headerfile <x86intrin.h>
77///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000078/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000079///
80/// \param __a
81/// A 256-bit vector of [8 x float] containing one of the source operands.
82/// \param __b
83/// A 256-bit vector of [8 x float] containing one of the source operands.
84/// \returns A 256-bit vector of [8 x float] containing the sums of both
85/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +000086static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +000087_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000088{
Craig Topper1aa231e2016-05-16 06:38:42 +000089 return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000090}
91
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000092/// \brief Subtracts two 256-bit vectors of [4 x double].
93///
94/// \headerfile <x86intrin.h>
95///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +000096/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +000097///
98/// \param __a
99/// A 256-bit vector of [4 x double] containing the minuend.
100/// \param __b
101/// A 256-bit vector of [4 x double] containing the subtrahend.
102/// \returns A 256-bit vector of [4 x double] containing the differences between
103/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000104static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000105_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000106{
Craig Topper1aa231e2016-05-16 06:38:42 +0000107 return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000108}
109
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000110/// \brief Subtracts two 256-bit vectors of [8 x float].
111///
112/// \headerfile <x86intrin.h>
113///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000115///
116/// \param __a
117/// A 256-bit vector of [8 x float] containing the minuend.
118/// \param __b
119/// A 256-bit vector of [8 x float] containing the subtrahend.
120/// \returns A 256-bit vector of [8 x float] containing the differences between
121/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000122static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000123_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000124{
Craig Topper1aa231e2016-05-16 06:38:42 +0000125 return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000126}
127
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
129/// two 256-bit vectors of [4 x double].
130///
131/// \headerfile <x86intrin.h>
132///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000134///
135/// \param __a
136/// A 256-bit vector of [4 x double] containing the left source operand.
137/// \param __b
138/// A 256-bit vector of [4 x double] containing the right source operand.
139/// \returns A 256-bit vector of [4 x double] containing the alternating sums
140/// and differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000141static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000142_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000143{
David Blaikie3302f2b2013-01-16 23:08:36 +0000144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000145}
146
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
148/// two 256-bit vectors of [8 x float].
149///
150/// \headerfile <x86intrin.h>
151///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000153///
154/// \param __a
155/// A 256-bit vector of [8 x float] containing the left source operand.
156/// \param __b
157/// A 256-bit vector of [8 x float] containing the right source operand.
158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159/// differences between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000160static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000161_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000162{
David Blaikie3302f2b2013-01-16 23:08:36 +0000163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000164}
165
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000166/// \brief Divides two 256-bit vectors of [4 x double].
167///
168/// \headerfile <x86intrin.h>
169///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000171///
172/// \param __a
173/// A 256-bit vector of [4 x double] containing the dividend.
174/// \param __b
175/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000176/// \returns A 256-bit vector of [4 x double] containing the quotients of both
177/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000178static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000179_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000180{
Craig Topper1aa231e2016-05-16 06:38:42 +0000181 return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000182}
183
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000184/// \brief Divides two 256-bit vectors of [8 x float].
185///
186/// \headerfile <x86intrin.h>
187///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000189///
190/// \param __a
191/// A 256-bit vector of [8 x float] containing the dividend.
192/// \param __b
193/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000194/// \returns A 256-bit vector of [8 x float] containing the quotients of both
195/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000196static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000197_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000198{
Craig Topper1aa231e2016-05-16 06:38:42 +0000199 return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000200}
201
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
203/// of each pair of values.
204///
205/// \headerfile <x86intrin.h>
206///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000208///
209/// \param __a
210/// A 256-bit vector of [4 x double] containing one of the operands.
211/// \param __b
212/// A 256-bit vector of [4 x double] containing one of the operands.
213/// \returns A 256-bit vector of [4 x double] containing the maximum values
214/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000215static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000216_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000217{
David Blaikie3302f2b2013-01-16 23:08:36 +0000218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000219}
220
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
222/// of each pair of values.
223///
224/// \headerfile <x86intrin.h>
225///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000227///
228/// \param __a
229/// A 256-bit vector of [8 x float] containing one of the operands.
230/// \param __b
231/// A 256-bit vector of [8 x float] containing one of the operands.
232/// \returns A 256-bit vector of [8 x float] containing the maximum values
233/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000234static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000235_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000236{
David Blaikie3302f2b2013-01-16 23:08:36 +0000237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000238}
239
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
241/// of each pair of values.
242///
243/// \headerfile <x86intrin.h>
244///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000245/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000246///
247/// \param __a
248/// A 256-bit vector of [4 x double] containing one of the operands.
249/// \param __b
250/// A 256-bit vector of [4 x double] containing one of the operands.
251/// \returns A 256-bit vector of [4 x double] containing the minimum values
252/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000253static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000254_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000255{
David Blaikie3302f2b2013-01-16 23:08:36 +0000256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000257}
258
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
260/// of each pair of values.
261///
262/// \headerfile <x86intrin.h>
263///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000264/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000265///
266/// \param __a
267/// A 256-bit vector of [8 x float] containing one of the operands.
268/// \param __b
269/// A 256-bit vector of [8 x float] containing one of the operands.
270/// \returns A 256-bit vector of [8 x float] containing the minimum values
271/// between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000272static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000273_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000274{
David Blaikie3302f2b2013-01-16 23:08:36 +0000275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000276}
277
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000278/// \brief Multiplies two 256-bit vectors of [4 x double].
279///
280/// \headerfile <x86intrin.h>
281///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000282/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000283///
284/// \param __a
285/// A 256-bit vector of [4 x double] containing one of the operands.
286/// \param __b
287/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000288/// \returns A 256-bit vector of [4 x double] containing the products of both
289/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000290static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000291_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000292{
Craig Topper1aa231e2016-05-16 06:38:42 +0000293 return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000294}
295
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000296/// \brief Multiplies two 256-bit vectors of [8 x float].
297///
298/// \headerfile <x86intrin.h>
299///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000300/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000301///
302/// \param __a
303/// A 256-bit vector of [8 x float] containing one of the operands.
304/// \param __b
305/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000306/// \returns A 256-bit vector of [8 x float] containing the products of both
307/// operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000308static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000309_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000310{
Craig Topper1aa231e2016-05-16 06:38:42 +0000311 return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000312}
313
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000314/// \brief Calculates the square roots of the values in a 256-bit vector of
315/// [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000316///
317/// \headerfile <x86intrin.h>
318///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000320///
321/// \param __a
322/// A 256-bit vector of [4 x double].
323/// \returns A 256-bit vector of [4 x double] containing the square roots of the
324/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000325static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000326_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000327{
David Blaikie3302f2b2013-01-16 23:08:36 +0000328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000329}
330
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000331/// \brief Calculates the square roots of the values in a 256-bit vector of
332/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000333///
334/// \headerfile <x86intrin.h>
335///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000337///
338/// \param __a
339/// A 256-bit vector of [8 x float].
340/// \returns A 256-bit vector of [8 x float] containing the square roots of the
341/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000342static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000343_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000344{
David Blaikie3302f2b2013-01-16 23:08:36 +0000345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000346}
347
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000348/// \brief Calculates the reciprocal square roots of the values in a 256-bit
349/// vector of [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000350///
351/// \headerfile <x86intrin.h>
352///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000354///
355/// \param __a
356/// A 256-bit vector of [8 x float].
357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358/// roots of the values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000359static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000360_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000361{
David Blaikie3302f2b2013-01-16 23:08:36 +0000362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000363}
364
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000365/// \brief Calculates the reciprocals of the values in a 256-bit vector of
366/// [8 x float].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000367///
368/// \headerfile <x86intrin.h>
369///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000371///
372/// \param __a
373/// A 256-bit vector of [8 x float].
374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375/// values in the operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000376static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000377_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000378{
David Blaikie3302f2b2013-01-16 23:08:36 +0000379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000380}
381
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
383/// by the byte operand. The source values are rounded to integer values and
384/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m256d _mm256_round_pd(__m256d V, const int M);
390/// \endcode
391///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000393///
394/// \param V
395/// A 256-bit vector of [4 x double].
396/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000397/// An integer value that specifies the rounding operation. \n
398/// Bits [7:4] are reserved. \n
399/// Bit [3] is a precision exception value: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000400/// 0: A normal PE exception is used. \n
401/// 1: The PE field is not updated. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000402/// Bit [2] is the rounding control source: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000403/// 0: Use bits [1:0] of \a M. \n
404/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000405/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000406/// 00: Nearest. \n
407/// 01: Downward (toward negative infinity). \n
408/// 10: Upward (toward positive infinity). \n
409/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000410/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier060d03b2011-12-17 00:15:26 +0000411#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +0000412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000413
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
415/// specified by the byte operand. The source values are rounded to integer
416/// values and returned as floating-point values.
417///
418/// \headerfile <x86intrin.h>
419///
420/// \code
421/// __m256 _mm256_round_ps(__m256 V, const int M);
422/// \endcode
423///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000425///
426/// \param V
427/// A 256-bit vector of [8 x float].
428/// \param M
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000429/// An integer value that specifies the rounding operation. \n
430/// Bits [7:4] are reserved. \n
431/// Bit [3] is a precision exception value: \n
432/// 0: A normal PE exception is used. \n
433/// 1: The PE field is not updated. \n
434/// Bit [2] is the rounding control source: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +0000435/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000436/// 1: Use the current MXCSR setting. \n
437/// Bits [1:0] contain the rounding control definition: \n
438/// 00: Nearest. \n
439/// 01: Downward (toward negative infinity). \n
440/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000441/// 11: Truncated.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000442/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier060d03b2011-12-17 00:15:26 +0000443#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +0000444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000445
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000447/// source values are rounded up to integer values and returned as 64-bit
448/// double-precision floating-point values.
449///
450/// \headerfile <x86intrin.h>
451///
452/// \code
453/// __m256d _mm256_ceil_pd(__m256d V);
454/// \endcode
455///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000457///
458/// \param V
459/// A 256-bit vector of [4 x double].
460/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000462
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000464/// The source values are rounded down to integer values and returned as
465/// 64-bit double-precision floating-point values.
466///
467/// \headerfile <x86intrin.h>
468///
469/// \code
470/// __m256d _mm256_floor_pd(__m256d V);
471/// \endcode
472///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000474///
475/// \param V
476/// A 256-bit vector of [4 x double].
477/// \returns A 256-bit vector of [4 x double] containing the rounded down
478/// values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000480
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000482/// source values are rounded up to integer values and returned as
483/// floating-point values.
484///
485/// \headerfile <x86intrin.h>
486///
487/// \code
488/// __m256 _mm256_ceil_ps(__m256 V);
489/// \endcode
490///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000492///
493/// \param V
494/// A 256-bit vector of [8 x float].
495/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000497
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000499/// source values are rounded down to integer values and returned as
500/// floating-point values.
501///
502/// \headerfile <x86intrin.h>
503///
504/// \code
505/// __m256 _mm256_floor_ps(__m256 V);
506/// \endcode
507///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000509///
510/// \param V
511/// A 256-bit vector of [8 x float].
512/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514
515/* Logical */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
517///
518/// \headerfile <x86intrin.h>
519///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000520/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000521///
522/// \param __a
523/// A 256-bit vector of [4 x double] containing one of the source operands.
524/// \param __b
525/// A 256-bit vector of [4 x double] containing one of the source operands.
526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000528static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000529_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000530{
Craig Topper6a77b622016-06-04 05:43:41 +0000531 return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000532}
533
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
535///
536/// \headerfile <x86intrin.h>
537///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000538/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000539///
540/// \param __a
541/// A 256-bit vector of [8 x float] containing one of the source operands.
542/// \param __b
543/// A 256-bit vector of [8 x float] containing one of the source operands.
544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000546static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000547_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000548{
Craig Topper6a77b622016-06-04 05:43:41 +0000549 return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000550}
551
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553/// the one's complement of the values contained in the first source operand.
554///
555/// \headerfile <x86intrin.h>
556///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000558///
559/// \param __a
560/// A 256-bit vector of [4 x double] containing the left source operand. The
561/// one's complement of this value is used in the bitwise AND.
562/// \param __b
563/// A 256-bit vector of [4 x double] containing the right source operand.
564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565/// values of the second operand and the one's complement of the first
566/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000567static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000568_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000569{
Craig Topper6a77b622016-06-04 05:43:41 +0000570 return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000571}
572
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574/// the one's complement of the values contained in the first source operand.
575///
576/// \headerfile <x86intrin.h>
577///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000579///
580/// \param __a
581/// A 256-bit vector of [8 x float] containing the left source operand. The
582/// one's complement of this value is used in the bitwise AND.
583/// \param __b
584/// A 256-bit vector of [8 x float] containing the right source operand.
585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586/// values of the second operand and the one's complement of the first
587/// operand.
Michael Kupersteine45af542015-06-30 13:36:19 +0000588static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000589_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000590{
Craig Topper6a77b622016-06-04 05:43:41 +0000591 return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000592}
593
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
595///
596/// \headerfile <x86intrin.h>
597///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000598/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000599///
600/// \param __a
601/// A 256-bit vector of [4 x double] containing one of the source operands.
602/// \param __b
603/// A 256-bit vector of [4 x double] containing one of the source operands.
604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000606static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000607_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000608{
Craig Topper6a77b622016-06-04 05:43:41 +0000609 return (__m256d)((__v4du)__a | (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000610}
611
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
613///
614/// \headerfile <x86intrin.h>
615///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000616/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000617///
618/// \param __a
619/// A 256-bit vector of [8 x float] containing one of the source operands.
620/// \param __b
621/// A 256-bit vector of [8 x float] containing one of the source operands.
622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000624static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000625_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000626{
Craig Topper6a77b622016-06-04 05:43:41 +0000627 return (__m256)((__v8su)__a | (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000628}
629
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631///
632/// \headerfile <x86intrin.h>
633///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000634/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000635///
636/// \param __a
637/// A 256-bit vector of [4 x double] containing one of the source operands.
638/// \param __b
639/// A 256-bit vector of [4 x double] containing one of the source operands.
640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000642static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000643_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000644{
Craig Topper6a77b622016-06-04 05:43:41 +0000645 return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000646}
647
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649///
650/// \headerfile <x86intrin.h>
651///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000652/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000653///
654/// \param __a
655/// A 256-bit vector of [8 x float] containing one of the source operands.
656/// \param __b
657/// A 256-bit vector of [8 x float] containing one of the source operands.
658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659/// values between both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000660static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000661_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000662{
Craig Topper6a77b622016-06-04 05:43:41 +0000663 return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000664}
665
666/* Horizontal arithmetic */
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000667/// \brief Horizontally adds the adjacent pairs of values contained in two
668/// 256-bit vectors of [4 x double].
669///
670/// \headerfile <x86intrin.h>
671///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000673///
674/// \param __a
675/// A 256-bit vector of [4 x double] containing one of the source operands.
676/// The horizontal sums of the values are returned in the even-indexed
677/// elements of a vector of [4 x double].
678/// \param __b
679/// A 256-bit vector of [4 x double] containing one of the source operands.
680/// The horizontal sums of the values are returned in the odd-indexed
681/// elements of a vector of [4 x double].
682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000684static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000685_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000686{
David Blaikie3302f2b2013-01-16 23:08:36 +0000687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000688}
689
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000690/// \brief Horizontally adds the adjacent pairs of values contained in two
691/// 256-bit vectors of [8 x float].
692///
693/// \headerfile <x86intrin.h>
694///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000696///
697/// \param __a
698/// A 256-bit vector of [8 x float] containing one of the source operands.
699/// The horizontal sums of the values are returned in the elements with
700/// index 0, 1, 4, 5 of a vector of [8 x float].
701/// \param __b
702/// A 256-bit vector of [8 x float] containing one of the source operands.
703/// The horizontal sums of the values are returned in the elements with
704/// index 2, 3, 6, 7 of a vector of [8 x float].
705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706/// both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000707static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000708_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000709{
David Blaikie3302f2b2013-01-16 23:08:36 +0000710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000711}
712
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000713/// \brief Horizontally subtracts the adjacent pairs of values contained in two
714/// 256-bit vectors of [4 x double].
715///
716/// \headerfile <x86intrin.h>
717///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000719///
720/// \param __a
721/// A 256-bit vector of [4 x double] containing one of the source operands.
722/// The horizontal differences between the values are returned in the
723/// even-indexed elements of a vector of [4 x double].
724/// \param __b
725/// A 256-bit vector of [4 x double] containing one of the source operands.
726/// The horizontal differences between the values are returned in the
727/// odd-indexed elements of a vector of [4 x double].
728/// \returns A 256-bit vector of [4 x double] containing the horizontal
729/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000730static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000731_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000732{
David Blaikie3302f2b2013-01-16 23:08:36 +0000733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000734}
735
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000736/// \brief Horizontally subtracts the adjacent pairs of values contained in two
737/// 256-bit vectors of [8 x float].
738///
739/// \headerfile <x86intrin.h>
740///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000742///
743/// \param __a
744/// A 256-bit vector of [8 x float] containing one of the source operands.
745/// The horizontal differences between the values are returned in the
746/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
747/// \param __b
748/// A 256-bit vector of [8 x float] containing one of the source operands.
749/// The horizontal differences between the values are returned in the
750/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
751/// \returns A 256-bit vector of [8 x float] containing the horizontal
752/// differences of both operands.
Michael Kupersteine45af542015-06-30 13:36:19 +0000753static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000754_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000755{
David Blaikie3302f2b2013-01-16 23:08:36 +0000756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000757}
758
759/* Vector permutations */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
761/// by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000762///
763/// \headerfile <x86intrin.h>
764///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000766///
767/// \param __a
768/// A 128-bit vector of [2 x double].
769/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000770/// A 128-bit integer vector operand specifying how the values are to be
771/// copied. \n
772/// Bit [1]: \n
773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774/// vector. \n
775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
776/// returned vector. \n
777/// Bit [65]: \n
778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
779/// returned vector. \n
780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
781/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000782/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000783static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000784_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000785{
David Blaikie3302f2b2013-01-16 23:08:36 +0000786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000787}
788
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
790/// by the 256-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000791///
792/// \headerfile <x86intrin.h>
793///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000795///
796/// \param __a
797/// A 256-bit vector of [4 x double].
798/// \param __c
799/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000800/// copied. \n
801/// Bit [1]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803/// vector. \n
804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
805/// returned vector. \n
Ekaterina Romanova4c77e892016-11-26 19:38:19 +0000806/// Bit [65]: \n
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
808/// returned vector. \n
809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
810/// returned vector. \n
811/// Bit [129]: \n
812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
813/// returned vector. \n
814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
815/// returned vector. \n
816/// Bit [193]: \n
817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
818/// returned vector. \n
819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +0000820/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000821/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000822static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000823_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000824{
David Blaikie3302f2b2013-01-16 23:08:36 +0000825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000826}
827
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
829/// specified by the 128-bit integer vector operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000830/// \headerfile <x86intrin.h>
831///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000833///
834/// \param __a
835/// A 128-bit vector of [4 x float].
836/// \param __c
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000837/// A 128-bit integer vector operand specifying how the values are to be
838/// copied. \n
839/// Bits [1:0]: \n
840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
841/// returned vector. \n
842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
843/// returned vector. \n
844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
845/// returned vector. \n
846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
847/// returned vector. \n
848/// Bits [33:32]: \n
849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
850/// returned vector. \n
851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
852/// returned vector. \n
853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
854/// returned vector. \n
855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
856/// returned vector. \n
857/// Bits [65:64]: \n
858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
859/// returned vector. \n
860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
861/// returned vector. \n
862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
863/// returned vector. \n
864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
865/// returned vector. \n
866/// Bits [97:96]: \n
867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
868/// returned vector. \n
869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
870/// returned vector. \n
871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
872/// returned vector. \n
873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
874/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000875/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000876static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000877_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000878{
David Blaikie3302f2b2013-01-16 23:08:36 +0000879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000880}
881
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
883/// specified by the 256-bit integer vector operand.
884///
885/// \headerfile <x86intrin.h>
886///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000888///
889/// \param __a
890/// A 256-bit vector of [8 x float].
891/// \param __c
892/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000893/// copied. \n
894/// Bits [1:0]: \n
895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
896/// returned vector. \n
897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
898/// returned vector. \n
899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
900/// returned vector. \n
901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
902/// returned vector. \n
903/// Bits [33:32]: \n
904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
905/// returned vector. \n
906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
907/// returned vector. \n
908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
909/// returned vector. \n
910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
911/// returned vector. \n
912/// Bits [65:64]: \n
913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
914/// returned vector. \n
915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
916/// returned vector. \n
917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
918/// returned vector. \n
919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
920/// returned vector. \n
921/// Bits [97:96]: \n
922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
923/// returned vector. \n
924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
925/// returned vector. \n
926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
927/// returned vector. \n
928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
929/// returned vector. \n
930/// Bits [129:128]: \n
931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
932/// returned vector. \n
933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
934/// returned vector. \n
935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
936/// returned vector. \n
937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
938/// returned vector. \n
939/// Bits [161:160]: \n
940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
941/// returned vector. \n
942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
943/// returned vector. \n
944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
945/// returned vector. \n
946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
947/// returned vector. \n
948/// Bits [193:192]: \n
949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
950/// returned vector. \n
951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
952/// returned vector. \n
953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
954/// returned vector. \n
955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
956/// returned vector. \n
957/// Bits [225:224]: \n
958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
959/// returned vector. \n
960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
961/// returned vector. \n
962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
963/// returned vector. \n
964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
965/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000966/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +0000967static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +0000968_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000969{
Craig Topper9fee8ab2015-01-31 06:33:59 +0000970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000971}
972
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
974/// by the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000975///
976/// \headerfile <x86intrin.h>
977///
978/// \code
979/// __m128d _mm_permute_pd(__m128d A, const int C);
980/// \endcode
981///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +0000982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000983///
984/// \param A
985/// A 128-bit vector of [2 x double].
986/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +0000987/// An immediate integer operand specifying how the values are to be
988/// copied. \n
989/// Bit [0]: \n
990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991/// vector. \n
992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
993/// returned vector. \n
994/// Bit [1]: \n
995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
996/// returned vector. \n
997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
998/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +0000999/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier93375d52011-12-17 01:39:56 +00001000#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001002 (__v2df)_mm_undefined_pd(), \
1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001004
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
1006/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001007///
1008/// \headerfile <x86intrin.h>
1009///
1010/// \code
1011/// __m256d _mm256_permute_pd(__m256d A, const int C);
1012/// \endcode
1013///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001015///
1016/// \param A
1017/// A 256-bit vector of [4 x double].
1018/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001019/// An immediate integer operand specifying how the values are to be
1020/// copied. \n
1021/// Bit [0]: \n
1022/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1023/// vector. \n
1024/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1025/// returned vector. \n
1026/// Bit [1]: \n
1027/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1028/// returned vector. \n
1029/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1030/// returned vector. \n
1031/// Bit [2]: \n
1032/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1033/// returned vector. \n
1034/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1035/// returned vector. \n
1036/// Bit [3]: \n
1037/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1038/// returned vector. \n
1039/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1040/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001041/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier93375d52011-12-17 01:39:56 +00001042#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001044 (__v4df)_mm256_undefined_pd(), \
1045 0 + (((C) >> 0) & 0x1), \
1046 0 + (((C) >> 1) & 0x1), \
1047 2 + (((C) >> 2) & 0x1), \
1048 2 + (((C) >> 3) & 0x1)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001049
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
1051/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001052///
1053/// \headerfile <x86intrin.h>
1054///
1055/// \code
1056/// __m128 _mm_permute_ps(__m128 A, const int C);
1057/// \endcode
1058///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001060///
1061/// \param A
1062/// A 128-bit vector of [4 x float].
1063/// \param C
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001064/// An immediate integer operand specifying how the values are to be
1065/// copied. \n
1066/// Bits [1:0]: \n
1067/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1068/// returned vector. \n
1069/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1070/// returned vector. \n
1071/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1072/// returned vector. \n
1073/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1074/// returned vector. \n
1075/// Bits [3:2]: \n
1076/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1077/// returned vector. \n
1078/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1079/// returned vector. \n
1080/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1081/// returned vector. \n
1082/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1083/// returned vector. \n
1084/// Bits [5:4]: \n
1085/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1086/// returned vector. \n
1087/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1088/// returned vector. \n
1089/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1090/// returned vector. \n
1091/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1092/// returned vector. \n
1093/// Bits [7:6]: \n
1094/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1095/// returned vector. \n
1096/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1097/// returned vector. \n
1098/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1099/// returned vector. \n
1100/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1101/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001102/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier7caca842011-12-17 01:51:05 +00001103#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001105 (__v4sf)_mm_undefined_ps(), \
1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001108
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
1110/// the immediate integer operand.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001111///
1112/// \headerfile <x86intrin.h>
1113///
1114/// \code
1115/// __m256 _mm256_permute_ps(__m256 A, const int C);
1116/// \endcode
1117///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001119///
1120/// \param A
1121/// A 256-bit vector of [8 x float].
1122/// \param C
Douglas Yung7ff91422018-01-08 21:21:17 +00001123/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001124/// copied. \n
1125/// Bits [1:0]: \n
1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1127/// returned vector. \n
1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1129/// returned vector. \n
1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1131/// returned vector. \n
1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1133/// returned vector. \n
1134/// Bits [3:2]: \n
1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1136/// returned vector. \n
1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1138/// returned vector. \n
1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1140/// returned vector. \n
1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1142/// returned vector. \n
1143/// Bits [5:4]: \n
1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1145/// returned vector. \n
1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1147/// returned vector. \n
1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1149/// returned vector. \n
1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1151/// returned vector. \n
1152/// Bits [7:6]: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001153/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001154/// returned vector. \n
1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1156/// returned vector. \n
1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1158/// returned vector. \n
1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1160/// returned vector. \n
1161/// Bits [1:0]: \n
1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1163/// returned vector. \n
1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1165/// returned vector. \n
1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1167/// returned vector. \n
1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1169/// returned vector. \n
1170/// Bits [3:2]: \n
1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1172/// returned vector. \n
1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1174/// returned vector. \n
1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1176/// returned vector. \n
1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1178/// returned vector. \n
1179/// Bits [5:4]: \n
1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1181/// returned vector. \n
1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1183/// returned vector. \n
1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1185/// returned vector. \n
1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1187/// returned vector. \n
1188/// Bits [7:6]: \n
1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1190/// returned vector. \n
1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1192/// returned vector. \n
1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1194/// returned vector. \n
1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1196/// returned vector.
Ekaterina Romanova13f189d2016-03-11 00:05:54 +00001197/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier7caca842011-12-17 01:51:05 +00001198#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
Craig Topper2a383c92016-07-04 22:18:01 +00001200 (__v8sf)_mm256_undefined_ps(), \
1201 0 + (((C) >> 0) & 0x3), \
1202 0 + (((C) >> 2) & 0x3), \
1203 0 + (((C) >> 4) & 0x3), \
1204 0 + (((C) >> 6) & 0x3), \
1205 4 + (((C) >> 0) & 0x3), \
1206 4 + (((C) >> 2) & 0x3), \
1207 4 + (((C) >> 4) & 0x3), \
1208 4 + (((C) >> 6) & 0x3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001209
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1211/// [4 x double], as specified by the immediate integer operand.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// \code
1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1217/// \endcode
1218///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001220///
1221/// \param V1
1222/// A 256-bit vector of [4 x double].
1223/// \param V2
1224/// A 256-bit vector of [4 x double.
1225/// \param M
1226/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001227/// permuted. \n
1228/// Bits [1:0]: \n
1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1230/// destination. \n
1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1232/// destination. \n
1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1234/// destination. \n
1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1236/// destination. \n
1237/// Bits [5:4]: \n
1238/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1239/// destination. \n
1240/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1241/// destination. \n
1242/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1243/// destination. \n
1244/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1245/// destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001246/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1249 (__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001250
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
1252/// [8 x float], as specified by the immediate integer operand.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// \code
1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1258/// \endcode
1259///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001261///
1262/// \param V1
1263/// A 256-bit vector of [8 x float].
1264/// \param V2
1265/// A 256-bit vector of [8 x float].
1266/// \param M
1267/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001268/// permuted. \n
1269/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001270/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001271/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001272/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001273/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001274/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001275/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001276/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001277/// destination. \n
1278/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001279/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001280/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001281/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001282/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001283/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001284/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001285/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001286/// destination.
1287/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1290 (__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001291
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
1293/// as specified by the immediate integer operand.
1294///
1295/// \headerfile <x86intrin.h>
1296///
1297/// \code
1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1299/// \endcode
1300///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001302///
1303/// \param V1
1304/// A 256-bit integer vector.
1305/// \param V2
1306/// A 256-bit integer vector.
1307/// \param M
1308/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001309/// Bits [1:0]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001310/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001311/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001312/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001313/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001314/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001315/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001316/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001317/// destination. \n
1318/// Bits [5:4]: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001319/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001320/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001321/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001322/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001323/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001324/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001325/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001326/// destination.
1327/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier9138fea252011-12-16 21:07:34 +00001328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1330 (__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001331
1332/* Vector Blend */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001333/// \brief Merges 64-bit double-precision data values stored in either of the
1334/// two 256-bit vectors of [4 x double], as specified by the immediate
1335/// integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1341/// \endcode
1342///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001344///
1345/// \param V1
1346/// A 256-bit vector of [4 x double].
1347/// \param V2
1348/// A 256-bit vector of [4 x double].
1349/// \param M
1350/// An immediate integer operand, with mask bits [3:0] specifying how the
1351/// values are to be copied. The position of the mask bit corresponds to the
1352/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001353/// element in operand \a V1 is copied to the same position in the
1354/// destination. When a mask bit is 1, the corresponding 64-bit element in
1355/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001356/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1359 (__v4df)(__m256d)(V2), \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +00001360 (((M) & 0x01) ? 4 : 0), \
1361 (((M) & 0x02) ? 5 : 1), \
1362 (((M) & 0x04) ? 6 : 2), \
1363 (((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001364
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001365/// \brief Merges 32-bit single-precision data values stored in either of the
1366/// two 256-bit vectors of [8 x float], as specified by the immediate
1367/// integer operand.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// \code
1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1373/// \endcode
1374///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001376///
1377/// \param V1
1378/// A 256-bit vector of [8 x float].
1379/// \param V2
1380/// A 256-bit vector of [8 x float].
1381/// \param M
1382/// An immediate integer operand, with mask bits [7:0] specifying how the
1383/// values are to be copied. The position of the mask bit corresponds to the
1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001385/// element in operand \a V1 is copied to the same position in the
1386/// destination. When a mask bit is 1, the corresponding 32-bit element in
1387/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001388/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1391 (__v8sf)(__m256)(V2), \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +00001392 (((M) & 0x01) ? 8 : 0), \
1393 (((M) & 0x02) ? 9 : 1), \
1394 (((M) & 0x04) ? 10 : 2), \
1395 (((M) & 0x08) ? 11 : 3), \
1396 (((M) & 0x10) ? 12 : 4), \
1397 (((M) & 0x20) ? 13 : 5), \
1398 (((M) & 0x40) ? 14 : 6), \
1399 (((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001400
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001401/// \brief Merges 64-bit double-precision data values stored in either of the
1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403/// operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001408///
1409/// \param __a
1410/// A 256-bit vector of [4 x double].
1411/// \param __b
1412/// A 256-bit vector of [4 x double].
1413/// \param __c
1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415/// how the values are to be copied. The position of the mask bit corresponds
1416/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001417/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001418/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova4c77e892016-11-26 19:38:19 +00001419/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001420/// destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001422static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001424{
David Blaikie3302f2b2013-01-16 23:08:36 +00001425 return (__m256d)__builtin_ia32_blendvpd256(
1426 (__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001427}
1428
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001429/// \brief Merges 32-bit single-precision data values stored in either of the
1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431/// operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001436///
1437/// \param __a
1438/// A 256-bit vector of [8 x float].
1439/// \param __b
1440/// A 256-bit vector of [8 x float].
1441/// \param __c
1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443/// and 31 specifying how the values are to be copied. The position of the
1444/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001446/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001447/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001448/// position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kupersteine45af542015-06-30 13:36:19 +00001450static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00001451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001452{
David Blaikie5bb70032013-01-16 23:13:42 +00001453 return (__m256)__builtin_ia32_blendvps256(
David Blaikie3302f2b2013-01-16 23:08:36 +00001454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001455}
1456
1457/* Vector Dot Product */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001458/// \brief Computes two dot products in parallel, using the lower and upper
1459/// halves of two [8 x float] vectors as input to the two computations, and
1460/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001461/// [8 x float] result.
1462///
1463/// The immediate integer operand controls which input elements will
1464/// contribute to the dot product, and where the final results are returned.
1465/// In general, for each dot product, the four corresponding elements of the
1466/// input vectors are multiplied; the first two and second two products are
1467/// summed, then the two sums are added to form the final result.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// \code
1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1473/// \endcode
1474///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001475/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001476///
1477/// \param V1
1478/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1479/// \param V2
1480/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1481/// \param M
1482/// An immediate integer argument. Bits [7:4] determine which elements of
1483/// the input vectors are used, with bit [4] corresponding to the lowest
1484/// element and bit [7] corresponding to the highest element of each [4 x
1485/// float] subvector. If a bit is set, the corresponding elements from the
1486/// two input vectors are used as an input for dot product; otherwise that
1487/// input is treated as zero. Bits [3:0] determine which elements of the
1488/// result will receive a copy of the final dot product, with bit [0]
1489/// corresponding to the lowest element and bit [3] corresponding to the
1490/// highest element of each [4 x float] subvector. If a bit is set, the dot
1491/// product is returned in the corresponding element; otherwise that element
1492/// is set to zero. The bitmask is applied in the same way to each of the
1493/// two parallel dot product computations.
1494/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedmanf16beb32011-11-10 00:11:13 +00001495#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001496 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1497 (__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001498
1499/* Vector shuffle */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001500/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001501/// specified by the immediate value operand.
1502///
1503/// The four selected elements in each operand are copied to the destination
1504/// according to the bits specified in the immediate operand. The selected
1505/// elements from the first 256-bit operand are copied to bits [63:0] and
1506/// bits [191:128] of the destination, and the selected elements from the
1507/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1508/// the destination. For example, if bits [7:0] of the immediate operand
1509/// contain a value of 0xFF, the 256-bit destination vector would contain the
1510/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// \code
1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1516/// \endcode
1517///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001519///
1520/// \param a
1521/// A 256-bit vector of [8 x float]. The four selected elements in this
1522/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1523/// according to the bits specified in the immediate operand.
1524/// \param b
1525/// A 256-bit vector of [8 x float]. The four selected elements in this
1526/// operand are copied to bits [127:64] and bits [255:192] in the
1527/// destination, according to the bits specified in the immediate operand.
1528/// \param mask
1529/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001530/// copy from \a a and \a b \n.
1531/// Bits [3:0] specify the values copied from operand \a a. \n
1532/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001533/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001534/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001535/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001536/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001537/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001538/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001539/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001540/// destination. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001541/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001542/// the destination. \n
1543/// Bit value assignments: \n
1544/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1545/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1546/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001547/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1548/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001549#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper2a383c92016-07-04 22:18:01 +00001550 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1551 (__v8sf)(__m256)(b), \
1552 0 + (((mask) >> 0) & 0x3), \
1553 0 + (((mask) >> 2) & 0x3), \
1554 8 + (((mask) >> 4) & 0x3), \
1555 8 + (((mask) >> 6) & 0x3), \
1556 4 + (((mask) >> 0) & 0x3), \
1557 4 + (((mask) >> 2) & 0x3), \
1558 12 + (((mask) >> 4) & 0x3), \
1559 12 + (((mask) >> 6) & 0x3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001560
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001561/// \brief Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001562/// [4 x double], as specified by the immediate value operand.
1563///
1564/// The selected elements from the first 256-bit operand are copied to bits
1565/// [63:0] and bits [191:128] in the destination, and the selected elements
1566/// from the second 256-bit operand are copied to bits [127:64] and bits
1567/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1568/// operand contain a value of 0xF, the 256-bit destination vector would
1569/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001570///
1571/// \headerfile <x86intrin.h>
1572///
1573/// \code
1574/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1575/// \endcode
1576///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001577/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001578///
1579/// \param a
1580/// A 256-bit vector of [4 x double].
1581/// \param b
1582/// A 256-bit vector of [4 x double].
1583/// \param mask
1584/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001585/// copy from \a a and \a b: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001586/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001587/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001588/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001589/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001590/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001591/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001592/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001593/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001594/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001595/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001596/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001597/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001598/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001599/// destination. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00001600/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001601/// destination.
1602/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001603#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper2a383c92016-07-04 22:18:01 +00001604 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1605 (__v4df)(__m256d)(b), \
1606 0 + (((mask) >> 0) & 0x1), \
1607 4 + (((mask) >> 1) & 0x1), \
1608 2 + (((mask) >> 2) & 0x1), \
1609 6 + (((mask) >> 3) & 0x1)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001610
1611/* Compare */
1612#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1613#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1614#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1615#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1616#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1617#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1618#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001619#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001620#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001621#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001622#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1623#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1624#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1625#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1626#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1627#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1628#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1629#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1630#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1631#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1632#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1633#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001634#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001635#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1636#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patelbd0d0062017-04-12 15:19:08 +00001637#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001638#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1639#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1640#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1641#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1642#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1643#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1644
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001645/// \brief Compares each of the corresponding double-precision values of two
1646/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001647/// immediate integer operand.
1648///
1649/// Returns a [2 x double] vector consisting of two doubles corresponding to
1650/// the two comparison results: zero if the comparison is false, and all 1's
1651/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001652///
1653/// \headerfile <x86intrin.h>
1654///
1655/// \code
1656/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1657/// \endcode
1658///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001659/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001660///
1661/// \param a
1662/// A 128-bit vector of [2 x double].
1663/// \param b
1664/// A 128-bit vector of [2 x double].
1665/// \param c
1666/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001667/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001668/// 0x00: Equal (ordered, non-signaling) \n
1669/// 0x01: Less-than (ordered, signaling) \n
1670/// 0x02: Less-than-or-equal (ordered, signaling) \n
1671/// 0x03: Unordered (non-signaling) \n
1672/// 0x04: Not-equal (unordered, non-signaling) \n
1673/// 0x05: Not-less-than (unordered, signaling) \n
1674/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1675/// 0x07: Ordered (non-signaling) \n
1676/// 0x08: Equal (unordered, non-signaling) \n
1677/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1678/// 0x0A: Not-greater-than (unordered, signaling) \n
1679/// 0x0B: False (ordered, non-signaling) \n
1680/// 0x0C: Not-equal (ordered, non-signaling) \n
1681/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1682/// 0x0E: Greater-than (ordered, signaling) \n
1683/// 0x0F: True (unordered, non-signaling) \n
1684/// 0x10: Equal (ordered, signaling) \n
1685/// 0x11: Less-than (ordered, non-signaling) \n
1686/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1687/// 0x13: Unordered (signaling) \n
1688/// 0x14: Not-equal (unordered, signaling) \n
1689/// 0x15: Not-less-than (unordered, non-signaling) \n
1690/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1691/// 0x17: Ordered (signaling) \n
1692/// 0x18: Equal (unordered, signaling) \n
1693/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1694/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1695/// 0x1B: False (ordered, signaling) \n
1696/// 0x1C: Not-equal (ordered, signaling) \n
1697/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1698/// 0x1E: Greater-than (ordered, non-signaling) \n
1699/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001700/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001701#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001702 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1703 (__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001704
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001705/// \brief Compares each of the corresponding values of two 128-bit vectors of
1706/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001707/// operand.
1708///
1709/// Returns a [4 x float] vector consisting of four floats corresponding to
1710/// the four comparison results: zero if the comparison is false, and all 1's
1711/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001712///
1713/// \headerfile <x86intrin.h>
1714///
1715/// \code
1716/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1717/// \endcode
1718///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001719/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001720///
1721/// \param a
1722/// A 128-bit vector of [4 x float].
1723/// \param b
1724/// A 128-bit vector of [4 x float].
1725/// \param c
1726/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001727/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001728/// 0x00: Equal (ordered, non-signaling) \n
1729/// 0x01: Less-than (ordered, signaling) \n
1730/// 0x02: Less-than-or-equal (ordered, signaling) \n
1731/// 0x03: Unordered (non-signaling) \n
1732/// 0x04: Not-equal (unordered, non-signaling) \n
1733/// 0x05: Not-less-than (unordered, signaling) \n
1734/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1735/// 0x07: Ordered (non-signaling) \n
1736/// 0x08: Equal (unordered, non-signaling) \n
1737/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1738/// 0x0A: Not-greater-than (unordered, signaling) \n
1739/// 0x0B: False (ordered, non-signaling) \n
1740/// 0x0C: Not-equal (ordered, non-signaling) \n
1741/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1742/// 0x0E: Greater-than (ordered, signaling) \n
1743/// 0x0F: True (unordered, non-signaling) \n
1744/// 0x10: Equal (ordered, signaling) \n
1745/// 0x11: Less-than (ordered, non-signaling) \n
1746/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1747/// 0x13: Unordered (signaling) \n
1748/// 0x14: Not-equal (unordered, signaling) \n
1749/// 0x15: Not-less-than (unordered, non-signaling) \n
1750/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1751/// 0x17: Ordered (signaling) \n
1752/// 0x18: Equal (unordered, signaling) \n
1753/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1754/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1755/// 0x1B: False (ordered, signaling) \n
1756/// 0x1C: Not-equal (ordered, signaling) \n
1757/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1758/// 0x1E: Greater-than (ordered, non-signaling) \n
1759/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001760/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001761#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001762 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1763 (__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001764
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001765/// \brief Compares each of the corresponding double-precision values of two
1766/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001767/// immediate integer operand.
1768///
1769/// Returns a [4 x double] vector consisting of four doubles corresponding to
1770/// the four comparison results: zero if the comparison is false, and all 1's
1771/// if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001772///
1773/// \headerfile <x86intrin.h>
1774///
1775/// \code
1776/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1777/// \endcode
1778///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001779/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001780///
1781/// \param a
1782/// A 256-bit vector of [4 x double].
1783/// \param b
1784/// A 256-bit vector of [4 x double].
1785/// \param c
1786/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001787/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001788/// 0x00: Equal (ordered, non-signaling) \n
1789/// 0x01: Less-than (ordered, signaling) \n
1790/// 0x02: Less-than-or-equal (ordered, signaling) \n
1791/// 0x03: Unordered (non-signaling) \n
1792/// 0x04: Not-equal (unordered, non-signaling) \n
1793/// 0x05: Not-less-than (unordered, signaling) \n
1794/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1795/// 0x07: Ordered (non-signaling) \n
1796/// 0x08: Equal (unordered, non-signaling) \n
1797/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1798/// 0x0A: Not-greater-than (unordered, signaling) \n
1799/// 0x0B: False (ordered, non-signaling) \n
1800/// 0x0C: Not-equal (ordered, non-signaling) \n
1801/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1802/// 0x0E: Greater-than (ordered, signaling) \n
1803/// 0x0F: True (unordered, non-signaling) \n
1804/// 0x10: Equal (ordered, signaling) \n
1805/// 0x11: Less-than (ordered, non-signaling) \n
1806/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1807/// 0x13: Unordered (signaling) \n
1808/// 0x14: Not-equal (unordered, signaling) \n
1809/// 0x15: Not-less-than (unordered, non-signaling) \n
1810/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1811/// 0x17: Ordered (signaling) \n
1812/// 0x18: Equal (unordered, signaling) \n
1813/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1814/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1815/// 0x1B: False (ordered, signaling) \n
1816/// 0x1C: Not-equal (ordered, signaling) \n
1817/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1818/// 0x1E: Greater-than (ordered, non-signaling) \n
1819/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001820/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001821#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001822 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1823 (__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001824
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001825/// \brief Compares each of the corresponding values of two 256-bit vectors of
1826/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001827/// operand.
1828///
1829/// Returns a [8 x float] vector consisting of eight floats corresponding to
1830/// the eight comparison results: zero if the comparison is false, and all
1831/// 1's if the comparison is true.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001832///
1833/// \headerfile <x86intrin.h>
1834///
1835/// \code
1836/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1837/// \endcode
1838///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001839/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001840///
1841/// \param a
1842/// A 256-bit vector of [8 x float].
1843/// \param b
1844/// A 256-bit vector of [8 x float].
1845/// \param c
1846/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001847/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001848/// 0x00: Equal (ordered, non-signaling) \n
1849/// 0x01: Less-than (ordered, signaling) \n
1850/// 0x02: Less-than-or-equal (ordered, signaling) \n
1851/// 0x03: Unordered (non-signaling) \n
1852/// 0x04: Not-equal (unordered, non-signaling) \n
1853/// 0x05: Not-less-than (unordered, signaling) \n
1854/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1855/// 0x07: Ordered (non-signaling) \n
1856/// 0x08: Equal (unordered, non-signaling) \n
1857/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1858/// 0x0A: Not-greater-than (unordered, signaling) \n
1859/// 0x0B: False (ordered, non-signaling) \n
1860/// 0x0C: Not-equal (ordered, non-signaling) \n
1861/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1862/// 0x0E: Greater-than (ordered, signaling) \n
1863/// 0x0F: True (unordered, non-signaling) \n
1864/// 0x10: Equal (ordered, signaling) \n
1865/// 0x11: Less-than (ordered, non-signaling) \n
1866/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1867/// 0x13: Unordered (signaling) \n
1868/// 0x14: Not-equal (unordered, signaling) \n
1869/// 0x15: Not-less-than (unordered, non-signaling) \n
1870/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1871/// 0x17: Ordered (signaling) \n
1872/// 0x18: Equal (unordered, signaling) \n
1873/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1874/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1875/// 0x1B: False (ordered, signaling) \n
1876/// 0x1C: Not-equal (ordered, signaling) \n
1877/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1878/// 0x1E: Greater-than (ordered, non-signaling) \n
1879/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001880/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001881#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001882 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1883 (__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001884
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001885/// \brief Compares each of the corresponding scalar double-precision values of
1886/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001887/// immediate integer operand.
1888///
1889/// If the result is true, all 64 bits of the destination vector are set;
1890/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// \code
1895/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1896/// \endcode
1897///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001898/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001899///
1900/// \param a
1901/// A 128-bit vector of [2 x double].
1902/// \param b
1903/// A 128-bit vector of [2 x double].
1904/// \param c
1905/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001906/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001907/// 0x00: Equal (ordered, non-signaling) \n
1908/// 0x01: Less-than (ordered, signaling) \n
1909/// 0x02: Less-than-or-equal (ordered, signaling) \n
1910/// 0x03: Unordered (non-signaling) \n
1911/// 0x04: Not-equal (unordered, non-signaling) \n
1912/// 0x05: Not-less-than (unordered, signaling) \n
1913/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1914/// 0x07: Ordered (non-signaling) \n
1915/// 0x08: Equal (unordered, non-signaling) \n
1916/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1917/// 0x0A: Not-greater-than (unordered, signaling) \n
1918/// 0x0B: False (ordered, non-signaling) \n
1919/// 0x0C: Not-equal (ordered, non-signaling) \n
1920/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1921/// 0x0E: Greater-than (ordered, signaling) \n
1922/// 0x0F: True (unordered, non-signaling) \n
1923/// 0x10: Equal (ordered, signaling) \n
1924/// 0x11: Less-than (ordered, non-signaling) \n
1925/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1926/// 0x13: Unordered (signaling) \n
1927/// 0x14: Not-equal (unordered, signaling) \n
1928/// 0x15: Not-less-than (unordered, non-signaling) \n
1929/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1930/// 0x17: Ordered (signaling) \n
1931/// 0x18: Equal (unordered, signaling) \n
1932/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1933/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1934/// 0x1B: False (ordered, signaling) \n
1935/// 0x1C: Not-equal (ordered, signaling) \n
1936/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1937/// 0x1E: Greater-than (ordered, non-signaling) \n
1938/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001939/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001940#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00001941 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1942 (__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001943
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001944/// \brief Compares each of the corresponding scalar values of two 128-bit
1945/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00001946/// integer operand.
1947///
1948/// If the result is true, all 32 bits of the destination vector are set;
1949/// otherwise they are cleared.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001950///
1951/// \headerfile <x86intrin.h>
1952///
1953/// \code
1954/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1955/// \endcode
1956///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00001957/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001958///
1959/// \param a
1960/// A 128-bit vector of [4 x float].
1961/// \param b
1962/// A 128-bit vector of [4 x float].
1963/// \param c
1964/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova16166a42016-12-23 23:36:26 +00001965/// operation to use: \n
Douglas Yung7ff91422018-01-08 21:21:17 +00001966/// 0x00: Equal (ordered, non-signaling) \n
1967/// 0x01: Less-than (ordered, signaling) \n
1968/// 0x02: Less-than-or-equal (ordered, signaling) \n
1969/// 0x03: Unordered (non-signaling) \n
1970/// 0x04: Not-equal (unordered, non-signaling) \n
1971/// 0x05: Not-less-than (unordered, signaling) \n
1972/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1973/// 0x07: Ordered (non-signaling) \n
1974/// 0x08: Equal (unordered, non-signaling) \n
1975/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1976/// 0x0A: Not-greater-than (unordered, signaling) \n
1977/// 0x0B: False (ordered, non-signaling) \n
1978/// 0x0C: Not-equal (ordered, non-signaling) \n
1979/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1980/// 0x0E: Greater-than (ordered, signaling) \n
1981/// 0x0F: True (unordered, non-signaling) \n
1982/// 0x10: Equal (ordered, signaling) \n
1983/// 0x11: Less-than (ordered, non-signaling) \n
1984/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1985/// 0x13: Unordered (signaling) \n
1986/// 0x14: Not-equal (unordered, signaling) \n
1987/// 0x15: Not-less-than (unordered, non-signaling) \n
1988/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1989/// 0x17: Ordered (signaling) \n
1990/// 0x18: Equal (unordered, signaling) \n
1991/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1992/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1993/// 0x1B: False (ordered, signaling) \n
1994/// 0x1C: Not-equal (ordered, signaling) \n
1995/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1996/// 0x1E: Greater-than (ordered, non-signaling) \n
1997/// 0x1F: True (unordered, signaling)
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00001998/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilsonc9b97cc2011-11-05 06:08:06 +00001999#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper71481662015-11-10 05:08:05 +00002000 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
2001 (__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002002
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002003/// \brief Takes a [8 x i32] vector and returns the vector element value
2004/// indexed by the immediate constant operand.
2005///
2006/// \headerfile <x86intrin.h>
2007///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002008/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2009/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002010///
2011/// \param __a
2012/// A 256-bit vector of [8 x i32].
2013/// \param __imm
2014/// An immediate integer operand with bits [2:0] determining which vector
2015/// element is extracted and returned.
2016/// \returns A 32-bit integer containing the extracted 32 bits of extended
2017/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00002018static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00002019_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002020{
David Blaikie3302f2b2013-01-16 23:08:36 +00002021 __v8si __b = (__v8si)__a;
Manman Renc94122e2013-10-23 20:33:14 +00002022 return __b[__imm & 7];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002023}
2024
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002025/// \brief Takes a [16 x i16] vector and returns the vector element value
2026/// indexed by the immediate constant operand.
2027///
2028/// \headerfile <x86intrin.h>
2029///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2031/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002032///
2033/// \param __a
2034/// A 256-bit integer vector of [16 x i16].
2035/// \param __imm
2036/// An immediate integer operand with bits [3:0] determining which vector
2037/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002038/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002039/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00002040static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00002041_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002042{
David Blaikie3302f2b2013-01-16 23:08:36 +00002043 __v16hi __b = (__v16hi)__a;
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002044 return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002045}
2046
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002047/// \brief Takes a [32 x i8] vector and returns the vector element value
2048/// indexed by the immediate constant operand.
2049///
2050/// \headerfile <x86intrin.h>
2051///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002052/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2053/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002054///
2055/// \param __a
2056/// A 256-bit integer vector of [32 x i8].
2057/// \param __imm
2058/// An immediate integer operand with bits [4:0] determining which vector
2059/// element is extracted and returned.
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002060/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2061/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00002062static __inline int __DEFAULT_FN_ATTRS
Craig Topper459554f2015-01-31 06:31:30 +00002063_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002064{
David Blaikie3302f2b2013-01-16 23:08:36 +00002065 __v32qi __b = (__v32qi)__a;
Simon Pilgrim28666ce2016-05-21 21:14:35 +00002066 return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002067}
2068
2069#ifdef __x86_64__
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002070/// \brief Takes a [4 x i64] vector and returns the vector element value
2071/// indexed by the immediate constant operand.
2072///
2073/// \headerfile <x86intrin.h>
2074///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002075/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2076/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002077///
2078/// \param __a
2079/// A 256-bit integer vector of [4 x i64].
2080/// \param __imm
2081/// An immediate integer operand with bits [1:0] determining which vector
2082/// element is extracted and returned.
2083/// \returns A 64-bit integer containing the extracted 64 bits of extended
2084/// packed data.
Michael Kupersteine45af542015-06-30 13:36:19 +00002085static __inline long long __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002086_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002087{
David Blaikie3302f2b2013-01-16 23:08:36 +00002088 __v4di __b = (__v4di)__a;
Manman Renc94122e2013-10-23 20:33:14 +00002089 return __b[__imm & 3];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002090}
2091#endif
2092
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002093/// \brief Takes a [8 x i32] vector and replaces the vector element value
2094/// indexed by the immediate constant operand by a new value. Returns the
2095/// modified vector.
2096///
2097/// \headerfile <x86intrin.h>
2098///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2100/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002101///
2102/// \param __a
2103/// A vector of [8 x i32] to be used by the insert operation.
2104/// \param __b
2105/// An integer value. The replacement value for the insert operation.
2106/// \param __imm
2107/// An immediate integer specifying the index of the vector element to be
2108/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002109/// \returns A copy of vector \a __a, after replacing its element indexed by
2110/// \a __imm with \a __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002111static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002112_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002113{
David Blaikie3302f2b2013-01-16 23:08:36 +00002114 __v8si __c = (__v8si)__a;
2115 __c[__imm & 7] = __b;
2116 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002117}
2118
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002119
2120/// \brief Takes a [16 x i16] vector and replaces the vector element value
2121/// indexed by the immediate constant operand with a new value. Returns the
2122/// modified vector.
2123///
2124/// \headerfile <x86intrin.h>
2125///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002126/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2127/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002128///
2129/// \param __a
2130/// A vector of [16 x i16] to be used by the insert operation.
2131/// \param __b
2132/// An i16 integer value. The replacement value for the insert operation.
2133/// \param __imm
2134/// An immediate integer specifying the index of the vector element to be
2135/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002136/// \returns A copy of vector \a __a, after replacing its element indexed by
2137/// \a __imm with \a __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002138static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002139_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002140{
David Blaikie3302f2b2013-01-16 23:08:36 +00002141 __v16hi __c = (__v16hi)__a;
2142 __c[__imm & 15] = __b;
2143 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002144}
2145
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002146/// \brief Takes a [32 x i8] vector and replaces the vector element value
2147/// indexed by the immediate constant operand with a new value. Returns the
2148/// modified vector.
2149///
2150/// \headerfile <x86intrin.h>
2151///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002152/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2153/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002154///
2155/// \param __a
2156/// A vector of [32 x i8] to be used by the insert operation.
2157/// \param __b
2158/// An i8 integer value. The replacement value for the insert operation.
2159/// \param __imm
2160/// An immediate integer specifying the index of the vector element to be
2161/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002162/// \returns A copy of vector \a __a, after replacing its element indexed by
2163/// \a __imm with \a __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002164static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002165_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002166{
David Blaikie3302f2b2013-01-16 23:08:36 +00002167 __v32qi __c = (__v32qi)__a;
2168 __c[__imm & 31] = __b;
2169 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002170}
2171
2172#ifdef __x86_64__
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002173/// \brief Takes a [4 x i64] vector and replaces the vector element value
2174/// indexed by the immediate constant operand with a new value. Returns the
2175/// modified vector.
2176///
2177/// \headerfile <x86intrin.h>
2178///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002179/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2180/// instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002181///
2182/// \param __a
2183/// A vector of [4 x i64] to be used by the insert operation.
2184/// \param __b
2185/// A 64-bit integer value. The replacement value for the insert operation.
2186/// \param __imm
2187/// An immediate integer specifying the index of the vector element to be
2188/// replaced.
Ekaterina Romanovad6042192016-12-08 04:09:17 +00002189/// \returns A copy of vector \a __a, after replacing its element indexed by
2190/// \a __imm with \a __b.
Michael Kupersteine45af542015-06-30 13:36:19 +00002191static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhasd7400292015-02-19 19:00:33 +00002192_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002193{
David Blaikie3302f2b2013-01-16 23:08:36 +00002194 __v4di __c = (__v4di)__a;
2195 __c[__imm & 3] = __b;
2196 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002197}
2198#endif
2199
2200/* Conversion */
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002201/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
2202///
2203/// \headerfile <x86intrin.h>
2204///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002205/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002206///
2207/// \param __a
2208/// A 128-bit integer vector of [4 x i32].
2209/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002210static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002211_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002212{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002213 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002214}
2215
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002216/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
2217///
2218/// \headerfile <x86intrin.h>
2219///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002220/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002221///
2222/// \param __a
2223/// A 256-bit integer vector.
2224/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002225static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002226_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002227{
David Blaikie3302f2b2013-01-16 23:08:36 +00002228 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002229}
2230
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002231/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2232/// [4 x float].
2233///
2234/// \headerfile <x86intrin.h>
2235///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002236/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002237///
2238/// \param __a
2239/// A 256-bit vector of [4 x double].
2240/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002241static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002242_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002243{
David Blaikie3302f2b2013-01-16 23:08:36 +00002244 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002245}
2246
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002247/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
2248///
2249/// \headerfile <x86intrin.h>
2250///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002251/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova1168fdc2016-05-16 22:54:45 +00002252///
2253/// \param __a
2254/// A 256-bit vector of [8 x float].
2255/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002256static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002257_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002258{
David Blaikie3302f2b2013-01-16 23:08:36 +00002259 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002260}
2261
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002262/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2263/// x double].
2264///
2265/// \headerfile <x86intrin.h>
2266///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002267/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002268///
2269/// \param __a
2270/// A 128-bit vector of [4 x float].
2271/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002272static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002273_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002274{
Simon Pilgrim90770c72016-05-23 22:13:02 +00002275 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002276}
2277
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002278/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2279/// x i32], truncating the result by rounding towards zero when it is
2280/// inexact.
2281///
2282/// \headerfile <x86intrin.h>
2283///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002284/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002285///
2286/// \param __a
2287/// A 256-bit vector of [4 x double].
2288/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002289static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002290_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002291{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002292 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002293}
2294
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002295/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2296/// x i32]. When a conversion is inexact, the value returned is rounded
2297/// according to the rounding control bits in the MXCSR register.
2298///
2299/// \headerfile <x86intrin.h>
2300///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002301/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002302///
2303/// \param __a
2304/// A 256-bit vector of [4 x double].
2305/// \returns A 128-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002306static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002307_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002308{
David Blaikie3302f2b2013-01-16 23:08:36 +00002309 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002310}
2311
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002312/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
2313/// truncating the result by rounding towards zero when it is inexact.
2314///
2315/// \headerfile <x86intrin.h>
2316///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002317/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002318///
2319/// \param __a
2320/// A 256-bit vector of [8 x float].
2321/// \returns A 256-bit integer vector containing the converted values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002322static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002323_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002324{
Simon Pilgrime3b9ee02016-07-20 10:18:01 +00002325 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002326}
2327
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002328/// \brief Returns the first element of the input vector of [4 x double].
2329///
2330/// \headerfile <avxintrin.h>
2331///
2332/// This intrinsic is a utility function and does not correspond to a specific
2333/// instruction.
2334///
2335/// \param __a
2336/// A 256-bit vector of [4 x double].
2337/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002338static __inline double __DEFAULT_FN_ATTRS
2339_mm256_cvtsd_f64(__m256d __a)
2340{
2341 return __a[0];
2342}
2343
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002344/// \brief Returns the first element of the input vector of [8 x i32].
2345///
2346/// \headerfile <avxintrin.h>
2347///
2348/// This intrinsic is a utility function and does not correspond to a specific
2349/// instruction.
2350///
2351/// \param __a
2352/// A 256-bit vector of [8 x i32].
2353/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002354static __inline int __DEFAULT_FN_ATTRS
2355_mm256_cvtsi256_si32(__m256i __a)
2356{
2357 __v8si __b = (__v8si)__a;
2358 return __b[0];
2359}
2360
Ekaterina Romanova2e041c92017-01-13 01:14:08 +00002361/// \brief Returns the first element of the input vector of [8 x float].
2362///
2363/// \headerfile <avxintrin.h>
2364///
2365/// This intrinsic is a utility function and does not correspond to a specific
2366/// instruction.
2367///
2368/// \param __a
2369/// A 256-bit vector of [8 x float].
2370/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckermane54093f2016-06-01 12:21:00 +00002371static __inline float __DEFAULT_FN_ATTRS
2372_mm256_cvtss_f32(__m256 __a)
2373{
2374 return __a[0];
2375}
2376
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002377/* Vector replicate */
Douglas Yung7ff91422018-01-08 21:21:17 +00002378/// \brief Moves and duplicates odd-indexed values from a 256-bit vector of
2379/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002380///
2381/// \headerfile <x86intrin.h>
2382///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002383/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002384///
2385/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002386/// A 256-bit vector of [8 x float]. \n
2387/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2388/// the return value. \n
2389/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2390/// the return value. \n
2391/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2392/// return value. \n
2393/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2394/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002395/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2396/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002397static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002398_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002399{
Craig Topper1aa231e2016-05-16 06:38:42 +00002400 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002401}
2402
Douglas Yung7ff91422018-01-08 21:21:17 +00002403/// \brief Moves and duplicates even-indexed values from a 256-bit vector of
2404/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002405///
2406/// \headerfile <x86intrin.h>
2407///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002408/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002409///
2410/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002411/// A 256-bit vector of [8 x float]. \n
2412/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2413/// the return value. \n
2414/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2415/// the return value. \n
2416/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2417/// return value. \n
2418/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2419/// return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002420/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2421/// values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002422static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002423_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002424{
Craig Topper1aa231e2016-05-16 06:38:42 +00002425 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002426}
2427
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002428/// \brief Moves and duplicates double-precision floating point values from a
2429/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2430/// vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002431///
2432/// \headerfile <x86intrin.h>
2433///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002434/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002435///
2436/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002437/// A 256-bit vector of [4 x double]. \n
2438/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2439/// return value. \n
2440/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2441/// the return value.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002442/// \returns A 256-bit vector of [4 x double] containing the moved and
2443/// duplicated values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002444static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002445_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002446{
Craig Topper1aa231e2016-05-16 06:38:42 +00002447 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002448}
2449
2450/* Unpack and Interleave */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002451/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
2452/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2453///
2454/// \headerfile <x86intrin.h>
2455///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002456/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002457///
2458/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002459/// A 256-bit floating-point vector of [4 x double]. \n
2460/// Bits [127:64] are written to bits [63:0] of the return value. \n
2461/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002462/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002463/// A 256-bit floating-point vector of [4 x double]. \n
2464/// Bits [127:64] are written to bits [127:64] of the return value. \n
2465/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002466/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002467static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002468_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002469{
Craig Topper1aa231e2016-05-16 06:38:42 +00002470 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002471}
2472
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002473/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
2474/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2475///
2476/// \headerfile <x86intrin.h>
2477///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002478/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002479///
2480/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002481/// A 256-bit floating-point vector of [4 x double]. \n
2482/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002483/// Bits [191:128] are written to bits [191:128] of the return value.
2484/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002485/// A 256-bit floating-point vector of [4 x double]. \n
2486/// Bits [63:0] are written to bits [127:64] of the return value. \n
2487/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002488/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002489static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002490_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002491{
Craig Topper1aa231e2016-05-16 06:38:42 +00002492 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002493}
2494
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002495/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2496/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2497/// vector of [8 x float].
2498///
2499/// \headerfile <x86intrin.h>
2500///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002501/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002502///
2503/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002504/// A 256-bit vector of [8 x float]. \n
2505/// Bits [95:64] are written to bits [31:0] of the return value. \n
2506/// Bits [127:96] are written to bits [95:64] of the return value. \n
2507/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002508/// Bits [255:224] are written to bits [223:192] of the return value.
2509/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002510/// A 256-bit vector of [8 x float]. \n
2511/// Bits [95:64] are written to bits [63:32] of the return value. \n
2512/// Bits [127:96] are written to bits [127:96] of the return value. \n
2513/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002514/// Bits [255:224] are written to bits [255:224] of the return value.
2515/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002516static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002517_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002518{
Craig Topper1aa231e2016-05-16 06:38:42 +00002519 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002520}
2521
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002522/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2523/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2524/// vector of [8 x float].
2525///
2526/// \headerfile <x86intrin.h>
2527///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002528/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002529///
2530/// \param __a
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002531/// A 256-bit vector of [8 x float]. \n
2532/// Bits [31:0] are written to bits [31:0] of the return value. \n
2533/// Bits [63:32] are written to bits [95:64] of the return value. \n
2534/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002535/// Bits [191:160] are written to bits [223:192] of the return value.
2536/// \param __b
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002537/// A 256-bit vector of [8 x float]. \n
2538/// Bits [31:0] are written to bits [63:32] of the return value. \n
2539/// Bits [63:32] are written to bits [127:96] of the return value. \n
2540/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002541/// Bits [191:160] are written to bits [255:224] of the return value.
2542/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00002543static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002544_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002545{
Craig Topper1aa231e2016-05-16 06:38:42 +00002546 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002547}
2548
2549/* Bit Test */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002550/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2551/// element-by-element comparison of the double-precision element in the
2552/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002553/// vector.
2554///
2555/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002556/// If there is at least one pair of double-precision elements where the
2557/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002558/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002559/// If there is at least one pair of double-precision elements where the
2560/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002561/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002562/// This intrinsic returns the value of the ZF flag.
2563///
2564/// \headerfile <x86intrin.h>
2565///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002566/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002567///
2568/// \param __a
2569/// A 128-bit vector of [2 x double].
2570/// \param __b
2571/// A 128-bit vector of [2 x double].
2572/// \returns the ZF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002573static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002574_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002575{
David Blaikie3302f2b2013-01-16 23:08:36 +00002576 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002577}
2578
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002579/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2580/// element-by-element comparison of the double-precision element in the
2581/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002582/// vector.
2583///
2584/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002585/// If there is at least one pair of double-precision elements where the
2586/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002587/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002588/// If there is at least one pair of double-precision elements where the
2589/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002590/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002591/// This intrinsic returns the value of the CF flag.
2592///
2593/// \headerfile <x86intrin.h>
2594///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002595/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002596///
2597/// \param __a
2598/// A 128-bit vector of [2 x double].
2599/// \param __b
2600/// A 128-bit vector of [2 x double].
2601/// \returns the CF flag in the EFLAGS register.
Michael Kupersteine45af542015-06-30 13:36:19 +00002602static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002603_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002604{
David Blaikie3302f2b2013-01-16 23:08:36 +00002605 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002606}
2607
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002608/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
2609/// element-by-element comparison of the double-precision element in the
2610/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002611/// vector.
2612///
2613/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002614/// If there is at least one pair of double-precision elements where the
2615/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002616/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002617/// If there is at least one pair of double-precision elements where the
2618/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002619/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002620/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2621/// otherwise it returns 0.
2622///
2623/// \headerfile <x86intrin.h>
2624///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002625/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002626///
2627/// \param __a
2628/// A 128-bit vector of [2 x double].
2629/// \param __b
2630/// A 128-bit vector of [2 x double].
2631/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002632static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002633_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002634{
David Blaikie3302f2b2013-01-16 23:08:36 +00002635 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002636}
2637
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002638/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2639/// element-by-element comparison of the single-precision element in the
2640/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002641/// vector.
2642///
2643/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002644/// If there is at least one pair of single-precision elements where the
2645/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002646/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002647/// If there is at least one pair of single-precision elements where the
2648/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002649/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002650/// This intrinsic returns the value of the ZF flag.
2651///
2652/// \headerfile <x86intrin.h>
2653///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002654/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002655///
2656/// \param __a
2657/// A 128-bit vector of [4 x float].
2658/// \param __b
2659/// A 128-bit vector of [4 x float].
2660/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002661static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002662_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002663{
David Blaikie3302f2b2013-01-16 23:08:36 +00002664 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002665}
2666
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002667/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2668/// element-by-element comparison of the single-precision element in the
2669/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002670/// vector.
2671///
2672/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002673/// If there is at least one pair of single-precision elements where the
2674/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002675/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002676/// If there is at least one pair of single-precision elements where the
2677/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002678/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002679/// This intrinsic returns the value of the CF flag.
2680///
2681/// \headerfile <x86intrin.h>
2682///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002683/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002684///
2685/// \param __a
2686/// A 128-bit vector of [4 x float].
2687/// \param __b
2688/// A 128-bit vector of [4 x float].
2689/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002690static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002691_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002692{
David Blaikie3302f2b2013-01-16 23:08:36 +00002693 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002694}
2695
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002696/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
2697/// element-by-element comparison of the single-precision element in the
2698/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002699/// vector.
2700///
2701/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002702/// If there is at least one pair of single-precision elements where the
2703/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002704/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002705/// If there is at least one pair of single-precision elements where the
2706/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002707/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002708/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2709/// otherwise it returns 0.
2710///
2711/// \headerfile <x86intrin.h>
2712///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002713/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002714///
2715/// \param __a
2716/// A 128-bit vector of [4 x float].
2717/// \param __b
2718/// A 128-bit vector of [4 x float].
2719/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002720static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002721_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002722{
David Blaikie3302f2b2013-01-16 23:08:36 +00002723 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002724}
2725
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002726/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2727/// element-by-element comparison of the double-precision elements in the
2728/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002729/// vector.
2730///
2731/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002732/// If there is at least one pair of double-precision elements where the
2733/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002734/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002735/// If there is at least one pair of double-precision elements where the
2736/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002737/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002738/// This intrinsic returns the value of the ZF flag.
2739///
2740/// \headerfile <x86intrin.h>
2741///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002742/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002743///
2744/// \param __a
2745/// A 256-bit vector of [4 x double].
2746/// \param __b
2747/// A 256-bit vector of [4 x double].
2748/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002749static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002750_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002751{
David Blaikie3302f2b2013-01-16 23:08:36 +00002752 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002753}
2754
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002755/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2756/// element-by-element comparison of the double-precision elements in the
2757/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002758/// vector.
2759///
2760/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002761/// If there is at least one pair of double-precision elements where the
2762/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002763/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002764/// If there is at least one pair of double-precision elements where the
2765/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002766/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002767/// This intrinsic returns the value of the CF flag.
2768///
2769/// \headerfile <x86intrin.h>
2770///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002771/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002772///
2773/// \param __a
2774/// A 256-bit vector of [4 x double].
2775/// \param __b
2776/// A 256-bit vector of [4 x double].
2777/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002778static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002779_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002780{
David Blaikie3302f2b2013-01-16 23:08:36 +00002781 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002782}
2783
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002784/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
2785/// element-by-element comparison of the double-precision elements in the
2786/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002787/// vector.
2788///
2789/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002790/// If there is at least one pair of double-precision elements where the
2791/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002792/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002793/// If there is at least one pair of double-precision elements where the
2794/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002795/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002796/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2797/// otherwise it returns 0.
2798///
2799/// \headerfile <x86intrin.h>
2800///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002801/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002802///
2803/// \param __a
2804/// A 256-bit vector of [4 x double].
2805/// \param __b
2806/// A 256-bit vector of [4 x double].
2807/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002808static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002809_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002810{
David Blaikie3302f2b2013-01-16 23:08:36 +00002811 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002812}
2813
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002814/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2815/// element-by-element comparison of the single-precision element in the
2816/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002817/// vector.
2818///
2819/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002820/// If there is at least one pair of single-precision elements where the
2821/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002822/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002823/// If there is at least one pair of single-precision elements where the
2824/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002825/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002826/// This intrinsic returns the value of the ZF flag.
2827///
2828/// \headerfile <x86intrin.h>
2829///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002830/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002831///
2832/// \param __a
2833/// A 256-bit vector of [8 x float].
2834/// \param __b
2835/// A 256-bit vector of [8 x float].
2836/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002837static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002838_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002839{
David Blaikie3302f2b2013-01-16 23:08:36 +00002840 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002841}
2842
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002843/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2844/// element-by-element comparison of the single-precision element in the
2845/// first source vector and the corresponding element in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002846/// vector.
2847///
2848/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002849/// If there is at least one pair of single-precision elements where the
2850/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002851/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002852/// If there is at least one pair of single-precision elements where the
2853/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002854/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002855/// This intrinsic returns the value of the CF flag.
2856///
2857/// \headerfile <x86intrin.h>
2858///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002859/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002860///
2861/// \param __a
2862/// A 256-bit vector of [8 x float].
2863/// \param __b
2864/// A 256-bit vector of [8 x float].
2865/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002866static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002867_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002868{
David Blaikie3302f2b2013-01-16 23:08:36 +00002869 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002870}
2871
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002872/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
2873/// element-by-element comparison of the single-precision elements in the
2874/// first source vector and the corresponding elements in the second source
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002875/// vector.
2876///
2877/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002878/// If there is at least one pair of single-precision elements where the
2879/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002880/// ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002881/// If there is at least one pair of single-precision elements where the
2882/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002883/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002884/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2885/// otherwise it returns 0.
2886///
2887/// \headerfile <x86intrin.h>
2888///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002889/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002890///
2891/// \param __a
2892/// A 256-bit vector of [8 x float].
2893/// \param __b
2894/// A 256-bit vector of [8 x float].
2895/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002896static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002897_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002898{
David Blaikie3302f2b2013-01-16 23:08:36 +00002899 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002900}
2901
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002902/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002903/// of the two source vectors.
2904///
2905/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002906/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002907/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002908/// If there is at least one pair of bits where the bit from the first source
2909/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002910/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002911/// This intrinsic returns the value of the ZF flag.
2912///
2913/// \headerfile <x86intrin.h>
2914///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002915/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002916///
2917/// \param __a
2918/// A 256-bit integer vector.
2919/// \param __b
2920/// A 256-bit integer vector.
2921/// \returns the ZF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002922static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002923_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002924{
David Blaikie3302f2b2013-01-16 23:08:36 +00002925 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002926}
2927
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002928/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002929/// of the two source vectors.
2930///
2931/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002932/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002933/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002934/// If there is at least one pair of bits where the bit from the first source
2935/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002936/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002937/// This intrinsic returns the value of the CF flag.
2938///
2939/// \headerfile <x86intrin.h>
2940///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002941/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002942///
2943/// \param __a
2944/// A 256-bit integer vector.
2945/// \param __b
2946/// A 256-bit integer vector.
2947/// \returns the CF flag.
Michael Kupersteine45af542015-06-30 13:36:19 +00002948static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002949_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002950{
David Blaikie3302f2b2013-01-16 23:08:36 +00002951 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002952}
2953
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002954/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00002955/// of the two source vectors.
2956///
2957/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002958/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002959/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002960/// If there is at least one pair of bits where the bit from the first source
2961/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova16166a42016-12-23 23:36:26 +00002962/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002963/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2964/// otherwise it returns 0.
2965///
2966/// \headerfile <x86intrin.h>
2967///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002968/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002969///
2970/// \param __a
2971/// A 256-bit integer vector.
2972/// \param __b
2973/// A 256-bit integer vector.
2974/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kupersteine45af542015-06-30 13:36:19 +00002975static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002976_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002977{
David Blaikie3302f2b2013-01-16 23:08:36 +00002978 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002979}
2980
2981/* Vector extract sign mask */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002982/// \brief Extracts the sign bits of double-precision floating point elements
2983/// in a 256-bit vector of [4 x double] and writes them to the lower order
2984/// bits of the return value.
2985///
2986/// \headerfile <x86intrin.h>
2987///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00002988/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00002989///
2990/// \param __a
2991/// A 256-bit vector of [4 x double] containing the double-precision
2992/// floating point values with sign bits to be extracted.
2993/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00002994static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00002995_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002996{
David Blaikie3302f2b2013-01-16 23:08:36 +00002997 return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00002998}
2999
Douglas Yung7ff91422018-01-08 21:21:17 +00003000/// \brief Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003001/// in a 256-bit vector of [8 x float] and writes them to the lower order
3002/// bits of the return value.
3003///
3004/// \headerfile <x86intrin.h>
3005///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003006/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003007///
3008/// \param __a
Douglas Yung7ff91422018-01-08 21:21:17 +00003009/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003010/// point values with sign bits to be extracted.
3011/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kupersteine45af542015-06-30 13:36:19 +00003012static __inline int __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003013_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003014{
David Blaikie3302f2b2013-01-16 23:08:36 +00003015 return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003016}
3017
David Blaikie3302f2b2013-01-16 23:08:36 +00003018/* Vector __zero */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003019/// \brief Zeroes the contents of all XMM or YMM registers.
3020///
3021/// \headerfile <x86intrin.h>
3022///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003023/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00003024static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003025_mm256_zeroall(void)
3026{
3027 __builtin_ia32_vzeroall();
3028}
3029
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003030/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3031///
3032/// \headerfile <x86intrin.h>
3033///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003034/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Michael Kupersteine45af542015-06-30 13:36:19 +00003035static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003036_mm256_zeroupper(void)
3037{
3038 __builtin_ia32_vzeroupper();
3039}
3040
3041/* Vector load with broadcast */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003042/// \brief Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003043/// specified address pointed to by \a __a and broadcasts it to the elements
3044/// of a [4 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003045///
3046/// \headerfile <x86intrin.h>
3047///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003048/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003049///
3050/// \param __a
3051/// The single-precision floating point value to be broadcast.
3052/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3053/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003054static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003055_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003056{
Adam Nemet286ae082014-05-29 20:47:29 +00003057 float __f = *__a;
3058 return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003059}
3060
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003061/// \brief Loads a scalar double-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003062/// specified address pointed to by \a __a and broadcasts it to the elements
3063/// of a [4 x double] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003064///
3065/// \headerfile <x86intrin.h>
3066///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003067/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003068///
3069/// \param __a
3070/// The double-precision floating point value to be broadcast.
3071/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3072/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003073static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003074_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003075{
Adam Nemet286ae082014-05-29 20:47:29 +00003076 double __d = *__a;
3077 return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003078}
3079
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003080/// \brief Loads a scalar single-precision floating point value from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003081/// specified address pointed to by \a __a and broadcasts it to the elements
3082/// of a [8 x float] vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003083///
3084/// \headerfile <x86intrin.h>
3085///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003086/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003087///
3088/// \param __a
3089/// The single-precision floating point value to be broadcast.
3090/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3091/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003092static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003093_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003094{
Adam Nemet286ae082014-05-29 20:47:29 +00003095 float __f = *__a;
3096 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003097}
3098
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003099/// \brief Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003100/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003101/// elements in a 256-bit vector of [4 x double].
3102///
3103/// \headerfile <x86intrin.h>
3104///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003105/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003106///
3107/// \param __a
3108/// The 128-bit vector of [2 x double] to be broadcast.
3109/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3110/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003111static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003112_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003113{
Craig Topper1aa231e2016-05-16 06:38:42 +00003114 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003115}
3116
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003117/// \brief Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003118/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003119/// elements in a 256-bit vector of [8 x float].
3120///
3121/// \headerfile <x86intrin.h>
3122///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003123/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003124///
3125/// \param __a
3126/// The 128-bit vector of [4 x float] to be broadcast.
3127/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3128/// equal to the broadcast value.
Michael Kupersteine45af542015-06-30 13:36:19 +00003129static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003130_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003131{
Craig Topper1aa231e2016-05-16 06:38:42 +00003132 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003133}
3134
3135/* SIMD load ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003136/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003137/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003138///
3139/// \headerfile <x86intrin.h>
3140///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003141/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003142///
3143/// \param __p
3144/// A 32-byte aligned pointer to a memory location containing
3145/// double-precision floating point values.
3146/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003147static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003148_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003149{
David Blaikie3302f2b2013-01-16 23:08:36 +00003150 return *(__m256d *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003151}
3152
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003153/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003154/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003155///
3156/// \headerfile <x86intrin.h>
3157///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003158/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003159///
3160/// \param __p
3161/// A 32-byte aligned pointer to a memory location containing float values.
3162/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003163static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003164_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003165{
David Blaikie3302f2b2013-01-16 23:08:36 +00003166 return *(__m256 *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003167}
3168
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003169/// \brief Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003170/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003171///
3172/// \headerfile <x86intrin.h>
3173///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003174/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003175///
3176/// \param __p
3177/// A pointer to a memory location containing double-precision floating
3178/// point values.
3179/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003180static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003181_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003182{
Craig Topper9e9301a2012-01-25 04:26:17 +00003183 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +00003184 __m256d __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003185 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003186 return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003187}
3188
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003189/// \brief Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003190/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003191///
3192/// \headerfile <x86intrin.h>
3193///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003194/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003195///
3196/// \param __p
3197/// A pointer to a memory location containing single-precision floating
3198/// point values.
3199/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003200static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003201_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003202{
Craig Topper9e9301a2012-01-25 04:26:17 +00003203 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +00003204 __m256 __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003205 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003206 return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003207}
3208
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003209/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003210/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003211///
3212/// \headerfile <x86intrin.h>
3213///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003214/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003215///
3216/// \param __p
3217/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3218/// values.
3219/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003220static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003221_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003222{
David Blaikie3302f2b2013-01-16 23:08:36 +00003223 return *__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003224}
3225
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003226/// \brief Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003227/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003228///
3229/// \headerfile <x86intrin.h>
3230///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003231/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003232///
3233/// \param __p
3234/// A pointer to a 256-bit integer vector containing integer values.
3235/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003236static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003237_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003238{
Craig Topper9e9301a2012-01-25 04:26:17 +00003239 struct __loadu_si256 {
David Blaikie3302f2b2013-01-16 23:08:36 +00003240 __m256i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00003241 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00003242 return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003243}
3244
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003245/// \brief Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003246/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3247/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003248/// line boundary.
3249///
3250/// \headerfile <x86intrin.h>
3251///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003252/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003253///
3254/// \param __p
3255/// A pointer to a 256-bit integer vector containing integer values.
3256/// \returns A 256-bit integer vector containing the moved values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003257static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003258_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003259{
David Blaikie3302f2b2013-01-16 23:08:36 +00003260 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003261}
3262
3263/* SIMD store ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003264/// \brief Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003265/// of [4 x double] to a 32-byte aligned memory location pointed to by
3266/// \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003267///
3268/// \headerfile <x86intrin.h>
3269///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003270/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003271///
3272/// \param __p
3273/// A 32-byte aligned pointer to a memory location that will receive the
3274/// double-precision floaing point values.
3275/// \param __a
3276/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003277static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003278_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003279{
David Blaikie3302f2b2013-01-16 23:08:36 +00003280 *(__m256d *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003281}
3282
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003283/// \brief Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003284/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003285///
3286/// \headerfile <x86intrin.h>
3287///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003288/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003289///
3290/// \param __p
3291/// A 32-byte aligned pointer to a memory location that will receive the
3292/// float values.
3293/// \param __a
3294/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003295static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003296_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003297{
David Blaikie3302f2b2013-01-16 23:08:36 +00003298 *(__m256 *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003299}
3300
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003301/// \brief Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003302/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003303///
3304/// \headerfile <x86intrin.h>
3305///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003306/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003307///
3308/// \param __p
3309/// A pointer to a memory location that will receive the double-precision
3310/// floating point values.
3311/// \param __a
3312/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003313static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003314_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003315{
Craig Topper09175da2016-05-30 17:10:30 +00003316 struct __storeu_pd {
3317 __m256d __v;
3318 } __attribute__((__packed__, __may_alias__));
3319 ((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003320}
3321
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003322/// \brief Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003323/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003324///
3325/// \headerfile <x86intrin.h>
3326///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003327/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003328///
3329/// \param __p
3330/// A pointer to a memory location that will receive the float values.
3331/// \param __a
3332/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003333static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003334_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003335{
Craig Topper09175da2016-05-30 17:10:30 +00003336 struct __storeu_ps {
3337 __m256 __v;
3338 } __attribute__((__packed__, __may_alias__));
3339 ((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003340}
3341
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003342/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003343/// aligned memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003344///
3345/// \headerfile <x86intrin.h>
3346///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003347/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003348///
3349/// \param __p
3350/// A 32-byte aligned pointer to a memory location that will receive the
3351/// integer values.
3352/// \param __a
3353/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003354static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003355_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003356{
David Blaikie3302f2b2013-01-16 23:08:36 +00003357 *__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003358}
3359
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003360/// \brief Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003361/// memory location pointed to by \a __p.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003362///
3363/// \headerfile <x86intrin.h>
3364///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003365/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003366///
3367/// \param __p
3368/// A pointer to a memory location that will receive the integer values.
3369/// \param __a
3370/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003371static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003372_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003373{
Craig Topper09175da2016-05-30 17:10:30 +00003374 struct __storeu_si256 {
3375 __m256i __v;
3376 } __attribute__((__packed__, __may_alias__));
3377 ((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003378}
3379
3380/* Conditional load ops */
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003381/// \brief Conditionally loads double-precision floating point elements from a
3382/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003383/// [2 x double], depending on the mask bits associated with each data
3384/// element.
3385///
3386/// \headerfile <x86intrin.h>
3387///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003388/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003389///
3390/// \param __p
3391/// A pointer to a memory location that contains the double-precision
3392/// floating point values.
3393/// \param __m
3394/// A 128-bit integer vector containing the mask. The most significant bit of
3395/// each data element represents the mask bits. If a mask bit is zero, the
3396/// corresponding value in the memory location is not loaded and the
3397/// corresponding field in the return value is set to zero.
3398/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003399static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003400_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003401{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003402 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003403}
3404
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003405/// \brief Conditionally loads double-precision floating point elements from a
3406/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003407/// [4 x double], depending on the mask bits associated with each data
3408/// element.
3409///
3410/// \headerfile <x86intrin.h>
3411///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003412/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003413///
3414/// \param __p
3415/// A pointer to a memory location that contains the double-precision
3416/// floating point values.
3417/// \param __m
3418/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3419/// significant bit of each quadword element represents the mask bits. If a
3420/// mask bit is zero, the corresponding value in the memory location is not
3421/// loaded and the corresponding field in the return value is set to zero.
3422/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003423static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003424_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003425{
David Blaikie3302f2b2013-01-16 23:08:36 +00003426 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003427 (__v4di)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003428}
3429
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003430/// \brief Conditionally loads single-precision floating point elements from a
3431/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003432/// [4 x float], depending on the mask bits associated with each data
3433/// element.
3434///
3435/// \headerfile <x86intrin.h>
3436///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003437/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003438///
3439/// \param __p
3440/// A pointer to a memory location that contains the single-precision
3441/// floating point values.
3442/// \param __m
3443/// A 128-bit integer vector containing the mask. The most significant bit of
3444/// each data element represents the mask bits. If a mask bit is zero, the
3445/// corresponding value in the memory location is not loaded and the
3446/// corresponding field in the return value is set to zero.
3447/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003448static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003449_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003450{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003451 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003452}
3453
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003454/// \brief Conditionally loads single-precision floating point elements from a
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003455/// memory location pointed to by \a __p into a 256-bit vector of
3456/// [8 x float], depending on the mask bits associated with each data
3457/// element.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003458///
3459/// \headerfile <x86intrin.h>
3460///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003461/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003462///
3463/// \param __p
3464/// A pointer to a memory location that contains the single-precision
3465/// floating point values.
3466/// \param __m
3467/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3468/// significant bit of each dword element represents the mask bits. If a mask
3469/// bit is zero, the corresponding value in the memory location is not loaded
3470/// and the corresponding field in the return value is set to zero.
3471/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kupersteine45af542015-06-30 13:36:19 +00003472static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003473_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003474{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003475 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003476}
3477
3478/* Conditional store ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003479/// \brief Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003480/// of [8 x float] to a memory location pointed to by \a __p, according to
3481/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003482///
3483/// \headerfile <x86intrin.h>
3484///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003485/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003486///
3487/// \param __p
3488/// A pointer to a memory location that will receive the float values.
3489/// \param __m
3490/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3491/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003492/// mask bits. If a mask bit is zero, the corresponding value from vector
3493/// \a __a is not stored and the corresponding field in the memory location
3494/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003495/// \param __a
3496/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003497static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003498_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003499{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003500 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003501}
3502
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003503/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003504/// to a memory location pointed to by \a __p, according to the specified
3505/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003506///
3507/// \headerfile <x86intrin.h>
3508///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003509/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003510///
3511/// \param __p
3512/// A pointer to a memory location that will receive the float values.
3513/// \param __m
3514/// A 128-bit integer vector containing the mask. The most significant bit of
3515/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003516/// zero, the corresponding value from vector \a __a is not stored and the
3517/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003518/// changed.
3519/// \param __a
3520/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003521static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003522_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003523{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003524 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003525}
3526
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003527/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003528/// to a memory location pointed to by \a __p, according to the specified
3529/// mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003530///
3531/// \headerfile <x86intrin.h>
3532///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003533/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003534///
3535/// \param __p
3536/// A pointer to a memory location that will receive the float values.
3537/// \param __m
3538/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3539/// significant bit of each quadword element in the mask vector represents
3540/// the mask bits. If a mask bit is zero, the corresponding value from vector
3541/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003542/// pointed to by \a __p is not changed.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003543/// \param __a
3544/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003545static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003546_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003547{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003548 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003549}
3550
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003551/// \brief Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003552/// of [4 x float] to a memory location pointed to by \a __p, according to
3553/// the specified mask.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003554///
3555/// \headerfile <x86intrin.h>
3556///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003557/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003558///
3559/// \param __p
3560/// A pointer to a memory location that will receive the float values.
3561/// \param __m
3562/// A 128-bit integer vector containing the mask. The most significant bit of
3563/// each field in the mask vector represents the mask bits. If a mask bit is
3564/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00003565/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003566/// changed.
3567/// \param __a
3568/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kupersteine45af542015-06-30 13:36:19 +00003569static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003570_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003571{
Andrea Di Biagio8bb12d02015-10-20 11:19:54 +00003572 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003573}
3574
3575/* Cacheability support ops */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003576/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
3577/// aligned memory location. To minimize caching, the data is flagged as
3578/// non-temporal (unlikely to be used again soon).
3579///
3580/// \headerfile <x86intrin.h>
3581///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003582/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003583///
3584/// \param __a
3585/// A pointer to a 32-byte aligned memory location that will receive the
3586/// integer values.
3587/// \param __b
3588/// A 256-bit integer vector containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003589static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003590_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003591{
Simon Pilgrimc14865c2017-07-29 15:33:34 +00003592 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3593 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003594}
3595
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003596/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
3597/// to a 32-byte aligned memory location. To minimize caching, the data is
3598/// flagged as non-temporal (unlikely to be used again soon).
3599///
3600/// \headerfile <x86intrin.h>
3601///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003602/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003603///
3604/// \param __a
3605/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanovacb3603a2017-06-06 22:58:01 +00003606/// double-precision floating-point values.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003607/// \param __b
3608/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003609static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003610_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003611{
Simon Pilgrimc14865c2017-07-29 15:33:34 +00003612 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3613 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003614}
3615
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003616/// \brief Moves single-precision floating point values from a 256-bit vector
3617/// of [8 x float] to a 32-byte aligned memory location. To minimize
3618/// caching, the data is flagged as non-temporal (unlikely to be used again
3619/// soon).
3620///
3621/// \headerfile <x86intrin.h>
3622///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00003623/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003624///
3625/// \param __p
3626/// A pointer to a 32-byte aligned memory location that will receive the
3627/// single-precision floating point values.
3628/// \param __a
3629/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kupersteine45af542015-06-30 13:36:19 +00003630static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003631_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003632{
Simon Pilgrimc14865c2017-07-29 15:33:34 +00003633 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3634 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003635}
3636
3637/* Create vectors */
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003638/// \brief Create a 256-bit vector of [4 x double] with undefined values.
3639///
3640/// \headerfile <x86intrin.h>
3641///
3642/// This intrinsic has no corresponding instruction.
3643///
3644/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003645static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003646_mm256_undefined_pd(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003647{
3648 return (__m256d)__builtin_ia32_undef256();
3649}
3650
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003651/// \brief Create a 256-bit vector of [8 x float] with undefined values.
3652///
3653/// \headerfile <x86intrin.h>
3654///
3655/// This intrinsic has no corresponding instruction.
3656///
3657/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003658static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003659_mm256_undefined_ps(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003660{
3661 return (__m256)__builtin_ia32_undef256();
3662}
3663
Ekaterina Romanova0a700762016-11-19 04:59:08 +00003664/// \brief Create a 256-bit integer vector with undefined values.
3665///
3666/// \headerfile <x86intrin.h>
3667///
3668/// This intrinsic has no corresponding instruction.
3669///
3670/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003671static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper3a0c7262016-06-09 05:14:28 +00003672_mm256_undefined_si256(void)
Simon Pilgrim5aba9922015-08-26 21:17:12 +00003673{
3674 return (__m256i)__builtin_ia32_undef256();
3675}
3676
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003677/// \brief Constructs a 256-bit floating-point vector of [4 x double]
3678/// initialized with the specified double-precision floating-point values.
3679///
3680/// \headerfile <x86intrin.h>
3681///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003682/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3683/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003684///
3685/// \param __a
3686/// A double-precision floating-point value used to initialize bits [255:192]
3687/// of the result.
3688/// \param __b
3689/// A double-precision floating-point value used to initialize bits [191:128]
3690/// of the result.
3691/// \param __c
3692/// A double-precision floating-point value used to initialize bits [127:64]
3693/// of the result.
3694/// \param __d
3695/// A double-precision floating-point value used to initialize bits [63:0]
3696/// of the result.
3697/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003698static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003699_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003700{
David Blaikie3302f2b2013-01-16 23:08:36 +00003701 return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003702}
3703
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003704/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
3705/// with the specified single-precision floating-point values.
3706///
3707/// \headerfile <x86intrin.h>
3708///
3709/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003710/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003711///
3712/// \param __a
3713/// A single-precision floating-point value used to initialize bits [255:224]
3714/// of the result.
3715/// \param __b
3716/// A single-precision floating-point value used to initialize bits [223:192]
3717/// of the result.
3718/// \param __c
3719/// A single-precision floating-point value used to initialize bits [191:160]
3720/// of the result.
3721/// \param __d
3722/// A single-precision floating-point value used to initialize bits [159:128]
3723/// of the result.
3724/// \param __e
3725/// A single-precision floating-point value used to initialize bits [127:96]
3726/// of the result.
3727/// \param __f
3728/// A single-precision floating-point value used to initialize bits [95:64]
3729/// of the result.
3730/// \param __g
3731/// A single-precision floating-point value used to initialize bits [63:32]
3732/// of the result.
3733/// \param __h
3734/// A single-precision floating-point value used to initialize bits [31:0]
3735/// of the result.
3736/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00003737static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003738_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003739 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003740{
David Blaikie3302f2b2013-01-16 23:08:36 +00003741 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003742}
3743
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003744/// \brief Constructs a 256-bit integer vector initialized with the specified
3745/// 32-bit integral values.
3746///
3747/// \headerfile <x86intrin.h>
3748///
3749/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003750/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003751///
3752/// \param __i0
3753/// A 32-bit integral value used to initialize bits [255:224] of the result.
3754/// \param __i1
3755/// A 32-bit integral value used to initialize bits [223:192] of the result.
3756/// \param __i2
3757/// A 32-bit integral value used to initialize bits [191:160] of the result.
3758/// \param __i3
3759/// A 32-bit integral value used to initialize bits [159:128] of the result.
3760/// \param __i4
3761/// A 32-bit integral value used to initialize bits [127:96] of the result.
3762/// \param __i5
3763/// A 32-bit integral value used to initialize bits [95:64] of the result.
3764/// \param __i6
3765/// A 32-bit integral value used to initialize bits [63:32] of the result.
3766/// \param __i7
3767/// A 32-bit integral value used to initialize bits [31:0] of the result.
3768/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003769static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003770_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003771 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003772{
David Blaikie3302f2b2013-01-16 23:08:36 +00003773 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003774}
3775
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003776/// \brief Constructs a 256-bit integer vector initialized with the specified
3777/// 16-bit integral values.
3778///
3779/// \headerfile <x86intrin.h>
3780///
3781/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003782/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003783///
3784/// \param __w15
3785/// A 16-bit integral value used to initialize bits [255:240] of the result.
3786/// \param __w14
3787/// A 16-bit integral value used to initialize bits [239:224] of the result.
3788/// \param __w13
3789/// A 16-bit integral value used to initialize bits [223:208] of the result.
3790/// \param __w12
3791/// A 16-bit integral value used to initialize bits [207:192] of the result.
3792/// \param __w11
3793/// A 16-bit integral value used to initialize bits [191:176] of the result.
3794/// \param __w10
3795/// A 16-bit integral value used to initialize bits [175:160] of the result.
3796/// \param __w09
3797/// A 16-bit integral value used to initialize bits [159:144] of the result.
3798/// \param __w08
3799/// A 16-bit integral value used to initialize bits [143:128] of the result.
3800/// \param __w07
3801/// A 16-bit integral value used to initialize bits [127:112] of the result.
3802/// \param __w06
3803/// A 16-bit integral value used to initialize bits [111:96] of the result.
3804/// \param __w05
3805/// A 16-bit integral value used to initialize bits [95:80] of the result.
3806/// \param __w04
3807/// A 16-bit integral value used to initialize bits [79:64] of the result.
3808/// \param __w03
3809/// A 16-bit integral value used to initialize bits [63:48] of the result.
3810/// \param __w02
3811/// A 16-bit integral value used to initialize bits [47:32] of the result.
3812/// \param __w01
3813/// A 16-bit integral value used to initialize bits [31:16] of the result.
3814/// \param __w00
3815/// A 16-bit integral value used to initialize bits [15:0] of the result.
3816/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003817static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003818_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003819 short __w11, short __w10, short __w09, short __w08,
3820 short __w07, short __w06, short __w05, short __w04,
3821 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003822{
David Blaikie3302f2b2013-01-16 23:08:36 +00003823 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3824 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003825}
3826
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003827/// \brief Constructs a 256-bit integer vector initialized with the specified
3828/// 8-bit integral values.
3829///
3830/// \headerfile <x86intrin.h>
3831///
3832/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003833/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003834///
3835/// \param __b31
3836/// An 8-bit integral value used to initialize bits [255:248] of the result.
3837/// \param __b30
3838/// An 8-bit integral value used to initialize bits [247:240] of the result.
3839/// \param __b29
3840/// An 8-bit integral value used to initialize bits [239:232] of the result.
3841/// \param __b28
3842/// An 8-bit integral value used to initialize bits [231:224] of the result.
3843/// \param __b27
3844/// An 8-bit integral value used to initialize bits [223:216] of the result.
3845/// \param __b26
3846/// An 8-bit integral value used to initialize bits [215:208] of the result.
3847/// \param __b25
3848/// An 8-bit integral value used to initialize bits [207:200] of the result.
3849/// \param __b24
3850/// An 8-bit integral value used to initialize bits [199:192] of the result.
3851/// \param __b23
3852/// An 8-bit integral value used to initialize bits [191:184] of the result.
3853/// \param __b22
3854/// An 8-bit integral value used to initialize bits [183:176] of the result.
3855/// \param __b21
3856/// An 8-bit integral value used to initialize bits [175:168] of the result.
3857/// \param __b20
3858/// An 8-bit integral value used to initialize bits [167:160] of the result.
3859/// \param __b19
3860/// An 8-bit integral value used to initialize bits [159:152] of the result.
3861/// \param __b18
3862/// An 8-bit integral value used to initialize bits [151:144] of the result.
3863/// \param __b17
3864/// An 8-bit integral value used to initialize bits [143:136] of the result.
3865/// \param __b16
3866/// An 8-bit integral value used to initialize bits [135:128] of the result.
3867/// \param __b15
3868/// An 8-bit integral value used to initialize bits [127:120] of the result.
3869/// \param __b14
3870/// An 8-bit integral value used to initialize bits [119:112] of the result.
3871/// \param __b13
3872/// An 8-bit integral value used to initialize bits [111:104] of the result.
3873/// \param __b12
3874/// An 8-bit integral value used to initialize bits [103:96] of the result.
3875/// \param __b11
3876/// An 8-bit integral value used to initialize bits [95:88] of the result.
3877/// \param __b10
3878/// An 8-bit integral value used to initialize bits [87:80] of the result.
3879/// \param __b09
3880/// An 8-bit integral value used to initialize bits [79:72] of the result.
3881/// \param __b08
3882/// An 8-bit integral value used to initialize bits [71:64] of the result.
3883/// \param __b07
3884/// An 8-bit integral value used to initialize bits [63:56] of the result.
3885/// \param __b06
3886/// An 8-bit integral value used to initialize bits [55:48] of the result.
3887/// \param __b05
3888/// An 8-bit integral value used to initialize bits [47:40] of the result.
3889/// \param __b04
3890/// An 8-bit integral value used to initialize bits [39:32] of the result.
3891/// \param __b03
3892/// An 8-bit integral value used to initialize bits [31:24] of the result.
3893/// \param __b02
3894/// An 8-bit integral value used to initialize bits [23:16] of the result.
3895/// \param __b01
3896/// An 8-bit integral value used to initialize bits [15:8] of the result.
3897/// \param __b00
3898/// An 8-bit integral value used to initialize bits [7:0] of the result.
3899/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003900static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003901_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00003902 char __b27, char __b26, char __b25, char __b24,
3903 char __b23, char __b22, char __b21, char __b20,
3904 char __b19, char __b18, char __b17, char __b16,
3905 char __b15, char __b14, char __b13, char __b12,
3906 char __b11, char __b10, char __b09, char __b08,
3907 char __b07, char __b06, char __b05, char __b04,
3908 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003909{
3910 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00003911 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3912 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3913 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3914 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003915 };
3916}
3917
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003918/// \brief Constructs a 256-bit integer vector initialized with the specified
3919/// 64-bit integral values.
3920///
3921/// \headerfile <x86intrin.h>
3922///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003923/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3924/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003925///
3926/// \param __a
3927/// A 64-bit integral value used to initialize bits [255:192] of the result.
3928/// \param __b
3929/// A 64-bit integral value used to initialize bits [191:128] of the result.
3930/// \param __c
3931/// A 64-bit integral value used to initialize bits [127:64] of the result.
3932/// \param __d
3933/// A 64-bit integral value used to initialize bits [63:0] of the result.
3934/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00003935static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003936_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003937{
David Blaikie3302f2b2013-01-16 23:08:36 +00003938 return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003939}
3940
3941/* Create vectors with elements in reverse order */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003942/// \brief Constructs a 256-bit floating-point vector of [4 x double],
3943/// initialized in reverse order with the specified double-precision
3944/// floating-point values.
3945///
3946/// \headerfile <x86intrin.h>
3947///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003948/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3949/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003950///
3951/// \param __a
3952/// A double-precision floating-point value used to initialize bits [63:0]
3953/// of the result.
3954/// \param __b
3955/// A double-precision floating-point value used to initialize bits [127:64]
3956/// of the result.
3957/// \param __c
3958/// A double-precision floating-point value used to initialize bits [191:128]
3959/// of the result.
3960/// \param __d
3961/// A double-precision floating-point value used to initialize bits [255:192]
3962/// of the result.
3963/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00003964static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00003965_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003966{
David Blaikie3302f2b2013-01-16 23:08:36 +00003967 return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00003968}
3969
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003970/// \brief Constructs a 256-bit floating-point vector of [8 x float],
3971/// initialized in reverse order with the specified single-precision
3972/// float-point values.
3973///
3974/// \headerfile <x86intrin.h>
3975///
3976/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00003977/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00003978///
3979/// \param __a
3980/// A single-precision floating-point value used to initialize bits [31:0]
3981/// of the result.
3982/// \param __b
3983/// A single-precision floating-point value used to initialize bits [63:32]
3984/// of the result.
3985/// \param __c
3986/// A single-precision floating-point value used to initialize bits [95:64]
3987/// of the result.
3988/// \param __d
3989/// A single-precision floating-point value used to initialize bits [127:96]
3990/// of the result.
3991/// \param __e
3992/// A single-precision floating-point value used to initialize bits [159:128]
3993/// of the result.
3994/// \param __f
3995/// A single-precision floating-point value used to initialize bits [191:160]
3996/// of the result.
3997/// \param __g
3998/// A single-precision floating-point value used to initialize bits [223:192]
3999/// of the result.
4000/// \param __h
4001/// A single-precision floating-point value used to initialize bits [255:224]
4002/// of the result.
4003/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004004static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004005_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004006 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004007{
David Blaikie3302f2b2013-01-16 23:08:36 +00004008 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004009}
4010
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004011/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4012/// with the specified 32-bit integral values.
4013///
4014/// \headerfile <x86intrin.h>
4015///
4016/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004017/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004018///
4019/// \param __i0
4020/// A 32-bit integral value used to initialize bits [31:0] of the result.
4021/// \param __i1
4022/// A 32-bit integral value used to initialize bits [63:32] of the result.
4023/// \param __i2
4024/// A 32-bit integral value used to initialize bits [95:64] of the result.
4025/// \param __i3
4026/// A 32-bit integral value used to initialize bits [127:96] of the result.
4027/// \param __i4
4028/// A 32-bit integral value used to initialize bits [159:128] of the result.
4029/// \param __i5
4030/// A 32-bit integral value used to initialize bits [191:160] of the result.
4031/// \param __i6
4032/// A 32-bit integral value used to initialize bits [223:192] of the result.
4033/// \param __i7
4034/// A 32-bit integral value used to initialize bits [255:224] of the result.
4035/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004036static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004037_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004038 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004039{
David Blaikie3302f2b2013-01-16 23:08:36 +00004040 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004041}
4042
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004043/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4044/// with the specified 16-bit integral values.
4045///
4046/// \headerfile <x86intrin.h>
4047///
4048/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004049/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004050///
4051/// \param __w15
4052/// A 16-bit integral value used to initialize bits [15:0] of the result.
4053/// \param __w14
4054/// A 16-bit integral value used to initialize bits [31:16] of the result.
4055/// \param __w13
4056/// A 16-bit integral value used to initialize bits [47:32] of the result.
4057/// \param __w12
4058/// A 16-bit integral value used to initialize bits [63:48] of the result.
4059/// \param __w11
4060/// A 16-bit integral value used to initialize bits [79:64] of the result.
4061/// \param __w10
4062/// A 16-bit integral value used to initialize bits [95:80] of the result.
4063/// \param __w09
4064/// A 16-bit integral value used to initialize bits [111:96] of the result.
4065/// \param __w08
4066/// A 16-bit integral value used to initialize bits [127:112] of the result.
4067/// \param __w07
4068/// A 16-bit integral value used to initialize bits [143:128] of the result.
4069/// \param __w06
4070/// A 16-bit integral value used to initialize bits [159:144] of the result.
4071/// \param __w05
4072/// A 16-bit integral value used to initialize bits [175:160] of the result.
4073/// \param __w04
4074/// A 16-bit integral value used to initialize bits [191:176] of the result.
4075/// \param __w03
4076/// A 16-bit integral value used to initialize bits [207:192] of the result.
4077/// \param __w02
4078/// A 16-bit integral value used to initialize bits [223:208] of the result.
4079/// \param __w01
4080/// A 16-bit integral value used to initialize bits [239:224] of the result.
4081/// \param __w00
4082/// A 16-bit integral value used to initialize bits [255:240] of the result.
4083/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004084static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004085_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004086 short __w11, short __w10, short __w09, short __w08,
4087 short __w07, short __w06, short __w05, short __w04,
4088 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004089{
David Blaikie3302f2b2013-01-16 23:08:36 +00004090 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
4091 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004092}
4093
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004094/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4095/// with the specified 8-bit integral values.
4096///
4097/// \headerfile <x86intrin.h>
4098///
4099/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004100/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004101///
4102/// \param __b31
4103/// An 8-bit integral value used to initialize bits [7:0] of the result.
4104/// \param __b30
4105/// An 8-bit integral value used to initialize bits [15:8] of the result.
4106/// \param __b29
4107/// An 8-bit integral value used to initialize bits [23:16] of the result.
4108/// \param __b28
4109/// An 8-bit integral value used to initialize bits [31:24] of the result.
4110/// \param __b27
4111/// An 8-bit integral value used to initialize bits [39:32] of the result.
4112/// \param __b26
4113/// An 8-bit integral value used to initialize bits [47:40] of the result.
4114/// \param __b25
4115/// An 8-bit integral value used to initialize bits [55:48] of the result.
4116/// \param __b24
4117/// An 8-bit integral value used to initialize bits [63:56] of the result.
4118/// \param __b23
4119/// An 8-bit integral value used to initialize bits [71:64] of the result.
4120/// \param __b22
4121/// An 8-bit integral value used to initialize bits [79:72] of the result.
4122/// \param __b21
4123/// An 8-bit integral value used to initialize bits [87:80] of the result.
4124/// \param __b20
4125/// An 8-bit integral value used to initialize bits [95:88] of the result.
4126/// \param __b19
4127/// An 8-bit integral value used to initialize bits [103:96] of the result.
4128/// \param __b18
4129/// An 8-bit integral value used to initialize bits [111:104] of the result.
4130/// \param __b17
4131/// An 8-bit integral value used to initialize bits [119:112] of the result.
4132/// \param __b16
4133/// An 8-bit integral value used to initialize bits [127:120] of the result.
4134/// \param __b15
4135/// An 8-bit integral value used to initialize bits [135:128] of the result.
4136/// \param __b14
4137/// An 8-bit integral value used to initialize bits [143:136] of the result.
4138/// \param __b13
4139/// An 8-bit integral value used to initialize bits [151:144] of the result.
4140/// \param __b12
4141/// An 8-bit integral value used to initialize bits [159:152] of the result.
4142/// \param __b11
4143/// An 8-bit integral value used to initialize bits [167:160] of the result.
4144/// \param __b10
4145/// An 8-bit integral value used to initialize bits [175:168] of the result.
4146/// \param __b09
4147/// An 8-bit integral value used to initialize bits [183:176] of the result.
4148/// \param __b08
4149/// An 8-bit integral value used to initialize bits [191:184] of the result.
4150/// \param __b07
4151/// An 8-bit integral value used to initialize bits [199:192] of the result.
4152/// \param __b06
4153/// An 8-bit integral value used to initialize bits [207:200] of the result.
4154/// \param __b05
4155/// An 8-bit integral value used to initialize bits [215:208] of the result.
4156/// \param __b04
4157/// An 8-bit integral value used to initialize bits [223:216] of the result.
4158/// \param __b03
4159/// An 8-bit integral value used to initialize bits [231:224] of the result.
4160/// \param __b02
4161/// An 8-bit integral value used to initialize bits [239:232] of the result.
4162/// \param __b01
4163/// An 8-bit integral value used to initialize bits [247:240] of the result.
4164/// \param __b00
4165/// An 8-bit integral value used to initialize bits [255:248] of the result.
4166/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004167static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004168_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004169 char __b27, char __b26, char __b25, char __b24,
4170 char __b23, char __b22, char __b21, char __b20,
4171 char __b19, char __b18, char __b17, char __b16,
4172 char __b15, char __b14, char __b13, char __b12,
4173 char __b11, char __b10, char __b09, char __b08,
4174 char __b07, char __b06, char __b05, char __b04,
4175 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004176{
4177 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00004178 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper9fee8ab2015-01-31 06:33:59 +00004179 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
4180 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
4181 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004182}
4183
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004184/// \brief Constructs a 256-bit integer vector, initialized in reverse order
4185/// with the specified 64-bit integral values.
4186///
4187/// \headerfile <x86intrin.h>
4188///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004189/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4190/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004191///
4192/// \param __a
4193/// A 64-bit integral value used to initialize bits [63:0] of the result.
4194/// \param __b
4195/// A 64-bit integral value used to initialize bits [127:64] of the result.
4196/// \param __c
4197/// A 64-bit integral value used to initialize bits [191:128] of the result.
4198/// \param __d
4199/// A 64-bit integral value used to initialize bits [255:192] of the result.
4200/// \returns An initialized 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00004201static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004202_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004203{
David Blaikie3302f2b2013-01-16 23:08:36 +00004204 return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004205}
4206
4207/* Create vectors with repeated elements */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004208/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
4209/// of the four double-precision floating-point vector elements set to the
4210/// specified double-precision floating-point value.
4211///
4212/// \headerfile <x86intrin.h>
4213///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004214/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004215///
4216/// \param __w
4217/// A double-precision floating-point value used to initialize each vector
4218/// element of the result.
4219/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004220static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004221_mm256_set1_pd(double __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004222{
David Blaikie3302f2b2013-01-16 23:08:36 +00004223 return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004224}
4225
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004226/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
4227/// of the eight single-precision floating-point vector elements set to the
4228/// specified single-precision floating-point value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004232/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4233/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004234///
4235/// \param __w
4236/// A single-precision floating-point value used to initialize each vector
4237/// element of the result.
4238/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004239static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004240_mm256_set1_ps(float __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004241{
David Blaikie3302f2b2013-01-16 23:08:36 +00004242 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004243}
4244
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004245/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
4246/// 32-bit integral vector elements set to the specified 32-bit integral
4247/// value.
4248///
4249/// \headerfile <x86intrin.h>
4250///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004251/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4252/// instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004253///
4254/// \param __i
4255/// A 32-bit integral value used to initialize each vector element of the
4256/// result.
4257/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kupersteine45af542015-06-30 13:36:19 +00004258static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004259_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004260{
David Blaikie3302f2b2013-01-16 23:08:36 +00004261 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004262}
4263
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004264/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
4265/// 16-bit integral vector elements set to the specified 16-bit integral
4266/// value.
4267///
4268/// \headerfile <x86intrin.h>
4269///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004270/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004271///
4272/// \param __w
4273/// A 16-bit integral value used to initialize each vector element of the
4274/// result.
4275/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kupersteine45af542015-06-30 13:36:19 +00004276static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004277_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004278{
David Blaikie3302f2b2013-01-16 23:08:36 +00004279 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
4280 __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004281}
4282
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004283/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
4284/// 8-bit integral vector elements set to the specified 8-bit integral value.
4285///
4286/// \headerfile <x86intrin.h>
4287///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004288/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004289///
4290/// \param __b
4291/// An 8-bit integral value used to initialize each vector element of the
4292/// result.
4293/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kupersteine45af542015-06-30 13:36:19 +00004294static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004295_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004296{
David Blaikie3302f2b2013-01-16 23:08:36 +00004297 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4298 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
4299 __b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004300}
4301
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004302/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
4303/// 64-bit integral vector elements set to the specified 64-bit integral
4304/// value.
4305///
4306/// \headerfile <x86intrin.h>
4307///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004308/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004309///
4310/// \param __q
4311/// A 64-bit integral value used to initialize each vector element of the
4312/// result.
4313/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kupersteine45af542015-06-30 13:36:19 +00004314static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004315_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004316{
David Blaikie3302f2b2013-01-16 23:08:36 +00004317 return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004318}
4319
David Blaikie3302f2b2013-01-16 23:08:36 +00004320/* Create __zeroed vectors */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004321/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
4322/// vector elements initialized to zero.
4323///
4324/// \headerfile <x86intrin.h>
4325///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004326/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004327///
4328/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004329static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004330_mm256_setzero_pd(void)
4331{
4332 return (__m256d){ 0, 0, 0, 0 };
4333}
4334
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004335/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
4336/// vector elements initialized to zero.
4337///
4338/// \headerfile <x86intrin.h>
4339///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004340/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004341///
4342/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004343static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004344_mm256_setzero_ps(void)
4345{
4346 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4347}
4348
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004349/// \brief Constructs a 256-bit integer vector initialized to zero.
4350///
4351/// \headerfile <x86intrin.h>
4352///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004353/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004354///
4355/// \returns A 256-bit integer vector initialized to zero.
Michael Kupersteine45af542015-06-30 13:36:19 +00004356static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004357_mm256_setzero_si256(void)
4358{
4359 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
4360}
4361
4362/* Cast between vector types */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004363/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4364/// floating-point vector of [8 x float].
4365///
4366/// \headerfile <x86intrin.h>
4367///
4368/// This intrinsic has no corresponding instruction.
4369///
4370/// \param __a
4371/// A 256-bit floating-point vector of [4 x double].
4372/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4373/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004374static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004375_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004376{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004377 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004378}
4379
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004380/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4381/// integer vector.
4382///
4383/// \headerfile <x86intrin.h>
4384///
4385/// This intrinsic has no corresponding instruction.
4386///
4387/// \param __a
4388/// A 256-bit floating-point vector of [4 x double].
4389/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4390/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004391static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004392_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004393{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004394 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004395}
4396
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004397/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4398/// floating-point vector of [4 x double].
4399///
4400/// \headerfile <x86intrin.h>
4401///
4402/// This intrinsic has no corresponding instruction.
4403///
4404/// \param __a
4405/// A 256-bit floating-point vector of [8 x float].
4406/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4407/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004408static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004409_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004410{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004411 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004412}
4413
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004414/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4415/// integer vector.
4416///
4417/// \headerfile <x86intrin.h>
4418///
4419/// This intrinsic has no corresponding instruction.
4420///
4421/// \param __a
4422/// A 256-bit floating-point vector of [8 x float].
4423/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4424/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004425static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004426_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004427{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004428 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004429}
4430
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004431/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4432/// of [8 x float].
4433///
4434/// \headerfile <x86intrin.h>
4435///
4436/// This intrinsic has no corresponding instruction.
4437///
4438/// \param __a
4439/// A 256-bit integer vector.
4440/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4441/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004442static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004443_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004444{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004445 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004446}
4447
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004448/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
4449/// of [4 x double].
4450///
4451/// \headerfile <x86intrin.h>
4452///
4453/// This intrinsic has no corresponding instruction.
4454///
4455/// \param __a
4456/// A 256-bit integer vector.
4457/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4458/// bitwise pattern as the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004459static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004460_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004461{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004462 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004463}
4464
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004465/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4466/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4467///
4468/// \headerfile <x86intrin.h>
4469///
4470/// This intrinsic has no corresponding instruction.
4471///
4472/// \param __a
4473/// A 256-bit floating-point vector of [4 x double].
4474/// \returns A 128-bit floating-point vector of [2 x double] containing the
4475/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004476static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004477_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004478{
Craig Topper1aa231e2016-05-16 06:38:42 +00004479 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004480}
4481
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004482/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
4483/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4484///
4485/// \headerfile <x86intrin.h>
4486///
4487/// This intrinsic has no corresponding instruction.
4488///
4489/// \param __a
4490/// A 256-bit floating-point vector of [8 x float].
4491/// \returns A 128-bit floating-point vector of [4 x float] containing the
4492/// lower 128 bits of the parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004493static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004494_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004495{
Craig Topper1aa231e2016-05-16 06:38:42 +00004496 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004497}
4498
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004499/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
4500///
4501/// \headerfile <x86intrin.h>
4502///
4503/// This intrinsic has no corresponding instruction.
4504///
4505/// \param __a
4506/// A 256-bit integer vector.
4507/// \returns A 128-bit integer vector containing the lower 128 bits of the
4508/// parameter.
Michael Kupersteine45af542015-06-30 13:36:19 +00004509static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004510_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004511{
Craig Topper1aa231e2016-05-16 06:38:42 +00004512 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004513}
4514
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004515/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004516/// 128-bit floating-point vector of [2 x double].
4517///
4518/// The lower 128 bits contain the value of the source vector. The contents
4519/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004520///
4521/// \headerfile <x86intrin.h>
4522///
4523/// This intrinsic has no corresponding instruction.
4524///
4525/// \param __a
4526/// A 128-bit vector of [2 x double].
4527/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4528/// contain the value of the parameter. The contents of the upper 128 bits
4529/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004530static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004531_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004532{
Craig Topper1aa231e2016-05-16 06:38:42 +00004533 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004534}
4535
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004536/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004537/// 128-bit floating-point vector of [4 x float].
4538///
4539/// The lower 128 bits contain the value of the source vector. The contents
4540/// of the upper 128 bits are undefined.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004541///
4542/// \headerfile <x86intrin.h>
4543///
4544/// This intrinsic has no corresponding instruction.
4545///
4546/// \param __a
4547/// A 128-bit vector of [4 x float].
4548/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4549/// contain the value of the parameter. The contents of the upper 128 bits
4550/// are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004551static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004552_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004553{
Craig Topper1aa231e2016-05-16 06:38:42 +00004554 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004555}
4556
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004557/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004558///
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004559/// The lower 128 bits contain the value of the source vector. The contents
4560/// of the upper 128 bits are undefined.
4561///
4562/// \headerfile <x86intrin.h>
4563///
4564/// This intrinsic has no corresponding instruction.
4565///
4566/// \param __a
4567/// A 128-bit integer vector.
4568/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4569/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kupersteine45af542015-06-30 13:36:19 +00004570static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner7ab75b32013-04-19 17:00:14 +00004571_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004572{
Craig Topper1aa231e2016-05-16 06:38:42 +00004573 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00004574}
Chad Rosierf8df4f42012-03-20 16:40:00 +00004575
Simon Pilgrim96d02f52017-04-29 17:17:06 +00004576/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
4577/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4578/// contain the value of the source vector. The upper 128 bits are set
4579/// to zero.
4580///
4581/// \headerfile <x86intrin.h>
4582///
4583/// This intrinsic has no corresponding instruction.
4584///
4585/// \param __a
4586/// A 128-bit vector of [2 x double].
4587/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4588/// contain the value of the parameter. The upper 128 bits are set to zero.
4589static __inline __m256d __DEFAULT_FN_ATTRS
4590_mm256_zextpd128_pd256(__m128d __a)
4591{
4592 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4593}
4594
4595/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
4596/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4597/// the value of the source vector. The upper 128 bits are set to zero.
4598///
4599/// \headerfile <x86intrin.h>
4600///
4601/// This intrinsic has no corresponding instruction.
4602///
4603/// \param __a
4604/// A 128-bit vector of [4 x float].
4605/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4606/// contain the value of the parameter. The upper 128 bits are set to zero.
4607static __inline __m256 __DEFAULT_FN_ATTRS
4608_mm256_zextps128_ps256(__m128 __a)
4609{
4610 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4611}
4612
4613/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
4614/// The lower 128 bits contain the value of the source vector. The upper
4615/// 128 bits are set to zero.
4616///
4617/// \headerfile <x86intrin.h>
4618///
4619/// This intrinsic has no corresponding instruction.
4620///
4621/// \param __a
4622/// A 128-bit integer vector.
4623/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4624/// the parameter. The upper 128 bits are set to zero.
4625static __inline __m256i __DEFAULT_FN_ATTRS
4626_mm256_zextsi128_si256(__m128i __a)
4627{
4628 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4629}
4630
Sean Silvae4c37602015-09-12 02:55:19 +00004631/*
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004632 Vector insert.
4633 We use macros rather than inlines because we only want to accept
4634 invocations where the immediate M is a constant expression.
4635*/
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004636/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
4637/// a 256-bit vector of [8 x float] given in the first parameter, and then
4638/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004639/// 128-bit vector of [4 x float] in the second parameter.
4640///
4641/// The immediate integer parameter determines between the upper or the lower
4642/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004643///
4644/// \headerfile <x86intrin.h>
4645///
4646/// \code
4647/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4648/// \endcode
4649///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004650/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004651///
4652/// \param V1
4653/// A 256-bit vector of [8 x float]. This vector is copied to the result
4654/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004655/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004656/// \param V2
4657/// A 128-bit vector of [4 x float]. The contents of this parameter are
4658/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004659/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004660/// \param M
4661/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004662/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004663/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004664/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4665/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004666/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4667/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4668/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004669/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004670#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
4671 (__m256)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004672 (__v8sf)(__m256)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004673 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
4674 (((M) & 1) ? 0 : 8), \
4675 (((M) & 1) ? 1 : 9), \
4676 (((M) & 1) ? 2 : 10), \
4677 (((M) & 1) ? 3 : 11), \
4678 (((M) & 1) ? 8 : 4), \
4679 (((M) & 1) ? 9 : 5), \
4680 (((M) & 1) ? 10 : 6), \
4681 (((M) & 1) ? 11 : 7) );})
4682
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004683/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
4684/// a 256-bit vector of [4 x double] given in the first parameter, and then
4685/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004686/// 128-bit vector of [2 x double] in the second parameter.
4687///
4688/// The immediate integer parameter determines between the upper or the lower
4689/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004690///
4691/// \headerfile <x86intrin.h>
4692///
4693/// \code
4694/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4695/// \endcode
4696///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004697/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004698///
4699/// \param V1
4700/// A 256-bit vector of [4 x double]. This vector is copied to the result
4701/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004702/// be replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004703/// \param V2
4704/// A 128-bit vector of [2 x double]. The contents of this parameter are
4705/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004706/// on the value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004707/// \param M
4708/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004709/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004710/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004711/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4712/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004713/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4714/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4715/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004716/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004717#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
4718 (__m256d)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004719 (__v4df)(__m256d)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004720 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
4721 (((M) & 1) ? 0 : 4), \
4722 (((M) & 1) ? 1 : 5), \
4723 (((M) & 1) ? 4 : 2), \
4724 (((M) & 1) ? 5 : 3) );})
4725
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004726/// \brief Constructs a new 256-bit integer vector by first duplicating a
4727/// 256-bit integer vector given in the first parameter, and then replacing
4728/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova1d4a0f22017-05-15 03:25:04 +00004729/// integer vector in the second parameter.
4730///
4731/// The immediate integer parameter determines between the upper or the lower
4732/// 128 bits.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004733///
4734/// \headerfile <x86intrin.h>
4735///
4736/// \code
4737/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4738/// \endcode
4739///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004740/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004741///
4742/// \param V1
4743/// A 256-bit integer vector. This vector is copied to the result first, and
4744/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004745/// replaced by the contents of \a V2.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004746/// \param V2
4747/// A 128-bit integer vector. The contents of this parameter are written to
4748/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004749/// value of parameter \a M.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004750/// \param M
4751/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004752/// from the two parameters are interleaved: \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004753/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004754/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4755/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004756/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4757/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4758/// result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004759/// \returns A 256-bit integer vector containing the interleaved values.
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004760#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
4761 (__m256i)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004762 (__v4di)(__m256i)(V1), \
Sanjay Patel7f6aa522015-03-10 15:19:26 +00004763 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
4764 (((M) & 1) ? 0 : 4), \
4765 (((M) & 1) ? 1 : 5), \
4766 (((M) & 1) ? 4 : 2), \
4767 (((M) & 1) ? 5 : 3) );})
4768
Sean Silvae4c37602015-09-12 02:55:19 +00004769/*
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004770 Vector extract.
4771 We use macros rather than inlines because we only want to accept
4772 invocations where the immediate M is a constant expression.
4773*/
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004774/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4775/// of [8 x float], as determined by the immediate integer parameter, and
4776/// returns the extracted bits as a 128-bit vector of [4 x float].
4777///
4778/// \headerfile <x86intrin.h>
4779///
4780/// \code
4781/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4782/// \endcode
4783///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004784/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004785///
4786/// \param V
4787/// A 256-bit vector of [8 x float].
4788/// \param M
4789/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004790/// extracted from the first parameter: \n
4791/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4792/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004793/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004794/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004795#define _mm256_extractf128_ps(V, M) __extension__ ({ \
4796 (__m128)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004797 (__v8sf)(__m256)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004798 (__v8sf)(_mm256_undefined_ps()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004799 (((M) & 1) ? 4 : 0), \
4800 (((M) & 1) ? 5 : 1), \
4801 (((M) & 1) ? 6 : 2), \
4802 (((M) & 1) ? 7 : 3) );})
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004803
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004804/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
4805/// of [4 x double], as determined by the immediate integer parameter, and
4806/// returns the extracted bits as a 128-bit vector of [2 x double].
4807///
4808/// \headerfile <x86intrin.h>
4809///
4810/// \code
4811/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4812/// \endcode
4813///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004814/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004815///
4816/// \param V
4817/// A 256-bit vector of [4 x double].
4818/// \param M
4819/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004820/// extracted from the first parameter: \n
4821/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4822/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004823/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004824/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004825#define _mm256_extractf128_pd(V, M) __extension__ ({ \
4826 (__m128d)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004827 (__v4df)(__m256d)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004828 (__v4df)(_mm256_undefined_pd()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004829 (((M) & 1) ? 2 : 0), \
4830 (((M) & 1) ? 3 : 1) );})
4831
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004832/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
4833/// integer vector, as determined by the immediate integer parameter, and
4834/// returns the extracted bits as a 128-bit integer vector.
4835///
4836/// \headerfile <x86intrin.h>
4837///
4838/// \code
4839/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4840/// \endcode
4841///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00004842/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004843///
4844/// \param V
4845/// A 256-bit integer vector.
4846/// \param M
4847/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004848/// extracted from the first parameter: \n
4849/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4850/// result. \n
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004851/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004852/// \returns A 128-bit integer vector containing the extracted bits.
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004853#define _mm256_extractf128_si256(V, M) __extension__ ({ \
4854 (__m128i)__builtin_shufflevector( \
Craig Topperd619eaaa2015-11-11 03:47:10 +00004855 (__v4di)(__m256i)(V), \
Craig Topper2a383c92016-07-04 22:18:01 +00004856 (__v4di)(_mm256_undefined_si256()), \
Sanjay Patel0c351ab2015-03-12 15:50:36 +00004857 (((M) & 1) ? 2 : 0), \
4858 (((M) & 1) ? 3 : 1) );})
4859
Chad Rosierf8df4f42012-03-20 16:40:00 +00004860/* SIMD load ops (unaligned) */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004861/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
4862/// unaligned memory locations and constructs a 256-bit floating-point vector
4863/// of [8 x float] by concatenating the two 128-bit vectors.
4864///
4865/// \headerfile <x86intrin.h>
4866///
4867/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004868/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004869///
4870/// \param __addr_hi
4871/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004872/// single-precision floating-point values. These values are to be copied to
4873/// bits[255:128] of the result. The address of the memory location does not
4874/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004875/// \param __addr_lo
4876/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004877/// single-precision floating-point values. These values are to be copied to
4878/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004879/// have to be aligned.
4880/// \returns A 256-bit floating-point vector of [8 x float] containing the
4881/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004882static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004883_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004884{
Craig Topper74b59482016-05-31 05:49:13 +00004885 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4886 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004887}
4888
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004889/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
4890/// unaligned memory locations and constructs a 256-bit floating-point vector
4891/// of [4 x double] by concatenating the two 128-bit vectors.
4892///
4893/// \headerfile <x86intrin.h>
4894///
4895/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004896/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004897///
4898/// \param __addr_hi
4899/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004900/// double-precision floating-point values. These values are to be copied to
4901/// bits[255:128] of the result. The address of the memory location does not
4902/// have to be aligned.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004903/// \param __addr_lo
4904/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004905/// double-precision floating-point values. These values are to be copied to
4906/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004907/// have to be aligned.
4908/// \returns A 256-bit floating-point vector of [4 x double] containing the
4909/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004910static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004911_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004912{
Craig Topper74b59482016-05-31 05:49:13 +00004913 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4914 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004915}
4916
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004917/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
4918/// constructs a 256-bit integer vector by concatenating the two 128-bit
4919/// vectors.
4920///
4921/// \headerfile <x86intrin.h>
4922///
4923/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004924/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004925///
4926/// \param __addr_hi
4927/// A pointer to a 128-bit memory location containing a 128-bit integer
4928/// vector. This vector is to be copied to bits[255:128] of the result. The
4929/// address of the memory location does not have to be aligned.
4930/// \param __addr_lo
4931/// A pointer to a 128-bit memory location containing a 128-bit integer
4932/// vector. This vector is to be copied to bits[127:0] of the result. The
4933/// address of the memory location does not have to be aligned.
4934/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00004935static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004936_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004937{
Craig Topper74b59482016-05-31 05:49:13 +00004938 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4939 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004940}
4941
4942/* SIMD store ops (unaligned) */
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004943/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4944/// vector of [8 x float] into two different unaligned memory locations.
4945///
4946/// \headerfile <x86intrin.h>
4947///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004948/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4949/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004950///
4951/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004952/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004953/// copied to this memory location. The address of this memory location does
4954/// not have to be aligned.
4955/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004956/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004957/// copied to this memory location. The address of this memory location does
4958/// not have to be aligned.
4959/// \param __a
4960/// A 256-bit floating-point vector of [8 x float].
Michael Kupersteine45af542015-06-30 13:36:19 +00004961static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004962_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004963{
David Blaikie3302f2b2013-01-16 23:08:36 +00004964 __m128 __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004965
David Blaikie3302f2b2013-01-16 23:08:36 +00004966 __v128 = _mm256_castps256_ps128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004967 _mm_storeu_ps(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004968 __v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004969 _mm_storeu_ps(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004970}
4971
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004972/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
4973/// vector of [4 x double] into two different unaligned memory locations.
4974///
4975/// \headerfile <x86intrin.h>
4976///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00004977/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4978/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004979///
4980/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004981/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004982/// copied to this memory location. The address of this memory location does
4983/// not have to be aligned.
4984/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00004985/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00004986/// copied to this memory location. The address of this memory location does
4987/// not have to be aligned.
4988/// \param __a
4989/// A 256-bit floating-point vector of [4 x double].
Michael Kupersteine45af542015-06-30 13:36:19 +00004990static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00004991_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00004992{
David Blaikie3302f2b2013-01-16 23:08:36 +00004993 __m128d __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00004994
David Blaikie3302f2b2013-01-16 23:08:36 +00004995 __v128 = _mm256_castpd256_pd128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00004996 _mm_storeu_pd(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00004997 __v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00004998 _mm_storeu_pd(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00004999}
5000
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005001/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
5002/// two different unaligned memory locations.
5003///
5004/// \headerfile <x86intrin.h>
5005///
Ekaterina Romanova16166a42016-12-23 23:36:26 +00005006/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5007/// store instructions.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005008///
5009/// \param __addr_hi
Ekaterina Romanovad6042192016-12-08 04:09:17 +00005010/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005011/// copied to this memory location. The address of this memory location does
5012/// not have to be aligned.
5013/// \param __addr_lo
Ekaterina Romanovad6042192016-12-08 04:09:17 +00005014/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005015/// copied to this memory location. The address of this memory location does
5016/// not have to be aligned.
5017/// \param __a
5018/// A 256-bit integer vector.
Michael Kupersteine45af542015-06-30 13:36:19 +00005019static __inline void __DEFAULT_FN_ATTRS
David Blaikie3302f2b2013-01-16 23:08:36 +00005020_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00005021{
David Blaikie3302f2b2013-01-16 23:08:36 +00005022 __m128i __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00005023
David Blaikie3302f2b2013-01-16 23:08:36 +00005024 __v128 = _mm256_castsi256_si128(__a);
Craig Topper09175da2016-05-30 17:10:30 +00005025 _mm_storeu_si128(__addr_lo, __v128);
David Blaikie3302f2b2013-01-16 23:08:36 +00005026 __v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper09175da2016-05-30 17:10:30 +00005027 _mm_storeu_si128(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00005028}
Richard Smith49e56442013-07-14 05:41:45 +00005029
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005030/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5031/// concatenating two 128-bit floating-point vectors of [4 x float].
5032///
5033/// \headerfile <x86intrin.h>
5034///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005035/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005036///
5037/// \param __hi
5038/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5039/// 128 bits of the result.
5040/// \param __lo
5041/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5042/// 128 bits of the result.
5043/// \returns A 256-bit floating-point vector of [8 x float] containing the
5044/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005045static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005046_mm256_set_m128 (__m128 __hi, __m128 __lo)
5047{
Craig Topper1aa231e2016-05-16 06:38:42 +00005048 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein76190042015-05-20 07:46:52 +00005049}
5050
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005051/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5052/// concatenating two 128-bit floating-point vectors of [2 x double].
5053///
5054/// \headerfile <x86intrin.h>
5055///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005056/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005057///
5058/// \param __hi
5059/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5060/// 128 bits of the result.
5061/// \param __lo
5062/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5063/// 128 bits of the result.
5064/// \returns A 256-bit floating-point vector of [4 x double] containing the
5065/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005066static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005067_mm256_set_m128d (__m128d __hi, __m128d __lo)
5068{
Michael Kuperstein76190042015-05-20 07:46:52 +00005069 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5070}
5071
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005072/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5073/// integer vectors.
5074///
5075/// \headerfile <x86intrin.h>
5076///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005077/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005078///
5079/// \param __hi
5080/// A 128-bit integer vector to be copied to the upper 128 bits of the
5081/// result.
5082/// \param __lo
5083/// A 128-bit integer vector to be copied to the lower 128 bits of the
5084/// result.
5085/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005086static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005087_mm256_set_m128i (__m128i __hi, __m128i __lo)
5088{
Michael Kuperstein76190042015-05-20 07:46:52 +00005089 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5090}
5091
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005092/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
5093/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
5094/// similar to _mm256_set_m128, but the order of the input parameters is
5095/// swapped.
5096///
5097/// \headerfile <x86intrin.h>
5098///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005099/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005100///
5101/// \param __lo
5102/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5103/// 128 bits of the result.
5104/// \param __hi
5105/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5106/// 128 bits of the result.
5107/// \returns A 256-bit floating-point vector of [8 x float] containing the
5108/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005109static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005110_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5111{
Michael Kuperstein76190042015-05-20 07:46:52 +00005112 return _mm256_set_m128(__hi, __lo);
5113}
5114
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005115/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
5116/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5117/// similar to _mm256_set_m128d, but the order of the input parameters is
5118/// swapped.
5119///
5120/// \headerfile <x86intrin.h>
5121///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005122/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005123///
5124/// \param __lo
5125/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5126/// 128 bits of the result.
5127/// \param __hi
5128/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5129/// 128 bits of the result.
5130/// \returns A 256-bit floating-point vector of [4 x double] containing the
5131/// concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005132static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005133_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5134{
Michael Kuperstein76190042015-05-20 07:46:52 +00005135 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5136}
5137
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005138/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
5139/// integer vectors. This is similar to _mm256_set_m128i, but the order of
5140/// the input parameters is swapped.
5141///
5142/// \headerfile <x86intrin.h>
5143///
Ekaterina Romanova0c1c3bb2016-12-09 18:35:50 +00005144/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova64adc382016-11-09 03:58:30 +00005145///
5146/// \param __lo
5147/// A 128-bit integer vector to be copied to the lower 128 bits of the
5148/// result.
5149/// \param __hi
5150/// A 128-bit integer vector to be copied to the upper 128 bits of the
5151/// result.
5152/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kupersteine45af542015-06-30 13:36:19 +00005153static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova2174b6f2016-11-17 23:02:00 +00005154_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5155{
Michael Kuperstein76190042015-05-20 07:46:52 +00005156 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5157}
5158
Michael Kupersteine45af542015-06-30 13:36:19 +00005159#undef __DEFAULT_FN_ATTRS
Eric Christopher4d1851682015-06-17 07:09:20 +00005160
Richard Smith49e56442013-07-14 05:41:45 +00005161#endif /* __AVXINTRIN_H */