Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: cb15396b3faf329cd8396d662a9f6b41d3a45f21 [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	47
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
				54	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	55
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	56	/* Arithmetic */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	57	/// Adds two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	58	///
				59	/// \headerfile <x86intrin.h>
				60	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	61	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	62	///
				63	/// \param __a
				64	/// A 256-bit vector of [4 x double] containing one of the source operands.
				65	/// \param __b
				66	/// A 256-bit vector of [4 x double] containing one of the source operands.
				67	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				68	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	69	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	70	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	71	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	72	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	73	}
				74
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	75	/// Adds two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	76	///
				77	/// \headerfile <x86intrin.h>
				78	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	79	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	80	///
				81	/// \param __a
				82	/// A 256-bit vector of [8 x float] containing one of the source operands.
				83	/// \param __b
				84	/// A 256-bit vector of [8 x float] containing one of the source operands.
				85	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				86	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	87	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	88	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	89	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	90	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	91	}
				92
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	93	/// Subtracts two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	94	///
				95	/// \headerfile <x86intrin.h>
				96	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	97	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	98	///
				99	/// \param __a
				100	/// A 256-bit vector of [4 x double] containing the minuend.
				101	/// \param __b
				102	/// A 256-bit vector of [4 x double] containing the subtrahend.
				103	/// \returns A 256-bit vector of [4 x double] containing the differences between
				104	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	105	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	106	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	107	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	108	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	109	}
				110
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	111	/// Subtracts two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	112	///
				113	/// \headerfile <x86intrin.h>
				114	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	115	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	116	///
				117	/// \param __a
				118	/// A 256-bit vector of [8 x float] containing the minuend.
				119	/// \param __b
				120	/// A 256-bit vector of [8 x float] containing the subtrahend.
				121	/// \returns A 256-bit vector of [8 x float] containing the differences between
				122	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	123	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	124	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	125	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	126	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	127	}
				128
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	129	/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	130	/// two 256-bit vectors of [4 x double].
				131	///
				132	/// \headerfile <x86intrin.h>
				133	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	134	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	135	///
				136	/// \param __a
				137	/// A 256-bit vector of [4 x double] containing the left source operand.
				138	/// \param __b
				139	/// A 256-bit vector of [4 x double] containing the right source operand.
				140	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				141	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	142	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	143	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	144	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	145	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	146	}
				147
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	148	/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	149	/// two 256-bit vectors of [8 x float].
				150	///
				151	/// \headerfile <x86intrin.h>
				152	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	153	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	154	///
				155	/// \param __a
				156	/// A 256-bit vector of [8 x float] containing the left source operand.
				157	/// \param __b
				158	/// A 256-bit vector of [8 x float] containing the right source operand.
				159	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				160	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	161	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	162	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	163	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	164	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	165	}
				166
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	167	/// Divides two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	168	///
				169	/// \headerfile <x86intrin.h>
				170	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	171	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	172	///
				173	/// \param __a
				174	/// A 256-bit vector of [4 x double] containing the dividend.
				175	/// \param __b
				176	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	177	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				178	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	179	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	180	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	181	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	182	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	183	}
				184
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	185	/// Divides two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	186	///
				187	/// \headerfile <x86intrin.h>
				188	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	189	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	190	///
				191	/// \param __a
				192	/// A 256-bit vector of [8 x float] containing the dividend.
				193	/// \param __b
				194	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	195	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				196	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	197	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	198	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	199	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	200	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	201	}
				202
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	203	/// Compares two 256-bit vectors of [4 x double] and returns the greater
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	204	/// of each pair of values.
				205	///
				206	/// \headerfile <x86intrin.h>
				207	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	208	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	209	///
				210	/// \param __a
				211	/// A 256-bit vector of [4 x double] containing one of the operands.
				212	/// \param __b
				213	/// A 256-bit vector of [4 x double] containing one of the operands.
				214	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				215	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	216	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	217	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	218	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	219	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	220	}
				221
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	222	/// Compares two 256-bit vectors of [8 x float] and returns the greater
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	223	/// of each pair of values.
				224	///
				225	/// \headerfile <x86intrin.h>
				226	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	227	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	228	///
				229	/// \param __a
				230	/// A 256-bit vector of [8 x float] containing one of the operands.
				231	/// \param __b
				232	/// A 256-bit vector of [8 x float] containing one of the operands.
				233	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				234	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	235	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	236	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	237	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	238	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	239	}
				240
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	241	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	242	/// of each pair of values.
				243	///
				244	/// \headerfile <x86intrin.h>
				245	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	246	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	247	///
				248	/// \param __a
				249	/// A 256-bit vector of [4 x double] containing one of the operands.
				250	/// \param __b
				251	/// A 256-bit vector of [4 x double] containing one of the operands.
				252	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				253	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	254	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	255	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	256	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	257	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	258	}
				259
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	260	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	261	/// of each pair of values.
				262	///
				263	/// \headerfile <x86intrin.h>
				264	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	265	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	266	///
				267	/// \param __a
				268	/// A 256-bit vector of [8 x float] containing one of the operands.
				269	/// \param __b
				270	/// A 256-bit vector of [8 x float] containing one of the operands.
				271	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				272	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	273	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	274	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	275	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	276	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	277	}
				278
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	279	/// Multiplies two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	280	///
				281	/// \headerfile <x86intrin.h>
				282	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	283	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	284	///
				285	/// \param __a
				286	/// A 256-bit vector of [4 x double] containing one of the operands.
				287	/// \param __b
				288	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	289	/// \returns A 256-bit vector of [4 x double] containing the products of both
				290	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	291	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	292	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	293	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	294	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	295	}
				296
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	297	/// Multiplies two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	298	///
				299	/// \headerfile <x86intrin.h>
				300	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	301	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	302	///
				303	/// \param __a
				304	/// A 256-bit vector of [8 x float] containing one of the operands.
				305	/// \param __b
				306	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	307	/// \returns A 256-bit vector of [8 x float] containing the products of both
				308	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	309	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	310	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	311	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	312	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	313	}
				314
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	315	/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	316	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	317	///
				318	/// \headerfile <x86intrin.h>
				319	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	320	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	321	///
				322	/// \param __a
				323	/// A 256-bit vector of [4 x double].
				324	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				325	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	326	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	327	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	328	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	329	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	330	}
				331
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	332	/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	333	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	334	///
				335	/// \headerfile <x86intrin.h>
				336	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	337	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	338	///
				339	/// \param __a
				340	/// A 256-bit vector of [8 x float].
				341	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				342	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	343	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	344	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	345	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	346	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	347	}
				348
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	349	/// Calculates the reciprocal square roots of the values in a 256-bit
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	350	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	351	///
				352	/// \headerfile <x86intrin.h>
				353	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	354	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	355	///
				356	/// \param __a
				357	/// A 256-bit vector of [8 x float].
				358	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				359	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	360	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	361	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	362	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	363	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	364	}
				365
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	366	/// Calculates the reciprocals of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	367	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	368	///
				369	/// \headerfile <x86intrin.h>
				370	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	371	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	372	///
				373	/// \param __a
				374	/// A 256-bit vector of [8 x float].
				375	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				376	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	377	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	378	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	379	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	380	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	381	}
				382
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	383	/// Rounds the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	384	/// by the byte operand. The source values are rounded to integer values and
				385	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	386	///
				387	/// \headerfile <x86intrin.h>
				388	///
				389	/// \code
				390	/// __m256d _mm256_round_pd(__m256d V, const int M);
				391	/// \endcode
				392	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	393	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	394	///
				395	/// \param V
				396	/// A 256-bit vector of [4 x double].
				397	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	398	/// An integer value that specifies the rounding operation. \n
				399	/// Bits [7:4] are reserved. \n
				400	/// Bit [3] is a precision exception value: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	401	/// 0: A normal PE exception is used. \n
				402	/// 1: The PE field is not updated. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	403	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	404	/// 0: Use bits [1:0] of \a M. \n
				405	/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	406	/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	407	/// 00: Nearest. \n
				408	/// 01: Downward (toward negative infinity). \n
				409	/// 10: Upward (toward positive infinity). \n
				410	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	411	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	412	#define _mm256_round_pd(V, M) \
				413	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	414
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	415	/// Rounds the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	416	/// specified by the byte operand. The source values are rounded to integer
				417	/// values and returned as floating-point values.
				418	///
				419	/// \headerfile <x86intrin.h>
				420	///
				421	/// \code
				422	/// __m256 _mm256_round_ps(__m256 V, const int M);
				423	/// \endcode
				424	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	425	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	426	///
				427	/// \param V
				428	/// A 256-bit vector of [8 x float].
				429	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	430	/// An integer value that specifies the rounding operation. \n
				431	/// Bits [7:4] are reserved. \n
				432	/// Bit [3] is a precision exception value: \n
				433	/// 0: A normal PE exception is used. \n
				434	/// 1: The PE field is not updated. \n
				435	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	436	/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	437	/// 1: Use the current MXCSR setting. \n
				438	/// Bits [1:0] contain the rounding control definition: \n
				439	/// 00: Nearest. \n
				440	/// 01: Downward (toward negative infinity). \n
				441	/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	442	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	443	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	444	#define _mm256_round_ps(V, M) \
				445	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	446
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	447	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	448	/// source values are rounded up to integer values and returned as 64-bit
				449	/// double-precision floating-point values.
				450	///
				451	/// \headerfile <x86intrin.h>
				452	///
				453	/// \code
				454	/// __m256d _mm256_ceil_pd(__m256d V);
				455	/// \endcode
				456	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	457	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	458	///
				459	/// \param V
				460	/// A 256-bit vector of [4 x double].
				461	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	462	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	463
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	464	/// Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	465	/// The source values are rounded down to integer values and returned as
				466	/// 64-bit double-precision floating-point values.
				467	///
				468	/// \headerfile <x86intrin.h>
				469	///
				470	/// \code
				471	/// __m256d _mm256_floor_pd(__m256d V);
				472	/// \endcode
				473	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	474	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	475	///
				476	/// \param V
				477	/// A 256-bit vector of [4 x double].
				478	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				479	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	480	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	481
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	482	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	483	/// source values are rounded up to integer values and returned as
				484	/// floating-point values.
				485	///
				486	/// \headerfile <x86intrin.h>
				487	///
				488	/// \code
				489	/// __m256 _mm256_ceil_ps(__m256 V);
				490	/// \endcode
				491	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	492	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	493	///
				494	/// \param V
				495	/// A 256-bit vector of [8 x float].
				496	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	497	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	498
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	499	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	500	/// source values are rounded down to integer values and returned as
				501	/// floating-point values.
				502	///
				503	/// \headerfile <x86intrin.h>
				504	///
				505	/// \code
				506	/// __m256 _mm256_floor_ps(__m256 V);
				507	/// \endcode
				508	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	509	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	510	///
				511	/// \param V
				512	/// A 256-bit vector of [8 x float].
				513	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	514	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				515
				516	/* Logical */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	517	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	518	///
				519	/// \headerfile <x86intrin.h>
				520	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	521	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	522	///
				523	/// \param __a
				524	/// A 256-bit vector of [4 x double] containing one of the source operands.
				525	/// \param __b
				526	/// A 256-bit vector of [4 x double] containing one of the source operands.
				527	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				528	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	529	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	530	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	531	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	532	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	533	}
				534
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	535	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	536	///
				537	/// \headerfile <x86intrin.h>
				538	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	539	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	540	///
				541	/// \param __a
				542	/// A 256-bit vector of [8 x float] containing one of the source operands.
				543	/// \param __b
				544	/// A 256-bit vector of [8 x float] containing one of the source operands.
				545	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				546	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	547	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	548	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	549	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	550	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	551	}
				552
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	553	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	554	/// the one's complement of the values contained in the first source operand.
				555	///
				556	/// \headerfile <x86intrin.h>
				557	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	558	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	559	///
				560	/// \param __a
				561	/// A 256-bit vector of [4 x double] containing the left source operand. The
				562	/// one's complement of this value is used in the bitwise AND.
				563	/// \param __b
				564	/// A 256-bit vector of [4 x double] containing the right source operand.
				565	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				566	/// values of the second operand and the one's complement of the first
				567	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	568	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	569	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	570	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	571	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	572	}
				573
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	574	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	575	/// the one's complement of the values contained in the first source operand.
				576	///
				577	/// \headerfile <x86intrin.h>
				578	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	579	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	580	///
				581	/// \param __a
				582	/// A 256-bit vector of [8 x float] containing the left source operand. The
				583	/// one's complement of this value is used in the bitwise AND.
				584	/// \param __b
				585	/// A 256-bit vector of [8 x float] containing the right source operand.
				586	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				587	/// values of the second operand and the one's complement of the first
				588	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	589	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	590	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	591	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	592	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	593	}
				594
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	595	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	596	///
				597	/// \headerfile <x86intrin.h>
				598	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	599	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	600	///
				601	/// \param __a
				602	/// A 256-bit vector of [4 x double] containing one of the source operands.
				603	/// \param __b
				604	/// A 256-bit vector of [4 x double] containing one of the source operands.
				605	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				606	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	607	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	608	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	609	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	610	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	611	}
				612
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	613	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	614	///
				615	/// \headerfile <x86intrin.h>
				616	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	617	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	618	///
				619	/// \param __a
				620	/// A 256-bit vector of [8 x float] containing one of the source operands.
				621	/// \param __b
				622	/// A 256-bit vector of [8 x float] containing one of the source operands.
				623	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				624	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	625	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	626	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	627	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	628	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	629	}
				630
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	631	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	632	///
				633	/// \headerfile <x86intrin.h>
				634	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	635	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	636	///
				637	/// \param __a
				638	/// A 256-bit vector of [4 x double] containing one of the source operands.
				639	/// \param __b
				640	/// A 256-bit vector of [4 x double] containing one of the source operands.
				641	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				642	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	643	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	644	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	645	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	646	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	647	}
				648
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	649	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	650	///
				651	/// \headerfile <x86intrin.h>
				652	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	653	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	654	///
				655	/// \param __a
				656	/// A 256-bit vector of [8 x float] containing one of the source operands.
				657	/// \param __b
				658	/// A 256-bit vector of [8 x float] containing one of the source operands.
				659	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				660	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	661	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	662	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	663	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	664	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	665	}
				666
				667	/* Horizontal arithmetic */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	668	/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	669	/// 256-bit vectors of [4 x double].
				670	///
				671	/// \headerfile <x86intrin.h>
				672	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	673	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	674	///
				675	/// \param __a
				676	/// A 256-bit vector of [4 x double] containing one of the source operands.
				677	/// The horizontal sums of the values are returned in the even-indexed
				678	/// elements of a vector of [4 x double].
				679	/// \param __b
				680	/// A 256-bit vector of [4 x double] containing one of the source operands.
				681	/// The horizontal sums of the values are returned in the odd-indexed
				682	/// elements of a vector of [4 x double].
				683	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				684	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	685	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	686	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	687	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	688	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	689	}
				690
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	691	/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	692	/// 256-bit vectors of [8 x float].
				693	///
				694	/// \headerfile <x86intrin.h>
				695	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	696	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	697	///
				698	/// \param __a
				699	/// A 256-bit vector of [8 x float] containing one of the source operands.
				700	/// The horizontal sums of the values are returned in the elements with
				701	/// index 0, 1, 4, 5 of a vector of [8 x float].
				702	/// \param __b
				703	/// A 256-bit vector of [8 x float] containing one of the source operands.
				704	/// The horizontal sums of the values are returned in the elements with
				705	/// index 2, 3, 6, 7 of a vector of [8 x float].
				706	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				707	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	708	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	709	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	710	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	711	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	712	}
				713
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	714	/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	715	/// 256-bit vectors of [4 x double].
				716	///
				717	/// \headerfile <x86intrin.h>
				718	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	719	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	720	///
				721	/// \param __a
				722	/// A 256-bit vector of [4 x double] containing one of the source operands.
				723	/// The horizontal differences between the values are returned in the
				724	/// even-indexed elements of a vector of [4 x double].
				725	/// \param __b
				726	/// A 256-bit vector of [4 x double] containing one of the source operands.
				727	/// The horizontal differences between the values are returned in the
				728	/// odd-indexed elements of a vector of [4 x double].
				729	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				730	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	731	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	732	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	733	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	734	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	735	}
				736
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	737	/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	738	/// 256-bit vectors of [8 x float].
				739	///
				740	/// \headerfile <x86intrin.h>
				741	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	742	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	743	///
				744	/// \param __a
				745	/// A 256-bit vector of [8 x float] containing one of the source operands.
				746	/// The horizontal differences between the values are returned in the
				747	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				748	/// \param __b
				749	/// A 256-bit vector of [8 x float] containing one of the source operands.
				750	/// The horizontal differences between the values are returned in the
				751	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				752	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				753	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	754	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	755	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	756	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	757	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	758	}
				759
				760	/* Vector permutations */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	761	/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	762	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	763	///
				764	/// \headerfile <x86intrin.h>
				765	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	766	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	767	///
				768	/// \param __a
				769	/// A 128-bit vector of [2 x double].
				770	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	771	/// A 128-bit integer vector operand specifying how the values are to be
				772	/// copied. \n
				773	/// Bit [1]: \n
				774	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				775	/// vector. \n
				776	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				777	/// returned vector. \n
				778	/// Bit [65]: \n
				779	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				780	/// returned vector. \n
				781	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				782	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	783	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	784	static __inline __m128d __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	785	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	786	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	787	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	788	}
				789
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	790	/// Copies the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	791	/// by the 256-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	792	///
				793	/// \headerfile <x86intrin.h>
				794	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	795	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	796	///
				797	/// \param __a
				798	/// A 256-bit vector of [4 x double].
				799	/// \param __c
				800	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	801	/// copied. \n
				802	/// Bit [1]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	803	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				804	/// vector. \n
				805	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				806	/// returned vector. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	807	/// Bit [65]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	808	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				809	/// returned vector. \n
				810	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				811	/// returned vector. \n
				812	/// Bit [129]: \n
				813	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				814	/// returned vector. \n
				815	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				816	/// returned vector. \n
				817	/// Bit [193]: \n
				818	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				819	/// returned vector. \n
				820	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	821	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	822	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	823	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	824	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	825	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	826	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	827	}
				828
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	829	/// Copies the values stored in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	830	/// specified by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	831	/// \headerfile <x86intrin.h>
				832	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	833	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	834	///
				835	/// \param __a
				836	/// A 128-bit vector of [4 x float].
				837	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	838	/// A 128-bit integer vector operand specifying how the values are to be
				839	/// copied. \n
				840	/// Bits [1:0]: \n
				841	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				842	/// returned vector. \n
				843	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				844	/// returned vector. \n
				845	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				846	/// returned vector. \n
				847	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				848	/// returned vector. \n
				849	/// Bits [33:32]: \n
				850	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				851	/// returned vector. \n
				852	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				853	/// returned vector. \n
				854	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				855	/// returned vector. \n
				856	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				857	/// returned vector. \n
				858	/// Bits [65:64]: \n
				859	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				860	/// returned vector. \n
				861	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				862	/// returned vector. \n
				863	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				864	/// returned vector. \n
				865	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				866	/// returned vector. \n
				867	/// Bits [97:96]: \n
				868	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				869	/// returned vector. \n
				870	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				871	/// returned vector. \n
				872	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				873	/// returned vector. \n
				874	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				875	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	876	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	877	static __inline __m128 __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	878	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	879	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	880	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	881	}
				882
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	883	/// Copies the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	884	/// specified by the 256-bit integer vector operand.
				885	///
				886	/// \headerfile <x86intrin.h>
				887	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	888	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	889	///
				890	/// \param __a
				891	/// A 256-bit vector of [8 x float].
				892	/// \param __c
				893	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	894	/// copied. \n
				895	/// Bits [1:0]: \n
				896	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				897	/// returned vector. \n
				898	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				899	/// returned vector. \n
				900	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				901	/// returned vector. \n
				902	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				903	/// returned vector. \n
				904	/// Bits [33:32]: \n
				905	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				906	/// returned vector. \n
				907	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				908	/// returned vector. \n
				909	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				910	/// returned vector. \n
				911	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				912	/// returned vector. \n
				913	/// Bits [65:64]: \n
				914	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				915	/// returned vector. \n
				916	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				917	/// returned vector. \n
				918	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				919	/// returned vector. \n
				920	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				921	/// returned vector. \n
				922	/// Bits [97:96]: \n
				923	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				924	/// returned vector. \n
				925	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				926	/// returned vector. \n
				927	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				928	/// returned vector. \n
				929	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				930	/// returned vector. \n
				931	/// Bits [129:128]: \n
				932	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				933	/// returned vector. \n
				934	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				935	/// returned vector. \n
				936	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				937	/// returned vector. \n
				938	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				939	/// returned vector. \n
				940	/// Bits [161:160]: \n
				941	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				942	/// returned vector. \n
				943	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				944	/// returned vector. \n
				945	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				946	/// returned vector. \n
				947	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				948	/// returned vector. \n
				949	/// Bits [193:192]: \n
				950	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				951	/// returned vector. \n
				952	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				953	/// returned vector. \n
				954	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				955	/// returned vector. \n
				956	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				957	/// returned vector. \n
				958	/// Bits [225:224]: \n
				959	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				960	/// returned vector. \n
				961	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				962	/// returned vector. \n
				963	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				964	/// returned vector. \n
				965	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				966	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	967	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	968	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	969	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	970	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	971	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	972	}
				973
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	974	/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	975	/// by the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	976	///
				977	/// \headerfile <x86intrin.h>
				978	///
				979	/// \code
				980	/// __m128d _mm_permute_pd(__m128d A, const int C);
				981	/// \endcode
				982	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	983	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	984	///
				985	/// \param A
				986	/// A 128-bit vector of [2 x double].
				987	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	988	/// An immediate integer operand specifying how the values are to be
				989	/// copied. \n
				990	/// Bit [0]: \n
				991	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				992	/// vector. \n
				993	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				994	/// returned vector. \n
				995	/// Bit [1]: \n
				996	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				997	/// returned vector. \n
				998	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				999	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1000	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1001	#define _mm_permute_pd(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1002	(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1003
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1004	/// Copies the values in a 256-bit vector of [4 x double] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1005	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1006	///
				1007	/// \headerfile <x86intrin.h>
				1008	///
				1009	/// \code
				1010	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1011	/// \endcode
				1012	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1013	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1014	///
				1015	/// \param A
				1016	/// A 256-bit vector of [4 x double].
				1017	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1018	/// An immediate integer operand specifying how the values are to be
				1019	/// copied. \n
				1020	/// Bit [0]: \n
				1021	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				1022	/// vector. \n
				1023	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1024	/// returned vector. \n
				1025	/// Bit [1]: \n
				1026	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1027	/// returned vector. \n
				1028	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1029	/// returned vector. \n
				1030	/// Bit [2]: \n
				1031	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1032	/// returned vector. \n
				1033	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1034	/// returned vector. \n
				1035	/// Bit [3]: \n
				1036	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1037	/// returned vector. \n
				1038	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1039	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1040	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1041	#define _mm256_permute_pd(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1042	(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1043
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1044	/// Copies the values in a 128-bit vector of [4 x float] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1045	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1046	///
				1047	/// \headerfile <x86intrin.h>
				1048	///
				1049	/// \code
				1050	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1051	/// \endcode
				1052	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1053	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1054	///
				1055	/// \param A
				1056	/// A 128-bit vector of [4 x float].
				1057	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1058	/// An immediate integer operand specifying how the values are to be
				1059	/// copied. \n
				1060	/// Bits [1:0]: \n
				1061	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1062	/// returned vector. \n
				1063	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1064	/// returned vector. \n
				1065	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1066	/// returned vector. \n
				1067	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1068	/// returned vector. \n
				1069	/// Bits [3:2]: \n
				1070	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1071	/// returned vector. \n
				1072	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1073	/// returned vector. \n
				1074	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1075	/// returned vector. \n
				1076	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1077	/// returned vector. \n
				1078	/// Bits [5:4]: \n
				1079	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1080	/// returned vector. \n
				1081	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1082	/// returned vector. \n
				1083	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1084	/// returned vector. \n
				1085	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1086	/// returned vector. \n
				1087	/// Bits [7:6]: \n
				1088	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1089	/// returned vector. \n
				1090	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1091	/// returned vector. \n
				1092	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1093	/// returned vector. \n
				1094	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1095	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1096	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1097	#define _mm_permute_ps(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1098	(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1099
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1100	/// Copies the values in a 256-bit vector of [8 x float] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1101	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1102	///
				1103	/// \headerfile <x86intrin.h>
				1104	///
				1105	/// \code
				1106	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1107	/// \endcode
				1108	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1109	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1110	///
				1111	/// \param A
				1112	/// A 256-bit vector of [8 x float].
				1113	/// \param C
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1114	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1115	/// copied. \n
				1116	/// Bits [1:0]: \n
				1117	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1118	/// returned vector. \n
				1119	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1120	/// returned vector. \n
				1121	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1122	/// returned vector. \n
				1123	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1124	/// returned vector. \n
				1125	/// Bits [3:2]: \n
				1126	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1127	/// returned vector. \n
				1128	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1129	/// returned vector. \n
				1130	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1131	/// returned vector. \n
				1132	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1133	/// returned vector. \n
				1134	/// Bits [5:4]: \n
				1135	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1136	/// returned vector. \n
				1137	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1138	/// returned vector. \n
				1139	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1140	/// returned vector. \n
				1141	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1142	/// returned vector. \n
				1143	/// Bits [7:6]: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1144	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1145	/// returned vector. \n
				1146	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1147	/// returned vector. \n
				1148	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1149	/// returned vector. \n
				1150	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1151	/// returned vector. \n
				1152	/// Bits [1:0]: \n
				1153	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1154	/// returned vector. \n
				1155	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1156	/// returned vector. \n
				1157	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1158	/// returned vector. \n
				1159	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1160	/// returned vector. \n
				1161	/// Bits [3:2]: \n
				1162	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1163	/// returned vector. \n
				1164	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1165	/// returned vector. \n
				1166	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1167	/// returned vector. \n
				1168	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1169	/// returned vector. \n
				1170	/// Bits [5:4]: \n
				1171	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1172	/// returned vector. \n
				1173	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1174	/// returned vector. \n
				1175	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1176	/// returned vector. \n
				1177	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1178	/// returned vector. \n
				1179	/// Bits [7:6]: \n
				1180	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1181	/// returned vector. \n
				1182	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1183	/// returned vector. \n
				1184	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1185	/// returned vector. \n
				1186	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1187	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1188	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1189	#define _mm256_permute_ps(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1190	(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1191
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1192	/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1193	/// [4 x double], as specified by the immediate integer operand.
				1194	///
				1195	/// \headerfile <x86intrin.h>
				1196	///
				1197	/// \code
				1198	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1199	/// \endcode
				1200	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1201	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1202	///
				1203	/// \param V1
				1204	/// A 256-bit vector of [4 x double].
				1205	/// \param V2
				1206	/// A 256-bit vector of [4 x double.
				1207	/// \param M
				1208	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1209	/// permuted. \n
				1210	/// Bits [1:0]: \n
				1211	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1212	/// destination. \n
				1213	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1214	/// destination. \n
				1215	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1216	/// destination. \n
				1217	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1218	/// destination. \n
				1219	/// Bits [5:4]: \n
				1220	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1221	/// destination. \n
				1222	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1223	/// destination. \n
				1224	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1225	/// destination. \n
				1226	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
				1227	/// destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1228	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1229	#define _mm256_permute2f128_pd(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1230	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
Craig Topper	9d3962f	2018-06-08 18:00:22 +0000	[diff] [blame]	1231	(__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1232
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1233	/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1234	/// [8 x float], as specified by the immediate integer operand.
				1235	///
				1236	/// \headerfile <x86intrin.h>
				1237	///
				1238	/// \code
				1239	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1240	/// \endcode
				1241	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1242	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1243	///
				1244	/// \param V1
				1245	/// A 256-bit vector of [8 x float].
				1246	/// \param V2
				1247	/// A 256-bit vector of [8 x float].
				1248	/// \param M
				1249	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1250	/// permuted. \n
				1251	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1252	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1253	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1254	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1255	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1256	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1257	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1258	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1259	/// destination. \n
				1260	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1261	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1262	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1263	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1264	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1265	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1266	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1267	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1268	/// destination.
				1269	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1270	#define _mm256_permute2f128_ps(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1271	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
Craig Topper	9d3962f	2018-06-08 18:00:22 +0000	[diff] [blame]	1272	(__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1273
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1274	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1275	/// as specified by the immediate integer operand.
				1276	///
				1277	/// \headerfile <x86intrin.h>
				1278	///
				1279	/// \code
				1280	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1281	/// \endcode
				1282	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1283	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1284	///
				1285	/// \param V1
				1286	/// A 256-bit integer vector.
				1287	/// \param V2
				1288	/// A 256-bit integer vector.
				1289	/// \param M
				1290	/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1291	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1292	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1293	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1294	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1295	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1296	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1297	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1298	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1299	/// destination. \n
				1300	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1301	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1302	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1303	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1304	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1305	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1306	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1307	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1308	/// destination.
				1309	/// \returns A 256-bit integer vector containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1310	#define _mm256_permute2f128_si256(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1311	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
Craig Topper	9d3962f	2018-06-08 18:00:22 +0000	[diff] [blame]	1312	(__v8si)(__m256i)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1313
				1314	/* Vector Blend */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1315	/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1316	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1317	/// integer operand.
				1318	///
				1319	/// \headerfile <x86intrin.h>
				1320	///
				1321	/// \code
				1322	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1323	/// \endcode
				1324	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1325	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1326	///
				1327	/// \param V1
				1328	/// A 256-bit vector of [4 x double].
				1329	/// \param V2
				1330	/// A 256-bit vector of [4 x double].
				1331	/// \param M
				1332	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1333	/// values are to be copied. The position of the mask bit corresponds to the
				1334	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1335	/// element in operand \a V1 is copied to the same position in the
				1336	/// destination. When a mask bit is 1, the corresponding 64-bit element in
				1337	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1338	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1339	#define _mm256_blend_pd(V1, V2, M) \
Craig Topper	7d17d72	2018-06-08 00:00:21 +0000	[diff] [blame]	1340	(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
				1341	(__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1342
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1343	/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1344	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1345	/// integer operand.
				1346	///
				1347	/// \headerfile <x86intrin.h>
				1348	///
				1349	/// \code
				1350	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1351	/// \endcode
				1352	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1353	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1354	///
				1355	/// \param V1
				1356	/// A 256-bit vector of [8 x float].
				1357	/// \param V2
				1358	/// A 256-bit vector of [8 x float].
				1359	/// \param M
				1360	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1361	/// values are to be copied. The position of the mask bit corresponds to the
				1362	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1363	/// element in operand \a V1 is copied to the same position in the
				1364	/// destination. When a mask bit is 1, the corresponding 32-bit element in
				1365	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1366	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1367	#define _mm256_blend_ps(V1, V2, M) \
Craig Topper	7d17d72	2018-06-08 00:00:21 +0000	[diff] [blame]	1368	(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
				1369	(__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1370
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1371	/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1372	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1373	/// operand.
				1374	///
				1375	/// \headerfile <x86intrin.h>
				1376	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1377	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1378	///
				1379	/// \param __a
				1380	/// A 256-bit vector of [4 x double].
				1381	/// \param __b
				1382	/// A 256-bit vector of [4 x double].
				1383	/// \param __c
				1384	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1385	/// how the values are to be copied. The position of the mask bit corresponds
				1386	/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1387	/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1388	/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	1389	/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1390	/// destination.
				1391	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1392	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1393	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1394	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1395	return (__m256d)__builtin_ia32_blendvpd256(
				1396	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1397	}
				1398
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1399	/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1400	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1401	/// operand.
				1402	///
				1403	/// \headerfile <x86intrin.h>
				1404	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1405	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1406	///
				1407	/// \param __a
				1408	/// A 256-bit vector of [8 x float].
				1409	/// \param __b
				1410	/// A 256-bit vector of [8 x float].
				1411	/// \param __c
				1412	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1413	/// and 31 specifying how the values are to be copied. The position of the
				1414	/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1415	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1416	/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1417	/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1418	/// position in the destination.
				1419	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1420	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1421	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1422	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1423	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1424	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1425	}
				1426
				1427	/* Vector Dot Product */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1428	/// Computes two dot products in parallel, using the lower and upper
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1429	/// halves of two [8 x float] vectors as input to the two computations, and
				1430	/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1431	/// [8 x float] result.
				1432	///
				1433	/// The immediate integer operand controls which input elements will
				1434	/// contribute to the dot product, and where the final results are returned.
				1435	/// In general, for each dot product, the four corresponding elements of the
				1436	/// input vectors are multiplied; the first two and second two products are
				1437	/// summed, then the two sums are added to form the final result.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1438	///
				1439	/// \headerfile <x86intrin.h>
				1440	///
				1441	/// \code
				1442	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1443	/// \endcode
				1444	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1445	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1446	///
				1447	/// \param V1
				1448	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1449	/// \param V2
				1450	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1451	/// \param M
				1452	/// An immediate integer argument. Bits [7:4] determine which elements of
				1453	/// the input vectors are used, with bit [4] corresponding to the lowest
				1454	/// element and bit [7] corresponding to the highest element of each [4 x
				1455	/// float] subvector. If a bit is set, the corresponding elements from the
				1456	/// two input vectors are used as an input for dot product; otherwise that
				1457	/// input is treated as zero. Bits [3:0] determine which elements of the
				1458	/// result will receive a copy of the final dot product, with bit [0]
				1459	/// corresponding to the lowest element and bit [3] corresponding to the
				1460	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1461	/// product is returned in the corresponding element; otherwise that element
				1462	/// is set to zero. The bitmask is applied in the same way to each of the
				1463	/// two parallel dot product computations.
				1464	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1465	#define _mm256_dp_ps(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1466	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1467	(__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1468
				1469	/* Vector shuffle */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1470	/// Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1471	/// specified by the immediate value operand.
				1472	///
				1473	/// The four selected elements in each operand are copied to the destination
				1474	/// according to the bits specified in the immediate operand. The selected
				1475	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1476	/// bits [191:128] of the destination, and the selected elements from the
				1477	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
				1478	/// the destination. For example, if bits [7:0] of the immediate operand
				1479	/// contain a value of 0xFF, the 256-bit destination vector would contain the
				1480	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1481	///
				1482	/// \headerfile <x86intrin.h>
				1483	///
				1484	/// \code
				1485	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1486	/// \endcode
				1487	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1488	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1489	///
				1490	/// \param a
				1491	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1492	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1493	/// according to the bits specified in the immediate operand.
				1494	/// \param b
				1495	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1496	/// operand are copied to bits [127:64] and bits [255:192] in the
				1497	/// destination, according to the bits specified in the immediate operand.
				1498	/// \param mask
				1499	/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1500	/// copy from \a a and \a b \n.
				1501	/// Bits [3:0] specify the values copied from operand \a a. \n
				1502	/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1503	/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1504	/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1505	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1506	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1507	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1508	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1509	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1510	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1511	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1512	/// the destination. \n
				1513	/// Bit value assignments: \n
				1514	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
				1515	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
				1516	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1517	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1518	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1519	#define _mm256_shuffle_ps(a, b, mask) \
Craig Topper	422a1bb	2018-06-08 07:18:33 +0000	[diff] [blame]	1520	(__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
				1521	(__v8sf)(__m256)(b), (int)(mask))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1522
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1523	/// Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1524	/// [4 x double], as specified by the immediate value operand.
				1525	///
				1526	/// The selected elements from the first 256-bit operand are copied to bits
				1527	/// [63:0] and bits [191:128] in the destination, and the selected elements
				1528	/// from the second 256-bit operand are copied to bits [127:64] and bits
				1529	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
				1530	/// operand contain a value of 0xF, the 256-bit destination vector would
				1531	/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1532	///
				1533	/// \headerfile <x86intrin.h>
				1534	///
				1535	/// \code
				1536	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1537	/// \endcode
				1538	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1539	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1540	///
				1541	/// \param a
				1542	/// A 256-bit vector of [4 x double].
				1543	/// \param b
				1544	/// A 256-bit vector of [4 x double].
				1545	/// \param mask
				1546	/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1547	/// copy from \a a and \a b: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1548	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1549	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1550	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1551	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1552	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1553	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1554	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1555	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1556	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1557	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1558	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1559	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1560	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1561	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1562	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1563	/// destination.
				1564	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1565	#define _mm256_shuffle_pd(a, b, mask) \
Craig Topper	422a1bb	2018-06-08 07:18:33 +0000	[diff] [blame]	1566	(__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
				1567	(__v4df)(__m256d)(b), (int)(mask))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1568
				1569	/* Compare */
				1570	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1571	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1572	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1573	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1574	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1575	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1576	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1577	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1578	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1579	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1580	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1581	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1582	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1583	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1584	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1585	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1586	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1587	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1588	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1589	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1590	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1591	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1592	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1593	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1594	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1595	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1596	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1597	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1598	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1599	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1600	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1601	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1602
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1603	/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1604	/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1605	/// immediate integer operand.
				1606	///
				1607	/// Returns a [2 x double] vector consisting of two doubles corresponding to
				1608	/// the two comparison results: zero if the comparison is false, and all 1's
				1609	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1610	///
				1611	/// \headerfile <x86intrin.h>
				1612	///
				1613	/// \code
				1614	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1615	/// \endcode
				1616	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1617	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1618	///
				1619	/// \param a
				1620	/// A 128-bit vector of [2 x double].
				1621	/// \param b
				1622	/// A 128-bit vector of [2 x double].
				1623	/// \param c
				1624	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1625	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1626	/// 0x00: Equal (ordered, non-signaling) \n
				1627	/// 0x01: Less-than (ordered, signaling) \n
				1628	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1629	/// 0x03: Unordered (non-signaling) \n
				1630	/// 0x04: Not-equal (unordered, non-signaling) \n
				1631	/// 0x05: Not-less-than (unordered, signaling) \n
				1632	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1633	/// 0x07: Ordered (non-signaling) \n
				1634	/// 0x08: Equal (unordered, non-signaling) \n
				1635	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1636	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1637	/// 0x0B: False (ordered, non-signaling) \n
				1638	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1639	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1640	/// 0x0E: Greater-than (ordered, signaling) \n
				1641	/// 0x0F: True (unordered, non-signaling) \n
				1642	/// 0x10: Equal (ordered, signaling) \n
				1643	/// 0x11: Less-than (ordered, non-signaling) \n
				1644	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1645	/// 0x13: Unordered (signaling) \n
				1646	/// 0x14: Not-equal (unordered, signaling) \n
				1647	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1648	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1649	/// 0x17: Ordered (signaling) \n
				1650	/// 0x18: Equal (unordered, signaling) \n
				1651	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1652	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1653	/// 0x1B: False (ordered, signaling) \n
				1654	/// 0x1C: Not-equal (ordered, signaling) \n
				1655	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1656	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1657	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1658	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1659	#define _mm_cmp_pd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1660	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1661	(__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1662
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1663	/// Compares each of the corresponding values of two 128-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1664	/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1665	/// operand.
				1666	///
				1667	/// Returns a [4 x float] vector consisting of four floats corresponding to
				1668	/// the four comparison results: zero if the comparison is false, and all 1's
				1669	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1670	///
				1671	/// \headerfile <x86intrin.h>
				1672	///
				1673	/// \code
				1674	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1675	/// \endcode
				1676	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1677	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1678	///
				1679	/// \param a
				1680	/// A 128-bit vector of [4 x float].
				1681	/// \param b
				1682	/// A 128-bit vector of [4 x float].
				1683	/// \param c
				1684	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1685	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1686	/// 0x00: Equal (ordered, non-signaling) \n
				1687	/// 0x01: Less-than (ordered, signaling) \n
				1688	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1689	/// 0x03: Unordered (non-signaling) \n
				1690	/// 0x04: Not-equal (unordered, non-signaling) \n
				1691	/// 0x05: Not-less-than (unordered, signaling) \n
				1692	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1693	/// 0x07: Ordered (non-signaling) \n
				1694	/// 0x08: Equal (unordered, non-signaling) \n
				1695	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1696	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1697	/// 0x0B: False (ordered, non-signaling) \n
				1698	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1699	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1700	/// 0x0E: Greater-than (ordered, signaling) \n
				1701	/// 0x0F: True (unordered, non-signaling) \n
				1702	/// 0x10: Equal (ordered, signaling) \n
				1703	/// 0x11: Less-than (ordered, non-signaling) \n
				1704	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1705	/// 0x13: Unordered (signaling) \n
				1706	/// 0x14: Not-equal (unordered, signaling) \n
				1707	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1708	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1709	/// 0x17: Ordered (signaling) \n
				1710	/// 0x18: Equal (unordered, signaling) \n
				1711	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1712	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1713	/// 0x1B: False (ordered, signaling) \n
				1714	/// 0x1C: Not-equal (ordered, signaling) \n
				1715	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1716	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1717	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1718	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1719	#define _mm_cmp_ps(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1720	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1721	(__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1722
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1723	/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1724	/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1725	/// immediate integer operand.
				1726	///
				1727	/// Returns a [4 x double] vector consisting of four doubles corresponding to
				1728	/// the four comparison results: zero if the comparison is false, and all 1's
				1729	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1730	///
				1731	/// \headerfile <x86intrin.h>
				1732	///
				1733	/// \code
				1734	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1735	/// \endcode
				1736	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1737	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1738	///
				1739	/// \param a
				1740	/// A 256-bit vector of [4 x double].
				1741	/// \param b
				1742	/// A 256-bit vector of [4 x double].
				1743	/// \param c
				1744	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1745	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1746	/// 0x00: Equal (ordered, non-signaling) \n
				1747	/// 0x01: Less-than (ordered, signaling) \n
				1748	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1749	/// 0x03: Unordered (non-signaling) \n
				1750	/// 0x04: Not-equal (unordered, non-signaling) \n
				1751	/// 0x05: Not-less-than (unordered, signaling) \n
				1752	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1753	/// 0x07: Ordered (non-signaling) \n
				1754	/// 0x08: Equal (unordered, non-signaling) \n
				1755	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1756	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1757	/// 0x0B: False (ordered, non-signaling) \n
				1758	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1759	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1760	/// 0x0E: Greater-than (ordered, signaling) \n
				1761	/// 0x0F: True (unordered, non-signaling) \n
				1762	/// 0x10: Equal (ordered, signaling) \n
				1763	/// 0x11: Less-than (ordered, non-signaling) \n
				1764	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1765	/// 0x13: Unordered (signaling) \n
				1766	/// 0x14: Not-equal (unordered, signaling) \n
				1767	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1768	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1769	/// 0x17: Ordered (signaling) \n
				1770	/// 0x18: Equal (unordered, signaling) \n
				1771	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1772	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1773	/// 0x1B: False (ordered, signaling) \n
				1774	/// 0x1C: Not-equal (ordered, signaling) \n
				1775	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1776	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1777	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1778	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1779	#define _mm256_cmp_pd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1780	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1781	(__v4df)(__m256d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1782
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1783	/// Compares each of the corresponding values of two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1784	/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1785	/// operand.
				1786	///
				1787	/// Returns a [8 x float] vector consisting of eight floats corresponding to
				1788	/// the eight comparison results: zero if the comparison is false, and all
				1789	/// 1's if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1790	///
				1791	/// \headerfile <x86intrin.h>
				1792	///
				1793	/// \code
				1794	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1795	/// \endcode
				1796	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1797	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1798	///
				1799	/// \param a
				1800	/// A 256-bit vector of [8 x float].
				1801	/// \param b
				1802	/// A 256-bit vector of [8 x float].
				1803	/// \param c
				1804	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1805	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1806	/// 0x00: Equal (ordered, non-signaling) \n
				1807	/// 0x01: Less-than (ordered, signaling) \n
				1808	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1809	/// 0x03: Unordered (non-signaling) \n
				1810	/// 0x04: Not-equal (unordered, non-signaling) \n
				1811	/// 0x05: Not-less-than (unordered, signaling) \n
				1812	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1813	/// 0x07: Ordered (non-signaling) \n
				1814	/// 0x08: Equal (unordered, non-signaling) \n
				1815	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1816	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1817	/// 0x0B: False (ordered, non-signaling) \n
				1818	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1819	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1820	/// 0x0E: Greater-than (ordered, signaling) \n
				1821	/// 0x0F: True (unordered, non-signaling) \n
				1822	/// 0x10: Equal (ordered, signaling) \n
				1823	/// 0x11: Less-than (ordered, non-signaling) \n
				1824	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1825	/// 0x13: Unordered (signaling) \n
				1826	/// 0x14: Not-equal (unordered, signaling) \n
				1827	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1828	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1829	/// 0x17: Ordered (signaling) \n
				1830	/// 0x18: Equal (unordered, signaling) \n
				1831	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1832	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1833	/// 0x1B: False (ordered, signaling) \n
				1834	/// 0x1C: Not-equal (ordered, signaling) \n
				1835	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1836	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1837	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1838	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1839	#define _mm256_cmp_ps(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1840	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1841	(__v8sf)(__m256)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1842
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1843	/// Compares each of the corresponding scalar double-precision values of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1844	/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1845	/// immediate integer operand.
				1846	///
				1847	/// If the result is true, all 64 bits of the destination vector are set;
				1848	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1849	///
				1850	/// \headerfile <x86intrin.h>
				1851	///
				1852	/// \code
				1853	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1854	/// \endcode
				1855	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1856	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1857	///
				1858	/// \param a
				1859	/// A 128-bit vector of [2 x double].
				1860	/// \param b
				1861	/// A 128-bit vector of [2 x double].
				1862	/// \param c
				1863	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1864	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1865	/// 0x00: Equal (ordered, non-signaling) \n
				1866	/// 0x01: Less-than (ordered, signaling) \n
				1867	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1868	/// 0x03: Unordered (non-signaling) \n
				1869	/// 0x04: Not-equal (unordered, non-signaling) \n
				1870	/// 0x05: Not-less-than (unordered, signaling) \n
				1871	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1872	/// 0x07: Ordered (non-signaling) \n
				1873	/// 0x08: Equal (unordered, non-signaling) \n
				1874	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1875	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1876	/// 0x0B: False (ordered, non-signaling) \n
				1877	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1878	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1879	/// 0x0E: Greater-than (ordered, signaling) \n
				1880	/// 0x0F: True (unordered, non-signaling) \n
				1881	/// 0x10: Equal (ordered, signaling) \n
				1882	/// 0x11: Less-than (ordered, non-signaling) \n
				1883	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1884	/// 0x13: Unordered (signaling) \n
				1885	/// 0x14: Not-equal (unordered, signaling) \n
				1886	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1887	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1888	/// 0x17: Ordered (signaling) \n
				1889	/// 0x18: Equal (unordered, signaling) \n
				1890	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1891	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1892	/// 0x1B: False (ordered, signaling) \n
				1893	/// 0x1C: Not-equal (ordered, signaling) \n
				1894	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1895	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1896	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1897	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1898	#define _mm_cmp_sd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1899	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1900	(__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1901
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1902	/// Compares each of the corresponding scalar values of two 128-bit
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1903	/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1904	/// integer operand.
				1905	///
				1906	/// If the result is true, all 32 bits of the destination vector are set;
				1907	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1908	///
				1909	/// \headerfile <x86intrin.h>
				1910	///
				1911	/// \code
				1912	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1913	/// \endcode
				1914	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1915	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1916	///
				1917	/// \param a
				1918	/// A 128-bit vector of [4 x float].
				1919	/// \param b
				1920	/// A 128-bit vector of [4 x float].
				1921	/// \param c
				1922	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1923	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1924	/// 0x00: Equal (ordered, non-signaling) \n
				1925	/// 0x01: Less-than (ordered, signaling) \n
				1926	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1927	/// 0x03: Unordered (non-signaling) \n
				1928	/// 0x04: Not-equal (unordered, non-signaling) \n
				1929	/// 0x05: Not-less-than (unordered, signaling) \n
				1930	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1931	/// 0x07: Ordered (non-signaling) \n
				1932	/// 0x08: Equal (unordered, non-signaling) \n
				1933	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1934	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1935	/// 0x0B: False (ordered, non-signaling) \n
				1936	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1937	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1938	/// 0x0E: Greater-than (ordered, signaling) \n
				1939	/// 0x0F: True (unordered, non-signaling) \n
				1940	/// 0x10: Equal (ordered, signaling) \n
				1941	/// 0x11: Less-than (ordered, non-signaling) \n
				1942	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1943	/// 0x13: Unordered (signaling) \n
				1944	/// 0x14: Not-equal (unordered, signaling) \n
				1945	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1946	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1947	/// 0x17: Ordered (signaling) \n
				1948	/// 0x18: Equal (unordered, signaling) \n
				1949	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1950	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1951	/// 0x1B: False (ordered, signaling) \n
				1952	/// 0x1C: Not-equal (ordered, signaling) \n
				1953	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1954	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1955	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1956	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1957	#define _mm_cmp_ss(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1958	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1959	(__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1960
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1961	/// Takes a [8 x i32] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1962	/// indexed by the immediate constant operand.
				1963	///
				1964	/// \headerfile <x86intrin.h>
				1965	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1966	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1967	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1968	///
				1969	/// \param __a
				1970	/// A 256-bit vector of [8 x i32].
				1971	/// \param __imm
				1972	/// An immediate integer operand with bits [2:0] determining which vector
				1973	/// element is extracted and returned.
				1974	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1975	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	1976	#define _mm256_extract_epi32(X, N) \
				1977	(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1978
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1979	/// Takes a [16 x i16] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1980	/// indexed by the immediate constant operand.
				1981	///
				1982	/// \headerfile <x86intrin.h>
				1983	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1984	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1985	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1986	///
				1987	/// \param __a
				1988	/// A 256-bit integer vector of [16 x i16].
				1989	/// \param __imm
				1990	/// An immediate integer operand with bits [3:0] determining which vector
				1991	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1992	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1993	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	1994	#define _mm256_extract_epi16(X, N) \
				1995	(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
				1996	(int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1997
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1998	/// Takes a [32 x i8] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1999	/// indexed by the immediate constant operand.
				2000	///
				2001	/// \headerfile <x86intrin.h>
				2002	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2003	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2004	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2005	///
				2006	/// \param __a
				2007	/// A 256-bit integer vector of [32 x i8].
				2008	/// \param __imm
				2009	/// An immediate integer operand with bits [4:0] determining which vector
				2010	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2011	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				2012	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2013	#define _mm256_extract_epi8(X, N) \
				2014	(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
				2015	(int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2016
				2017	#ifdef __x86_64__
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2018	/// Takes a [4 x i64] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2019	/// indexed by the immediate constant operand.
				2020	///
				2021	/// \headerfile <x86intrin.h>
				2022	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2023	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2024	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2025	///
				2026	/// \param __a
				2027	/// A 256-bit integer vector of [4 x i64].
				2028	/// \param __imm
				2029	/// An immediate integer operand with bits [1:0] determining which vector
				2030	/// element is extracted and returned.
				2031	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				2032	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2033	#define _mm256_extract_epi64(X, N) \
				2034	(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2035	#endif
				2036
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2037	/// Takes a [8 x i32] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2038	/// indexed by the immediate constant operand by a new value. Returns the
				2039	/// modified vector.
				2040	///
				2041	/// \headerfile <x86intrin.h>
				2042	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2043	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2044	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2045	///
				2046	/// \param __a
				2047	/// A vector of [8 x i32] to be used by the insert operation.
				2048	/// \param __b
				2049	/// An integer value. The replacement value for the insert operation.
				2050	/// \param __imm
				2051	/// An immediate integer specifying the index of the vector element to be
				2052	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2053	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2054	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2055	#define _mm256_insert_epi32(X, I, N) \
				2056	(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
				2057	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2058
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2059
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2060	/// Takes a [16 x i16] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2061	/// indexed by the immediate constant operand with a new value. Returns the
				2062	/// modified vector.
				2063	///
				2064	/// \headerfile <x86intrin.h>
				2065	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2066	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2067	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2068	///
				2069	/// \param __a
				2070	/// A vector of [16 x i16] to be used by the insert operation.
				2071	/// \param __b
				2072	/// An i16 integer value. The replacement value for the insert operation.
				2073	/// \param __imm
				2074	/// An immediate integer specifying the index of the vector element to be
				2075	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2076	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2077	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2078	#define _mm256_insert_epi16(X, I, N) \
				2079	(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
				2080	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2081
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2082	/// Takes a [32 x i8] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2083	/// indexed by the immediate constant operand with a new value. Returns the
				2084	/// modified vector.
				2085	///
				2086	/// \headerfile <x86intrin.h>
				2087	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2088	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2089	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2090	///
				2091	/// \param __a
				2092	/// A vector of [32 x i8] to be used by the insert operation.
				2093	/// \param __b
				2094	/// An i8 integer value. The replacement value for the insert operation.
				2095	/// \param __imm
				2096	/// An immediate integer specifying the index of the vector element to be
				2097	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2098	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2099	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2100	#define _mm256_insert_epi8(X, I, N) \
				2101	(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
				2102	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2103
				2104	#ifdef __x86_64__
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2105	/// Takes a [4 x i64] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2106	/// indexed by the immediate constant operand with a new value. Returns the
				2107	/// modified vector.
				2108	///
				2109	/// \headerfile <x86intrin.h>
				2110	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2111	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2112	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2113	///
				2114	/// \param __a
				2115	/// A vector of [4 x i64] to be used by the insert operation.
				2116	/// \param __b
				2117	/// A 64-bit integer value. The replacement value for the insert operation.
				2118	/// \param __imm
				2119	/// An immediate integer specifying the index of the vector element to be
				2120	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2121	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2122	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2123	#define _mm256_insert_epi64(X, I, N) \
				2124	(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
				2125	(long long)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2126	#endif
				2127
				2128	/* Conversion */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2129	/// Converts a vector of [4 x i32] into a vector of [4 x double].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2130	///
				2131	/// \headerfile <x86intrin.h>
				2132	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2133	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2134	///
				2135	/// \param __a
				2136	/// A 128-bit integer vector of [4 x i32].
				2137	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2138	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2139	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2140	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2141	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2142	}
				2143
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2144	/// Converts a vector of [8 x i32] into a vector of [8 x float].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2145	///
				2146	/// \headerfile <x86intrin.h>
				2147	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2148	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2149	///
				2150	/// \param __a
				2151	/// A 256-bit integer vector.
				2152	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2153	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2154	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2155	{
Craig Topper	842171d	2018-05-21 20:19:17 +0000	[diff] [blame]	2156	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2157	}
				2158
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2159	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2160	/// [4 x float].
				2161	///
				2162	/// \headerfile <x86intrin.h>
				2163	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2164	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2165	///
				2166	/// \param __a
				2167	/// A 256-bit vector of [4 x double].
				2168	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2169	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2170	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2171	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2172	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2173	}
				2174
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2175	/// Converts a vector of [8 x float] into a vector of [8 x i32].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2176	///
				2177	/// \headerfile <x86intrin.h>
				2178	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2179	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2180	///
				2181	/// \param __a
				2182	/// A 256-bit vector of [8 x float].
				2183	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2184	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2185	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2186	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2187	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2188	}
				2189
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2190	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2191	/// x double].
				2192	///
				2193	/// \headerfile <x86intrin.h>
				2194	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2195	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2196	///
				2197	/// \param __a
				2198	/// A 128-bit vector of [4 x float].
				2199	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2200	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2201	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2202	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2203	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2204	}
				2205
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2206	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2207	/// x i32], truncating the result by rounding towards zero when it is
				2208	/// inexact.
				2209	///
				2210	/// \headerfile <x86intrin.h>
				2211	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2212	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2213	///
				2214	/// \param __a
				2215	/// A 256-bit vector of [4 x double].
				2216	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2217	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2218	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2219	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2220	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2221	}
				2222
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2223	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2224	/// x i32]. When a conversion is inexact, the value returned is rounded
				2225	/// according to the rounding control bits in the MXCSR register.
				2226	///
				2227	/// \headerfile <x86intrin.h>
				2228	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2229	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2230	///
				2231	/// \param __a
				2232	/// A 256-bit vector of [4 x double].
				2233	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2234	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2235	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2237	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2238	}
				2239
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2240	/// Converts a vector of [8 x float] into a vector of [8 x i32],
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2241	/// truncating the result by rounding towards zero when it is inexact.
				2242	///
				2243	/// \headerfile <x86intrin.h>
				2244	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2245	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2246	///
				2247	/// \param __a
				2248	/// A 256-bit vector of [8 x float].
				2249	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2250	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2251	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2252	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2253	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2254	}
				2255
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2256	/// Returns the first element of the input vector of [4 x double].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2257	///
				2258	/// \headerfile <avxintrin.h>
				2259	///
				2260	/// This intrinsic is a utility function and does not correspond to a specific
				2261	/// instruction.
				2262	///
				2263	/// \param __a
				2264	/// A 256-bit vector of [4 x double].
				2265	/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2266	static __inline double __DEFAULT_FN_ATTRS
				2267	_mm256_cvtsd_f64(__m256d __a)
				2268	{
				2269	return __a[0];
				2270	}
				2271
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2272	/// Returns the first element of the input vector of [8 x i32].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2273	///
				2274	/// \headerfile <avxintrin.h>
				2275	///
				2276	/// This intrinsic is a utility function and does not correspond to a specific
				2277	/// instruction.
				2278	///
				2279	/// \param __a
				2280	/// A 256-bit vector of [8 x i32].
				2281	/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2282	static __inline int __DEFAULT_FN_ATTRS
				2283	_mm256_cvtsi256_si32(__m256i __a)
				2284	{
				2285	__v8si __b = (__v8si)__a;
				2286	return __b[0];
				2287	}
				2288
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2289	/// Returns the first element of the input vector of [8 x float].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2290	///
				2291	/// \headerfile <avxintrin.h>
				2292	///
				2293	/// This intrinsic is a utility function and does not correspond to a specific
				2294	/// instruction.
				2295	///
				2296	/// \param __a
				2297	/// A 256-bit vector of [8 x float].
				2298	/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2299	static __inline float __DEFAULT_FN_ATTRS
				2300	_mm256_cvtss_f32(__m256 __a)
				2301	{
				2302	return __a[0];
				2303	}
				2304
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2305	/* Vector replicate */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2306	/// Moves and duplicates odd-indexed values from a 256-bit vector of
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2307	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2308	///
				2309	/// \headerfile <x86intrin.h>
				2310	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2311	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2312	///
				2313	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2314	/// A 256-bit vector of [8 x float]. \n
				2315	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
				2316	/// the return value. \n
				2317	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
				2318	/// the return value. \n
				2319	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
				2320	/// return value. \n
				2321	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
				2322	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2323	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2324	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2325	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2326	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2327	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2328	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2329	}
				2330
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2331	/// Moves and duplicates even-indexed values from a 256-bit vector of
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2332	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2333	///
				2334	/// \headerfile <x86intrin.h>
				2335	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2336	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2337	///
				2338	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2339	/// A 256-bit vector of [8 x float]. \n
				2340	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
				2341	/// the return value. \n
				2342	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
				2343	/// the return value. \n
				2344	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
				2345	/// return value. \n
				2346	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
				2347	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2348	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2349	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2350	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2351	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2352	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2353	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2354	}
				2355
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2356	/// Moves and duplicates double-precision floating point values from a
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2357	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2358	/// vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2359	///
				2360	/// \headerfile <x86intrin.h>
				2361	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2362	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2363	///
				2364	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2365	/// A 256-bit vector of [4 x double]. \n
				2366	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
				2367	/// return value. \n
				2368	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
				2369	/// the return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2370	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2371	/// duplicated values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2372	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2373	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2374	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2375	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2376	}
				2377
				2378	/* Unpack and Interleave */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2379	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2380	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2381	///
				2382	/// \headerfile <x86intrin.h>
				2383	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2384	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2385	///
				2386	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2387	/// A 256-bit floating-point vector of [4 x double]. \n
				2388	/// Bits [127:64] are written to bits [63:0] of the return value. \n
				2389	/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2390	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2391	/// A 256-bit floating-point vector of [4 x double]. \n
				2392	/// Bits [127:64] are written to bits [127:64] of the return value. \n
				2393	/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2394	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2395	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2396	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2397	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2398	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2399	}
				2400
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2401	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2402	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2403	///
				2404	/// \headerfile <x86intrin.h>
				2405	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2406	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2407	///
				2408	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2409	/// A 256-bit floating-point vector of [4 x double]. \n
				2410	/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2411	/// Bits [191:128] are written to bits [191:128] of the return value.
				2412	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2413	/// A 256-bit floating-point vector of [4 x double]. \n
				2414	/// Bits [63:0] are written to bits [127:64] of the return value. \n
				2415	/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2416	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2417	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2418	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2419	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2420	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2421	}
				2422
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2423	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2424	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2425	/// vector of [8 x float].
				2426	///
				2427	/// \headerfile <x86intrin.h>
				2428	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2429	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2430	///
				2431	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2432	/// A 256-bit vector of [8 x float]. \n
				2433	/// Bits [95:64] are written to bits [31:0] of the return value. \n
				2434	/// Bits [127:96] are written to bits [95:64] of the return value. \n
				2435	/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2436	/// Bits [255:224] are written to bits [223:192] of the return value.
				2437	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2438	/// A 256-bit vector of [8 x float]. \n
				2439	/// Bits [95:64] are written to bits [63:32] of the return value. \n
				2440	/// Bits [127:96] are written to bits [127:96] of the return value. \n
				2441	/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2442	/// Bits [255:224] are written to bits [255:224] of the return value.
				2443	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2444	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2445	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2446	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2447	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2448	}
				2449
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2450	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2451	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2452	/// vector of [8 x float].
				2453	///
				2454	/// \headerfile <x86intrin.h>
				2455	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2456	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2457	///
				2458	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2459	/// A 256-bit vector of [8 x float]. \n
				2460	/// Bits [31:0] are written to bits [31:0] of the return value. \n
				2461	/// Bits [63:32] are written to bits [95:64] of the return value. \n
				2462	/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2463	/// Bits [191:160] are written to bits [223:192] of the return value.
				2464	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2465	/// A 256-bit vector of [8 x float]. \n
				2466	/// Bits [31:0] are written to bits [63:32] of the return value. \n
				2467	/// Bits [63:32] are written to bits [127:96] of the return value. \n
				2468	/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2469	/// Bits [191:160] are written to bits [255:224] of the return value.
				2470	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2471	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2472	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2473	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2474	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2475	}
				2476
				2477	/* Bit Test */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2478	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2479	/// element-by-element comparison of the double-precision element in the
				2480	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2481	/// vector.
				2482	///
				2483	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2484	/// If there is at least one pair of double-precision elements where the
				2485	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2486	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2487	/// If there is at least one pair of double-precision elements where the
				2488	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2489	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2490	/// This intrinsic returns the value of the ZF flag.
				2491	///
				2492	/// \headerfile <x86intrin.h>
				2493	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2494	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2495	///
				2496	/// \param __a
				2497	/// A 128-bit vector of [2 x double].
				2498	/// \param __b
				2499	/// A 128-bit vector of [2 x double].
				2500	/// \returns the ZF flag in the EFLAGS register.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2501	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2502	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2503	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2504	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2505	}
				2506
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2507	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2508	/// element-by-element comparison of the double-precision element in the
				2509	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2510	/// vector.
				2511	///
				2512	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2513	/// If there is at least one pair of double-precision elements where the
				2514	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2515	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2516	/// If there is at least one pair of double-precision elements where the
				2517	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2518	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2519	/// This intrinsic returns the value of the CF flag.
				2520	///
				2521	/// \headerfile <x86intrin.h>
				2522	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2523	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2524	///
				2525	/// \param __a
				2526	/// A 128-bit vector of [2 x double].
				2527	/// \param __b
				2528	/// A 128-bit vector of [2 x double].
				2529	/// \returns the CF flag in the EFLAGS register.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2530	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2531	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2532	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2533	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2534	}
				2535
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2536	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2537	/// element-by-element comparison of the double-precision element in the
				2538	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2539	/// vector.
				2540	///
				2541	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2542	/// If there is at least one pair of double-precision elements where the
				2543	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2544	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2545	/// If there is at least one pair of double-precision elements where the
				2546	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2547	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2548	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2549	/// otherwise it returns 0.
				2550	///
				2551	/// \headerfile <x86intrin.h>
				2552	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2553	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2554	///
				2555	/// \param __a
				2556	/// A 128-bit vector of [2 x double].
				2557	/// \param __b
				2558	/// A 128-bit vector of [2 x double].
				2559	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2560	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2561	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2562	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2563	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2564	}
				2565
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2566	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2567	/// element-by-element comparison of the single-precision element in the
				2568	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2569	/// vector.
				2570	///
				2571	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2572	/// If there is at least one pair of single-precision elements where the
				2573	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2574	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2575	/// If there is at least one pair of single-precision elements where the
				2576	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2577	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2578	/// This intrinsic returns the value of the ZF flag.
				2579	///
				2580	/// \headerfile <x86intrin.h>
				2581	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2582	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2583	///
				2584	/// \param __a
				2585	/// A 128-bit vector of [4 x float].
				2586	/// \param __b
				2587	/// A 128-bit vector of [4 x float].
				2588	/// \returns the ZF flag.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2589	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2590	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2591	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2592	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2593	}
				2594
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2595	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2596	/// element-by-element comparison of the single-precision element in the
				2597	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2598	/// vector.
				2599	///
				2600	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2601	/// If there is at least one pair of single-precision elements where the
				2602	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2603	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2604	/// If there is at least one pair of single-precision elements where the
				2605	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2606	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2607	/// This intrinsic returns the value of the CF flag.
				2608	///
				2609	/// \headerfile <x86intrin.h>
				2610	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2611	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2612	///
				2613	/// \param __a
				2614	/// A 128-bit vector of [4 x float].
				2615	/// \param __b
				2616	/// A 128-bit vector of [4 x float].
				2617	/// \returns the CF flag.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2618	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2619	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2620	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2621	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2622	}
				2623
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2624	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2625	/// element-by-element comparison of the single-precision element in the
				2626	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2627	/// vector.
				2628	///
				2629	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2630	/// If there is at least one pair of single-precision elements where the
				2631	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2632	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2633	/// If there is at least one pair of single-precision elements where the
				2634	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2635	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2636	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2637	/// otherwise it returns 0.
				2638	///
				2639	/// \headerfile <x86intrin.h>
				2640	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2641	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2642	///
				2643	/// \param __a
				2644	/// A 128-bit vector of [4 x float].
				2645	/// \param __b
				2646	/// A 128-bit vector of [4 x float].
				2647	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2648	static __inline int __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2649	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2650	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2651	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2652	}
				2653
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2654	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2655	/// element-by-element comparison of the double-precision elements in the
				2656	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2657	/// vector.
				2658	///
				2659	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2660	/// If there is at least one pair of double-precision elements where the
				2661	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2662	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2663	/// If there is at least one pair of double-precision elements where the
				2664	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2665	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2666	/// This intrinsic returns the value of the ZF flag.
				2667	///
				2668	/// \headerfile <x86intrin.h>
				2669	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2670	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2671	///
				2672	/// \param __a
				2673	/// A 256-bit vector of [4 x double].
				2674	/// \param __b
				2675	/// A 256-bit vector of [4 x double].
				2676	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2677	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2678	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2679	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2680	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2681	}
				2682
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2683	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2684	/// element-by-element comparison of the double-precision elements in the
				2685	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2686	/// vector.
				2687	///
				2688	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2689	/// If there is at least one pair of double-precision elements where the
				2690	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2691	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2692	/// If there is at least one pair of double-precision elements where the
				2693	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2694	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2695	/// This intrinsic returns the value of the CF flag.
				2696	///
				2697	/// \headerfile <x86intrin.h>
				2698	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2699	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2700	///
				2701	/// \param __a
				2702	/// A 256-bit vector of [4 x double].
				2703	/// \param __b
				2704	/// A 256-bit vector of [4 x double].
				2705	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2706	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2707	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2708	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2709	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2710	}
				2711
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2712	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2713	/// element-by-element comparison of the double-precision elements in the
				2714	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2715	/// vector.
				2716	///
				2717	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2718	/// If there is at least one pair of double-precision elements where the
				2719	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2720	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2721	/// If there is at least one pair of double-precision elements where the
				2722	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2723	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2724	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2725	/// otherwise it returns 0.
				2726	///
				2727	/// \headerfile <x86intrin.h>
				2728	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2729	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2730	///
				2731	/// \param __a
				2732	/// A 256-bit vector of [4 x double].
				2733	/// \param __b
				2734	/// A 256-bit vector of [4 x double].
				2735	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2736	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2737	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2738	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2739	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2740	}
				2741
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2742	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2743	/// element-by-element comparison of the single-precision element in the
				2744	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2745	/// vector.
				2746	///
				2747	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2748	/// If there is at least one pair of single-precision elements where the
				2749	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2750	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2751	/// If there is at least one pair of single-precision elements where the
				2752	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2753	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2754	/// This intrinsic returns the value of the ZF flag.
				2755	///
				2756	/// \headerfile <x86intrin.h>
				2757	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2758	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2759	///
				2760	/// \param __a
				2761	/// A 256-bit vector of [8 x float].
				2762	/// \param __b
				2763	/// A 256-bit vector of [8 x float].
				2764	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2765	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2766	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2767	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2768	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2769	}
				2770
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2771	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2772	/// element-by-element comparison of the single-precision element in the
				2773	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2774	/// vector.
				2775	///
				2776	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2777	/// If there is at least one pair of single-precision elements where the
				2778	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2779	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2780	/// If there is at least one pair of single-precision elements where the
				2781	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2782	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2783	/// This intrinsic returns the value of the CF flag.
				2784	///
				2785	/// \headerfile <x86intrin.h>
				2786	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2787	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2788	///
				2789	/// \param __a
				2790	/// A 256-bit vector of [8 x float].
				2791	/// \param __b
				2792	/// A 256-bit vector of [8 x float].
				2793	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2794	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2795	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2796	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2797	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2798	}
				2799
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2800	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2801	/// element-by-element comparison of the single-precision elements in the
				2802	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2803	/// vector.
				2804	///
				2805	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2806	/// If there is at least one pair of single-precision elements where the
				2807	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2808	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2809	/// If there is at least one pair of single-precision elements where the
				2810	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2811	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2812	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2813	/// otherwise it returns 0.
				2814	///
				2815	/// \headerfile <x86intrin.h>
				2816	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2817	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2818	///
				2819	/// \param __a
				2820	/// A 256-bit vector of [8 x float].
				2821	/// \param __b
				2822	/// A 256-bit vector of [8 x float].
				2823	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2824	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2825	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2826	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2827	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2828	}
				2829
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2830	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2831	/// of the two source vectors.
				2832	///
				2833	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2834	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2835	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2836	/// If there is at least one pair of bits where the bit from the first source
				2837	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2838	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2839	/// This intrinsic returns the value of the ZF flag.
				2840	///
				2841	/// \headerfile <x86intrin.h>
				2842	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2843	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2844	///
				2845	/// \param __a
				2846	/// A 256-bit integer vector.
				2847	/// \param __b
				2848	/// A 256-bit integer vector.
				2849	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2850	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2851	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2852	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2853	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2854	}
				2855
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2856	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2857	/// of the two source vectors.
				2858	///
				2859	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2860	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2861	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2862	/// If there is at least one pair of bits where the bit from the first source
				2863	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2864	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2865	/// This intrinsic returns the value of the CF flag.
				2866	///
				2867	/// \headerfile <x86intrin.h>
				2868	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2869	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2870	///
				2871	/// \param __a
				2872	/// A 256-bit integer vector.
				2873	/// \param __b
				2874	/// A 256-bit integer vector.
				2875	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2876	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2877	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2878	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2879	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2880	}
				2881
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2882	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2883	/// of the two source vectors.
				2884	///
				2885	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2886	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2887	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2888	/// If there is at least one pair of bits where the bit from the first source
				2889	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2890	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2891	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2892	/// otherwise it returns 0.
				2893	///
				2894	/// \headerfile <x86intrin.h>
				2895	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2896	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2897	///
				2898	/// \param __a
				2899	/// A 256-bit integer vector.
				2900	/// \param __b
				2901	/// A 256-bit integer vector.
				2902	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2903	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2904	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2905	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2906	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2907	}
				2908
				2909	/* Vector extract sign mask */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2910	/// Extracts the sign bits of double-precision floating point elements
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2911	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2912	/// bits of the return value.
				2913	///
				2914	/// \headerfile <x86intrin.h>
				2915	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2916	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2917	///
				2918	/// \param __a
				2919	/// A 256-bit vector of [4 x double] containing the double-precision
				2920	/// floating point values with sign bits to be extracted.
				2921	/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2922	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2923	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2924	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2925	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2926	}
				2927
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2928	/// Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2929	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				2930	/// bits of the return value.
				2931	///
				2932	/// \headerfile <x86intrin.h>
				2933	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2934	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2935	///
				2936	/// \param __a
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2937	/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2938	/// point values with sign bits to be extracted.
				2939	/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2940	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2941	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2942	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2943	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2944	}
				2945
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2946	/* Vector __zero */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2947	/// Zeroes the contents of all XMM or YMM registers.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2948	///
				2949	/// \headerfile <x86intrin.h>
				2950	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2951	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2952	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2953	_mm256_zeroall(void)
				2954	{
				2955	__builtin_ia32_vzeroall();
				2956	}
				2957
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2958	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2959	///
				2960	/// \headerfile <x86intrin.h>
				2961	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2962	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2963	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2964	_mm256_zeroupper(void)
				2965	{
				2966	__builtin_ia32_vzeroupper();
				2967	}
				2968
				2969	/* Vector load with broadcast */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2970	/// Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2971	/// specified address pointed to by \a __a and broadcasts it to the elements
				2972	/// of a [4 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2973	///
				2974	/// \headerfile <x86intrin.h>
				2975	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2976	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2977	///
				2978	/// \param __a
				2979	/// The single-precision floating point value to be broadcast.
				2980	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				2981	/// equal to the broadcast value.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	2982	static __inline __m128 __DEFAULT_FN_ATTRS128
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2983	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2984	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2985	float __f = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	2986	return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2987	}
				2988
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2989	/// Loads a scalar double-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2990	/// specified address pointed to by \a __a and broadcasts it to the elements
				2991	/// of a [4 x double] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2992	///
				2993	/// \headerfile <x86intrin.h>
				2994	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2995	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2996	///
				2997	/// \param __a
				2998	/// The double-precision floating point value to be broadcast.
				2999	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				3000	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3001	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3002	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3003	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3004	double __d = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3005	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3006	}
				3007
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3008	/// Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3009	/// specified address pointed to by \a __a and broadcasts it to the elements
				3010	/// of a [8 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3011	///
				3012	/// \headerfile <x86intrin.h>
				3013	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3014	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3015	///
				3016	/// \param __a
				3017	/// The single-precision floating point value to be broadcast.
				3018	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				3019	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3020	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3021	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3022	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3023	float __f = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3024	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3025	}
				3026
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3027	/// Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3028	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3029	/// elements in a 256-bit vector of [4 x double].
				3030	///
				3031	/// \headerfile <x86intrin.h>
				3032	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3033	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3034	///
				3035	/// \param __a
				3036	/// The 128-bit vector of [2 x double] to be broadcast.
				3037	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				3038	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3039	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3040	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3041	{
Craig Topper	6fb26f9	2018-06-03 19:42:59 +0000	[diff] [blame]	3042	__m128d __b = _mm_loadu_pd((const double *)__a);
				3043	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
				3044	0, 1, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3045	}
				3046
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3047	/// Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3048	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3049	/// elements in a 256-bit vector of [8 x float].
				3050	///
				3051	/// \headerfile <x86intrin.h>
				3052	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3053	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3054	///
				3055	/// \param __a
				3056	/// The 128-bit vector of [4 x float] to be broadcast.
				3057	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				3058	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3059	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3060	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3061	{
Craig Topper	6fb26f9	2018-06-03 19:42:59 +0000	[diff] [blame]	3062	__m128 __b = _mm_loadu_ps((const float *)__a);
				3063	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
				3064	0, 1, 2, 3, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3065	}
				3066
				3067	/* SIMD load ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3068	/// Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3069	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3070	///
				3071	/// \headerfile <x86intrin.h>
				3072	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3073	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3074	///
				3075	/// \param __p
				3076	/// A 32-byte aligned pointer to a memory location containing
				3077	/// double-precision floating point values.
				3078	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3079	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3080	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3081	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3082	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3083	}
				3084
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3085	/// Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3086	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3087	///
				3088	/// \headerfile <x86intrin.h>
				3089	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3090	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3091	///
				3092	/// \param __p
				3093	/// A 32-byte aligned pointer to a memory location containing float values.
				3094	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3095	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3096	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3097	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3098	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3099	}
				3100
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3101	/// Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3102	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3103	///
				3104	/// \headerfile <x86intrin.h>
				3105	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3106	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3107	///
				3108	/// \param __p
				3109	/// A pointer to a memory location containing double-precision floating
				3110	/// point values.
				3111	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3112	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3113	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3114	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3115	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3116	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3117	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3118	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3119	}
				3120
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3121	/// Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3122	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3123	///
				3124	/// \headerfile <x86intrin.h>
				3125	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3126	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3127	///
				3128	/// \param __p
				3129	/// A pointer to a memory location containing single-precision floating
				3130	/// point values.
				3131	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3132	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3133	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3134	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3135	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3136	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3137	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3138	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3139	}
				3140
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3141	/// Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3142	/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3143	///
				3144	/// \headerfile <x86intrin.h>
				3145	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3146	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3147	///
				3148	/// \param __p
				3149	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3150	/// values.
				3151	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3152	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3153	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3154	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3155	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3156	}
				3157
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3158	/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3159	/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3160	///
				3161	/// \headerfile <x86intrin.h>
				3162	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3163	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3164	///
				3165	/// \param __p
				3166	/// A pointer to a 256-bit integer vector containing integer values.
				3167	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3168	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3169	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3170	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3171	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3172	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3173	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3174	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3175	}
				3176
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3177	/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3178	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
				3179	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3180	/// line boundary.
				3181	///
				3182	/// \headerfile <x86intrin.h>
				3183	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3184	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3185	///
				3186	/// \param __p
				3187	/// A pointer to a 256-bit integer vector containing integer values.
				3188	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3189	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3190	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3191	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3192	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3193	}
				3194
				3195	/* SIMD store ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3196	/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3197	/// of [4 x double] to a 32-byte aligned memory location pointed to by
				3198	/// \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3199	///
				3200	/// \headerfile <x86intrin.h>
				3201	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3202	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3203	///
				3204	/// \param __p
				3205	/// A 32-byte aligned pointer to a memory location that will receive the
				3206	/// double-precision floaing point values.
				3207	/// \param __a
				3208	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3209	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3210	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3211	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3212	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3213	}
				3214
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3215	/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3216	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3217	///
				3218	/// \headerfile <x86intrin.h>
				3219	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3220	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3221	///
				3222	/// \param __p
				3223	/// A 32-byte aligned pointer to a memory location that will receive the
				3224	/// float values.
				3225	/// \param __a
				3226	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3227	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3228	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3229	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3230	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3231	}
				3232
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3233	/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3234	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3235	///
				3236	/// \headerfile <x86intrin.h>
				3237	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3238	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3239	///
				3240	/// \param __p
				3241	/// A pointer to a memory location that will receive the double-precision
				3242	/// floating point values.
				3243	/// \param __a
				3244	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3245	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3246	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3247	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3248	struct __storeu_pd {
				3249	__m256d __v;
				3250	} __attribute__((__packed__, __may_alias__));
				3251	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3252	}
				3253
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3254	/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3255	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3256	///
				3257	/// \headerfile <x86intrin.h>
				3258	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3259	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3260	///
				3261	/// \param __p
				3262	/// A pointer to a memory location that will receive the float values.
				3263	/// \param __a
				3264	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3265	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3266	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3267	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3268	struct __storeu_ps {
				3269	__m256 __v;
				3270	} __attribute__((__packed__, __may_alias__));
				3271	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3272	}
				3273
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3274	/// Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3275	/// aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3276	///
				3277	/// \headerfile <x86intrin.h>
				3278	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3279	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3280	///
				3281	/// \param __p
				3282	/// A 32-byte aligned pointer to a memory location that will receive the
				3283	/// integer values.
				3284	/// \param __a
				3285	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3286	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3287	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3288	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3289	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3290	}
				3291
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3292	/// Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3293	/// memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3294	///
				3295	/// \headerfile <x86intrin.h>
				3296	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3297	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3298	///
				3299	/// \param __p
				3300	/// A pointer to a memory location that will receive the integer values.
				3301	/// \param __a
				3302	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3303	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3304	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3305	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3306	struct __storeu_si256 {
				3307	__m256i __v;
				3308	} __attribute__((__packed__, __may_alias__));
				3309	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3310	}
				3311
				3312	/* Conditional load ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3313	/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3314	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3315	/// [2 x double], depending on the mask bits associated with each data
				3316	/// element.
				3317	///
				3318	/// \headerfile <x86intrin.h>
				3319	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3320	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3321	///
				3322	/// \param __p
				3323	/// A pointer to a memory location that contains the double-precision
				3324	/// floating point values.
				3325	/// \param __m
				3326	/// A 128-bit integer vector containing the mask. The most significant bit of
				3327	/// each data element represents the mask bits. If a mask bit is zero, the
				3328	/// corresponding value in the memory location is not loaded and the
				3329	/// corresponding field in the return value is set to zero.
				3330	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	3331	static __inline __m128d __DEFAULT_FN_ATTRS128
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3332	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3333	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3334	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3335	}
				3336
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3337	/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3338	/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3339	/// [4 x double], depending on the mask bits associated with each data
				3340	/// element.
				3341	///
				3342	/// \headerfile <x86intrin.h>
				3343	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3344	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3345	///
				3346	/// \param __p
				3347	/// A pointer to a memory location that contains the double-precision
				3348	/// floating point values.
				3349	/// \param __m
				3350	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3351	/// significant bit of each quadword element represents the mask bits. If a
				3352	/// mask bit is zero, the corresponding value in the memory location is not
				3353	/// loaded and the corresponding field in the return value is set to zero.
				3354	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3355	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3356	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3357	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3358	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3359	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3360	}
				3361
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3362	/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3363	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3364	/// [4 x float], depending on the mask bits associated with each data
				3365	/// element.
				3366	///
				3367	/// \headerfile <x86intrin.h>
				3368	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3369	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3370	///
				3371	/// \param __p
				3372	/// A pointer to a memory location that contains the single-precision
				3373	/// floating point values.
				3374	/// \param __m
				3375	/// A 128-bit integer vector containing the mask. The most significant bit of
				3376	/// each data element represents the mask bits. If a mask bit is zero, the
				3377	/// corresponding value in the memory location is not loaded and the
				3378	/// corresponding field in the return value is set to zero.
				3379	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	3380	static __inline __m128 __DEFAULT_FN_ATTRS128
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3381	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3382	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3383	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3384	}
				3385
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3386	/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3387	/// memory location pointed to by \a __p into a 256-bit vector of
				3388	/// [8 x float], depending on the mask bits associated with each data
				3389	/// element.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3390	///
				3391	/// \headerfile <x86intrin.h>
				3392	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3393	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3394	///
				3395	/// \param __p
				3396	/// A pointer to a memory location that contains the single-precision
				3397	/// floating point values.
				3398	/// \param __m
				3399	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3400	/// significant bit of each dword element represents the mask bits. If a mask
				3401	/// bit is zero, the corresponding value in the memory location is not loaded
				3402	/// and the corresponding field in the return value is set to zero.
				3403	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3404	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3405	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3406	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3407	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3408	}
				3409
				3410	/* Conditional store ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3411	/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3412	/// of [8 x float] to a memory location pointed to by \a __p, according to
				3413	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3414	///
				3415	/// \headerfile <x86intrin.h>
				3416	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3417	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3418	///
				3419	/// \param __p
				3420	/// A pointer to a memory location that will receive the float values.
				3421	/// \param __m
				3422	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3423	/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3424	/// mask bits. If a mask bit is zero, the corresponding value from vector
				3425	/// \a __a is not stored and the corresponding field in the memory location
				3426	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3427	/// \param __a
				3428	/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3429	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3430	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3431	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3432	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3433	}
				3434
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3435	/// Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3436	/// to a memory location pointed to by \a __p, according to the specified
				3437	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3438	///
				3439	/// \headerfile <x86intrin.h>
				3440	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3441	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3442	///
				3443	/// \param __p
				3444	/// A pointer to a memory location that will receive the float values.
				3445	/// \param __m
				3446	/// A 128-bit integer vector containing the mask. The most significant bit of
				3447	/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3448	/// zero, the corresponding value from vector \a __a is not stored and the
				3449	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3450	/// changed.
				3451	/// \param __a
				3452	/// A 128-bit vector of [2 x double] containing the values to be stored.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	3453	static __inline void __DEFAULT_FN_ATTRS128
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3454	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3455	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3456	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3457	}
				3458
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3459	/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3460	/// to a memory location pointed to by \a __p, according to the specified
				3461	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3462	///
				3463	/// \headerfile <x86intrin.h>
				3464	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3465	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3466	///
				3467	/// \param __p
				3468	/// A pointer to a memory location that will receive the float values.
				3469	/// \param __m
				3470	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3471	/// significant bit of each quadword element in the mask vector represents
				3472	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3473	/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3474	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3475	/// \param __a
				3476	/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3477	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3478	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3479	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3480	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3481	}
				3482
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3483	/// Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3484	/// of [4 x float] to a memory location pointed to by \a __p, according to
				3485	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3486	///
				3487	/// \headerfile <x86intrin.h>
				3488	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3489	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3490	///
				3491	/// \param __p
				3492	/// A pointer to a memory location that will receive the float values.
				3493	/// \param __m
				3494	/// A 128-bit integer vector containing the mask. The most significant bit of
				3495	/// each field in the mask vector represents the mask bits. If a mask bit is
				3496	/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3497	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3498	/// changed.
				3499	/// \param __a
				3500	/// A 128-bit vector of [4 x float] containing the values to be stored.
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	3501	static __inline void __DEFAULT_FN_ATTRS128
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3502	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3503	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3504	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3505	}
				3506
				3507	/* Cacheability support ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3508	/// Moves integer data from a 256-bit integer vector to a 32-byte
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3509	/// aligned memory location. To minimize caching, the data is flagged as
				3510	/// non-temporal (unlikely to be used again soon).
				3511	///
				3512	/// \headerfile <x86intrin.h>
				3513	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3514	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3515	///
				3516	/// \param __a
				3517	/// A pointer to a 32-byte aligned memory location that will receive the
				3518	/// integer values.
				3519	/// \param __b
				3520	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3521	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3522	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3523	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3524	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
				3525	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3526	}
				3527
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3528	/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3529	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3530	/// flagged as non-temporal (unlikely to be used again soon).
				3531	///
				3532	/// \headerfile <x86intrin.h>
				3533	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3534	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3535	///
				3536	/// \param __a
				3537	/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanova	cb3603a	2017-06-06 22:58:01 +0000	[diff] [blame]	3538	/// double-precision floating-point values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3539	/// \param __b
				3540	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3541	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3542	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3543	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3544	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
				3545	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3546	}
				3547
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3548	/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3549	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3550	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3551	/// soon).
				3552	///
				3553	/// \headerfile <x86intrin.h>
				3554	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3555	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3556	///
				3557	/// \param __p
				3558	/// A pointer to a 32-byte aligned memory location that will receive the
				3559	/// single-precision floating point values.
				3560	/// \param __a
				3561	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3562	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3563	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3564	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3565	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
				3566	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3567	}
				3568
				3569	/* Create vectors */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3570	/// Create a 256-bit vector of [4 x double] with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3571	///
				3572	/// \headerfile <x86intrin.h>
				3573	///
				3574	/// This intrinsic has no corresponding instruction.
				3575	///
				3576	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3577	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3578	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3579	{
				3580	return (__m256d)__builtin_ia32_undef256();
				3581	}
				3582
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3583	/// Create a 256-bit vector of [8 x float] with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3584	///
				3585	/// \headerfile <x86intrin.h>
				3586	///
				3587	/// This intrinsic has no corresponding instruction.
				3588	///
				3589	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3590	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3591	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3592	{
				3593	return (__m256)__builtin_ia32_undef256();
				3594	}
				3595
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3596	/// Create a 256-bit integer vector with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3597	///
				3598	/// \headerfile <x86intrin.h>
				3599	///
				3600	/// This intrinsic has no corresponding instruction.
				3601	///
				3602	/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3603	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3604	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3605	{
				3606	return (__m256i)__builtin_ia32_undef256();
				3607	}
				3608
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3609	/// Constructs a 256-bit floating-point vector of [4 x double]
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3610	/// initialized with the specified double-precision floating-point values.
				3611	///
				3612	/// \headerfile <x86intrin.h>
				3613	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3614	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3615	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3616	///
				3617	/// \param __a
				3618	/// A double-precision floating-point value used to initialize bits [255:192]
				3619	/// of the result.
				3620	/// \param __b
				3621	/// A double-precision floating-point value used to initialize bits [191:128]
				3622	/// of the result.
				3623	/// \param __c
				3624	/// A double-precision floating-point value used to initialize bits [127:64]
				3625	/// of the result.
				3626	/// \param __d
				3627	/// A double-precision floating-point value used to initialize bits [63:0]
				3628	/// of the result.
				3629	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3630	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3631	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3632	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3633	return __extension__ (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3634	}
				3635
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3636	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3637	/// with the specified single-precision floating-point values.
				3638	///
				3639	/// \headerfile <x86intrin.h>
				3640	///
				3641	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3642	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3643	///
				3644	/// \param __a
				3645	/// A single-precision floating-point value used to initialize bits [255:224]
				3646	/// of the result.
				3647	/// \param __b
				3648	/// A single-precision floating-point value used to initialize bits [223:192]
				3649	/// of the result.
				3650	/// \param __c
				3651	/// A single-precision floating-point value used to initialize bits [191:160]
				3652	/// of the result.
				3653	/// \param __d
				3654	/// A single-precision floating-point value used to initialize bits [159:128]
				3655	/// of the result.
				3656	/// \param __e
				3657	/// A single-precision floating-point value used to initialize bits [127:96]
				3658	/// of the result.
				3659	/// \param __f
				3660	/// A single-precision floating-point value used to initialize bits [95:64]
				3661	/// of the result.
				3662	/// \param __g
				3663	/// A single-precision floating-point value used to initialize bits [63:32]
				3664	/// of the result.
				3665	/// \param __h
				3666	/// A single-precision floating-point value used to initialize bits [31:0]
				3667	/// of the result.
				3668	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3669	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3670	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3671	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3672	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3673	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3674	}
				3675
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3676	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3677	/// 32-bit integral values.
				3678	///
				3679	/// \headerfile <x86intrin.h>
				3680	///
				3681	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3682	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3683	///
				3684	/// \param __i0
				3685	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3686	/// \param __i1
				3687	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3688	/// \param __i2
				3689	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3690	/// \param __i3
				3691	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3692	/// \param __i4
				3693	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3694	/// \param __i5
				3695	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3696	/// \param __i6
				3697	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3698	/// \param __i7
				3699	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3700	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3701	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3702	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3703	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3704	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3705	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3706	}
				3707
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3708	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3709	/// 16-bit integral values.
				3710	///
				3711	/// \headerfile <x86intrin.h>
				3712	///
				3713	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3714	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3715	///
				3716	/// \param __w15
				3717	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3718	/// \param __w14
				3719	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3720	/// \param __w13
				3721	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3722	/// \param __w12
				3723	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3724	/// \param __w11
				3725	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3726	/// \param __w10
				3727	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3728	/// \param __w09
				3729	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3730	/// \param __w08
				3731	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3732	/// \param __w07
				3733	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3734	/// \param __w06
				3735	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3736	/// \param __w05
				3737	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3738	/// \param __w04
				3739	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3740	/// \param __w03
				3741	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3742	/// \param __w02
				3743	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3744	/// \param __w01
				3745	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3746	/// \param __w00
				3747	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3748	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3749	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3750	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3751	short __w11, short __w10, short __w09, short __w08,
				3752	short __w07, short __w06, short __w05, short __w04,
				3753	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3754	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3755	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3756	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3757	}
				3758
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3759	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3760	/// 8-bit integral values.
				3761	///
				3762	/// \headerfile <x86intrin.h>
				3763	///
				3764	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3765	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3766	///
				3767	/// \param __b31
				3768	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3769	/// \param __b30
				3770	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3771	/// \param __b29
				3772	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3773	/// \param __b28
				3774	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3775	/// \param __b27
				3776	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3777	/// \param __b26
				3778	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3779	/// \param __b25
				3780	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3781	/// \param __b24
				3782	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3783	/// \param __b23
				3784	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3785	/// \param __b22
				3786	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3787	/// \param __b21
				3788	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3789	/// \param __b20
				3790	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3791	/// \param __b19
				3792	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3793	/// \param __b18
				3794	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3795	/// \param __b17
				3796	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3797	/// \param __b16
				3798	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3799	/// \param __b15
				3800	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3801	/// \param __b14
				3802	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3803	/// \param __b13
				3804	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3805	/// \param __b12
				3806	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3807	/// \param __b11
				3808	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3809	/// \param __b10
				3810	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3811	/// \param __b09
				3812	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3813	/// \param __b08
				3814	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3815	/// \param __b07
				3816	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3817	/// \param __b06
				3818	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3819	/// \param __b05
				3820	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3821	/// \param __b04
				3822	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3823	/// \param __b03
				3824	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3825	/// \param __b02
				3826	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3827	/// \param __b01
				3828	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3829	/// \param __b00
				3830	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3831	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3832	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3833	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3834	char __b27, char __b26, char __b25, char __b24,
				3835	char __b23, char __b22, char __b21, char __b20,
				3836	char __b19, char __b18, char __b17, char __b16,
				3837	char __b15, char __b14, char __b13, char __b12,
				3838	char __b11, char __b10, char __b09, char __b08,
				3839	char __b07, char __b06, char __b05, char __b04,
				3840	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3841	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3842	return __extension__ (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3843	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3844	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3845	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3846	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3847	};
				3848	}
				3849
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3850	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3851	/// 64-bit integral values.
				3852	///
				3853	/// \headerfile <x86intrin.h>
				3854	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3855	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				3856	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3857	///
				3858	/// \param __a
				3859	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3860	/// \param __b
				3861	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3862	/// \param __c
				3863	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3864	/// \param __d
				3865	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3866	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3867	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3868	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3869	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3870	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3871	}
				3872
				3873	/* Create vectors with elements in reverse order */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3874	/// Constructs a 256-bit floating-point vector of [4 x double],
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3875	/// initialized in reverse order with the specified double-precision
				3876	/// floating-point values.
				3877	///
				3878	/// \headerfile <x86intrin.h>
				3879	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3880	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3881	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3882	///
				3883	/// \param __a
				3884	/// A double-precision floating-point value used to initialize bits [63:0]
				3885	/// of the result.
				3886	/// \param __b
				3887	/// A double-precision floating-point value used to initialize bits [127:64]
				3888	/// of the result.
				3889	/// \param __c
				3890	/// A double-precision floating-point value used to initialize bits [191:128]
				3891	/// of the result.
				3892	/// \param __d
				3893	/// A double-precision floating-point value used to initialize bits [255:192]
				3894	/// of the result.
				3895	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3896	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3897	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3898	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3899	return _mm256_set_pd(__d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3900	}
				3901
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3902	/// Constructs a 256-bit floating-point vector of [8 x float],
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3903	/// initialized in reverse order with the specified single-precision
				3904	/// float-point values.
				3905	///
				3906	/// \headerfile <x86intrin.h>
				3907	///
				3908	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3909	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3910	///
				3911	/// \param __a
				3912	/// A single-precision floating-point value used to initialize bits [31:0]
				3913	/// of the result.
				3914	/// \param __b
				3915	/// A single-precision floating-point value used to initialize bits [63:32]
				3916	/// of the result.
				3917	/// \param __c
				3918	/// A single-precision floating-point value used to initialize bits [95:64]
				3919	/// of the result.
				3920	/// \param __d
				3921	/// A single-precision floating-point value used to initialize bits [127:96]
				3922	/// of the result.
				3923	/// \param __e
				3924	/// A single-precision floating-point value used to initialize bits [159:128]
				3925	/// of the result.
				3926	/// \param __f
				3927	/// A single-precision floating-point value used to initialize bits [191:160]
				3928	/// of the result.
				3929	/// \param __g
				3930	/// A single-precision floating-point value used to initialize bits [223:192]
				3931	/// of the result.
				3932	/// \param __h
				3933	/// A single-precision floating-point value used to initialize bits [255:224]
				3934	/// of the result.
				3935	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3936	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3937	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3938	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3939	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3940	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3941	}
				3942
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3943	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3944	/// with the specified 32-bit integral values.
				3945	///
				3946	/// \headerfile <x86intrin.h>
				3947	///
				3948	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3949	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3950	///
				3951	/// \param __i0
				3952	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3953	/// \param __i1
				3954	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3955	/// \param __i2
				3956	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3957	/// \param __i3
				3958	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3959	/// \param __i4
				3960	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3961	/// \param __i5
				3962	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3963	/// \param __i6
				3964	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3965	/// \param __i7
				3966	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3967	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3968	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3969	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3970	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3971	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3972	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3973	}
				3974
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3975	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3976	/// with the specified 16-bit integral values.
				3977	///
				3978	/// \headerfile <x86intrin.h>
				3979	///
				3980	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3981	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3982	///
				3983	/// \param __w15
				3984	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3985	/// \param __w14
				3986	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3987	/// \param __w13
				3988	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3989	/// \param __w12
				3990	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3991	/// \param __w11
				3992	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3993	/// \param __w10
				3994	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3995	/// \param __w09
				3996	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3997	/// \param __w08
				3998	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3999	/// \param __w07
				4000	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				4001	/// \param __w06
				4002	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				4003	/// \param __w05
				4004	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				4005	/// \param __w04
				4006	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				4007	/// \param __w03
				4008	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				4009	/// \param __w02
				4010	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				4011	/// \param __w01
				4012	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				4013	/// \param __w00
				4014	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				4015	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4016	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4017	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4018	short __w11, short __w10, short __w09, short __w08,
				4019	short __w07, short __w06, short __w05, short __w04,
				4020	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4021	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4022	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
				4023	__w04, __w05, __w06, __w07,
				4024	__w08, __w09, __w10, __w11,
				4025	__w12, __w13, __w14, __w15);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4026	}
				4027
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4028	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4029	/// with the specified 8-bit integral values.
				4030	///
				4031	/// \headerfile <x86intrin.h>
				4032	///
				4033	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4034	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4035	///
				4036	/// \param __b31
				4037	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				4038	/// \param __b30
				4039	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				4040	/// \param __b29
				4041	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				4042	/// \param __b28
				4043	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				4044	/// \param __b27
				4045	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				4046	/// \param __b26
				4047	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				4048	/// \param __b25
				4049	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				4050	/// \param __b24
				4051	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				4052	/// \param __b23
				4053	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				4054	/// \param __b22
				4055	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				4056	/// \param __b21
				4057	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				4058	/// \param __b20
				4059	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				4060	/// \param __b19
				4061	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				4062	/// \param __b18
				4063	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				4064	/// \param __b17
				4065	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				4066	/// \param __b16
				4067	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				4068	/// \param __b15
				4069	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				4070	/// \param __b14
				4071	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				4072	/// \param __b13
				4073	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				4074	/// \param __b12
				4075	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				4076	/// \param __b11
				4077	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				4078	/// \param __b10
				4079	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				4080	/// \param __b09
				4081	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				4082	/// \param __b08
				4083	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				4084	/// \param __b07
				4085	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				4086	/// \param __b06
				4087	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				4088	/// \param __b05
				4089	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				4090	/// \param __b04
				4091	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				4092	/// \param __b03
				4093	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				4094	/// \param __b02
				4095	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				4096	/// \param __b01
				4097	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				4098	/// \param __b00
				4099	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				4100	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4101	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4102	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4103	char __b27, char __b26, char __b25, char __b24,
				4104	char __b23, char __b22, char __b21, char __b20,
				4105	char __b19, char __b18, char __b17, char __b16,
				4106	char __b15, char __b14, char __b13, char __b12,
				4107	char __b11, char __b10, char __b09, char __b08,
				4108	char __b07, char __b06, char __b05, char __b04,
				4109	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4110	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4111	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				4112	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				4113	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				4114	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4115	}
				4116
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4117	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4118	/// with the specified 64-bit integral values.
				4119	///
				4120	/// \headerfile <x86intrin.h>
				4121	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4122	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				4123	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4124	///
				4125	/// \param __a
				4126	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				4127	/// \param __b
				4128	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				4129	/// \param __c
				4130	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				4131	/// \param __d
				4132	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				4133	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4134	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4135	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4136	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4137	return _mm256_set_epi64x(__d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4138	}
				4139
				4140	/* Create vectors with repeated elements */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4141	/// Constructs a 256-bit floating-point vector of [4 x double], with each
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4142	/// of the four double-precision floating-point vector elements set to the
				4143	/// specified double-precision floating-point value.
				4144	///
				4145	/// \headerfile <x86intrin.h>
				4146	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4147	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4148	///
				4149	/// \param __w
				4150	/// A double-precision floating-point value used to initialize each vector
				4151	/// element of the result.
				4152	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4153	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4154	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4155	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4156	return _mm256_set_pd(__w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4157	}
				4158
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4159	/// Constructs a 256-bit floating-point vector of [8 x float], with each
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4160	/// of the eight single-precision floating-point vector elements set to the
				4161	/// specified single-precision floating-point value.
				4162	///
				4163	/// \headerfile <x86intrin.h>
				4164	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4165	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4166	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4167	///
				4168	/// \param __w
				4169	/// A single-precision floating-point value used to initialize each vector
				4170	/// element of the result.
				4171	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4172	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4173	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4174	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4175	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4176	}
				4177
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4178	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4179	/// 32-bit integral vector elements set to the specified 32-bit integral
				4180	/// value.
				4181	///
				4182	/// \headerfile <x86intrin.h>
				4183	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4184	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4185	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4186	///
				4187	/// \param __i
				4188	/// A 32-bit integral value used to initialize each vector element of the
				4189	/// result.
				4190	/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4191	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4192	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4193	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4194	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4195	}
				4196
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4197	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4198	/// 16-bit integral vector elements set to the specified 16-bit integral
				4199	/// value.
				4200	///
				4201	/// \headerfile <x86intrin.h>
				4202	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4203	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4204	///
				4205	/// \param __w
				4206	/// A 16-bit integral value used to initialize each vector element of the
				4207	/// result.
				4208	/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4209	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4210	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4211	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4212	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
				4213	__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4214	}
				4215
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4216	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4217	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4218	///
				4219	/// \headerfile <x86intrin.h>
				4220	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4221	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4222	///
				4223	/// \param __b
				4224	/// An 8-bit integral value used to initialize each vector element of the
				4225	/// result.
				4226	/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4227	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4228	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4229	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4230	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
				4231	__b, __b, __b, __b, __b, __b, __b, __b,
				4232	__b, __b, __b, __b, __b, __b, __b, __b,
				4233	__b, __b, __b, __b, __b, __b, __b, __b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4234	}
				4235
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4236	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4237	/// 64-bit integral vector elements set to the specified 64-bit integral
				4238	/// value.
				4239	///
				4240	/// \headerfile <x86intrin.h>
				4241	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4242	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4243	///
				4244	/// \param __q
				4245	/// A 64-bit integral value used to initialize each vector element of the
				4246	/// result.
				4247	/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4248	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4249	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4250	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4251	return _mm256_set_epi64x(__q, __q, __q, __q);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4252	}
				4253
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4254	/* Create __zeroed vectors */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4255	/// Constructs a 256-bit floating-point vector of [4 x double] with all
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4256	/// vector elements initialized to zero.
				4257	///
				4258	/// \headerfile <x86intrin.h>
				4259	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4260	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4261	///
				4262	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4263	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4264	_mm256_setzero_pd(void)
				4265	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4266	return __extension__ (__m256d){ 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4267	}
				4268
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4269	/// Constructs a 256-bit floating-point vector of [8 x float] with all
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4270	/// vector elements initialized to zero.
				4271	///
				4272	/// \headerfile <x86intrin.h>
				4273	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4274	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4275	///
				4276	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4277	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4278	_mm256_setzero_ps(void)
				4279	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4280	return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4281	}
				4282
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4283	/// Constructs a 256-bit integer vector initialized to zero.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4284	///
				4285	/// \headerfile <x86intrin.h>
				4286	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4287	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4288	///
				4289	/// \returns A 256-bit integer vector initialized to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4290	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4291	_mm256_setzero_si256(void)
				4292	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4293	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4294	}
				4295
				4296	/* Cast between vector types */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4297	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4298	/// floating-point vector of [8 x float].
				4299	///
				4300	/// \headerfile <x86intrin.h>
				4301	///
				4302	/// This intrinsic has no corresponding instruction.
				4303	///
				4304	/// \param __a
				4305	/// A 256-bit floating-point vector of [4 x double].
				4306	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4307	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4308	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4309	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4310	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4311	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4312	}
				4313
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4314	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4315	/// integer vector.
				4316	///
				4317	/// \headerfile <x86intrin.h>
				4318	///
				4319	/// This intrinsic has no corresponding instruction.
				4320	///
				4321	/// \param __a
				4322	/// A 256-bit floating-point vector of [4 x double].
				4323	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4324	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4325	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4326	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4327	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4328	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4329	}
				4330
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4331	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4332	/// floating-point vector of [4 x double].
				4333	///
				4334	/// \headerfile <x86intrin.h>
				4335	///
				4336	/// This intrinsic has no corresponding instruction.
				4337	///
				4338	/// \param __a
				4339	/// A 256-bit floating-point vector of [8 x float].
				4340	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4341	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4342	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4343	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4344	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4345	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4346	}
				4347
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4348	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4349	/// integer vector.
				4350	///
				4351	/// \headerfile <x86intrin.h>
				4352	///
				4353	/// This intrinsic has no corresponding instruction.
				4354	///
				4355	/// \param __a
				4356	/// A 256-bit floating-point vector of [8 x float].
				4357	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4358	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4359	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4360	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4361	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4362	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4363	}
				4364
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4365	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4366	/// of [8 x float].
				4367	///
				4368	/// \headerfile <x86intrin.h>
				4369	///
				4370	/// This intrinsic has no corresponding instruction.
				4371	///
				4372	/// \param __a
				4373	/// A 256-bit integer vector.
				4374	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4375	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4376	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4377	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4378	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4379	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4380	}
				4381
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4382	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4383	/// of [4 x double].
				4384	///
				4385	/// \headerfile <x86intrin.h>
				4386	///
				4387	/// This intrinsic has no corresponding instruction.
				4388	///
				4389	/// \param __a
				4390	/// A 256-bit integer vector.
				4391	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4392	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4393	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4394	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4395	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4396	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4397	}
				4398
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4399	/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4400	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4401	///
				4402	/// \headerfile <x86intrin.h>
				4403	///
				4404	/// This intrinsic has no corresponding instruction.
				4405	///
				4406	/// \param __a
				4407	/// A 256-bit floating-point vector of [4 x double].
				4408	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4409	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4410	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4411	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4412	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4413	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4414	}
				4415
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4416	/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4417	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4418	///
				4419	/// \headerfile <x86intrin.h>
				4420	///
				4421	/// This intrinsic has no corresponding instruction.
				4422	///
				4423	/// \param __a
				4424	/// A 256-bit floating-point vector of [8 x float].
				4425	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4426	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4427	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4428	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4429	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4430	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4431	}
				4432
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4433	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4434	///
				4435	/// \headerfile <x86intrin.h>
				4436	///
				4437	/// This intrinsic has no corresponding instruction.
				4438	///
				4439	/// \param __a
				4440	/// A 256-bit integer vector.
				4441	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4442	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4443	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4444	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4445	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4446	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4447	}
				4448
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4449	/// Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4450	/// 128-bit floating-point vector of [2 x double].
				4451	///
				4452	/// The lower 128 bits contain the value of the source vector. The contents
				4453	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4454	///
				4455	/// \headerfile <x86intrin.h>
				4456	///
				4457	/// This intrinsic has no corresponding instruction.
				4458	///
				4459	/// \param __a
				4460	/// A 128-bit vector of [2 x double].
				4461	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4462	/// contain the value of the parameter. The contents of the upper 128 bits
				4463	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4464	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4465	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4466	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4467	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4468	}
				4469
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4470	/// Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4471	/// 128-bit floating-point vector of [4 x float].
				4472	///
				4473	/// The lower 128 bits contain the value of the source vector. The contents
				4474	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4475	///
				4476	/// \headerfile <x86intrin.h>
				4477	///
				4478	/// This intrinsic has no corresponding instruction.
				4479	///
				4480	/// \param __a
				4481	/// A 128-bit vector of [4 x float].
				4482	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4483	/// contain the value of the parameter. The contents of the upper 128 bits
				4484	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4485	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4486	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4487	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4488	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4489	}
				4490
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4491	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4492	///
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4493	/// The lower 128 bits contain the value of the source vector. The contents
				4494	/// of the upper 128 bits are undefined.
				4495	///
				4496	/// \headerfile <x86intrin.h>
				4497	///
				4498	/// This intrinsic has no corresponding instruction.
				4499	///
				4500	/// \param __a
				4501	/// A 128-bit integer vector.
				4502	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4503	/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4504	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4505	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4506	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4507	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4508	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4509
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4510	/// Constructs a 256-bit floating-point vector of [4 x double] from a
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4511	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4512	/// contain the value of the source vector. The upper 128 bits are set
				4513	/// to zero.
				4514	///
				4515	/// \headerfile <x86intrin.h>
				4516	///
				4517	/// This intrinsic has no corresponding instruction.
				4518	///
				4519	/// \param __a
				4520	/// A 128-bit vector of [2 x double].
				4521	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4522	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4523	static __inline __m256d __DEFAULT_FN_ATTRS
				4524	_mm256_zextpd128_pd256(__m128d __a)
				4525	{
				4526	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
				4527	}
				4528
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4529	/// Constructs a 256-bit floating-point vector of [8 x float] from a
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4530	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4531	/// the value of the source vector. The upper 128 bits are set to zero.
				4532	///
				4533	/// \headerfile <x86intrin.h>
				4534	///
				4535	/// This intrinsic has no corresponding instruction.
				4536	///
				4537	/// \param __a
				4538	/// A 128-bit vector of [4 x float].
				4539	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4540	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4541	static __inline __m256 __DEFAULT_FN_ATTRS
				4542	_mm256_zextps128_ps256(__m128 __a)
				4543	{
				4544	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
				4545	}
				4546
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4547	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4548	/// The lower 128 bits contain the value of the source vector. The upper
				4549	/// 128 bits are set to zero.
				4550	///
				4551	/// \headerfile <x86intrin.h>
				4552	///
				4553	/// This intrinsic has no corresponding instruction.
				4554	///
				4555	/// \param __a
				4556	/// A 128-bit integer vector.
				4557	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4558	/// the parameter. The upper 128 bits are set to zero.
				4559	static __inline __m256i __DEFAULT_FN_ATTRS
				4560	_mm256_zextsi128_si256(__m128i __a)
				4561	{
				4562	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
				4563	}
				4564
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4565	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4566	Vector insert.
				4567	We use macros rather than inlines because we only want to accept
				4568	invocations where the immediate M is a constant expression.
				4569	*/
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4570	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4571	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4572	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4573	/// 128-bit vector of [4 x float] in the second parameter.
				4574	///
				4575	/// The immediate integer parameter determines between the upper or the lower
				4576	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4577	///
				4578	/// \headerfile <x86intrin.h>
				4579	///
				4580	/// \code
				4581	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4582	/// \endcode
				4583	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4584	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4585	///
				4586	/// \param V1
				4587	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4588	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4589	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4590	/// \param V2
				4591	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4592	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4593	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4594	/// \param M
				4595	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4596	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4597	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4598	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4599	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4600	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4601	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4602	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4603	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4604	#define _mm256_insertf128_ps(V1, V2, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4605	(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
				4606	(__v4sf)(__m128)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4607
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4608	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4609	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4610	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4611	/// 128-bit vector of [2 x double] in the second parameter.
				4612	///
				4613	/// The immediate integer parameter determines between the upper or the lower
				4614	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4615	///
				4616	/// \headerfile <x86intrin.h>
				4617	///
				4618	/// \code
				4619	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4620	/// \endcode
				4621	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4622	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4623	///
				4624	/// \param V1
				4625	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4626	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4627	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4628	/// \param V2
				4629	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4630	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4631	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4632	/// \param M
				4633	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4634	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4635	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4636	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4637	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4638	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4639	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4640	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4641	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4642	#define _mm256_insertf128_pd(V1, V2, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4643	(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
				4644	(__v2df)(__m128d)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4645
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4646	/// Constructs a new 256-bit integer vector by first duplicating a
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4647	/// 256-bit integer vector given in the first parameter, and then replacing
				4648	/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4649	/// integer vector in the second parameter.
				4650	///
				4651	/// The immediate integer parameter determines between the upper or the lower
				4652	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4653	///
				4654	/// \headerfile <x86intrin.h>
				4655	///
				4656	/// \code
				4657	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4658	/// \endcode
				4659	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4660	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4661	///
				4662	/// \param V1
				4663	/// A 256-bit integer vector. This vector is copied to the result first, and
				4664	/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4665	/// replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4666	/// \param V2
				4667	/// A 128-bit integer vector. The contents of this parameter are written to
				4668	/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4669	/// value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4670	/// \param M
				4671	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4672	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4673	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4674	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4675	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4676	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4677	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4678	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4679	/// \returns A 256-bit integer vector containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4680	#define _mm256_insertf128_si256(V1, V2, M) \
Craig Topper	573dab1	2018-06-08 04:09:14 +0000	[diff] [blame]	4681	(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
				4682	(__v4si)(__m128i)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4683
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4684	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4685	Vector extract.
				4686	We use macros rather than inlines because we only want to accept
				4687	invocations where the immediate M is a constant expression.
				4688	*/
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4689	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4690	/// of [8 x float], as determined by the immediate integer parameter, and
				4691	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4692	///
				4693	/// \headerfile <x86intrin.h>
				4694	///
				4695	/// \code
				4696	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4697	/// \endcode
				4698	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4699	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4700	///
				4701	/// \param V
				4702	/// A 256-bit vector of [8 x float].
				4703	/// \param M
				4704	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4705	/// extracted from the first parameter: \n
				4706	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4707	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4708	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4709	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4710	#define _mm256_extractf128_ps(V, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4711	(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4712
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4713	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4714	/// of [4 x double], as determined by the immediate integer parameter, and
				4715	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4716	///
				4717	/// \headerfile <x86intrin.h>
				4718	///
				4719	/// \code
				4720	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4721	/// \endcode
				4722	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4723	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4724	///
				4725	/// \param V
				4726	/// A 256-bit vector of [4 x double].
				4727	/// \param M
				4728	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4729	/// extracted from the first parameter: \n
				4730	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4731	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4732	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4733	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4734	#define _mm256_extractf128_pd(V, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4735	(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4736
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4737	/// Extracts either the upper or the lower 128 bits from a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4738	/// integer vector, as determined by the immediate integer parameter, and
				4739	/// returns the extracted bits as a 128-bit integer vector.
				4740	///
				4741	/// \headerfile <x86intrin.h>
				4742	///
				4743	/// \code
				4744	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4745	/// \endcode
				4746	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4747	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4748	///
				4749	/// \param V
				4750	/// A 256-bit integer vector.
				4751	/// \param M
				4752	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4753	/// extracted from the first parameter: \n
				4754	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4755	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4756	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4757	/// \returns A 128-bit integer vector containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4758	#define _mm256_extractf128_si256(V, M) \
Craig Topper	573dab1	2018-06-08 04:09:14 +0000	[diff] [blame]	4759	(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4760
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4761	/* SIMD load ops (unaligned) */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4762	/// Loads two 128-bit floating-point vectors of [4 x float] from
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4763	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4764	/// of [8 x float] by concatenating the two 128-bit vectors.
				4765	///
				4766	/// \headerfile <x86intrin.h>
				4767	///
				4768	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4769	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4770	///
				4771	/// \param __addr_hi
				4772	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4773	/// single-precision floating-point values. These values are to be copied to
				4774	/// bits[255:128] of the result. The address of the memory location does not
				4775	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4776	/// \param __addr_lo
				4777	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4778	/// single-precision floating-point values. These values are to be copied to
				4779	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4780	/// have to be aligned.
				4781	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4782	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4783	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4784	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4785	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4786	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				4787	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4788	}
				4789
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4790	/// Loads two 128-bit floating-point vectors of [2 x double] from
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4791	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4792	/// of [4 x double] by concatenating the two 128-bit vectors.
				4793	///
				4794	/// \headerfile <x86intrin.h>
				4795	///
				4796	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4797	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4798	///
				4799	/// \param __addr_hi
				4800	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4801	/// double-precision floating-point values. These values are to be copied to
				4802	/// bits[255:128] of the result. The address of the memory location does not
				4803	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4804	/// \param __addr_lo
				4805	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4806	/// double-precision floating-point values. These values are to be copied to
				4807	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4808	/// have to be aligned.
				4809	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4810	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4811	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4812	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4813	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4814	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				4815	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4816	}
				4817
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4818	/// Loads two 128-bit integer vectors from unaligned memory locations and
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4819	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4820	/// vectors.
				4821	///
				4822	/// \headerfile <x86intrin.h>
				4823	///
				4824	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4825	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4826	///
				4827	/// \param __addr_hi
				4828	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4829	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4830	/// address of the memory location does not have to be aligned.
				4831	/// \param __addr_lo
				4832	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4833	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4834	/// address of the memory location does not have to be aligned.
				4835	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4836	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4837	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4838	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4839	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				4840	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4841	}
				4842
				4843	/* SIMD store ops (unaligned) */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4844	/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4845	/// vector of [8 x float] into two different unaligned memory locations.
				4846	///
				4847	/// \headerfile <x86intrin.h>
				4848	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4849	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4850	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4851	///
				4852	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4853	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4854	/// copied to this memory location. The address of this memory location does
				4855	/// not have to be aligned.
				4856	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4857	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4858	/// copied to this memory location. The address of this memory location does
				4859	/// not have to be aligned.
				4860	/// \param __a
				4861	/// A 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4862	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4863	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4864	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4865	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4866
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4867	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4868	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4869	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4870	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4871	}
				4872
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4873	/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4874	/// vector of [4 x double] into two different unaligned memory locations.
				4875	///
				4876	/// \headerfile <x86intrin.h>
				4877	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4878	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4879	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4880	///
				4881	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4882	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4883	/// copied to this memory location. The address of this memory location does
				4884	/// not have to be aligned.
				4885	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4886	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4887	/// copied to this memory location. The address of this memory location does
				4888	/// not have to be aligned.
				4889	/// \param __a
				4890	/// A 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4891	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4892	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4893	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4894	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4895
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4896	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4897	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4898	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4899	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4900	}
				4901
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4902	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4903	/// two different unaligned memory locations.
				4904	///
				4905	/// \headerfile <x86intrin.h>
				4906	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4907	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4908	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4909	///
				4910	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4911	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4912	/// copied to this memory location. The address of this memory location does
				4913	/// not have to be aligned.
				4914	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4915	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4916	/// copied to this memory location. The address of this memory location does
				4917	/// not have to be aligned.
				4918	/// \param __a
				4919	/// A 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4920	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4921	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4922	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4923	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4924
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4925	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4926	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4927	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4928	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4929	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4930
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4931	/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4932	/// concatenating two 128-bit floating-point vectors of [4 x float].
				4933	///
				4934	/// \headerfile <x86intrin.h>
				4935	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4936	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4937	///
				4938	/// \param __hi
				4939	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4940	/// 128 bits of the result.
				4941	/// \param __lo
				4942	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4943	/// 128 bits of the result.
				4944	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4945	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4946	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4947	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				4948	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4949	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4950	}
				4951
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4952	/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4953	/// concatenating two 128-bit floating-point vectors of [2 x double].
				4954	///
				4955	/// \headerfile <x86intrin.h>
				4956	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4957	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4958	///
				4959	/// \param __hi
				4960	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4961	/// 128 bits of the result.
				4962	/// \param __lo
				4963	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4964	/// 128 bits of the result.
				4965	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4966	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4967	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4968	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				4969	{
Craig Topper	5cbeeed	2018-07-07 17:03:32 +0000	[diff] [blame]	4970	return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4971	}
				4972
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4973	/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4974	/// integer vectors.
				4975	///
				4976	/// \headerfile <x86intrin.h>
				4977	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4978	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4979	///
				4980	/// \param __hi
				4981	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4982	/// result.
				4983	/// \param __lo
				4984	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4985	/// result.
				4986	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4987	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4988	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				4989	{
Craig Topper	5cbeeed	2018-07-07 17:03:32 +0000	[diff] [blame]	4990	return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4991	}
				4992
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4993	/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4994	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				4995	/// similar to _mm256_set_m128, but the order of the input parameters is
				4996	/// swapped.
				4997	///
				4998	/// \headerfile <x86intrin.h>
				4999	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5000	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5001	///
				5002	/// \param __lo
				5003	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				5004	/// 128 bits of the result.
				5005	/// \param __hi
				5006	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				5007	/// 128 bits of the result.
				5008	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				5009	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5010	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5011	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				5012	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5013	return _mm256_set_m128(__hi, __lo);
				5014	}
				5015
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	5016	/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5017	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				5018	/// similar to _mm256_set_m128d, but the order of the input parameters is
				5019	/// swapped.
				5020	///
				5021	/// \headerfile <x86intrin.h>
				5022	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5023	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5024	///
				5025	/// \param __lo
				5026	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				5027	/// 128 bits of the result.
				5028	/// \param __hi
				5029	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				5030	/// 128 bits of the result.
				5031	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				5032	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5033	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5034	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				5035	{
Craig Topper	5cbeeed	2018-07-07 17:03:32 +0000	[diff] [blame]	5036	return (__m256d)_mm256_set_m128d(__hi, __lo);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5037	}
				5038
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	5039	/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5040	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				5041	/// the input parameters is swapped.
				5042	///
				5043	/// \headerfile <x86intrin.h>
				5044	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5045	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5046	///
				5047	/// \param __lo
				5048	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				5049	/// result.
				5050	/// \param __hi
				5051	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				5052	/// result.
				5053	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5054	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5055	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				5056	{
Craig Topper	5cbeeed	2018-07-07 17:03:32 +0000	[diff] [blame]	5057	return (__m256i)_mm256_set_m128i(__hi, __lo);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5058	}
				5059
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5060	#undef __DEFAULT_FN_ATTRS
Craig Topper	74c10e3	2018-07-09 19:00:16 +0000	[diff] [blame]	5061	#undef __DEFAULT_FN_ATTRS128
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	5062
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	5063	#endif /* __AVXINTRIN_H */