Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: 811a2e07498f1364cdf0faffc9017a0e57e34464 [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
				39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
				46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				47
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
				51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	54
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	55	/* Arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	56	/// \brief Adds two 256-bit vectors of [4 x double].
				57	///
				58	/// \headerfile <x86intrin.h>
				59	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	60	/// This intrinsic corresponds to the \c VADDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	68	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	69	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	70	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	71	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	72	}
				73
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	74	/// \brief Adds two 256-bit vectors of [8 x float].
				75	///
				76	/// \headerfile <x86intrin.h>
				77	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	78	/// This intrinsic corresponds to the \c VADDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	86	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	87	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	88	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	89	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	90	}
				91
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	92	/// \brief Subtracts two 256-bit vectors of [4 x double].
				93	///
				94	/// \headerfile <x86intrin.h>
				95	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	96	/// This intrinsic corresponds to the \c VSUBPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	104	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	105	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	106	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	107	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	108	}
				109
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	110	/// \brief Subtracts two 256-bit vectors of [8 x float].
				111	///
				112	/// \headerfile <x86intrin.h>
				113	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	114	/// This intrinsic corresponds to the \c VSUBPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	122	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	123	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	124	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	125	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	126	}
				127
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	128	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	133	/// This intrinsic corresponds to the \c VADDSUBPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	141	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	142	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	143	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	145	}
				146
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	147	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	152	/// This intrinsic corresponds to the \c VADDSUBPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	160	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	161	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	162	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	164	}
				165
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	166	/// \brief Divides two 256-bit vectors of [4 x double].
				167	///
				168	/// \headerfile <x86intrin.h>
				169	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	170	/// This intrinsic corresponds to the \c VDIVPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	178	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	179	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	180	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	181	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	182	}
				183
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	184	/// \brief Divides two 256-bit vectors of [8 x float].
				185	///
				186	/// \headerfile <x86intrin.h>
				187	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	188	/// This intrinsic corresponds to the \c VDIVPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	196	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	197	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	198	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	199	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	200	}
				201
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	202	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	207	/// This intrinsic corresponds to the \c VMAXPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	215	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	216	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	217	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	219	}
				220
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	221	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	226	/// This intrinsic corresponds to the \c VMAXPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	234	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	235	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	238	}
				239
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	240	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	245	/// This intrinsic corresponds to the \c VMINPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	253	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	254	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	255	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	257	}
				258
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	259	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	264	/// This intrinsic corresponds to the \c VMINPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	272	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	273	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	274	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	276	}
				277
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	278	/// \brief Multiplies two 256-bit vectors of [4 x double].
				279	///
				280	/// \headerfile <x86intrin.h>
				281	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	282	/// This intrinsic corresponds to the \c VMULPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	290	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	291	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	292	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	293	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	294	}
				295
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	296	/// \brief Multiplies two 256-bit vectors of [8 x float].
				297	///
				298	/// \headerfile <x86intrin.h>
				299	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	300	/// This intrinsic corresponds to the \c VMULPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	308	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	309	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	311	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	312	}
				313
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	314	/// \brief Calculates the square roots of the values in a 256-bit vector of
				315	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	316	///
				317	/// \headerfile <x86intrin.h>
				318	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	319	/// This intrinsic corresponds to the \c VSQRTPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	325	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	326	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	327	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	329	}
				330
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	331	/// \brief Calculates the square roots of the values in a 256-bit vector of
				332	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	333	///
				334	/// \headerfile <x86intrin.h>
				335	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	336	/// This intrinsic corresponds to the \c VSQRTPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	342	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	343	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	344	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	346	}
				347
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	348	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				349	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	350	///
				351	/// \headerfile <x86intrin.h>
				352	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	353	/// This intrinsic corresponds to the \c VRSQRTPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	359	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	360	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	361	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	363	}
				364
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	365	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				366	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	367	///
				368	/// \headerfile <x86intrin.h>
				369	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	370	/// This intrinsic corresponds to the \c VRCPPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	376	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	377	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	378	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	380	}
				381
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	382	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	392	/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	397	/// An integer value that specifies the rounding operation. \n
				398	/// Bits [7:4] are reserved. \n
				399	/// Bit [3] is a precision exception value: \n
				400	/// 0: A normal PE exception is used. \n
				401	/// 1: The PE field is not updated. \n
				402	/// Bit [2] is the rounding control source: \n
				403	/// 0: Use bits [1:0] of M. \n
				404	/// 1: Use the current MXCSR setting. \n
				405	/// Bits [1:0] contain the rounding control definition: \n
				406	/// 00: Nearest. \n
				407	/// 01: Downward (toward negative infinity). \n
				408	/// 10: Upward (toward positive infinity). \n
				409	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	411	#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	413
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	414	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	424	/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	429	/// An integer value that specifies the rounding operation. \n
				430	/// Bits [7:4] are reserved. \n
				431	/// Bit [3] is a precision exception value: \n
				432	/// 0: A normal PE exception is used. \n
				433	/// 1: The PE field is not updated. \n
				434	/// Bit [2] is the rounding control source: \n
				435	/// 0: Use bits [1:0] of M. \n
				436	/// 1: Use the current MXCSR setting. \n
				437	/// Bits [1:0] contain the rounding control definition: \n
				438	/// 00: Nearest. \n
				439	/// 01: Downward (toward negative infinity). \n
				440	/// 10: Upward (toward positive infinity). \n
				441	/// 11: Truncated. \n
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	443	#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	445
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	446	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	456	/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	462
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	463	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	473	/// This intrinsic corresponds to the \c VROUNDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	480
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	481	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	491	/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	497
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	498	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	508	/// This intrinsic corresponds to the \c VROUNDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	516	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				517	///
				518	/// \headerfile <x86intrin.h>
				519	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	520	/// This intrinsic corresponds to the \c VANDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	528	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	529	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	530	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	531	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	532	}
				533
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	534	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				535	///
				536	/// \headerfile <x86intrin.h>
				537	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	538	/// This intrinsic corresponds to the \c VANDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	546	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	547	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	548	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	549	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	550	}
				551
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	552	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	557	/// This intrinsic corresponds to the \c VANDNPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	567	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	568	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	569	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	571	}
				572
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	573	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	578	/// This intrinsic corresponds to the \c VANDNPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	588	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	589	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	590	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	591	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	592	}
				593
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	594	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				595	///
				596	/// \headerfile <x86intrin.h>
				597	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	598	/// This intrinsic corresponds to the \c VORPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	606	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	607	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	608	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	609	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	610	}
				611
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	612	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				613	///
				614	/// \headerfile <x86intrin.h>
				615	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	616	/// This intrinsic corresponds to the \c VORPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	624	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	625	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	626	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	627	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	628	}
				629
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	630	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				631	///
				632	/// \headerfile <x86intrin.h>
				633	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	634	/// This intrinsic corresponds to the \c VXORPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	643	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	644	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	646	}
				647
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	648	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				649	///
				650	/// \headerfile <x86intrin.h>
				651	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	652	/// This intrinsic corresponds to the \c VXORPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	660	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	661	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	662	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	663	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	664	}
				665
				666	/* Horizontal arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	667	/// \brief Horizontally adds the adjacent pairs of values contained in two
				668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	672	/// This intrinsic corresponds to the \c VHADDPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	684	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	685	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	686	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	688	}
				689
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	690	/// \brief Horizontally adds the adjacent pairs of values contained in two
				691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	695	/// This intrinsic corresponds to the \c VHADDPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	707	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	708	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	709	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	711	}
				712
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	713	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	718	/// This intrinsic corresponds to the \c VHSUBPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	730	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	731	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	732	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	734	}
				735
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	736	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	741	/// This intrinsic corresponds to the \c VHSUBPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	753	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	754	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	755	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	757	}
				758
				759	/* Vector permutations */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	760	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				761	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	762	///
				763	/// \headerfile <x86intrin.h>
				764	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	765	/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	770	/// \li A 128-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	771	/// copied.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	772	/// \li Bit [1]:
				773	/// \li 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	774	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	775	/// \li 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	776	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	777	/// \li Bit [65]:
				778	/// \li 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	779	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	780	/// \li 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	781	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	783	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	784	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	787	}
				788
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	789	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	790	/// specified by the 256-bit integer vector operand.
				791	///
				792	/// \headerfile <x86intrin.h>
				793	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	794	/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	800	/// copied. \n
				801	/// Bit [1]: \n
				802	/// \li 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	803	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	804	/// \li 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	805	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	806	/// Bit [65]: \n
				807	/// \li 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	808	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	809	/// \li 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	810	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	811	/// Bit [129]: \n
				812	/// \li 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	813	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	814	/// \li 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	815	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	816	/// Bit [193]: \n
				817	/// \li 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	818	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	819	/// \li 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	820	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	824	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	826	}
				827
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	828	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				829	/// specified by the 128-bit integer vector operand.
				830	///
				831	/// \headerfile <x86intrin.h>
				832	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	833	/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	834	///
				835	/// \param __a
				836	/// A 128-bit vector of [4 x float].
				837	/// \param __c
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	838	/// <ul>
				839	/// <li> A 128-bit integer vector operand specifying how the values are to be
				840	/// copied.
				841	/// </li>
				842	/// <li> Bits [1:0]:
				843	/// <ul>
				844	/// <li> 00: Bits [31:0] of the source are copied to bits [31:0] of the
				845	/// returned vector. </li>
				846	/// <li> 01: Bits [63:32] of the source are copied to bits [31:0] of the
				847	/// returned vector. </li>
				848	/// <li> 10: Bits [95:64] of the source are copied to bits [31:0] of the
				849	/// returned vector. </li>
				850	/// <li> 11: Bits [127:96] of the source are copied to bits [31:0] of the
				851	/// returned vector. </li>
				852	/// </ul>
				853	/// </li>
				854	/// <li> Bits [33:32]:
				855	/// <ul>
				856	/// <li> 00: Bits [31:0] of the source are copied to bits [63:32] of the
				857	/// returned vector. </li>
				858	/// <li> 01: Bits [63:32] of the source are copied to bits [63:32] of the
				859	/// returned vector. </li>
				860	/// <li> 10: Bits [95:64] of the source are copied to bits [63:32] of the
				861	/// returned vector. </li>
				862	/// <li> 11: Bits [127:96] of the source are copied to bits [63:32] of the
				863	/// returned vector. </li>
				864	/// </ul>
				865	/// </li>
				866	/// <li> Bits [65:64]:
				867	/// <ul>
				868	/// <li> 00: Bits [31:0] of the source are copied to bits [95:64] of the
				869	/// returned vector. </li>
				870	/// <li> 01: Bits [63:32] of the source are copied to bits [95:64] of the
				871	/// returned vector. </li>
				872	/// <li> 10: Bits [95:64] of the source are copied to bits [95:64] of the
				873	/// returned vector. </li>
				874	/// <li> 11: Bits [127:96] of the source are copied to bits [95:64] of the
				875	/// returned vector. </li>
				876	/// </ul>
				877	/// </li>
				878	/// <li> Bits [97:96]:
				879	/// <ul>
				880	/// <li> 00: Bits [31:0] of the source are copied to bits [127:96] of the
				881	/// returned vector. </li>
				882	/// <li> 01: Bits [63:32] of the source are copied to bits [127:96] of the
				883	/// returned vector. </li>
				884	/// <li> 10: Bits [95:64] of the source are copied to bits [127:96] of the
				885	/// returned vector. </li>
				886	/// <li> 11: Bits [127:96] of the source are copied to bits [127:96] of the
				887	/// returned vector. </li>
				888	/// <li> 11: Bits [127:96] of the source are copied to bits [95:64] of the
				889	/// returned vector. </li>
				890	/// </ul>
				891	/// </li>
				892	/// </ul>
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	893	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	894	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	895	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	896	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	897	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	898	}
				899
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	900	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				901	/// specified by the 256-bit integer vector operand.
				902	///
				903	/// \headerfile <x86intrin.h>
				904	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	905	/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	906	///
				907	/// \param __a
				908	/// A 256-bit vector of [8 x float].
				909	/// \param __c
				910	/// A 256-bit integer vector operand specifying how the values are to be
				911	/// copied.
				912	/// Bits [1:0]:
				913	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	914	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	915	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	916	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	917	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	918	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	919	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	920	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	921	/// Bits [33:32]:
				922	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	923	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	924	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	925	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	926	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	927	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	928	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	929	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	930	/// Bits [65:64]:
				931	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	932	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	933	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	934	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	935	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	936	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	937	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	938	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	939	/// Bits [97:96]:
				940	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	941	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	942	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	943	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	944	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	945	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	946	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	947	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	948	/// Bits [129:128]:
				949	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	950	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	951	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	952	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	953	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	954	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	955	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	956	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	957	/// Bits [161:160]:
				958	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	959	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	960	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	961	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	962	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	963	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	964	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	965	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	966	/// Bits [193:192]:
				967	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	968	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	969	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	970	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	971	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	972	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	973	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	974	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	975	/// Bits [225:224]:
				976	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	977	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	978	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	979	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	980	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	981	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	982	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	983	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	984	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	985	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	986	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	987	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	988	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	989	}
				990
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	991	/// \brief Copies the values in a 128-bit vector of [2 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	992	/// specified by the immediate integer operand.
				993	///
				994	/// \headerfile <x86intrin.h>
				995	///
				996	/// \code
				997	/// __m128d _mm_permute_pd(__m128d A, const int C);
				998	/// \endcode
				999	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1000	/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1001	///
				1002	/// \param A
				1003	/// A 128-bit vector of [2 x double].
				1004	/// \param C
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1005	/// \parblock
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1006	/// An immediate integer operand specifying how the values are to be copied.
				1007	/// Bit [0]:
				1008	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1009	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1010	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1011	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1012	/// Bit [1]:
				1013	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1014	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1015	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1016	/// returned vector.
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1017	/// \endparblock
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1018	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1019	#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1020	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1021	(__v2df)_mm_undefined_pd(), \
				1022	((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1023
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1024	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1025	/// specified by the immediate integer operand.
				1026	///
				1027	/// \headerfile <x86intrin.h>
				1028	///
				1029	/// \code
				1030	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1031	/// \endcode
				1032	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1033	/// This intrinsic corresponds to the \c VPERMILPD instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1034	///
				1035	/// \param A
				1036	/// A 256-bit vector of [4 x double].
				1037	/// \param C
				1038	/// An immediate integer operand specifying how the values are to be copied.
				1039	/// Bit [0]:
				1040	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1041	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1042	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1043	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1044	/// Bit [1]:
				1045	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1046	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1047	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1048	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1049	/// Bit [2]:
				1050	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1051	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1052	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1053	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1054	/// Bit [3]:
				1055	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1056	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1057	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1058	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1059	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1060	#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1061	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1062	(__v4df)_mm256_undefined_pd(), \
				1063	0 + (((C) >> 0) & 0x1), \
				1064	0 + (((C) >> 1) & 0x1), \
				1065	2 + (((C) >> 2) & 0x1), \
				1066	2 + (((C) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1067
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1068	/// \brief Copies the values in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1069	/// specified by the immediate integer operand.
				1070	///
				1071	/// \headerfile <x86intrin.h>
				1072	///
				1073	/// \code
				1074	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1075	/// \endcode
				1076	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1077	/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1078	///
				1079	/// \param A
				1080	/// A 128-bit vector of [4 x float].
				1081	/// \param C
				1082	/// An immediate integer operand specifying how the values are to be copied.
				1083	/// Bits [1:0]:
				1084	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1085	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1086	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1087	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1088	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1089	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1090	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1091	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1092	/// Bits [3:2]:
				1093	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1094	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1095	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1096	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1097	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1098	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1099	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1100	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1101	/// Bits [5:4]:
				1102	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1103	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1104	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1105	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1106	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1107	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1108	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1109	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1110	/// Bits [7:6]:
				1111	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1112	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1113	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1114	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1115	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1116	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1117	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1118	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1119	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1120	#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1121	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1122	(__v4sf)_mm_undefined_ps(), \
				1123	((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
				1124	((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1125
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1126	/// \brief Copies the values in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1127	/// specified by the immediate integer operand.
				1128	///
				1129	/// \headerfile <x86intrin.h>
				1130	///
				1131	/// \code
				1132	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1133	/// \endcode
				1134	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1135	/// This intrinsic corresponds to the \c VPERMILPS instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1136	///
				1137	/// \param A
				1138	/// A 256-bit vector of [8 x float].
				1139	/// \param C
				1140	/// An immediate integer operand specifying how the values are to be copied.
				1141	/// Bits [1:0]:
				1142	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1143	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1144	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1145	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1146	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1147	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1148	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1149	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1150	/// Bits [3:2]:
				1151	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1152	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1153	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1154	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1155	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1156	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1157	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1158	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1159	/// Bits [5:4]:
				1160	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1161	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1162	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1163	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1164	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1165	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1166	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1167	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1168	/// Bits [7:6]:
				1169	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1170	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1171	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1172	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1173	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1174	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1175	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1176	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1177	/// Bits [1:0]:
				1178	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1179	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1180	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1181	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1182	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1183	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1184	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1185	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1186	/// Bits [3:2]:
				1187	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1188	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1189	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1190	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1191	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1192	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1193	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1194	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1195	/// Bits [5:4]:
				1196	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1197	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1198	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1199	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1200	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1201	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1202	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1203	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1204	/// Bits [7:6]:
				1205	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1206	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1207	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1208	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1209	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1210	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1211	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1212	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1213	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1214	#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1215	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1216	(__v8sf)_mm256_undefined_ps(), \
				1217	0 + (((C) >> 0) & 0x3), \
				1218	0 + (((C) >> 2) & 0x3), \
				1219	0 + (((C) >> 4) & 0x3), \
				1220	0 + (((C) >> 6) & 0x3), \
				1221	4 + (((C) >> 0) & 0x3), \
				1222	4 + (((C) >> 2) & 0x3), \
				1223	4 + (((C) >> 4) & 0x3), \
				1224	4 + (((C) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1225
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1226	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1227	/// [4 x double], as specified by the immediate integer operand.
				1228	///
				1229	/// \headerfile <x86intrin.h>
				1230	///
				1231	/// \code
				1232	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1233	/// \endcode
				1234	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1235	/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1236	///
				1237	/// \param V1
				1238	/// A 256-bit vector of [4 x double].
				1239	/// \param V2
				1240	/// A 256-bit vector of [4 x double.
				1241	/// \param M
				1242	/// An immediate integer operand specifying how the values are to be
				1243	/// permuted.
				1244	/// Bits [1:0]:
				1245	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1246	/// destination.
				1247	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1248	/// destination.
				1249	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1250	/// destination.
				1251	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1252	/// destination.
				1253	/// Bits [5:4]:
				1254	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1255	/// destination.
				1256	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1257	/// destination.
				1258	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1259	/// destination.
				1260	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1261	/// destination.
				1262	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1263	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1264	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1265	(__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1266
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1267	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1268	/// [8 x float], as specified by the immediate integer operand.
				1269	///
				1270	/// \headerfile <x86intrin.h>
				1271	///
				1272	/// \code
				1273	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1274	/// \endcode
				1275	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1276	/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1277	///
				1278	/// \param V1
				1279	/// A 256-bit vector of [8 x float].
				1280	/// \param V2
				1281	/// A 256-bit vector of [8 x float].
				1282	/// \param M
				1283	/// An immediate integer operand specifying how the values are to be
				1284	/// permuted.
				1285	/// Bits [1:0]:
				1286	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1287	/// destination.
				1288	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1289	/// destination.
				1290	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1291	/// destination.
				1292	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1293	/// destination.
				1294	/// Bits [5:4]:
				1295	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1296	/// destination.
				1297	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1298	/// destination.
				1299	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1300	/// destination.
				1301	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1302	/// destination.
				1303	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1304	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1305	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1306	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1307
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1308	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1309	/// as specified by the immediate integer operand.
				1310	///
				1311	/// \headerfile <x86intrin.h>
				1312	///
				1313	/// \code
				1314	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1315	/// \endcode
				1316	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1317	/// This intrinsic corresponds to the \c VPERM2F128 instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1318	///
				1319	/// \param V1
				1320	/// A 256-bit integer vector.
				1321	/// \param V2
				1322	/// A 256-bit integer vector.
				1323	/// \param M
				1324	/// An immediate integer operand specifying how the values are to be copied.
				1325	/// Bits [1:0]:
				1326	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1327	/// destination.
				1328	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1329	/// destination.
				1330	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1331	/// destination.
				1332	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1333	/// destination.
				1334	/// Bits [5:4]:
				1335	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1336	/// destination.
				1337	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1338	/// destination.
				1339	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1340	/// destination.
				1341	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1342	/// destination.
				1343	/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1344	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1345	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1346	(__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1347
				1348	/* Vector Blend */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1349	/// \brief Merges 64-bit double-precision data values stored in either of the
				1350	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1351	/// integer operand.
				1352	///
				1353	/// \headerfile <x86intrin.h>
				1354	///
				1355	/// \code
				1356	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1357	/// \endcode
				1358	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1359	/// This intrinsic corresponds to the \c VBLENDPD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1360	///
				1361	/// \param V1
				1362	/// A 256-bit vector of [4 x double].
				1363	/// \param V2
				1364	/// A 256-bit vector of [4 x double].
				1365	/// \param M
				1366	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1367	/// values are to be copied. The position of the mask bit corresponds to the
				1368	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				1369	/// element in operand V1 is copied to the same position in the destination.
				1370	/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
				1371	/// copied to the same position in the destination.
				1372	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1373	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1374	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1375	(__v4df)(__m256d)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1376	(((M) & 0x01) ? 4 : 0), \
				1377	(((M) & 0x02) ? 5 : 1), \
				1378	(((M) & 0x04) ? 6 : 2), \
				1379	(((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1380
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1381	/// \brief Merges 32-bit single-precision data values stored in either of the
				1382	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1383	/// integer operand.
				1384	///
				1385	/// \headerfile <x86intrin.h>
				1386	///
				1387	/// \code
				1388	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1389	/// \endcode
				1390	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1391	/// This intrinsic corresponds to the \c VBLENDPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1392	///
				1393	/// \param V1
				1394	/// A 256-bit vector of [8 x float].
				1395	/// \param V2
				1396	/// A 256-bit vector of [8 x float].
				1397	/// \param M
				1398	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1399	/// values are to be copied. The position of the mask bit corresponds to the
				1400	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				1401	/// element in operand V1 is copied to the same position in the destination.
				1402	/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
				1403	/// copied to the same position in the destination.
				1404	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1405	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1406	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1407	(__v8sf)(__m256)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1408	(((M) & 0x01) ? 8 : 0), \
				1409	(((M) & 0x02) ? 9 : 1), \
				1410	(((M) & 0x04) ? 10 : 2), \
				1411	(((M) & 0x08) ? 11 : 3), \
				1412	(((M) & 0x10) ? 12 : 4), \
				1413	(((M) & 0x20) ? 13 : 5), \
				1414	(((M) & 0x40) ? 14 : 6), \
				1415	(((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1416
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1417	/// \brief Merges 64-bit double-precision data values stored in either of the
				1418	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1419	/// operand.
				1420	///
				1421	/// \headerfile <x86intrin.h>
				1422	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1423	/// This intrinsic corresponds to the \c VBLENDVPD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1424	///
				1425	/// \param __a
				1426	/// A 256-bit vector of [4 x double].
				1427	/// \param __b
				1428	/// A 256-bit vector of [4 x double].
				1429	/// \param __c
				1430	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1431	/// how the values are to be copied. The position of the mask bit corresponds
				1432	/// to the most significant bit of a copied value. When a mask bit is 0, the
				1433	/// corresponding 64-bit element in operand __a is copied to the same
				1434	/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1435	/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1436	/// destination.
				1437	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1438	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1439	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1440	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1441	return (__m256d)__builtin_ia32_blendvpd256(
				1442	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1443	}
				1444
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1445	/// \brief Merges 32-bit single-precision data values stored in either of the
				1446	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1447	/// operand.
				1448	///
				1449	/// \headerfile <x86intrin.h>
				1450	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1451	/// This intrinsic corresponds to the \c VBLENDVPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1452	///
				1453	/// \param __a
				1454	/// A 256-bit vector of [8 x float].
				1455	/// \param __b
				1456	/// A 256-bit vector of [8 x float].
				1457	/// \param __c
				1458	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1459	/// and 31 specifying how the values are to be copied. The position of the
				1460	/// mask bit corresponds to the most significant bit of a copied value. When
				1461	/// a mask bit is 0, the corresponding 32-bit element in operand __a is
				1462	/// copied to the same position in the destination. When a mask bit is 1, the
				1463	/// corresponding 32-bit element in operand __b is copied to the same
				1464	/// position in the destination.
				1465	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1466	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1467	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1468	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1469	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1470	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1471	}
				1472
				1473	/* Vector Dot Product */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1474	/// \brief Computes two dot products in parallel, using the lower and upper
				1475	/// halves of two [8 x float] vectors as input to the two computations, and
				1476	/// returning the two dot products in the lower and upper halves of the
				1477	/// [8 x float] result. The immediate integer operand controls which
				1478	/// input elements will contribute to the dot product, and where the final
				1479	/// results are returned. In general, for each dot product, the four
				1480	/// corresponding elements of the input vectors are multiplied; the first
				1481	/// two and second two products are summed, then the two sums are added to
				1482	/// form the final result.
				1483	///
				1484	/// \headerfile <x86intrin.h>
				1485	///
				1486	/// \code
				1487	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1488	/// \endcode
				1489	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1490	/// This intrinsic corresponds to the \c VDPPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1491	///
				1492	/// \param V1
				1493	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1494	/// \param V2
				1495	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1496	/// \param M
				1497	/// An immediate integer argument. Bits [7:4] determine which elements of
				1498	/// the input vectors are used, with bit [4] corresponding to the lowest
				1499	/// element and bit [7] corresponding to the highest element of each [4 x
				1500	/// float] subvector. If a bit is set, the corresponding elements from the
				1501	/// two input vectors are used as an input for dot product; otherwise that
				1502	/// input is treated as zero. Bits [3:0] determine which elements of the
				1503	/// result will receive a copy of the final dot product, with bit [0]
				1504	/// corresponding to the lowest element and bit [3] corresponding to the
				1505	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1506	/// product is returned in the corresponding element; otherwise that element
				1507	/// is set to zero. The bitmask is applied in the same way to each of the
				1508	/// two parallel dot product computations.
				1509	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1510	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1511	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1512	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1513
				1514	/* Vector shuffle */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1515	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
				1516	/// specified by the immediate value operand. The four selected elements in
				1517	/// each operand are copied to the destination according to the bits
				1518	/// specified in the immediate operand. The selected elements from the first
				1519	/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
				1520	/// destination, and the selected elements from the second 256-bit operand
				1521	/// are copied to bits [127:64] and bits [255:192] of the destination. For
				1522	/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
				1523	/// the 256-bit destination vector would contain the following values: b[7],
				1524	/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
				1525	///
				1526	/// \headerfile <x86intrin.h>
				1527	///
				1528	/// \code
				1529	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1530	/// \endcode
				1531	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1532	/// This intrinsic corresponds to the \c VSHUFPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1533	///
				1534	/// \param a
				1535	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1536	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1537	/// according to the bits specified in the immediate operand.
				1538	/// \param b
				1539	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1540	/// operand are copied to bits [127:64] and bits [255:192] in the
				1541	/// destination, according to the bits specified in the immediate operand.
				1542	/// \param mask
				1543	/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1544	/// copy from a and b.
				1545	/// Bits [3:0] specify the values copied from operand a.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1546	/// Bits [7:4] specify the values copied from operand b.
				1547	/// The destinations within the 256-bit destination are assigned values as
				1548	/// follows, according to the bit value assignments described below:
				1549	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
				1550	/// destination.
				1551	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
				1552	/// destination.
				1553	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
				1554	/// destination.
				1555	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
				1556	/// the destination.
				1557	/// Bit value assignments:
				1558	/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
				1559	/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
				1560	/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
				1561	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1562	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1563	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1564	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1565	(__v8sf)(__m256)(b), \
				1566	0 + (((mask) >> 0) & 0x3), \
				1567	0 + (((mask) >> 2) & 0x3), \
				1568	8 + (((mask) >> 4) & 0x3), \
				1569	8 + (((mask) >> 6) & 0x3), \
				1570	4 + (((mask) >> 0) & 0x3), \
				1571	4 + (((mask) >> 2) & 0x3), \
				1572	12 + (((mask) >> 4) & 0x3), \
				1573	12 + (((mask) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1574
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1575	/// \brief Selects four double-precision values from the 256-bit operands of
				1576	/// [4 x double], as specified by the immediate value operand. The selected
				1577	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1578	/// bits [191:128] in the destination, and the selected elements from the
				1579	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
				1580	/// the destination. For example, if bits [3:0] of the immediate operand
				1581	/// contain a value of 0xF, the 256-bit destination vector would contain the
				1582	/// following values: b[3], a[3], b[1], a[1].
				1583	///
				1584	/// \headerfile <x86intrin.h>
				1585	///
				1586	/// \code
				1587	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1588	/// \endcode
				1589	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1590	/// This intrinsic corresponds to the \c VSHUFPD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1591	///
				1592	/// \param a
				1593	/// A 256-bit vector of [4 x double].
				1594	/// \param b
				1595	/// A 256-bit vector of [4 x double].
				1596	/// \param mask
				1597	/// An immediate value containing 8-bit values specifying which elements to
				1598	/// copy from a and b:
				1599	/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
				1600	/// destination.
				1601	/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
				1602	/// destination.
				1603	/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
				1604	/// destination.
				1605	/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
				1606	/// destination.
				1607	/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
				1608	/// destination.
				1609	/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
				1610	/// destination.
				1611	/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
				1612	/// destination.
				1613	/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
				1614	/// destination.
				1615	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1616	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1617	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1618	(__v4df)(__m256d)(b), \
				1619	0 + (((mask) >> 0) & 0x1), \
				1620	4 + (((mask) >> 1) & 0x1), \
				1621	2 + (((mask) >> 2) & 0x1), \
				1622	6 + (((mask) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1623
				1624	/* Compare */
				1625	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1626	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1627	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1628	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1629	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1630	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1631	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1632	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1633	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1634	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1635	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1636	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1637	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1638	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1639	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1640	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1641	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1642	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1643	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1644	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1645	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1646	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1647	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1648	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1649	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1650	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1651	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1652	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1653	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1654	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1655	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1656	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1657
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1658	/// \brief Compares each of the corresponding double-precision values of two
				1659	/// 128-bit vectors of [2 x double], using the operation specified by the
				1660	/// immediate integer operand. Returns a [2 x double] vector consisting of
				1661	/// two doubles corresponding to the two comparison results: zero if the
				1662	/// comparison is false, and all 1's if the comparison is true.
				1663	///
				1664	/// \headerfile <x86intrin.h>
				1665	///
				1666	/// \code
				1667	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1668	/// \endcode
				1669	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1670	/// This intrinsic corresponds to the \c VCMPPD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1671	///
				1672	/// \param a
				1673	/// A 128-bit vector of [2 x double].
				1674	/// \param b
				1675	/// A 128-bit vector of [2 x double].
				1676	/// \param c
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1677	/// \parblock
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1678	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1679	/// operation to use:
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1680	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1681	/// 00h, 08h, 10h, 18h: Equal
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1682	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1683	/// 01h, 09h, 11h, 19h: Less than
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1684	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1685	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1686	/// operands)
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1687	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1688	/// 03h, 0Bh, 13h, 1Bh: Unordered
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1689	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1690	/// 04h, 0Ch, 14h, 1Ch: Not equal
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1691	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1692	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1693	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1694	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1695	/// (swapped operands)
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1696	///
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1697	/// 07h, 0Fh, 17h, 1Fh: Ordered
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1698	/// \endparblock
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1699	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1700	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1701	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1702	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1703
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1704	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1705	/// [4 x float], using the operation specified by the immediate integer
				1706	/// operand. Returns a [4 x float] vector consisting of four floats
				1707	/// corresponding to the four comparison results: zero if the comparison is
				1708	/// false, and all 1's if the comparison is true.
				1709	///
				1710	/// \headerfile <x86intrin.h>
				1711	///
				1712	/// \code
				1713	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1714	/// \endcode
				1715	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1716	/// This intrinsic corresponds to the \c VCMPPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1717	///
				1718	/// \param a
				1719	/// A 128-bit vector of [4 x float].
				1720	/// \param b
				1721	/// A 128-bit vector of [4 x float].
				1722	/// \param c
				1723	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1724	/// operation to use:
				1725	/// 00h, 08h, 10h, 18h: Equal
				1726	/// 01h, 09h, 11h, 19h: Less than
				1727	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1728	/// operands)
				1729	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1730	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1731	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1732	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1733	/// (swapped operands)
				1734	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1735	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1736	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1737	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1738	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1739
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1740	/// \brief Compares each of the corresponding double-precision values of two
				1741	/// 256-bit vectors of [4 x double], using the operation specified by the
				1742	/// immediate integer operand. Returns a [4 x double] vector consisting of
				1743	/// four doubles corresponding to the four comparison results: zero if the
				1744	/// comparison is false, and all 1's if the comparison is true.
				1745	///
				1746	/// \headerfile <x86intrin.h>
				1747	///
				1748	/// \code
				1749	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1750	/// \endcode
				1751	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1752	/// This intrinsic corresponds to the \c VCMPPD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1753	///
				1754	/// \param a
				1755	/// A 256-bit vector of [4 x double].
				1756	/// \param b
				1757	/// A 256-bit vector of [4 x double].
				1758	/// \param c
				1759	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1760	/// operation to use:
				1761	/// 00h, 08h, 10h, 18h: Equal
				1762	/// 01h, 09h, 11h, 19h: Less than
				1763	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1764	/// operands)
				1765	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1766	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1767	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1768	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1769	/// (swapped operands)
				1770	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1771	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1772	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1773	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1774	(__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1775
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1776	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1777	/// [8 x float], using the operation specified by the immediate integer
				1778	/// operand. Returns a [8 x float] vector consisting of eight floats
				1779	/// corresponding to the eight comparison results: zero if the comparison is
				1780	/// false, and all 1's if the comparison is true.
				1781	///
				1782	/// \headerfile <x86intrin.h>
				1783	///
				1784	/// \code
				1785	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1786	/// \endcode
				1787	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1788	/// This intrinsic corresponds to the \c VCMPPS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1789	///
				1790	/// \param a
				1791	/// A 256-bit vector of [8 x float].
				1792	/// \param b
				1793	/// A 256-bit vector of [8 x float].
				1794	/// \param c
				1795	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1796	/// operation to use:
				1797	/// 00h, 08h, 10h, 18h: Equal
				1798	/// 01h, 09h, 11h, 19h: Less than
				1799	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1800	/// operands)
				1801	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1802	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1803	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1804	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1805	/// (swapped operands)
				1806	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1807	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1808	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1809	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1810	(__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1811
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1812	/// \brief Compares each of the corresponding scalar double-precision values of
				1813	/// two 128-bit vectors of [2 x double], using the operation specified by the
				1814	/// immediate integer operand. If the result is true, all 64 bits of the
				1815	/// destination vector are set; otherwise they are cleared.
				1816	///
				1817	/// \headerfile <x86intrin.h>
				1818	///
				1819	/// \code
				1820	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1821	/// \endcode
				1822	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1823	/// This intrinsic corresponds to the \c VCMPSD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1824	///
				1825	/// \param a
				1826	/// A 128-bit vector of [2 x double].
				1827	/// \param b
				1828	/// A 128-bit vector of [2 x double].
				1829	/// \param c
				1830	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1831	/// operation to use:
				1832	/// 00h, 08h, 10h, 18h: Equal
				1833	/// 01h, 09h, 11h, 19h: Less than
				1834	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1835	/// operands)
				1836	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1837	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1838	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1839	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1840	/// (swapped operands)
				1841	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1842	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1843	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1844	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1845	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1846
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1847	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1848	/// vectors of [4 x float], using the operation specified by the immediate
				1849	/// integer operand. If the result is true, all 32 bits of the destination
				1850	/// vector are set; otherwise they are cleared.
				1851	///
				1852	/// \headerfile <x86intrin.h>
				1853	///
				1854	/// \code
				1855	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1856	/// \endcode
				1857	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1858	/// This intrinsic corresponds to the \c VCMPSS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1859	///
				1860	/// \param a
				1861	/// A 128-bit vector of [4 x float].
				1862	/// \param b
				1863	/// A 128-bit vector of [4 x float].
				1864	/// \param c
				1865	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1866	/// operation to use:
				1867	/// 00h, 08h, 10h, 18h: Equal
				1868	/// 01h, 09h, 11h, 19h: Less than
				1869	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1870	/// operands)
				1871	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1872	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1873	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1874	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1875	/// (swapped operands)
				1876	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1877	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1878	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1879	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1880	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1881
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1882	/// \brief Takes a [8 x i32] vector and returns the vector element value
				1883	/// indexed by the immediate constant operand.
				1884	///
				1885	/// \headerfile <x86intrin.h>
				1886	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1887	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1888	///
				1889	/// \param __a
				1890	/// A 256-bit vector of [8 x i32].
				1891	/// \param __imm
				1892	/// An immediate integer operand with bits [2:0] determining which vector
				1893	/// element is extracted and returned.
				1894	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1895	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1896	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1897	_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1898	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1899	__v8si __b = (__v8si)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1900	return __b[__imm & 7];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1901	}
				1902
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1903	/// \brief Takes a [16 x i16] vector and returns the vector element value
				1904	/// indexed by the immediate constant operand.
				1905	///
				1906	/// \headerfile <x86intrin.h>
				1907	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1908	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1909	///
				1910	/// \param __a
				1911	/// A 256-bit integer vector of [16 x i16].
				1912	/// \param __imm
				1913	/// An immediate integer operand with bits [3:0] determining which vector
				1914	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1915	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1916	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1917	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1918	_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1919	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1920	__v16hi __b = (__v16hi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1921	return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1922	}
				1923
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1924	/// \brief Takes a [32 x i8] vector and returns the vector element value
				1925	/// indexed by the immediate constant operand.
				1926	///
				1927	/// \headerfile <x86intrin.h>
				1928	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1929	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1930	///
				1931	/// \param __a
				1932	/// A 256-bit integer vector of [32 x i8].
				1933	/// \param __imm
				1934	/// An immediate integer operand with bits [4:0] determining which vector
				1935	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1936	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				1937	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1938	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1939	_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1940	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1941	__v32qi __b = (__v32qi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1942	return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1943	}
				1944
				1945	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1946	/// \brief Takes a [4 x i64] vector and returns the vector element value
				1947	/// indexed by the immediate constant operand.
				1948	///
				1949	/// \headerfile <x86intrin.h>
				1950	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1951	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1952	///
				1953	/// \param __a
				1954	/// A 256-bit integer vector of [4 x i64].
				1955	/// \param __imm
				1956	/// An immediate integer operand with bits [1:0] determining which vector
				1957	/// element is extracted and returned.
				1958	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				1959	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1960	static __inline long long __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1961	_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1962	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1963	__v4di __b = (__v4di)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1964	return __b[__imm & 3];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1965	}
				1966	#endif
				1967
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1968	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				1969	/// indexed by the immediate constant operand by a new value. Returns the
				1970	/// modified vector.
				1971	///
				1972	/// \headerfile <x86intrin.h>
				1973	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	1974	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1975	///
				1976	/// \param __a
				1977	/// A vector of [8 x i32] to be used by the insert operation.
				1978	/// \param __b
				1979	/// An integer value. The replacement value for the insert operation.
				1980	/// \param __imm
				1981	/// An immediate integer specifying the index of the vector element to be
				1982	/// replaced.
				1983	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1984	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1985	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1986	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1987	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1988	__v8si __c = (__v8si)__a;
				1989	__c[__imm & 7] = __b;
				1990	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1991	}
				1992
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1993
				1994	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				1995	/// indexed by the immediate constant operand with a new value. Returns the
				1996	/// modified vector.
				1997	///
				1998	/// \headerfile <x86intrin.h>
				1999	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2000	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2001	///
				2002	/// \param __a
				2003	/// A vector of [16 x i16] to be used by the insert operation.
				2004	/// \param __b
				2005	/// An i16 integer value. The replacement value for the insert operation.
				2006	/// \param __imm
				2007	/// An immediate integer specifying the index of the vector element to be
				2008	/// replaced.
				2009	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2010	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2011	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2012	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2013	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2014	__v16hi __c = (__v16hi)__a;
				2015	__c[__imm & 15] = __b;
				2016	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2017	}
				2018
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2019	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				2020	/// indexed by the immediate constant operand with a new value. Returns the
				2021	/// modified vector.
				2022	///
				2023	/// \headerfile <x86intrin.h>
				2024	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2025	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2026	///
				2027	/// \param __a
				2028	/// A vector of [32 x i8] to be used by the insert operation.
				2029	/// \param __b
				2030	/// An i8 integer value. The replacement value for the insert operation.
				2031	/// \param __imm
				2032	/// An immediate integer specifying the index of the vector element to be
				2033	/// replaced.
				2034	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2035	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2036	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2037	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2038	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2039	__v32qi __c = (__v32qi)__a;
				2040	__c[__imm & 31] = __b;
				2041	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2042	}
				2043
				2044	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2045	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2046	/// indexed by the immediate constant operand with a new value. Returns the
				2047	/// modified vector.
				2048	///
				2049	/// \headerfile <x86intrin.h>
				2050	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2051	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2052	///
				2053	/// \param __a
				2054	/// A vector of [4 x i64] to be used by the insert operation.
				2055	/// \param __b
				2056	/// A 64-bit integer value. The replacement value for the insert operation.
				2057	/// \param __imm
				2058	/// An immediate integer specifying the index of the vector element to be
				2059	/// replaced.
				2060	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2061	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2062	static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhas	d740029	2015-02-19 19:00:33 +0000	[diff] [blame]	2063	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2064	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2065	__v4di __c = (__v4di)__a;
				2066	__c[__imm & 3] = __b;
				2067	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2068	}
				2069	#endif
				2070
				2071	/* Conversion */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2072	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2073	///
				2074	/// \headerfile <x86intrin.h>
				2075	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2076	/// This intrinsic corresponds to the \c VCVTDQ2PD instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2077	///
				2078	/// \param __a
				2079	/// A 128-bit integer vector of [4 x i32].
				2080	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2081	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2082	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2083	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2084	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2085	}
				2086
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2087	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2088	///
				2089	/// \headerfile <x86intrin.h>
				2090	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2091	/// This intrinsic corresponds to the \c VCVTDQ2PS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2092	///
				2093	/// \param __a
				2094	/// A 256-bit integer vector.
				2095	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2096	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2097	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2098	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2099	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2100	}
				2101
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2102	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2103	/// [4 x float].
				2104	///
				2105	/// \headerfile <x86intrin.h>
				2106	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2107	/// This intrinsic corresponds to the \c VCVTPD2PS instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2108	///
				2109	/// \param __a
				2110	/// A 256-bit vector of [4 x double].
				2111	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2112	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2113	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2114	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2115	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2116	}
				2117
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2118	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2119	///
				2120	/// \headerfile <x86intrin.h>
				2121	///
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame^]	2122	/// This intrinsic corresponds to the \c VCVTPS2DQ instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2123	///
				2124	/// \param __a
				2125	/// A 256-bit vector of [8 x float].
				2126	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2127	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2128	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2129	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2130	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2131	}
				2132
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2133	/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
				2134	/// x double].
				2135	///
				2136	/// \headerfile <x86intrin.h>
				2137	///
				2138	/// This intrinsic corresponds to the \c VCVTPS2PD instruction.
				2139	///
				2140	/// \param __a
				2141	/// A 128-bit vector of [4 x float].
				2142	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2143	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2144	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2145	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2146	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2147	}
				2148
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2149	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2150	/// x i32], truncating the result by rounding towards zero when it is
				2151	/// inexact.
				2152	///
				2153	/// \headerfile <x86intrin.h>
				2154	///
				2155	/// This intrinsic corresponds to the \c VCVTTPD2DQ instruction.
				2156	///
				2157	/// \param __a
				2158	/// A 256-bit vector of [4 x double].
				2159	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2160	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2161	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2162	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2163	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2164	}
				2165
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2166	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2167	/// x i32]. When a conversion is inexact, the value returned is rounded
				2168	/// according to the rounding control bits in the MXCSR register.
				2169	///
				2170	/// \headerfile <x86intrin.h>
				2171	///
				2172	/// This intrinsic corresponds to the \c VCVTPD2DQ instruction.
				2173	///
				2174	/// \param __a
				2175	/// A 256-bit vector of [4 x double].
				2176	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2177	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2178	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2179	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2180	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2181	}
				2182
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2183	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
				2184	/// truncating the result by rounding towards zero when it is inexact.
				2185	///
				2186	/// \headerfile <x86intrin.h>
				2187	///
				2188	/// This intrinsic corresponds to the \c VCVTTPS2DQ instruction.
				2189	///
				2190	/// \param __a
				2191	/// A 256-bit vector of [8 x float].
				2192	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2193	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2194	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2195	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2196	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2197	}
				2198
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2199	static __inline double __DEFAULT_FN_ATTRS
				2200	_mm256_cvtsd_f64(__m256d __a)
				2201	{
				2202	return __a[0];
				2203	}
				2204
				2205	static __inline int __DEFAULT_FN_ATTRS
				2206	_mm256_cvtsi256_si32(__m256i __a)
				2207	{
				2208	__v8si __b = (__v8si)__a;
				2209	return __b[0];
				2210	}
				2211
				2212	static __inline float __DEFAULT_FN_ATTRS
				2213	_mm256_cvtss_f32(__m256 __a)
				2214	{
				2215	return __a[0];
				2216	}
				2217
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2218	/* Vector replicate */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2219	/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
				2220	/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
				2221	/// Bits [255:224] of __a are written to bits [255:224] and [223:192]
				2222	/// of the return value.
				2223	/// Bits [191:160] of __a are written to bits [191:160] and [159:128]
				2224	/// of the return value.
				2225	/// Bits [127:96] of __a are written to bits [127:96] and [95:64] of
				2226	/// the return value.
				2227	/// Bits [63:32] of __a are written to bits [63:32] and [31:0] of the
				2228	/// return value.
				2229	///
				2230	/// \headerfile <x86intrin.h>
				2231	///
				2232	/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
				2233	///
				2234	/// \param __a
				2235	/// A 256-bit vector of [8 x float].
				2236	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2237	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2238	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2239	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2240	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2241	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2242	}
				2243
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2244	/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
				2245	/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
				2246	/// Bits [223:192] of __a are written to bits [255:224] and [223:192]
				2247	/// of the return value.
				2248	/// Bits [159:128] of __a are written to bits [191:160] and [159:128]
				2249	/// of the return value.
				2250	/// Bits [95:64] of __a are written to bits [127:96] and [95:64] of
				2251	/// the return value.
				2252	/// Bits [31:0] of __a are written to bits [63:32] and [31:0] of the
				2253	/// return value.
				2254	///
				2255	/// \headerfile <x86intrin.h>
				2256	///
				2257	/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
				2258	///
				2259	/// \param __a
				2260	/// A 256-bit vector of [8 x float].
				2261	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2262	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2263	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2264	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2265	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2266	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2267	}
				2268
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2269	/// \brief Moves and duplicates double-precision floating point values from a
				2270	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2271	/// vector of [4 x double].
				2272	/// Bits [63:0] of __a are written to bits [127:64] and [63:0] of the
				2273	/// return value.
				2274	/// Bits [191:128] of __a are written to bits [255:192] and [191:128]
				2275	/// of the return value.
				2276	///
				2277	/// \headerfile <x86intrin.h>
				2278	///
				2279	/// This intrinsic corresponds to the \c VMOVDDUP instruction.
				2280	///
				2281	/// \param __a
				2282	/// A 256-bit vector of [4 x double].
				2283	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2284	/// duplicated values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2285	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2286	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2287	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2288	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2289	}
				2290
				2291	/* Unpack and Interleave */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2292	/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
				2293	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2294	///
				2295	/// \headerfile <x86intrin.h>
				2296	///
				2297	/// This intrinsic corresponds to the \c VUNPCKHPD instruction.
				2298	///
				2299	/// \param __a
				2300	/// A 256-bit floating-point vector of [4 x double].
				2301	/// Bits [127:64] are written to bits [63:0] of the return value.
				2302	/// Bits [255:192] are written to bits [191:128] of the return value.
				2303	/// \param __b
				2304	/// A 256-bit floating-point vector of [4 x double].
				2305	/// Bits [127:64] are written to bits [127:64] of the return value.
				2306	/// Bits [255:192] are written to bits [255:192] of the return value.
				2307	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2308	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2309	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2311	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2312	}
				2313
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2314	/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
				2315	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2316	///
				2317	/// \headerfile <x86intrin.h>
				2318	///
				2319	/// This intrinsic corresponds to the \c VUNPCKLPD instruction.
				2320	///
				2321	/// \param __a
				2322	/// A 256-bit floating-point vector of [4 x double].
				2323	/// Bits [63:0] are written to bits [63:0] of the return value.
				2324	/// Bits [191:128] are written to bits [191:128] of the return value.
				2325	/// \param __b
				2326	/// A 256-bit floating-point vector of [4 x double].
				2327	/// Bits [63:0] are written to bits [127:64] of the return value.
				2328	/// Bits [191:128] are written to bits [255:192] of the return value.
				2329	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2330	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2331	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2332	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2333	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2334	}
				2335
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2336	/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
				2337	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2338	/// vector of [8 x float].
				2339	///
				2340	/// \headerfile <x86intrin.h>
				2341	///
				2342	/// This intrinsic corresponds to the \c VUNPCKHPS instruction.
				2343	///
				2344	/// \param __a
				2345	/// A 256-bit vector of [8 x float].
				2346	/// Bits [95:64] are written to bits [31:0] of the return value.
				2347	/// Bits [127:96] are written to bits [95:64] of the return value.
				2348	/// Bits [223:192] are written to bits [159:128] of the return value.
				2349	/// Bits [255:224] are written to bits [223:192] of the return value.
				2350	/// \param __b
				2351	/// A 256-bit vector of [8 x float].
				2352	/// Bits [95:64] are written to bits [63:32] of the return value.
				2353	/// Bits [127:96] are written to bits [127:96] of the return value.
				2354	/// Bits [223:192] are written to bits [191:160] of the return value.
				2355	/// Bits [255:224] are written to bits [255:224] of the return value.
				2356	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2357	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2358	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2359	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2360	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2361	}
				2362
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2363	/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
				2364	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2365	/// vector of [8 x float].
				2366	///
				2367	/// \headerfile <x86intrin.h>
				2368	///
				2369	/// This intrinsic corresponds to the \c VUNPCKLPS instruction.
				2370	///
				2371	/// \param __a
				2372	/// A 256-bit vector of [8 x float].
				2373	/// Bits [31:0] are written to bits [31:0] of the return value.
				2374	/// Bits [63:32] are written to bits [95:64] of the return value.
				2375	/// Bits [159:128] are written to bits [159:128] of the return value.
				2376	/// Bits [191:160] are written to bits [223:192] of the return value.
				2377	/// \param __b
				2378	/// A 256-bit vector of [8 x float].
				2379	/// Bits [31:0] are written to bits [63:32] of the return value.
				2380	/// Bits [63:32] are written to bits [127:96] of the return value.
				2381	/// Bits [159:128] are written to bits [191:160] of the return value.
				2382	/// Bits [191:160] are written to bits [255:224] of the return value.
				2383	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2384	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2385	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2386	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2387	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2388	}
				2389
				2390	/* Bit Test */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2391	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2392	/// element-by-element comparison of the double-precision element in the
				2393	/// first source vector and the corresponding element in the second source
				2394	/// vector. The EFLAGS register is updated as follows:
				2395	/// If there is at least one pair of double-precision elements where the
				2396	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2397	/// ZF flag is set to 1.
				2398	/// If there is at least one pair of double-precision elements where the
				2399	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2400	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2401	/// This intrinsic returns the value of the ZF flag.
				2402	///
				2403	/// \headerfile <x86intrin.h>
				2404	///
				2405	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2406	///
				2407	/// \param __a
				2408	/// A 128-bit vector of [2 x double].
				2409	/// \param __b
				2410	/// A 128-bit vector of [2 x double].
				2411	/// \returns the ZF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2412	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2413	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2414	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2415	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2416	}
				2417
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2418	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2419	/// element-by-element comparison of the double-precision element in the
				2420	/// first source vector and the corresponding element in the second source
				2421	/// vector. The EFLAGS register is updated as follows:
				2422	/// If there is at least one pair of double-precision elements where the
				2423	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2424	/// ZF flag is set to 1.
				2425	/// If there is at least one pair of double-precision elements where the
				2426	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2427	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2428	/// This intrinsic returns the value of the CF flag.
				2429	///
				2430	/// \headerfile <x86intrin.h>
				2431	///
				2432	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2433	///
				2434	/// \param __a
				2435	/// A 128-bit vector of [2 x double].
				2436	/// \param __b
				2437	/// A 128-bit vector of [2 x double].
				2438	/// \returns the CF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2439	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2440	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2441	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2442	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2443	}
				2444
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2445	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2446	/// element-by-element comparison of the double-precision element in the
				2447	/// first source vector and the corresponding element in the second source
				2448	/// vector. The EFLAGS register is updated as follows:
				2449	/// If there is at least one pair of double-precision elements where the
				2450	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2451	/// ZF flag is set to 1.
				2452	/// If there is at least one pair of double-precision elements where the
				2453	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2454	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2455	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2456	/// otherwise it returns 0.
				2457	///
				2458	/// \headerfile <x86intrin.h>
				2459	///
				2460	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2461	///
				2462	/// \param __a
				2463	/// A 128-bit vector of [2 x double].
				2464	/// \param __b
				2465	/// A 128-bit vector of [2 x double].
				2466	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2467	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2468	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2469	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2470	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2471	}
				2472
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2473	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2474	/// element-by-element comparison of the single-precision element in the
				2475	/// first source vector and the corresponding element in the second source
				2476	/// vector. The EFLAGS register is updated as follows:
				2477	/// If there is at least one pair of single-precision elements where the
				2478	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2479	/// ZF flag is set to 1.
				2480	/// If there is at least one pair of single-precision elements where the
				2481	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2482	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2483	/// This intrinsic returns the value of the ZF flag.
				2484	///
				2485	/// \headerfile <x86intrin.h>
				2486	///
				2487	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2488	///
				2489	/// \param __a
				2490	/// A 128-bit vector of [4 x float].
				2491	/// \param __b
				2492	/// A 128-bit vector of [4 x float].
				2493	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2494	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2495	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2496	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2497	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2498	}
				2499
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2500	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2501	/// element-by-element comparison of the single-precision element in the
				2502	/// first source vector and the corresponding element in the second source
				2503	/// vector. The EFLAGS register is updated as follows:
				2504	/// If there is at least one pair of single-precision elements where the
				2505	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2506	/// ZF flag is set to 1.
				2507	/// If there is at least one pair of single-precision elements where the
				2508	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2509	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2510	/// This intrinsic returns the value of the CF flag.
				2511	///
				2512	/// \headerfile <x86intrin.h>
				2513	///
				2514	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2515	///
				2516	/// \param __a
				2517	/// A 128-bit vector of [4 x float].
				2518	/// \param __b
				2519	/// A 128-bit vector of [4 x float].
				2520	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2521	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2522	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2523	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2524	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2525	}
				2526
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2527	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2528	/// element-by-element comparison of the single-precision element in the
				2529	/// first source vector and the corresponding element in the second source
				2530	/// vector. The EFLAGS register is updated as follows:
				2531	/// If there is at least one pair of single-precision elements where the
				2532	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2533	/// ZF flag is set to 1.
				2534	/// If there is at least one pair of single-precision elements where the
				2535	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2536	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2537	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2538	/// otherwise it returns 0.
				2539	///
				2540	/// \headerfile <x86intrin.h>
				2541	///
				2542	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2543	///
				2544	/// \param __a
				2545	/// A 128-bit vector of [4 x float].
				2546	/// \param __b
				2547	/// A 128-bit vector of [4 x float].
				2548	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2549	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2550	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2551	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2552	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2553	}
				2554
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2555	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2556	/// element-by-element comparison of the double-precision elements in the
				2557	/// first source vector and the corresponding elements in the second source
				2558	/// vector. The EFLAGS register is updated as follows:
				2559	/// If there is at least one pair of double-precision elements where the
				2560	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2561	/// ZF flag is set to 1.
				2562	/// If there is at least one pair of double-precision elements where the
				2563	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2564	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2565	/// This intrinsic returns the value of the ZF flag.
				2566	///
				2567	/// \headerfile <x86intrin.h>
				2568	///
				2569	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2570	///
				2571	/// \param __a
				2572	/// A 256-bit vector of [4 x double].
				2573	/// \param __b
				2574	/// A 256-bit vector of [4 x double].
				2575	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2576	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2577	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2578	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2579	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2580	}
				2581
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2582	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2583	/// element-by-element comparison of the double-precision elements in the
				2584	/// first source vector and the corresponding elements in the second source
				2585	/// vector. The EFLAGS register is updated as follows:
				2586	/// If there is at least one pair of double-precision elements where the
				2587	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2588	/// ZF flag is set to 1.
				2589	/// If there is at least one pair of double-precision elements where the
				2590	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2591	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2592	/// This intrinsic returns the value of the CF flag.
				2593	///
				2594	/// \headerfile <x86intrin.h>
				2595	///
				2596	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2597	///
				2598	/// \param __a
				2599	/// A 256-bit vector of [4 x double].
				2600	/// \param __b
				2601	/// A 256-bit vector of [4 x double].
				2602	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2603	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2604	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2605	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2606	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2607	}
				2608
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2609	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2610	/// element-by-element comparison of the double-precision elements in the
				2611	/// first source vector and the corresponding elements in the second source
				2612	/// vector. The EFLAGS register is updated as follows:
				2613	/// If there is at least one pair of double-precision elements where the
				2614	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2615	/// ZF flag is set to 1.
				2616	/// If there is at least one pair of double-precision elements where the
				2617	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2618	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2619	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2620	/// otherwise it returns 0.
				2621	///
				2622	/// \headerfile <x86intrin.h>
				2623	///
				2624	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2625	///
				2626	/// \param __a
				2627	/// A 256-bit vector of [4 x double].
				2628	/// \param __b
				2629	/// A 256-bit vector of [4 x double].
				2630	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2631	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2632	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2633	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2634	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2635	}
				2636
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2637	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2638	/// element-by-element comparison of the single-precision element in the
				2639	/// first source vector and the corresponding element in the second source
				2640	/// vector. The EFLAGS register is updated as follows:
				2641	/// If there is at least one pair of single-precision elements where the
				2642	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2643	/// ZF flag is set to 1.
				2644	/// If there is at least one pair of single-precision elements where the
				2645	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2646	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2647	/// This intrinsic returns the value of the ZF flag.
				2648	///
				2649	/// \headerfile <x86intrin.h>
				2650	///
				2651	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2652	///
				2653	/// \param __a
				2654	/// A 256-bit vector of [8 x float].
				2655	/// \param __b
				2656	/// A 256-bit vector of [8 x float].
				2657	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2658	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2659	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2660	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2661	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2662	}
				2663
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2664	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2665	/// element-by-element comparison of the single-precision element in the
				2666	/// first source vector and the corresponding element in the second source
				2667	/// vector. The EFLAGS register is updated as follows:
				2668	/// If there is at least one pair of single-precision elements where the
				2669	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2670	/// ZF flag is set to 1.
				2671	/// If there is at least one pair of single-precision elements where the
				2672	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2673	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2674	/// This intrinsic returns the value of the CF flag.
				2675	///
				2676	/// \headerfile <x86intrin.h>
				2677	///
				2678	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2679	///
				2680	/// \param __a
				2681	/// A 256-bit vector of [8 x float].
				2682	/// \param __b
				2683	/// A 256-bit vector of [8 x float].
				2684	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2685	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2686	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2687	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2688	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2689	}
				2690
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2691	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2692	/// element-by-element comparison of the single-precision elements in the
				2693	/// first source vector and the corresponding elements in the second source
				2694	/// vector. The EFLAGS register is updated as follows:
				2695	/// If there is at least one pair of single-precision elements where the
				2696	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2697	/// ZF flag is set to 1.
				2698	/// If there is at least one pair of single-precision elements where the
				2699	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2700	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2701	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2702	/// otherwise it returns 0.
				2703	///
				2704	/// \headerfile <x86intrin.h>
				2705	///
				2706	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2707	///
				2708	/// \param __a
				2709	/// A 256-bit vector of [8 x float].
				2710	/// \param __b
				2711	/// A 256-bit vector of [8 x float].
				2712	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2713	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2714	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2715	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2716	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2717	}
				2718
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2719	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2720	/// of the two source vectors and update the EFLAGS register as follows:
				2721	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2722	/// is set to 0. Otherwise the ZF flag is set to 1.
				2723	/// If there is at least one pair of bits where the bit from the first source
				2724	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2725	/// is set to 0. Otherwise the CF flag is set to 1.
				2726	/// This intrinsic returns the value of the ZF flag.
				2727	///
				2728	/// \headerfile <x86intrin.h>
				2729	///
				2730	/// This intrinsic corresponds to the \c VPTEST instruction.
				2731	///
				2732	/// \param __a
				2733	/// A 256-bit integer vector.
				2734	/// \param __b
				2735	/// A 256-bit integer vector.
				2736	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2737	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2738	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2739	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2740	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2741	}
				2742
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2743	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2744	/// of the two source vectors and update the EFLAGS register as follows:
				2745	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2746	/// is set to 0. Otherwise the ZF flag is set to 1.
				2747	/// If there is at least one pair of bits where the bit from the first source
				2748	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2749	/// is set to 0. Otherwise the CF flag is set to 1.
				2750	/// This intrinsic returns the value of the CF flag.
				2751	///
				2752	/// \headerfile <x86intrin.h>
				2753	///
				2754	/// This intrinsic corresponds to the \c VPTEST instruction.
				2755	///
				2756	/// \param __a
				2757	/// A 256-bit integer vector.
				2758	/// \param __b
				2759	/// A 256-bit integer vector.
				2760	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2761	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2762	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2763	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2764	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2765	}
				2766
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2767	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2768	/// of the two source vectors and update the EFLAGS register as follows:
				2769	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2770	/// is set to 0. Otherwise the ZF flag is set to 1.
				2771	/// If there is at least one pair of bits where the bit from the first source
				2772	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2773	/// is set to 0. Otherwise the CF flag is set to 1.
				2774	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2775	/// otherwise it returns 0.
				2776	///
				2777	/// \headerfile <x86intrin.h>
				2778	///
				2779	/// This intrinsic corresponds to the \c VPTEST instruction.
				2780	///
				2781	/// \param __a
				2782	/// A 256-bit integer vector.
				2783	/// \param __b
				2784	/// A 256-bit integer vector.
				2785	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2786	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2787	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2788	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2789	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2790	}
				2791
				2792	/* Vector extract sign mask */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2793	/// \brief Extracts the sign bits of double-precision floating point elements
				2794	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2795	/// bits of the return value.
				2796	///
				2797	/// \headerfile <x86intrin.h>
				2798	///
				2799	/// This intrinsic corresponds to the \c VMOVMSKPD instruction.
				2800	///
				2801	/// \param __a
				2802	/// A 256-bit vector of [4 x double] containing the double-precision
				2803	/// floating point values with sign bits to be extracted.
				2804	/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2805	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2806	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2807	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2808	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2809	}
				2810
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2811	/// \brief Extracts the sign bits of double-precision floating point elements
				2812	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				2813	/// bits of the return value.
				2814	///
				2815	/// \headerfile <x86intrin.h>
				2816	///
				2817	/// This intrinsic corresponds to the \c VMOVMSKPS instruction.
				2818	///
				2819	/// \param __a
				2820	/// A 256-bit vector of [8 x float] containing the double-precision floating
				2821	/// point values with sign bits to be extracted.
				2822	/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2823	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2824	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2825	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2826	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2827	}
				2828
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2829	/* Vector __zero */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2830	/// \brief Zeroes the contents of all XMM or YMM registers.
				2831	///
				2832	/// \headerfile <x86intrin.h>
				2833	///
				2834	/// This intrinsic corresponds to the \c VZEROALL instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2835	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2836	_mm256_zeroall(void)
				2837	{
				2838	__builtin_ia32_vzeroall();
				2839	}
				2840
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2841	/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
				2842	///
				2843	/// \headerfile <x86intrin.h>
				2844	///
				2845	/// This intrinsic corresponds to the \c VZEROUPPER instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2846	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2847	_mm256_zeroupper(void)
				2848	{
				2849	__builtin_ia32_vzeroupper();
				2850	}
				2851
				2852	/* Vector load with broadcast */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2853	/// \brief Loads a scalar single-precision floating point value from the
				2854	/// specified address pointed to by __a and broadcasts it to the elements of
				2855	/// a [4 x float] vector.
				2856	///
				2857	/// \headerfile <x86intrin.h>
				2858	///
				2859	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
				2860	///
				2861	/// \param __a
				2862	/// The single-precision floating point value to be broadcast.
				2863	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				2864	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2865	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2866	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2867	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2868	float __f = *__a;
				2869	return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2870	}
				2871
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2872	/// \brief Loads a scalar double-precision floating point value from the
				2873	/// specified address pointed to by __a and broadcasts it to the elements of
				2874	/// a [4 x double] vector.
				2875	///
				2876	/// \headerfile <x86intrin.h>
				2877	///
				2878	/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
				2879	///
				2880	/// \param __a
				2881	/// The double-precision floating point value to be broadcast.
				2882	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				2883	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2884	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2885	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2886	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2887	double __d = *__a;
				2888	return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2889	}
				2890
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2891	/// \brief Loads a scalar single-precision floating point value from the
				2892	/// specified address pointed to by __a and broadcasts it to the elements of
				2893	/// a [8 x float] vector.
				2894	///
				2895	/// \headerfile <x86intrin.h>
				2896	///
				2897	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
				2898	///
				2899	/// \param __a
				2900	/// The single-precision floating point value to be broadcast.
				2901	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				2902	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2903	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2904	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2905	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2906	float __f = *__a;
				2907	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2908	}
				2909
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2910	/// \brief Loads the data from a 128-bit vector of [2 x double] from the
				2911	/// specified address pointed to by __a and broadcasts it to 128-bit
				2912	/// elements in a 256-bit vector of [4 x double].
				2913	///
				2914	/// \headerfile <x86intrin.h>
				2915	///
				2916	/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
				2917	///
				2918	/// \param __a
				2919	/// The 128-bit vector of [2 x double] to be broadcast.
				2920	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				2921	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2922	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2923	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2924	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2925	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2926	}
				2927
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2928	/// \brief Loads the data from a 128-bit vector of [4 x float] from the
				2929	/// specified address pointed to by __a and broadcasts it to 128-bit
				2930	/// elements in a 256-bit vector of [8 x float].
				2931	///
				2932	/// \headerfile <x86intrin.h>
				2933	///
				2934	/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
				2935	///
				2936	/// \param __a
				2937	/// The 128-bit vector of [4 x float] to be broadcast.
				2938	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				2939	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2940	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2941	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2942	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2943	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2944	}
				2945
				2946	/* SIMD load ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2947	/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
				2948	/// memory location pointed to by __p into a vector of [4 x double].
				2949	///
				2950	/// \headerfile <x86intrin.h>
				2951	///
				2952	/// This intrinsic corresponds to the \c VMOVAPD instruction.
				2953	///
				2954	/// \param __p
				2955	/// A 32-byte aligned pointer to a memory location containing
				2956	/// double-precision floating point values.
				2957	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2958	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2959	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2960	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2961	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2962	}
				2963
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2964	/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
				2965	/// memory location pointed to by __p into a vector of [8 x float].
				2966	///
				2967	/// \headerfile <x86intrin.h>
				2968	///
				2969	/// This intrinsic corresponds to the \c VMOVAPS instruction.
				2970	///
				2971	/// \param __p
				2972	/// A 32-byte aligned pointer to a memory location containing float values.
				2973	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2974	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2975	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2976	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2977	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2978	}
				2979
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2980	/// \brief Loads 4 double-precision floating point values from an unaligned
				2981	/// memory location pointed to by __p into a vector of [4 x double].
				2982	///
				2983	/// \headerfile <x86intrin.h>
				2984	///
				2985	/// This intrinsic corresponds to the \c VMOVUPD instruction.
				2986	///
				2987	/// \param __p
				2988	/// A pointer to a memory location containing double-precision floating
				2989	/// point values.
				2990	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2991	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2992	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2993	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2994	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2995	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2996	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2997	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2998	}
				2999
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3000	/// \brief Loads 8 single-precision floating point values from an unaligned
				3001	/// memory location pointed to by __p into a vector of [8 x float].
				3002	///
				3003	/// \headerfile <x86intrin.h>
				3004	///
				3005	/// This intrinsic corresponds to the \c VMOVUPS instruction.
				3006	///
				3007	/// \param __p
				3008	/// A pointer to a memory location containing single-precision floating
				3009	/// point values.
				3010	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3011	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3012	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3013	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3014	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3015	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3016	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3017	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3018	}
				3019
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3020	/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
				3021	/// location pointed to by __p into elements of a 256-bit integer vector.
				3022	///
				3023	/// \headerfile <x86intrin.h>
				3024	///
				3025	/// This intrinsic corresponds to the \c VMOVDQA instruction.
				3026	///
				3027	/// \param __p
				3028	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3029	/// values.
				3030	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3031	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3032	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3033	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3034	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3035	}
				3036
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3037	/// \brief Loads 256 bits of integer data from an unaligned memory location
				3038	/// pointed to by __p into a 256-bit integer vector.
				3039	///
				3040	/// \headerfile <x86intrin.h>
				3041	///
				3042	/// This intrinsic corresponds to the \c VMOVDQU instruction.
				3043	///
				3044	/// \param __p
				3045	/// A pointer to a 256-bit integer vector containing integer values.
				3046	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3047	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3048	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3049	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3050	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3051	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3052	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3053	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3054	}
				3055
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3056	/// \brief Loads 256 bits of integer data from an unaligned memory location
				3057	/// pointed to by __p into a 256-bit integer vector. This intrinsic may
				3058	/// perform better than _mm256_loadu_si256 when the data crosses a cache
				3059	/// line boundary.
				3060	///
				3061	/// \headerfile <x86intrin.h>
				3062	///
				3063	/// This intrinsic corresponds to the \c VLDDQU instruction.
				3064	///
				3065	/// \param __p
				3066	/// A pointer to a 256-bit integer vector containing integer values.
				3067	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3068	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3069	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3070	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3071	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3072	}
				3073
				3074	/* SIMD store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3075	/// \brief Stores double-precision floating point values from a 256-bit vector
				3076	/// of [4 x double] to a 32-byte aligned memory location pointed to by __p.
				3077	///
				3078	/// \headerfile <x86intrin.h>
				3079	///
				3080	/// This intrinsic corresponds to the \c VMOVAPD instruction.
				3081	///
				3082	/// \param __p
				3083	/// A 32-byte aligned pointer to a memory location that will receive the
				3084	/// double-precision floaing point values.
				3085	/// \param __a
				3086	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3087	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3088	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3089	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3090	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3091	}
				3092
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3093	/// \brief Stores single-precision floating point values from a 256-bit vector
				3094	/// of [8 x float] to a 32-byte aligned memory location pointed to by __p.
				3095	///
				3096	/// \headerfile <x86intrin.h>
				3097	///
				3098	/// This intrinsic corresponds to the \c VMOVAPS instruction.
				3099	///
				3100	/// \param __p
				3101	/// A 32-byte aligned pointer to a memory location that will receive the
				3102	/// float values.
				3103	/// \param __a
				3104	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3105	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3106	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3107	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3108	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3109	}
				3110
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3111	/// \brief Stores double-precision floating point values from a 256-bit vector
				3112	/// of [4 x double] to an unaligned memory location pointed to by __p.
				3113	///
				3114	/// \headerfile <x86intrin.h>
				3115	///
				3116	/// This intrinsic corresponds to the \c VMOVUPD instruction.
				3117	///
				3118	/// \param __p
				3119	/// A pointer to a memory location that will receive the double-precision
				3120	/// floating point values.
				3121	/// \param __a
				3122	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3123	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3124	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3125	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3126	struct __storeu_pd {
				3127	__m256d __v;
				3128	} __attribute__((__packed__, __may_alias__));
				3129	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3130	}
				3131
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3132	/// \brief Stores single-precision floating point values from a 256-bit vector
				3133	/// of [8 x float] to an unaligned memory location pointed to by __p.
				3134	///
				3135	/// \headerfile <x86intrin.h>
				3136	///
				3137	/// This intrinsic corresponds to the \c VMOVUPS instruction.
				3138	///
				3139	/// \param __p
				3140	/// A pointer to a memory location that will receive the float values.
				3141	/// \param __a
				3142	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3143	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3144	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3145	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3146	struct __storeu_ps {
				3147	__m256 __v;
				3148	} __attribute__((__packed__, __may_alias__));
				3149	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3150	}
				3151
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3152	/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
				3153	/// aligned memory location pointed to by __p.
				3154	///
				3155	/// \headerfile <x86intrin.h>
				3156	///
				3157	/// This intrinsic corresponds to the \c VMOVDQA instruction.
				3158	///
				3159	/// \param __p
				3160	/// A 32-byte aligned pointer to a memory location that will receive the
				3161	/// integer values.
				3162	/// \param __a
				3163	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3164	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3165	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3166	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3167	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3168	}
				3169
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3170	/// \brief Stores integer values from a 256-bit integer vector to an unaligned
				3171	/// memory location pointed to by __p.
				3172	///
				3173	/// \headerfile <x86intrin.h>
				3174	///
				3175	/// This intrinsic corresponds to the \c VMOVDQU instruction.
				3176	///
				3177	/// \param __p
				3178	/// A pointer to a memory location that will receive the integer values.
				3179	/// \param __a
				3180	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3181	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3182	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3183	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3184	struct __storeu_si256 {
				3185	__m256i __v;
				3186	} __attribute__((__packed__, __may_alias__));
				3187	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3188	}
				3189
				3190	/* Conditional load ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3191	/// \brief Conditionally loads double-precision floating point elements
				3192	/// from a memory location pointed to by __p into a 128-bit vector of
				3193	/// [2 x double], depending on the mask bits associated with each data
				3194	/// element.
				3195	///
				3196	/// \headerfile <x86intrin.h>
				3197	///
				3198	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3199	///
				3200	/// \param __p
				3201	/// A pointer to a memory location that contains the double-precision
				3202	/// floating point values.
				3203	/// \param __m
				3204	/// A 128-bit integer vector containing the mask. The most significant bit of
				3205	/// each data element represents the mask bits. If a mask bit is zero, the
				3206	/// corresponding value in the memory location is not loaded and the
				3207	/// corresponding field in the return value is set to zero.
				3208	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3209	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3210	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3211	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3212	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3213	}
				3214
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3215	/// \brief Conditionally loads double-precision floating point elements
				3216	/// from a memory location pointed to by __p into a 256-bit vector of
				3217	/// [4 x double], depending on the mask bits associated with each data
				3218	/// element.
				3219	///
				3220	/// \headerfile <x86intrin.h>
				3221	///
				3222	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3223	///
				3224	/// \param __p
				3225	/// A pointer to a memory location that contains the double-precision
				3226	/// floating point values.
				3227	/// \param __m
				3228	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3229	/// significant bit of each quadword element represents the mask bits. If a
				3230	/// mask bit is zero, the corresponding value in the memory location is not
				3231	/// loaded and the corresponding field in the return value is set to zero.
				3232	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3233	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3234	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3235	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3236	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3237	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3238	}
				3239
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3240	/// \brief Conditionally loads single-precision floating point elements
				3241	/// from a memory location pointed to by __p into a 128-bit vector of
				3242	/// [4 x float], depending on the mask bits associated with each data
				3243	/// element.
				3244	///
				3245	/// \headerfile <x86intrin.h>
				3246	///
				3247	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3248	///
				3249	/// \param __p
				3250	/// A pointer to a memory location that contains the single-precision
				3251	/// floating point values.
				3252	/// \param __m
				3253	/// A 128-bit integer vector containing the mask. The most significant bit of
				3254	/// each data element represents the mask bits. If a mask bit is zero, the
				3255	/// corresponding value in the memory location is not loaded and the
				3256	/// corresponding field in the return value is set to zero.
				3257	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3258	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3259	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3260	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3261	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3262	}
				3263
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3264	/// \brief Conditionally loads single-precision floating point elements from a
				3265	/// memory location pointed to by __p into a 256-bit vector of [8 x float],
				3266	/// depending on the mask bits associated with each data element.
				3267	///
				3268	/// \headerfile <x86intrin.h>
				3269	///
				3270	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3271	///
				3272	/// \param __p
				3273	/// A pointer to a memory location that contains the single-precision
				3274	/// floating point values.
				3275	/// \param __m
				3276	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3277	/// significant bit of each dword element represents the mask bits. If a mask
				3278	/// bit is zero, the corresponding value in the memory location is not loaded
				3279	/// and the corresponding field in the return value is set to zero.
				3280	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3281	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3282	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3283	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3284	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3285	}
				3286
				3287	/* Conditional store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3288	/// \brief Moves single-precision floating point values from a 256-bit vector
				3289	/// of [8 x float] to a memory location pointed to by __p, according to the
				3290	/// specified mask.
				3291	///
				3292	/// \headerfile <x86intrin.h>
				3293	///
				3294	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3295	///
				3296	/// \param __p
				3297	/// A pointer to a memory location that will receive the float values.
				3298	/// \param __m
				3299	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3300	/// significant bit of each dword element in the mask vector represents the
				3301	/// mask bits. If a mask bit is zero, the corresponding value from vector __a
				3302	/// is not stored and the corresponding field in the memory location pointed
				3303	/// to by __p is not changed.
				3304	/// \param __a
				3305	/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3306	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3307	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3308	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3309	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3310	}
				3311
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3312	/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
				3313	/// to a memory location pointed to by __p, according to the specified mask.
				3314	///
				3315	/// \headerfile <x86intrin.h>
				3316	///
				3317	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3318	///
				3319	/// \param __p
				3320	/// A pointer to a memory location that will receive the float values.
				3321	/// \param __m
				3322	/// A 128-bit integer vector containing the mask. The most significant bit of
				3323	/// each field in the mask vector represents the mask bits. If a mask bit is
				3324	/// zero, the corresponding value from vector __a is not stored and the
				3325	/// corresponding field in the memory location pointed to by __p is not
				3326	/// changed.
				3327	/// \param __a
				3328	/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3329	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3330	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3331	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3332	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3333	}
				3334
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3335	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
				3336	/// to a memory location pointed to by __p, according to the specified mask.
				3337	///
				3338	/// \headerfile <x86intrin.h>
				3339	///
				3340	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3341	///
				3342	/// \param __p
				3343	/// A pointer to a memory location that will receive the float values.
				3344	/// \param __m
				3345	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3346	/// significant bit of each quadword element in the mask vector represents
				3347	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3348	/// __a is not stored and the corresponding field in the memory location
				3349	/// pointed to by __p is not changed.
				3350	/// \param __a
				3351	/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3352	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3353	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3354	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3355	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3356	}
				3357
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3358	/// \brief Moves single-precision floating point values from a 128-bit vector
				3359	/// of [4 x float] to a memory location pointed to by __p, according to the
				3360	/// specified mask.
				3361	///
				3362	/// \headerfile <x86intrin.h>
				3363	///
				3364	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3365	///
				3366	/// \param __p
				3367	/// A pointer to a memory location that will receive the float values.
				3368	/// \param __m
				3369	/// A 128-bit integer vector containing the mask. The most significant bit of
				3370	/// each field in the mask vector represents the mask bits. If a mask bit is
				3371	/// zero, the corresponding value from vector __a is not stored and the
				3372	/// corresponding field in the memory location pointed to by __p is not
				3373	/// changed.
				3374	/// \param __a
				3375	/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3376	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3377	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3378	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3379	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3380	}
				3381
				3382	/* Cacheability support ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3383	/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
				3384	/// aligned memory location. To minimize caching, the data is flagged as
				3385	/// non-temporal (unlikely to be used again soon).
				3386	///
				3387	/// \headerfile <x86intrin.h>
				3388	///
				3389	/// This intrinsic corresponds to the \c VMOVNTDQ instruction.
				3390	///
				3391	/// \param __a
				3392	/// A pointer to a 32-byte aligned memory location that will receive the
				3393	/// integer values.
				3394	/// \param __b
				3395	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3396	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3397	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3398	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3399	__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3400	}
				3401
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3402	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
				3403	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3404	/// flagged as non-temporal (unlikely to be used again soon).
				3405	///
				3406	/// \headerfile <x86intrin.h>
				3407	///
				3408	/// This intrinsic corresponds to the \c VMOVNTPD instruction.
				3409	///
				3410	/// \param __a
				3411	/// A pointer to a 32-byte aligned memory location that will receive the
				3412	/// integer values.
				3413	/// \param __b
				3414	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3415	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3416	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3417	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3418	__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3419	}
				3420
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3421	/// \brief Moves single-precision floating point values from a 256-bit vector
				3422	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3423	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3424	/// soon).
				3425	///
				3426	/// \headerfile <x86intrin.h>
				3427	///
				3428	/// This intrinsic corresponds to the \c VMOVNTPS instruction.
				3429	///
				3430	/// \param __p
				3431	/// A pointer to a 32-byte aligned memory location that will receive the
				3432	/// single-precision floating point values.
				3433	/// \param __a
				3434	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3435	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3436	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3437	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3438	__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3439	}
				3440
				3441	/* Create vectors */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3442	/// \brief Create a 256-bit vector of [4 x double] with undefined values.
				3443	///
				3444	/// \headerfile <x86intrin.h>
				3445	///
				3446	/// This intrinsic has no corresponding instruction.
				3447	///
				3448	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3449	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3450	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3451	{
				3452	return (__m256d)__builtin_ia32_undef256();
				3453	}
				3454
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3455	/// \brief Create a 256-bit vector of [8 x float] with undefined values.
				3456	///
				3457	/// \headerfile <x86intrin.h>
				3458	///
				3459	/// This intrinsic has no corresponding instruction.
				3460	///
				3461	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3462	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3463	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3464	{
				3465	return (__m256)__builtin_ia32_undef256();
				3466	}
				3467
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3468	/// \brief Create a 256-bit integer vector with undefined values.
				3469	///
				3470	/// \headerfile <x86intrin.h>
				3471	///
				3472	/// This intrinsic has no corresponding instruction.
				3473	///
				3474	/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3475	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3476	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3477	{
				3478	return (__m256i)__builtin_ia32_undef256();
				3479	}
				3480
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3481	/// \brief Constructs a 256-bit floating-point vector of [4 x double]
				3482	/// initialized with the specified double-precision floating-point values.
				3483	///
				3484	/// \headerfile <x86intrin.h>
				3485	///
				3486	/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
				3487	///
				3488	/// \param __a
				3489	/// A double-precision floating-point value used to initialize bits [255:192]
				3490	/// of the result.
				3491	/// \param __b
				3492	/// A double-precision floating-point value used to initialize bits [191:128]
				3493	/// of the result.
				3494	/// \param __c
				3495	/// A double-precision floating-point value used to initialize bits [127:64]
				3496	/// of the result.
				3497	/// \param __d
				3498	/// A double-precision floating-point value used to initialize bits [63:0]
				3499	/// of the result.
				3500	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3501	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3502	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3503	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3504	return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3505	}
				3506
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3507	/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
				3508	/// with the specified single-precision floating-point values.
				3509	///
				3510	/// \headerfile <x86intrin.h>
				3511	///
				3512	/// This intrinsic is a utility function and does not correspond to a specific
				3513	/// instruction.
				3514	///
				3515	/// \param __a
				3516	/// A single-precision floating-point value used to initialize bits [255:224]
				3517	/// of the result.
				3518	/// \param __b
				3519	/// A single-precision floating-point value used to initialize bits [223:192]
				3520	/// of the result.
				3521	/// \param __c
				3522	/// A single-precision floating-point value used to initialize bits [191:160]
				3523	/// of the result.
				3524	/// \param __d
				3525	/// A single-precision floating-point value used to initialize bits [159:128]
				3526	/// of the result.
				3527	/// \param __e
				3528	/// A single-precision floating-point value used to initialize bits [127:96]
				3529	/// of the result.
				3530	/// \param __f
				3531	/// A single-precision floating-point value used to initialize bits [95:64]
				3532	/// of the result.
				3533	/// \param __g
				3534	/// A single-precision floating-point value used to initialize bits [63:32]
				3535	/// of the result.
				3536	/// \param __h
				3537	/// A single-precision floating-point value used to initialize bits [31:0]
				3538	/// of the result.
				3539	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3540	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3541	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3542	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3543	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3544	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3545	}
				3546
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3547	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3548	/// 32-bit integral values.
				3549	///
				3550	/// \headerfile <x86intrin.h>
				3551	///
				3552	/// This intrinsic is a utility function and does not correspond to a specific
				3553	/// instruction.
				3554	///
				3555	/// \param __i0
				3556	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3557	/// \param __i1
				3558	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3559	/// \param __i2
				3560	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3561	/// \param __i3
				3562	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3563	/// \param __i4
				3564	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3565	/// \param __i5
				3566	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3567	/// \param __i6
				3568	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3569	/// \param __i7
				3570	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3571	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3572	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3573	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3574	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3575	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3576	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3577	}
				3578
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3579	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3580	/// 16-bit integral values.
				3581	///
				3582	/// \headerfile <x86intrin.h>
				3583	///
				3584	/// This intrinsic is a utility function and does not correspond to a specific
				3585	/// instruction.
				3586	///
				3587	/// \param __w15
				3588	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3589	/// \param __w14
				3590	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3591	/// \param __w13
				3592	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3593	/// \param __w12
				3594	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3595	/// \param __w11
				3596	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3597	/// \param __w10
				3598	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3599	/// \param __w09
				3600	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3601	/// \param __w08
				3602	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3603	/// \param __w07
				3604	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3605	/// \param __w06
				3606	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3607	/// \param __w05
				3608	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3609	/// \param __w04
				3610	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3611	/// \param __w03
				3612	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3613	/// \param __w02
				3614	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3615	/// \param __w01
				3616	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3617	/// \param __w00
				3618	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3619	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3620	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3621	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3622	short __w11, short __w10, short __w09, short __w08,
				3623	short __w07, short __w06, short __w05, short __w04,
				3624	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3625	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3626	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				3627	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3628	}
				3629
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3630	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3631	/// 8-bit integral values.
				3632	///
				3633	/// \headerfile <x86intrin.h>
				3634	///
				3635	/// This intrinsic is a utility function and does not correspond to a specific
				3636	/// instruction.
				3637	///
				3638	/// \param __b31
				3639	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3640	/// \param __b30
				3641	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3642	/// \param __b29
				3643	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3644	/// \param __b28
				3645	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3646	/// \param __b27
				3647	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3648	/// \param __b26
				3649	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3650	/// \param __b25
				3651	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3652	/// \param __b24
				3653	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3654	/// \param __b23
				3655	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3656	/// \param __b22
				3657	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3658	/// \param __b21
				3659	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3660	/// \param __b20
				3661	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3662	/// \param __b19
				3663	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3664	/// \param __b18
				3665	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3666	/// \param __b17
				3667	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3668	/// \param __b16
				3669	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3670	/// \param __b15
				3671	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3672	/// \param __b14
				3673	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3674	/// \param __b13
				3675	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3676	/// \param __b12
				3677	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3678	/// \param __b11
				3679	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3680	/// \param __b10
				3681	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3682	/// \param __b09
				3683	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3684	/// \param __b08
				3685	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3686	/// \param __b07
				3687	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3688	/// \param __b06
				3689	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3690	/// \param __b05
				3691	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3692	/// \param __b04
				3693	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3694	/// \param __b03
				3695	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3696	/// \param __b02
				3697	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3698	/// \param __b01
				3699	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3700	/// \param __b00
				3701	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3702	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3703	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3704	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3705	char __b27, char __b26, char __b25, char __b24,
				3706	char __b23, char __b22, char __b21, char __b20,
				3707	char __b19, char __b18, char __b17, char __b16,
				3708	char __b15, char __b14, char __b13, char __b12,
				3709	char __b11, char __b10, char __b09, char __b08,
				3710	char __b07, char __b06, char __b05, char __b04,
				3711	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3712	{
				3713	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3714	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3715	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3716	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3717	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3718	};
				3719	}
				3720
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3721	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3722	/// 64-bit integral values.
				3723	///
				3724	/// \headerfile <x86intrin.h>
				3725	///
				3726	/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
				3727	///
				3728	/// \param __a
				3729	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3730	/// \param __b
				3731	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3732	/// \param __c
				3733	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3734	/// \param __d
				3735	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3736	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3737	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3738	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3739	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3740	return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3741	}
				3742
				3743	/* Create vectors with elements in reverse order */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3744	/// \brief Constructs a 256-bit floating-point vector of [4 x double],
				3745	/// initialized in reverse order with the specified double-precision
				3746	/// floating-point values.
				3747	///
				3748	/// \headerfile <x86intrin.h>
				3749	///
				3750	/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
				3751	///
				3752	/// \param __a
				3753	/// A double-precision floating-point value used to initialize bits [63:0]
				3754	/// of the result.
				3755	/// \param __b
				3756	/// A double-precision floating-point value used to initialize bits [127:64]
				3757	/// of the result.
				3758	/// \param __c
				3759	/// A double-precision floating-point value used to initialize bits [191:128]
				3760	/// of the result.
				3761	/// \param __d
				3762	/// A double-precision floating-point value used to initialize bits [255:192]
				3763	/// of the result.
				3764	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3765	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3766	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3767	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3768	return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3769	}
				3770
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3771	/// \brief Constructs a 256-bit floating-point vector of [8 x float],
				3772	/// initialized in reverse order with the specified single-precision
				3773	/// float-point values.
				3774	///
				3775	/// \headerfile <x86intrin.h>
				3776	///
				3777	/// This intrinsic is a utility function and does not correspond to a specific
				3778	/// instruction.
				3779	///
				3780	/// \param __a
				3781	/// A single-precision floating-point value used to initialize bits [31:0]
				3782	/// of the result.
				3783	/// \param __b
				3784	/// A single-precision floating-point value used to initialize bits [63:32]
				3785	/// of the result.
				3786	/// \param __c
				3787	/// A single-precision floating-point value used to initialize bits [95:64]
				3788	/// of the result.
				3789	/// \param __d
				3790	/// A single-precision floating-point value used to initialize bits [127:96]
				3791	/// of the result.
				3792	/// \param __e
				3793	/// A single-precision floating-point value used to initialize bits [159:128]
				3794	/// of the result.
				3795	/// \param __f
				3796	/// A single-precision floating-point value used to initialize bits [191:160]
				3797	/// of the result.
				3798	/// \param __g
				3799	/// A single-precision floating-point value used to initialize bits [223:192]
				3800	/// of the result.
				3801	/// \param __h
				3802	/// A single-precision floating-point value used to initialize bits [255:224]
				3803	/// of the result.
				3804	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3805	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3806	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3807	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3808	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3809	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3810	}
				3811
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3812	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3813	/// with the specified 32-bit integral values.
				3814	///
				3815	/// \headerfile <x86intrin.h>
				3816	///
				3817	/// This intrinsic is a utility function and does not correspond to a specific
				3818	/// instruction.
				3819	///
				3820	/// \param __i0
				3821	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3822	/// \param __i1
				3823	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3824	/// \param __i2
				3825	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3826	/// \param __i3
				3827	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3828	/// \param __i4
				3829	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3830	/// \param __i5
				3831	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3832	/// \param __i6
				3833	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3834	/// \param __i7
				3835	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3836	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3837	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3838	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3839	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3840	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3841	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3842	}
				3843
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3844	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3845	/// with the specified 16-bit integral values.
				3846	///
				3847	/// \headerfile <x86intrin.h>
				3848	///
				3849	/// This intrinsic is a utility function and does not correspond to a specific
				3850	/// instruction.
				3851	///
				3852	/// \param __w15
				3853	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3854	/// \param __w14
				3855	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3856	/// \param __w13
				3857	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3858	/// \param __w12
				3859	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3860	/// \param __w11
				3861	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3862	/// \param __w10
				3863	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3864	/// \param __w09
				3865	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3866	/// \param __w08
				3867	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3868	/// \param __w07
				3869	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3870	/// \param __w06
				3871	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3872	/// \param __w05
				3873	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3874	/// \param __w04
				3875	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3876	/// \param __w03
				3877	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3878	/// \param __w02
				3879	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3880	/// \param __w01
				3881	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3882	/// \param __w00
				3883	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3884	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3885	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3886	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3887	short __w11, short __w10, short __w09, short __w08,
				3888	short __w07, short __w06, short __w05, short __w04,
				3889	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3890	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3891	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				3892	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3893	}
				3894
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3895	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3896	/// with the specified 8-bit integral values.
				3897	///
				3898	/// \headerfile <x86intrin.h>
				3899	///
				3900	/// This intrinsic is a utility function and does not correspond to a specific
				3901	/// instruction.
				3902	///
				3903	/// \param __b31
				3904	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3905	/// \param __b30
				3906	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3907	/// \param __b29
				3908	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3909	/// \param __b28
				3910	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3911	/// \param __b27
				3912	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3913	/// \param __b26
				3914	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3915	/// \param __b25
				3916	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3917	/// \param __b24
				3918	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3919	/// \param __b23
				3920	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3921	/// \param __b22
				3922	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3923	/// \param __b21
				3924	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3925	/// \param __b20
				3926	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3927	/// \param __b19
				3928	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3929	/// \param __b18
				3930	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3931	/// \param __b17
				3932	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3933	/// \param __b16
				3934	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3935	/// \param __b15
				3936	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3937	/// \param __b14
				3938	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3939	/// \param __b13
				3940	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3941	/// \param __b12
				3942	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3943	/// \param __b11
				3944	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3945	/// \param __b10
				3946	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3947	/// \param __b09
				3948	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3949	/// \param __b08
				3950	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3951	/// \param __b07
				3952	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3953	/// \param __b06
				3954	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3955	/// \param __b05
				3956	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3957	/// \param __b04
				3958	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3959	/// \param __b03
				3960	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3961	/// \param __b02
				3962	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3963	/// \param __b01
				3964	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3965	/// \param __b00
				3966	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3967	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3968	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3969	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3970	char __b27, char __b26, char __b25, char __b24,
				3971	char __b23, char __b22, char __b21, char __b20,
				3972	char __b19, char __b18, char __b17, char __b16,
				3973	char __b15, char __b14, char __b13, char __b12,
				3974	char __b11, char __b10, char __b09, char __b08,
				3975	char __b07, char __b06, char __b05, char __b04,
				3976	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3977	{
				3978	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3979	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3980	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				3981	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				3982	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3983	}
				3984
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3985	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3986	/// with the specified 64-bit integral values.
				3987	///
				3988	/// \headerfile <x86intrin.h>
				3989	///
				3990	/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
				3991	///
				3992	/// \param __a
				3993	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3994	/// \param __b
				3995	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3996	/// \param __c
				3997	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3998	/// \param __d
				3999	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				4000	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4001	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4002	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4003	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4004	return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4005	}
				4006
				4007	/* Create vectors with repeated elements */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4008	/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
				4009	/// of the four double-precision floating-point vector elements set to the
				4010	/// specified double-precision floating-point value.
				4011	///
				4012	/// \headerfile <x86intrin.h>
				4013	///
				4014	/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
				4015	///
				4016	/// \param __w
				4017	/// A double-precision floating-point value used to initialize each vector
				4018	/// element of the result.
				4019	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4020	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4021	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4022	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4023	return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4024	}
				4025
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4026	/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
				4027	/// of the eight single-precision floating-point vector elements set to the
				4028	/// specified single-precision floating-point value.
				4029	///
				4030	/// \headerfile <x86intrin.h>
				4031	///
				4032	/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
				4033	///
				4034	/// \param __w
				4035	/// A single-precision floating-point value used to initialize each vector
				4036	/// element of the result.
				4037	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4038	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4039	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4040	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4041	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4042	}
				4043
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4044	/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
				4045	/// 32-bit integral vector elements set to the specified 32-bit integral
				4046	/// value.
				4047	///
				4048	/// \headerfile <x86intrin.h>
				4049	///
				4050	/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
				4051	///
				4052	/// \param __i
				4053	/// A 32-bit integral value used to initialize each vector element of the
				4054	/// result.
				4055	/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4056	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4057	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4058	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4059	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4060	}
				4061
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4062	/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
				4063	/// 16-bit integral vector elements set to the specified 16-bit integral
				4064	/// value.
				4065	///
				4066	/// \headerfile <x86intrin.h>
				4067	///
				4068	/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
				4069	///
				4070	/// \param __w
				4071	/// A 16-bit integral value used to initialize each vector element of the
				4072	/// result.
				4073	/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4074	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4075	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4076	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4077	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				4078	__w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4079	}
				4080
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4081	/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
				4082	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4083	///
				4084	/// \headerfile <x86intrin.h>
				4085	///
				4086	/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
				4087	///
				4088	/// \param __b
				4089	/// An 8-bit integral value used to initialize each vector element of the
				4090	/// result.
				4091	/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4092	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4093	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4094	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4095	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4096	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4097	__b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4098	}
				4099
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4100	/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
				4101	/// 64-bit integral vector elements set to the specified 64-bit integral
				4102	/// value.
				4103	///
				4104	/// \headerfile <x86intrin.h>
				4105	///
				4106	/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
				4107	///
				4108	/// \param __q
				4109	/// A 64-bit integral value used to initialize each vector element of the
				4110	/// result.
				4111	/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4112	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4113	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4114	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4115	return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4116	}
				4117
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4118	/* Create __zeroed vectors */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4119	/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
				4120	/// vector elements initialized to zero.
				4121	///
				4122	/// \headerfile <x86intrin.h>
				4123	///
				4124	/// This intrinsic corresponds to the \c VXORPS instruction.
				4125	///
				4126	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4127	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4128	_mm256_setzero_pd(void)
				4129	{
				4130	return (__m256d){ 0, 0, 0, 0 };
				4131	}
				4132
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4133	/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
				4134	/// vector elements initialized to zero.
				4135	///
				4136	/// \headerfile <x86intrin.h>
				4137	///
				4138	/// This intrinsic corresponds to the \c VXORPS instruction.
				4139	///
				4140	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4141	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4142	_mm256_setzero_ps(void)
				4143	{
				4144	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				4145	}
				4146
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4147	/// \brief Constructs a 256-bit integer vector initialized to zero.
				4148	///
				4149	/// \headerfile <x86intrin.h>
				4150	///
				4151	/// This intrinsic corresponds to the \c VXORPS instruction.
				4152	///
				4153	/// \returns A 256-bit integer vector initialized to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4154	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4155	_mm256_setzero_si256(void)
				4156	{
				4157	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				4158	}
				4159
				4160	/* Cast between vector types */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4161	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4162	/// floating-point vector of [8 x float].
				4163	///
				4164	/// \headerfile <x86intrin.h>
				4165	///
				4166	/// This intrinsic has no corresponding instruction.
				4167	///
				4168	/// \param __a
				4169	/// A 256-bit floating-point vector of [4 x double].
				4170	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4171	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4172	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4173	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4174	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4175	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4176	}
				4177
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4178	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4179	/// integer vector.
				4180	///
				4181	/// \headerfile <x86intrin.h>
				4182	///
				4183	/// This intrinsic has no corresponding instruction.
				4184	///
				4185	/// \param __a
				4186	/// A 256-bit floating-point vector of [4 x double].
				4187	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4188	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4189	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4190	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4191	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4192	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4193	}
				4194
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4195	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4196	/// floating-point vector of [4 x double].
				4197	///
				4198	/// \headerfile <x86intrin.h>
				4199	///
				4200	/// This intrinsic has no corresponding instruction.
				4201	///
				4202	/// \param __a
				4203	/// A 256-bit floating-point vector of [8 x float].
				4204	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4205	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4206	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4207	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4208	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4209	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4210	}
				4211
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4212	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4213	/// integer vector.
				4214	///
				4215	/// \headerfile <x86intrin.h>
				4216	///
				4217	/// This intrinsic has no corresponding instruction.
				4218	///
				4219	/// \param __a
				4220	/// A 256-bit floating-point vector of [8 x float].
				4221	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4222	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4223	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4224	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4225	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4226	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4227	}
				4228
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4229	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4230	/// of [8 x float].
				4231	///
				4232	/// \headerfile <x86intrin.h>
				4233	///
				4234	/// This intrinsic has no corresponding instruction.
				4235	///
				4236	/// \param __a
				4237	/// A 256-bit integer vector.
				4238	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4239	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4240	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4241	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4242	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4243	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4244	}
				4245
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4246	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4247	/// of [4 x double].
				4248	///
				4249	/// \headerfile <x86intrin.h>
				4250	///
				4251	/// This intrinsic has no corresponding instruction.
				4252	///
				4253	/// \param __a
				4254	/// A 256-bit integer vector.
				4255	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4256	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4257	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4258	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4259	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4260	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4261	}
				4262
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4263	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4264	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4265	///
				4266	/// \headerfile <x86intrin.h>
				4267	///
				4268	/// This intrinsic has no corresponding instruction.
				4269	///
				4270	/// \param __a
				4271	/// A 256-bit floating-point vector of [4 x double].
				4272	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4273	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4274	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4275	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4276	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4277	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4278	}
				4279
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4280	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4281	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4282	///
				4283	/// \headerfile <x86intrin.h>
				4284	///
				4285	/// This intrinsic has no corresponding instruction.
				4286	///
				4287	/// \param __a
				4288	/// A 256-bit floating-point vector of [8 x float].
				4289	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4290	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4291	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4292	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4293	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4294	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4295	}
				4296
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4297	/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
				4298	///
				4299	/// \headerfile <x86intrin.h>
				4300	///
				4301	/// This intrinsic has no corresponding instruction.
				4302	///
				4303	/// \param __a
				4304	/// A 256-bit integer vector.
				4305	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4306	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4307	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4308	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4309	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4310	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4311	}
				4312
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4313	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
				4314	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4315	/// contain the value of the source vector. The contents of the upper 128
				4316	/// bits are undefined.
				4317	///
				4318	/// \headerfile <x86intrin.h>
				4319	///
				4320	/// This intrinsic has no corresponding instruction.
				4321	///
				4322	/// \param __a
				4323	/// A 128-bit vector of [2 x double].
				4324	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4325	/// contain the value of the parameter. The contents of the upper 128 bits
				4326	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4327	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4328	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4329	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4330	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4331	}
				4332
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4333	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
				4334	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4335	/// the value of the source vector. The contents of the upper 128 bits are
				4336	/// undefined.
				4337	///
				4338	/// \headerfile <x86intrin.h>
				4339	///
				4340	/// This intrinsic has no corresponding instruction.
				4341	///
				4342	/// \param __a
				4343	/// A 128-bit vector of [4 x float].
				4344	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4345	/// contain the value of the parameter. The contents of the upper 128 bits
				4346	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4347	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4348	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4349	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4350	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4351	}
				4352
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4353	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
				4354	/// The lower 128 bits contain the value of the source vector. The contents
				4355	/// of the upper 128 bits are undefined.
				4356	///
				4357	/// \headerfile <x86intrin.h>
				4358	///
				4359	/// This intrinsic has no corresponding instruction.
				4360	///
				4361	/// \param __a
				4362	/// A 128-bit integer vector.
				4363	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4364	/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4365	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4366	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4367	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4368	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4369	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4370
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4371	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4372	Vector insert.
				4373	We use macros rather than inlines because we only want to accept
				4374	invocations where the immediate M is a constant expression.
				4375	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4376	/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
				4377	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4378	/// replacing either the upper or the lower 128 bits with the contents of a
				4379	/// 128-bit vector of [4 x float] in the second parameter. The immediate
				4380	/// integer parameter determines between the upper or the lower 128 bits.
				4381	///
				4382	/// \headerfile <x86intrin.h>
				4383	///
				4384	/// \code
				4385	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4386	/// \endcode
				4387	///
				4388	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4389	///
				4390	/// \param V1
				4391	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4392	/// first, and then either the upper or the lower 128 bits of the result will
				4393	/// be replaced by the contents of V2.
				4394	/// \param V2
				4395	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4396	/// written to either the upper or the lower 128 bits of the result depending
				4397	/// on the value of parameter M.
				4398	/// \param M
				4399	/// An immediate integer. The least significant bit determines how the values
				4400	/// from the two parameters are interleaved:
				4401	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4402	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4403	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4404	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4405	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4406	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				4407	(__m256)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4408	(__v8sf)(__m256)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4409	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				4410	(((M) & 1) ? 0 : 8), \
				4411	(((M) & 1) ? 1 : 9), \
				4412	(((M) & 1) ? 2 : 10), \
				4413	(((M) & 1) ? 3 : 11), \
				4414	(((M) & 1) ? 8 : 4), \
				4415	(((M) & 1) ? 9 : 5), \
				4416	(((M) & 1) ? 10 : 6), \
				4417	(((M) & 1) ? 11 : 7) );})
				4418
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4419	/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
				4420	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4421	/// replacing either the upper or the lower 128 bits with the contents of a
				4422	/// 128-bit vector of [2 x double] in the second parameter. The immediate
				4423	/// integer parameter determines between the upper or the lower 128 bits.
				4424	///
				4425	/// \headerfile <x86intrin.h>
				4426	///
				4427	/// \code
				4428	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4429	/// \endcode
				4430	///
				4431	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4432	///
				4433	/// \param V1
				4434	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4435	/// first, and then either the upper or the lower 128 bits of the result will
				4436	/// be replaced by the contents of V2.
				4437	/// \param V2
				4438	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4439	/// written to either the upper or the lower 128 bits of the result depending
				4440	/// on the value of parameter M.
				4441	/// \param M
				4442	/// An immediate integer. The least significant bit determines how the values
				4443	/// from the two parameters are interleaved:
				4444	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4445	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4446	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4447	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4448	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4449	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				4450	(__m256d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4451	(__v4df)(__m256d)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4452	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				4453	(((M) & 1) ? 0 : 4), \
				4454	(((M) & 1) ? 1 : 5), \
				4455	(((M) & 1) ? 4 : 2), \
				4456	(((M) & 1) ? 5 : 3) );})
				4457
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4458	/// \brief Constructs a new 256-bit integer vector by first duplicating a
				4459	/// 256-bit integer vector given in the first parameter, and then replacing
				4460	/// either the upper or the lower 128 bits with the contents of a 128-bit
				4461	/// integer vector in the second parameter. The immediate integer parameter
				4462	/// determines between the upper or the lower 128 bits.
				4463	///
				4464	/// \headerfile <x86intrin.h>
				4465	///
				4466	/// \code
				4467	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4468	/// \endcode
				4469	///
				4470	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4471	///
				4472	/// \param V1
				4473	/// A 256-bit integer vector. This vector is copied to the result first, and
				4474	/// then either the upper or the lower 128 bits of the result will be
				4475	/// replaced by the contents of V2.
				4476	/// \param V2
				4477	/// A 128-bit integer vector. The contents of this parameter are written to
				4478	/// either the upper or the lower 128 bits of the result depending on the
				4479	/// value of parameter M.
				4480	/// \param M
				4481	/// An immediate integer. The least significant bit determines how the values
				4482	/// from the two parameters are interleaved:
				4483	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4484	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4485	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4486	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4487	/// \returns A 256-bit integer vector containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4488	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				4489	(__m256i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4490	(__v4di)(__m256i)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4491	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				4492	(((M) & 1) ? 0 : 4), \
				4493	(((M) & 1) ? 1 : 5), \
				4494	(((M) & 1) ? 4 : 2), \
				4495	(((M) & 1) ? 5 : 3) );})
				4496
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4497	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4498	Vector extract.
				4499	We use macros rather than inlines because we only want to accept
				4500	invocations where the immediate M is a constant expression.
				4501	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4502	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4503	/// of [8 x float], as determined by the immediate integer parameter, and
				4504	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4505	///
				4506	/// \headerfile <x86intrin.h>
				4507	///
				4508	/// \code
				4509	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4510	/// \endcode
				4511	///
				4512	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4513	///
				4514	/// \param V
				4515	/// A 256-bit vector of [8 x float].
				4516	/// \param M
				4517	/// An immediate integer. The least significant bit determines which bits are
				4518	/// extracted from the first parameter:
				4519	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4520	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4521	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4522	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				4523	(__m128)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4524	(__v8sf)(__m256)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4525	(__v8sf)(_mm256_undefined_ps()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4526	(((M) & 1) ? 4 : 0), \
				4527	(((M) & 1) ? 5 : 1), \
				4528	(((M) & 1) ? 6 : 2), \
				4529	(((M) & 1) ? 7 : 3) );})
				4530
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4531	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4532	/// of [4 x double], as determined by the immediate integer parameter, and
				4533	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4534	///
				4535	/// \headerfile <x86intrin.h>
				4536	///
				4537	/// \code
				4538	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4539	/// \endcode
				4540	///
				4541	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4542	///
				4543	/// \param V
				4544	/// A 256-bit vector of [4 x double].
				4545	/// \param M
				4546	/// An immediate integer. The least significant bit determines which bits are
				4547	/// extracted from the first parameter:
				4548	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4549	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4550	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4551	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				4552	(__m128d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4553	(__v4df)(__m256d)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4554	(__v4df)(_mm256_undefined_pd()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4555	(((M) & 1) ? 2 : 0), \
				4556	(((M) & 1) ? 3 : 1) );})
				4557
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4558	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
				4559	/// integer vector, as determined by the immediate integer parameter, and
				4560	/// returns the extracted bits as a 128-bit integer vector.
				4561	///
				4562	/// \headerfile <x86intrin.h>
				4563	///
				4564	/// \code
				4565	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4566	/// \endcode
				4567	///
				4568	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4569	///
				4570	/// \param V
				4571	/// A 256-bit integer vector.
				4572	/// \param M
				4573	/// An immediate integer. The least significant bit determines which bits are
				4574	/// extracted from the first parameter:
				4575	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4576	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4577	/// \returns A 128-bit integer vector containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4578	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				4579	(__m128i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4580	(__v4di)(__m256i)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4581	(__v4di)(_mm256_undefined_si256()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4582	(((M) & 1) ? 2 : 0), \
				4583	(((M) & 1) ? 3 : 1) );})
				4584
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4585	/* SIMD load ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4586	/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
				4587	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4588	/// of [8 x float] by concatenating the two 128-bit vectors.
				4589	///
				4590	/// \headerfile <x86intrin.h>
				4591	///
				4592	/// This intrinsic corresponds to load instructions followed by the
				4593	/// \c VINSERTF128 instruction.
				4594	///
				4595	/// \param __addr_hi
				4596	/// A pointer to a 128-bit memory location containing 4 consecutive
				4597	/// single-precision floating-point values. These values are to be copied
				4598	/// to bits[255:128] of the result. The address of the memory location does
				4599	/// not have to be aligned.
				4600	/// \param __addr_lo
				4601	/// A pointer to a 128-bit memory location containing 4 consecutive
				4602	/// single-precision floating-point values. These values are to be copied
				4603	/// to bits[127:0] of the result. The address of the memory location does not
				4604	/// have to be aligned.
				4605	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4606	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4607	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4608	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4609	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4610	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				4611	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4612	}
				4613
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4614	/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
				4615	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4616	/// of [4 x double] by concatenating the two 128-bit vectors.
				4617	///
				4618	/// \headerfile <x86intrin.h>
				4619	///
				4620	/// This intrinsic corresponds to load instructions followed by the
				4621	/// \c VINSERTF128 instruction.
				4622	///
				4623	/// \param __addr_hi
				4624	/// A pointer to a 128-bit memory location containing two consecutive
				4625	/// double-precision floating-point values. These values are to be copied
				4626	/// to bits[255:128] of the result. The address of the memory location does
				4627	/// not have to be aligned.
				4628	/// \param __addr_lo
				4629	/// A pointer to a 128-bit memory location containing two consecutive
				4630	/// double-precision floating-point values. These values are to be copied
				4631	/// to bits[127:0] of the result. The address of the memory location does not
				4632	/// have to be aligned.
				4633	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4634	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4635	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4636	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4637	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4638	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				4639	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4640	}
				4641
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4642	/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
				4643	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4644	/// vectors.
				4645	///
				4646	/// \headerfile <x86intrin.h>
				4647	///
				4648	/// This intrinsic corresponds to load instructions followed by the
				4649	/// \c VINSERTF128 instruction.
				4650	///
				4651	/// \param __addr_hi
				4652	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4653	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4654	/// address of the memory location does not have to be aligned.
				4655	/// \param __addr_lo
				4656	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4657	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4658	/// address of the memory location does not have to be aligned.
				4659	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4660	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4661	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4662	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4663	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				4664	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4665	}
				4666
				4667	/* SIMD store ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4668	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4669	/// vector of [8 x float] into two different unaligned memory locations.
				4670	///
				4671	/// \headerfile <x86intrin.h>
				4672	///
				4673	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4674	/// instructions.
				4675	///
				4676	/// \param __addr_hi
				4677	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4678	/// copied to this memory location. The address of this memory location does
				4679	/// not have to be aligned.
				4680	/// \param __addr_lo
				4681	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4682	/// copied to this memory location. The address of this memory location does
				4683	/// not have to be aligned.
				4684	/// \param __a
				4685	/// A 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4686	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4687	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4688	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4689	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4690
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4691	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4692	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4693	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4694	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4695	}
				4696
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4697	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4698	/// vector of [4 x double] into two different unaligned memory locations.
				4699	///
				4700	/// \headerfile <x86intrin.h>
				4701	///
				4702	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4703	/// instructions.
				4704	///
				4705	/// \param __addr_hi
				4706	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4707	/// copied to this memory location. The address of this memory location does
				4708	/// not have to be aligned.
				4709	/// \param __addr_lo
				4710	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4711	/// copied to this memory location. The address of this memory location does
				4712	/// not have to be aligned.
				4713	/// \param __a
				4714	/// A 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4715	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4716	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4717	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4718	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4719
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4720	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4721	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4722	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4723	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4724	}
				4725
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4726	/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
				4727	/// two different unaligned memory locations.
				4728	///
				4729	/// \headerfile <x86intrin.h>
				4730	///
				4731	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4732	/// instructions.
				4733	///
				4734	/// \param __addr_hi
				4735	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4736	/// copied to this memory location. The address of this memory location does
				4737	/// not have to be aligned.
				4738	/// \param __addr_lo
				4739	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4740	/// copied to this memory location. The address of this memory location does
				4741	/// not have to be aligned.
				4742	/// \param __a
				4743	/// A 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4744	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4745	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4746	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4747	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4748
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4749	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4750	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4751	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4752	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4753	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4754
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4755	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				4756	/// concatenating two 128-bit floating-point vectors of [4 x float].
				4757	///
				4758	/// \headerfile <x86intrin.h>
				4759	///
				4760	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4761	///
				4762	/// \param __hi
				4763	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4764	/// 128 bits of the result.
				4765	/// \param __lo
				4766	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4767	/// 128 bits of the result.
				4768	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4769	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4770	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4771	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				4772	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4773	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4774	}
				4775
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4776	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				4777	/// concatenating two 128-bit floating-point vectors of [2 x double].
				4778	///
				4779	/// \headerfile <x86intrin.h>
				4780	///
				4781	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4782	///
				4783	/// \param __hi
				4784	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4785	/// 128 bits of the result.
				4786	/// \param __lo
				4787	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4788	/// 128 bits of the result.
				4789	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4790	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4791	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4792	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				4793	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4794	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4795	}
				4796
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4797	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				4798	/// integer vectors.
				4799	///
				4800	/// \headerfile <x86intrin.h>
				4801	///
				4802	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4803	///
				4804	/// \param __hi
				4805	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4806	/// result.
				4807	/// \param __lo
				4808	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4809	/// result.
				4810	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4811	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4812	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				4813	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4814	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4815	}
				4816
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4817	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				4818	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				4819	/// similar to _mm256_set_m128, but the order of the input parameters is
				4820	/// swapped.
				4821	///
				4822	/// \headerfile <x86intrin.h>
				4823	///
				4824	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4825	///
				4826	/// \param __lo
				4827	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4828	/// 128 bits of the result.
				4829	/// \param __hi
				4830	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4831	/// 128 bits of the result.
				4832	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4833	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4834	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4835	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				4836	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4837	return _mm256_set_m128(__hi, __lo);
				4838	}
				4839
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4840	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				4841	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				4842	/// similar to _mm256_set_m128d, but the order of the input parameters is
				4843	/// swapped.
				4844	///
				4845	/// \headerfile <x86intrin.h>
				4846	///
				4847	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4848	///
				4849	/// \param __lo
				4850	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4851	/// 128 bits of the result.
				4852	/// \param __hi
				4853	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4854	/// 128 bits of the result.
				4855	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4856	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4857	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4858	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				4859	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4860	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4861	}
				4862
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4863	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				4864	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				4865	/// the input parameters is swapped.
				4866	///
				4867	/// \headerfile <x86intrin.h>
				4868	///
				4869	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4870	///
				4871	/// \param __lo
				4872	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4873	/// result.
				4874	/// \param __hi
				4875	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4876	/// result.
				4877	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4878	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4879	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				4880	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4881	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4882	}
				4883
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4884	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	4885
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4886	#endif /* __AVXINTRIN_H */