Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: 0fda7e4fe53707e4476a33f9df741196a1a7fb3f [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
				39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
				46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				47
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
				51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	54
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	55	/* Arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	56	/// \brief Adds two 256-bit vectors of [4 x double].
				57	///
				58	/// \headerfile <x86intrin.h>
				59	///
				60	/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
				61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	68	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	69	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	70	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	71	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	72	}
				73
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	74	/// \brief Adds two 256-bit vectors of [8 x float].
				75	///
				76	/// \headerfile <x86intrin.h>
				77	///
				78	/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
				79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	86	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	87	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	88	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	89	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	90	}
				91
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	92	/// \brief Subtracts two 256-bit vectors of [4 x double].
				93	///
				94	/// \headerfile <x86intrin.h>
				95	///
				96	/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
				97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	104	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	105	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	106	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	107	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	108	}
				109
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	110	/// \brief Subtracts two 256-bit vectors of [8 x float].
				111	///
				112	/// \headerfile <x86intrin.h>
				113	///
				114	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
				115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	122	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	123	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	124	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	125	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	126	}
				127
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	128	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
				133	/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
				134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	141	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	142	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	143	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	145	}
				146
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	147	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
				152	/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
				153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	160	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	161	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	162	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	164	}
				165
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	166	/// \brief Divides two 256-bit vectors of [4 x double].
				167	///
				168	/// \headerfile <x86intrin.h>
				169	///
				170	/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
				171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	178	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	179	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	180	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	181	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	182	}
				183
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	184	/// \brief Divides two 256-bit vectors of [8 x float].
				185	///
				186	/// \headerfile <x86intrin.h>
				187	///
				188	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
				189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	196	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	197	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	198	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	199	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	200	}
				201
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	202	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
				207	/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
				208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	215	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	216	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	217	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	219	}
				220
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	221	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
				226	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
				227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	234	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	235	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	238	}
				239
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	240	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
				245	/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
				246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	253	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	254	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	255	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	257	}
				258
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	259	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
				264	/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
				265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	272	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	273	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	274	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	276	}
				277
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	278	/// \brief Multiplies two 256-bit vectors of [4 x double].
				279	///
				280	/// \headerfile <x86intrin.h>
				281	///
				282	/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
				283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	290	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	291	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	292	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	293	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	294	}
				295
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	296	/// \brief Multiplies two 256-bit vectors of [8 x float].
				297	///
				298	/// \headerfile <x86intrin.h>
				299	///
				300	/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
				301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	308	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	309	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	311	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	312	}
				313
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	314	/// \brief Calculates the square roots of the values in a 256-bit vector of
				315	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	316	///
				317	/// \headerfile <x86intrin.h>
				318	///
				319	/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
				320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	325	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	326	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	327	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	329	}
				330
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	331	/// \brief Calculates the square roots of the values in a 256-bit vector of
				332	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	333	///
				334	/// \headerfile <x86intrin.h>
				335	///
				336	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
				337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	342	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	343	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	344	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	346	}
				347
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	348	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				349	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	350	///
				351	/// \headerfile <x86intrin.h>
				352	///
				353	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
				354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	359	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	360	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	361	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	363	}
				364
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	365	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				366	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	367	///
				368	/// \headerfile <x86intrin.h>
				369	///
				370	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
				371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	376	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	377	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	378	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	380	}
				381
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	382	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
				392	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
				397	/// An integer value that specifies the rounding operation.
				398	/// Bits [7:4] are reserved.
				399	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	400	/// 0: A normal PE exception is used.
				401	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	402	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	403	/// 0: Use bits [1:0] of M.
				404	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	405	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	406	/// 00: Nearest.
				407	/// 01: Downward (toward negative infinity).
				408	/// 10: Upward (toward positive infinity).
				409	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	411	#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	413
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	414	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
				424	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
				429	/// An integer value that specifies the rounding operation.
				430	/// Bits [7:4] are reserved.
				431	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	432	/// 0: A normal PE exception is used.
				433	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	434	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	435	/// 0: Use bits [1:0] of M.
				436	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	437	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	438	/// 00: Nearest.
				439	/// 01: Downward (toward negative infinity).
				440	/// 10: Upward (toward positive infinity).
				441	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	443	#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	445
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	446	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
				456	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	462
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	463	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
				473	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	480
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	481	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
				491	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	497
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	498	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
				508	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	516	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				517	///
				518	/// \headerfile <x86intrin.h>
				519	///
				520	/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
				521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	528	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	529	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	530	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	531	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	532	}
				533
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	534	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				535	///
				536	/// \headerfile <x86intrin.h>
				537	///
				538	/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
				539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	546	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	547	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	548	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	549	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	550	}
				551
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	552	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
				557	/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
				558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	567	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	568	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	569	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	571	}
				572
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	573	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
				578	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
				579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	588	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	589	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	590	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	591	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	592	}
				593
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	594	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				595	///
				596	/// \headerfile <x86intrin.h>
				597	///
				598	/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
				599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	606	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	607	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	608	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	609	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	610	}
				611
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	612	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				613	///
				614	/// \headerfile <x86intrin.h>
				615	///
				616	/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
				617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	624	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	625	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	626	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	627	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	628	}
				629
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	630	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				631	///
				632	/// \headerfile <x86intrin.h>
				633	///
				634	/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
				635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	643	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	644	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	646	}
				647
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	648	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				649	///
				650	/// \headerfile <x86intrin.h>
				651	///
				652	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	660	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	661	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	662	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	663	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	664	}
				665
				666	/* Horizontal arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	667	/// \brief Horizontally adds the adjacent pairs of values contained in two
				668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
				672	/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
				673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	684	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	685	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	686	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	688	}
				689
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	690	/// \brief Horizontally adds the adjacent pairs of values contained in two
				691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
				695	/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
				696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	707	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	708	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	709	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	711	}
				712
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	713	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
				718	/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
				719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	730	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	731	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	732	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	734	}
				735
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	736	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
				741	/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
				742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	753	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	754	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	755	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	757	}
				758
				759	/* Vector permutations */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	760	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				761	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	762	///
				763	/// \headerfile <x86intrin.h>
				764	///
				765	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
				770	/// A 128-bit integer vector operand specifying how the values are to be
				771	/// copied.
				772	/// Bit [1]:
				773	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	774	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	775	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	776	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	777	/// Bit [65]:
				778	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	779	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	780	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	781	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	783	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	784	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	787	}
				788
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	789	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	790	/// specified by the 256-bit integer vector operand.
				791	///
				792	/// \headerfile <x86intrin.h>
				793	///
				794	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
				800	/// copied.
				801	/// Bit [1]:
				802	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	803	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	804	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	805	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	806	/// Bit [65]:
				807	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	808	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	809	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	810	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	811	/// Bit [129]:
				812	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	813	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	814	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	815	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	816	/// Bit [193]:
				817	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	818	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	819	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	820	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	824	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	826	}
				827
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	828	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				829	/// specified by the 128-bit integer vector operand.
				830	///
				831	/// \headerfile <x86intrin.h>
				832	///
				833	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				834	///
				835	/// \param __a
				836	/// A 128-bit vector of [4 x float].
				837	/// \param __c
				838	/// A 128-bit integer vector operand specifying how the values are to be
				839	/// copied.
				840	/// Bits [1:0]:
				841	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	842	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	843	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	844	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	845	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	846	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	847	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	848	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	849	/// Bits [33:32]:
				850	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	851	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	852	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	853	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	854	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	855	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	856	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	857	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	858	/// Bits [65:64]:
				859	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	860	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	861	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	862	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	863	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	864	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	865	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	866	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	867	/// Bits [97:96]:
				868	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	869	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	870	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	871	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	872	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	873	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	874	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	875	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	876	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	877	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	878	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	879	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	880	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	881	}
				882
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	883	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				884	/// specified by the 256-bit integer vector operand.
				885	///
				886	/// \headerfile <x86intrin.h>
				887	///
				888	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				889	///
				890	/// \param __a
				891	/// A 256-bit vector of [8 x float].
				892	/// \param __c
				893	/// A 256-bit integer vector operand specifying how the values are to be
				894	/// copied.
				895	/// Bits [1:0]:
				896	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	897	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	898	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	899	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	900	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	901	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	902	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	903	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	904	/// Bits [33:32]:
				905	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	906	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	907	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	908	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	909	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	910	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	911	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	912	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	913	/// Bits [65:64]:
				914	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	915	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	916	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	917	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	918	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	919	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	920	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	921	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	922	/// Bits [97:96]:
				923	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	924	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	925	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	926	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	927	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	928	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	929	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	930	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	931	/// Bits [129:128]:
				932	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	933	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	934	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	935	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	936	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	937	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	938	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	939	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	940	/// Bits [161:160]:
				941	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	942	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	943	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	944	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	945	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	946	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	947	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	948	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	949	/// Bits [193:192]:
				950	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	951	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	952	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	953	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	954	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	955	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	956	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	957	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	958	/// Bits [225:224]:
				959	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	960	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	961	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	962	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	963	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	964	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	965	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	966	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	967	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	968	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	969	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	970	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	971	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	972	}
				973
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	974	/// \brief Copies the values in a 128-bit vector of [2 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	975	/// specified by the immediate integer operand.
				976	///
				977	/// \headerfile <x86intrin.h>
				978	///
				979	/// \code
				980	/// __m128d _mm_permute_pd(__m128d A, const int C);
				981	/// \endcode
				982	///
				983	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				984	///
				985	/// \param A
				986	/// A 128-bit vector of [2 x double].
				987	/// \param C
				988	/// An immediate integer operand specifying how the values are to be copied.
				989	/// Bit [0]:
				990	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	991	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	992	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	993	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	994	/// Bit [1]:
				995	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	996	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	997	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	998	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	999	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1000	#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1001	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1002	(__v2df)_mm_undefined_pd(), \
				1003	((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1004
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1005	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1006	/// specified by the immediate integer operand.
				1007	///
				1008	/// \headerfile <x86intrin.h>
				1009	///
				1010	/// \code
				1011	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1012	/// \endcode
				1013	///
				1014	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				1015	///
				1016	/// \param A
				1017	/// A 256-bit vector of [4 x double].
				1018	/// \param C
				1019	/// An immediate integer operand specifying how the values are to be copied.
				1020	/// Bit [0]:
				1021	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1022	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1023	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1024	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1025	/// Bit [1]:
				1026	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1027	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1028	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1029	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1030	/// Bit [2]:
				1031	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1032	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1033	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1034	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1035	/// Bit [3]:
				1036	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1037	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1038	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1039	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1040	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1041	#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1042	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1043	(__v4df)_mm256_undefined_pd(), \
				1044	0 + (((C) >> 0) & 0x1), \
				1045	0 + (((C) >> 1) & 0x1), \
				1046	2 + (((C) >> 2) & 0x1), \
				1047	2 + (((C) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1048
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1049	/// \brief Copies the values in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1050	/// specified by the immediate integer operand.
				1051	///
				1052	/// \headerfile <x86intrin.h>
				1053	///
				1054	/// \code
				1055	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1056	/// \endcode
				1057	///
				1058	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1059	///
				1060	/// \param A
				1061	/// A 128-bit vector of [4 x float].
				1062	/// \param C
				1063	/// An immediate integer operand specifying how the values are to be copied.
				1064	/// Bits [1:0]:
				1065	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1066	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1067	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1068	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1069	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1070	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1071	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1072	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1073	/// Bits [3:2]:
				1074	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1075	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1076	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1077	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1078	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1079	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1080	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1081	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1082	/// Bits [5:4]:
				1083	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1084	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1085	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1086	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1087	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1088	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1089	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1090	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1091	/// Bits [7:6]:
				1092	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1093	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1094	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1095	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1096	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1097	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1098	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1099	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1100	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1101	#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1102	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1103	(__v4sf)_mm_undefined_ps(), \
				1104	((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
				1105	((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1106
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1107	/// \brief Copies the values in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1108	/// specified by the immediate integer operand.
				1109	///
				1110	/// \headerfile <x86intrin.h>
				1111	///
				1112	/// \code
				1113	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1114	/// \endcode
				1115	///
				1116	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1117	///
				1118	/// \param A
				1119	/// A 256-bit vector of [8 x float].
				1120	/// \param C
				1121	/// An immediate integer operand specifying how the values are to be copied.
				1122	/// Bits [1:0]:
				1123	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1124	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1125	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1126	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1127	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1128	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1129	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1130	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1131	/// Bits [3:2]:
				1132	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1133	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1134	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1135	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1136	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1137	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1138	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1139	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1140	/// Bits [5:4]:
				1141	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1142	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1143	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1144	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1145	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1146	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1147	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1148	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1149	/// Bits [7:6]:
				1150	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1151	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1152	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1153	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1154	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1155	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1156	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1157	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1158	/// Bits [1:0]:
				1159	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1160	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1161	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1162	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1163	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1164	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1165	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1166	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1167	/// Bits [3:2]:
				1168	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1169	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1170	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1171	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1172	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1173	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1174	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1175	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1176	/// Bits [5:4]:
				1177	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1178	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1179	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1180	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1181	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1182	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1183	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1184	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1185	/// Bits [7:6]:
				1186	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1187	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1188	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1189	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1190	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1191	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1192	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1193	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1194	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1195	#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1196	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1197	(__v8sf)_mm256_undefined_ps(), \
				1198	0 + (((C) >> 0) & 0x3), \
				1199	0 + (((C) >> 2) & 0x3), \
				1200	0 + (((C) >> 4) & 0x3), \
				1201	0 + (((C) >> 6) & 0x3), \
				1202	4 + (((C) >> 0) & 0x3), \
				1203	4 + (((C) >> 2) & 0x3), \
				1204	4 + (((C) >> 4) & 0x3), \
				1205	4 + (((C) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1206
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1207	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1208	/// [4 x double], as specified by the immediate integer operand.
				1209	///
				1210	/// \headerfile <x86intrin.h>
				1211	///
				1212	/// \code
				1213	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1214	/// \endcode
				1215	///
				1216	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1217	///
				1218	/// \param V1
				1219	/// A 256-bit vector of [4 x double].
				1220	/// \param V2
				1221	/// A 256-bit vector of [4 x double.
				1222	/// \param M
				1223	/// An immediate integer operand specifying how the values are to be
				1224	/// permuted.
				1225	/// Bits [1:0]:
				1226	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1227	/// destination.
				1228	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1229	/// destination.
				1230	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1231	/// destination.
				1232	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1233	/// destination.
				1234	/// Bits [5:4]:
				1235	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1236	/// destination.
				1237	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1238	/// destination.
				1239	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1240	/// destination.
				1241	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1242	/// destination.
				1243	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1244	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1245	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1246	(__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1247
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1248	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1249	/// [8 x float], as specified by the immediate integer operand.
				1250	///
				1251	/// \headerfile <x86intrin.h>
				1252	///
				1253	/// \code
				1254	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1255	/// \endcode
				1256	///
				1257	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1258	///
				1259	/// \param V1
				1260	/// A 256-bit vector of [8 x float].
				1261	/// \param V2
				1262	/// A 256-bit vector of [8 x float].
				1263	/// \param M
				1264	/// An immediate integer operand specifying how the values are to be
				1265	/// permuted.
				1266	/// Bits [1:0]:
				1267	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1268	/// destination.
				1269	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1270	/// destination.
				1271	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1272	/// destination.
				1273	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1274	/// destination.
				1275	/// Bits [5:4]:
				1276	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1277	/// destination.
				1278	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1279	/// destination.
				1280	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1281	/// destination.
				1282	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1283	/// destination.
				1284	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1285	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1286	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1287	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1288
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1289	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1290	/// as specified by the immediate integer operand.
				1291	///
				1292	/// \headerfile <x86intrin.h>
				1293	///
				1294	/// \code
				1295	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1296	/// \endcode
				1297	///
				1298	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1299	///
				1300	/// \param V1
				1301	/// A 256-bit integer vector.
				1302	/// \param V2
				1303	/// A 256-bit integer vector.
				1304	/// \param M
				1305	/// An immediate integer operand specifying how the values are to be copied.
				1306	/// Bits [1:0]:
				1307	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1308	/// destination.
				1309	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1310	/// destination.
				1311	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1312	/// destination.
				1313	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1314	/// destination.
				1315	/// Bits [5:4]:
				1316	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1317	/// destination.
				1318	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1319	/// destination.
				1320	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1321	/// destination.
				1322	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1323	/// destination.
				1324	/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1325	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1326	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1327	(__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1328
				1329	/* Vector Blend */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1330	/// \brief Merges 64-bit double-precision data values stored in either of the
				1331	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1332	/// integer operand.
				1333	///
				1334	/// \headerfile <x86intrin.h>
				1335	///
				1336	/// \code
				1337	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1338	/// \endcode
				1339	///
				1340	/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
				1341	///
				1342	/// \param V1
				1343	/// A 256-bit vector of [4 x double].
				1344	/// \param V2
				1345	/// A 256-bit vector of [4 x double].
				1346	/// \param M
				1347	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1348	/// values are to be copied. The position of the mask bit corresponds to the
				1349	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				1350	/// element in operand V1 is copied to the same position in the destination.
				1351	/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
				1352	/// copied to the same position in the destination.
				1353	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1354	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1355	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1356	(__v4df)(__m256d)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1357	(((M) & 0x01) ? 4 : 0), \
				1358	(((M) & 0x02) ? 5 : 1), \
				1359	(((M) & 0x04) ? 6 : 2), \
				1360	(((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1361
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1362	/// \brief Merges 32-bit single-precision data values stored in either of the
				1363	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1364	/// integer operand.
				1365	///
				1366	/// \headerfile <x86intrin.h>
				1367	///
				1368	/// \code
				1369	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1370	/// \endcode
				1371	///
				1372	/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
				1373	///
				1374	/// \param V1
				1375	/// A 256-bit vector of [8 x float].
				1376	/// \param V2
				1377	/// A 256-bit vector of [8 x float].
				1378	/// \param M
				1379	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1380	/// values are to be copied. The position of the mask bit corresponds to the
				1381	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				1382	/// element in operand V1 is copied to the same position in the destination.
				1383	/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
				1384	/// copied to the same position in the destination.
				1385	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1386	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1387	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1388	(__v8sf)(__m256)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1389	(((M) & 0x01) ? 8 : 0), \
				1390	(((M) & 0x02) ? 9 : 1), \
				1391	(((M) & 0x04) ? 10 : 2), \
				1392	(((M) & 0x08) ? 11 : 3), \
				1393	(((M) & 0x10) ? 12 : 4), \
				1394	(((M) & 0x20) ? 13 : 5), \
				1395	(((M) & 0x40) ? 14 : 6), \
				1396	(((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1397
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1398	/// \brief Merges 64-bit double-precision data values stored in either of the
				1399	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1400	/// operand.
				1401	///
				1402	/// \headerfile <x86intrin.h>
				1403	///
				1404	/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
				1405	///
				1406	/// \param __a
				1407	/// A 256-bit vector of [4 x double].
				1408	/// \param __b
				1409	/// A 256-bit vector of [4 x double].
				1410	/// \param __c
				1411	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1412	/// how the values are to be copied. The position of the mask bit corresponds
				1413	/// to the most significant bit of a copied value. When a mask bit is 0, the
				1414	/// corresponding 64-bit element in operand __a is copied to the same
				1415	/// position in the destination. When a mask bit is 1, the corresponding
				1416	/// 64-bit element in operand __b is copied to the same position in the
				1417	/// destination.
				1418	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1419	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1420	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1421	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1422	return (__m256d)__builtin_ia32_blendvpd256(
				1423	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1424	}
				1425
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1426	/// \brief Merges 32-bit single-precision data values stored in either of the
				1427	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1428	/// operand.
				1429	///
				1430	/// \headerfile <x86intrin.h>
				1431	///
				1432	/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
				1433	///
				1434	/// \param __a
				1435	/// A 256-bit vector of [8 x float].
				1436	/// \param __b
				1437	/// A 256-bit vector of [8 x float].
				1438	/// \param __c
				1439	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1440	/// and 31 specifying how the values are to be copied. The position of the
				1441	/// mask bit corresponds to the most significant bit of a copied value. When
				1442	/// a mask bit is 0, the corresponding 32-bit element in operand __a is
				1443	/// copied to the same position in the destination. When a mask bit is 1, the
				1444	/// corresponding 32-bit element in operand __b is copied to the same
				1445	/// position in the destination.
				1446	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1447	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1448	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1449	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1450	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1451	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1452	}
				1453
				1454	/* Vector Dot Product */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1455	/// \brief Computes two dot products in parallel, using the lower and upper
				1456	/// halves of two [8 x float] vectors as input to the two computations, and
				1457	/// returning the two dot products in the lower and upper halves of the
				1458	/// [8 x float] result. The immediate integer operand controls which
				1459	/// input elements will contribute to the dot product, and where the final
				1460	/// results are returned. In general, for each dot product, the four
				1461	/// corresponding elements of the input vectors are multiplied; the first
				1462	/// two and second two products are summed, then the two sums are added to
				1463	/// form the final result.
				1464	///
				1465	/// \headerfile <x86intrin.h>
				1466	///
				1467	/// \code
				1468	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1469	/// \endcode
				1470	///
				1471	/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
				1472	///
				1473	/// \param V1
				1474	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1475	/// \param V2
				1476	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1477	/// \param M
				1478	/// An immediate integer argument. Bits [7:4] determine which elements of
				1479	/// the input vectors are used, with bit [4] corresponding to the lowest
				1480	/// element and bit [7] corresponding to the highest element of each [4 x
				1481	/// float] subvector. If a bit is set, the corresponding elements from the
				1482	/// two input vectors are used as an input for dot product; otherwise that
				1483	/// input is treated as zero. Bits [3:0] determine which elements of the
				1484	/// result will receive a copy of the final dot product, with bit [0]
				1485	/// corresponding to the lowest element and bit [3] corresponding to the
				1486	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1487	/// product is returned in the corresponding element; otherwise that element
				1488	/// is set to zero. The bitmask is applied in the same way to each of the
				1489	/// two parallel dot product computations.
				1490	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1491	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1492	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1493	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1494
				1495	/* Vector shuffle */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1496	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
				1497	/// specified by the immediate value operand. The four selected elements in
				1498	/// each operand are copied to the destination according to the bits
				1499	/// specified in the immediate operand. The selected elements from the first
				1500	/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
				1501	/// destination, and the selected elements from the second 256-bit operand
				1502	/// are copied to bits [127:64] and bits [255:192] of the destination. For
				1503	/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
				1504	/// the 256-bit destination vector would contain the following values: b[7],
				1505	/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
				1506	///
				1507	/// \headerfile <x86intrin.h>
				1508	///
				1509	/// \code
				1510	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1511	/// \endcode
				1512	///
				1513	/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
				1514	///
				1515	/// \param a
				1516	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1517	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1518	/// according to the bits specified in the immediate operand.
				1519	/// \param b
				1520	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1521	/// operand are copied to bits [127:64] and bits [255:192] in the
				1522	/// destination, according to the bits specified in the immediate operand.
				1523	/// \param mask
				1524	/// An immediate value containing an 8-bit value specifying which elements to
				1525	/// copy from a and b. Bits [3:0] specify the values copied from operand a.
				1526	/// Bits [7:4] specify the values copied from operand b.
				1527	/// The destinations within the 256-bit destination are assigned values as
				1528	/// follows, according to the bit value assignments described below:
				1529	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
				1530	/// destination.
				1531	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
				1532	/// destination.
				1533	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
				1534	/// destination.
				1535	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
				1536	/// the destination.
				1537	/// Bit value assignments:
				1538	/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
				1539	/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
				1540	/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
				1541	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1542	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1543	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1544	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1545	(__v8sf)(__m256)(b), \
				1546	0 + (((mask) >> 0) & 0x3), \
				1547	0 + (((mask) >> 2) & 0x3), \
				1548	8 + (((mask) >> 4) & 0x3), \
				1549	8 + (((mask) >> 6) & 0x3), \
				1550	4 + (((mask) >> 0) & 0x3), \
				1551	4 + (((mask) >> 2) & 0x3), \
				1552	12 + (((mask) >> 4) & 0x3), \
				1553	12 + (((mask) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1554
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1555	/// \brief Selects four double-precision values from the 256-bit operands of
				1556	/// [4 x double], as specified by the immediate value operand. The selected
				1557	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1558	/// bits [191:128] in the destination, and the selected elements from the
				1559	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
				1560	/// the destination. For example, if bits [3:0] of the immediate operand
				1561	/// contain a value of 0xF, the 256-bit destination vector would contain the
				1562	/// following values: b[3], a[3], b[1], a[1].
				1563	///
				1564	/// \headerfile <x86intrin.h>
				1565	///
				1566	/// \code
				1567	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1568	/// \endcode
				1569	///
				1570	/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
				1571	///
				1572	/// \param a
				1573	/// A 256-bit vector of [4 x double].
				1574	/// \param b
				1575	/// A 256-bit vector of [4 x double].
				1576	/// \param mask
				1577	/// An immediate value containing 8-bit values specifying which elements to
				1578	/// copy from a and b:
				1579	/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
				1580	/// destination.
				1581	/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
				1582	/// destination.
				1583	/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
				1584	/// destination.
				1585	/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
				1586	/// destination.
				1587	/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
				1588	/// destination.
				1589	/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
				1590	/// destination.
				1591	/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
				1592	/// destination.
				1593	/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
				1594	/// destination.
				1595	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1596	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1597	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1598	(__v4df)(__m256d)(b), \
				1599	0 + (((mask) >> 0) & 0x1), \
				1600	4 + (((mask) >> 1) & 0x1), \
				1601	2 + (((mask) >> 2) & 0x1), \
				1602	6 + (((mask) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1603
				1604	/* Compare */
				1605	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1606	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1607	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1608	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1609	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1610	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1611	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1612	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1613	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1614	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1615	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1616	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1617	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1618	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1619	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1620	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1621	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1622	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1623	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1624	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1625	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1626	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1627	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1628	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1629	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1630	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1631	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1632	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1633	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1634	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1635	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1636	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1637
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1638	/// \brief Compares each of the corresponding double-precision values of two
				1639	/// 128-bit vectors of [2 x double], using the operation specified by the
				1640	/// immediate integer operand. Returns a [2 x double] vector consisting of
				1641	/// two doubles corresponding to the two comparison results: zero if the
				1642	/// comparison is false, and all 1's if the comparison is true.
				1643	///
				1644	/// \headerfile <x86intrin.h>
				1645	///
				1646	/// \code
				1647	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1648	/// \endcode
				1649	///
				1650	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1651	///
				1652	/// \param a
				1653	/// A 128-bit vector of [2 x double].
				1654	/// \param b
				1655	/// A 128-bit vector of [2 x double].
				1656	/// \param c
				1657	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1658	/// operation to use:
				1659	/// 00h, 08h, 10h, 18h: Equal
				1660	/// 01h, 09h, 11h, 19h: Less than
				1661	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1662	/// operands)
				1663	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1664	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1665	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1666	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1667	/// (swapped operands)
				1668	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1669	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1670	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1671	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1672	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1673
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1674	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1675	/// [4 x float], using the operation specified by the immediate integer
				1676	/// operand. Returns a [4 x float] vector consisting of four floats
				1677	/// corresponding to the four comparison results: zero if the comparison is
				1678	/// false, and all 1's if the comparison is true.
				1679	///
				1680	/// \headerfile <x86intrin.h>
				1681	///
				1682	/// \code
				1683	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1684	/// \endcode
				1685	///
				1686	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1687	///
				1688	/// \param a
				1689	/// A 128-bit vector of [4 x float].
				1690	/// \param b
				1691	/// A 128-bit vector of [4 x float].
				1692	/// \param c
				1693	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1694	/// operation to use:
				1695	/// 00h, 08h, 10h, 18h: Equal
				1696	/// 01h, 09h, 11h, 19h: Less than
				1697	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1698	/// operands)
				1699	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1700	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1701	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1702	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1703	/// (swapped operands)
				1704	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1705	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1706	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1707	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1708	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1709
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1710	/// \brief Compares each of the corresponding double-precision values of two
				1711	/// 256-bit vectors of [4 x double], using the operation specified by the
				1712	/// immediate integer operand. Returns a [4 x double] vector consisting of
				1713	/// four doubles corresponding to the four comparison results: zero if the
				1714	/// comparison is false, and all 1's if the comparison is true.
				1715	///
				1716	/// \headerfile <x86intrin.h>
				1717	///
				1718	/// \code
				1719	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1720	/// \endcode
				1721	///
				1722	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1723	///
				1724	/// \param a
				1725	/// A 256-bit vector of [4 x double].
				1726	/// \param b
				1727	/// A 256-bit vector of [4 x double].
				1728	/// \param c
				1729	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1730	/// operation to use:
				1731	/// 00h, 08h, 10h, 18h: Equal
				1732	/// 01h, 09h, 11h, 19h: Less than
				1733	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1734	/// operands)
				1735	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1736	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1737	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1738	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1739	/// (swapped operands)
				1740	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1741	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1742	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1743	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1744	(__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1745
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1746	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1747	/// [8 x float], using the operation specified by the immediate integer
				1748	/// operand. Returns a [8 x float] vector consisting of eight floats
				1749	/// corresponding to the eight comparison results: zero if the comparison is
				1750	/// false, and all 1's if the comparison is true.
				1751	///
				1752	/// \headerfile <x86intrin.h>
				1753	///
				1754	/// \code
				1755	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1756	/// \endcode
				1757	///
				1758	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1759	///
				1760	/// \param a
				1761	/// A 256-bit vector of [8 x float].
				1762	/// \param b
				1763	/// A 256-bit vector of [8 x float].
				1764	/// \param c
				1765	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1766	/// operation to use:
				1767	/// 00h, 08h, 10h, 18h: Equal
				1768	/// 01h, 09h, 11h, 19h: Less than
				1769	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1770	/// operands)
				1771	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1772	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1773	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1774	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1775	/// (swapped operands)
				1776	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1777	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1778	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1779	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1780	(__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1781
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1782	/// \brief Compares each of the corresponding scalar double-precision values of
				1783	/// two 128-bit vectors of [2 x double], using the operation specified by the
				1784	/// immediate integer operand. If the result is true, all 64 bits of the
				1785	/// destination vector are set; otherwise they are cleared.
				1786	///
				1787	/// \headerfile <x86intrin.h>
				1788	///
				1789	/// \code
				1790	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1791	/// \endcode
				1792	///
				1793	/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
				1794	///
				1795	/// \param a
				1796	/// A 128-bit vector of [2 x double].
				1797	/// \param b
				1798	/// A 128-bit vector of [2 x double].
				1799	/// \param c
				1800	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1801	/// operation to use:
				1802	/// 00h, 08h, 10h, 18h: Equal
				1803	/// 01h, 09h, 11h, 19h: Less than
				1804	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1805	/// operands)
				1806	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1807	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1808	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1809	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1810	/// (swapped operands)
				1811	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1812	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1813	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1814	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1815	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1816
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1817	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1818	/// vectors of [4 x float], using the operation specified by the immediate
				1819	/// integer operand. If the result is true, all 32 bits of the destination
				1820	/// vector are set; otherwise they are cleared.
				1821	///
				1822	/// \headerfile <x86intrin.h>
				1823	///
				1824	/// \code
				1825	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1826	/// \endcode
				1827	///
				1828	/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
				1829	///
				1830	/// \param a
				1831	/// A 128-bit vector of [4 x float].
				1832	/// \param b
				1833	/// A 128-bit vector of [4 x float].
				1834	/// \param c
				1835	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1836	/// operation to use:
				1837	/// 00h, 08h, 10h, 18h: Equal
				1838	/// 01h, 09h, 11h, 19h: Less than
				1839	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1840	/// operands)
				1841	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1842	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1843	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1844	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1845	/// (swapped operands)
				1846	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1847	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1848	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1849	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1850	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1851
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1852	/// \brief Takes a [8 x i32] vector and returns the vector element value
				1853	/// indexed by the immediate constant operand.
				1854	///
				1855	/// \headerfile <x86intrin.h>
				1856	///
				1857	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1858	/// EXTRACTF128+COMPOSITE instruction.
				1859	///
				1860	/// \param __a
				1861	/// A 256-bit vector of [8 x i32].
				1862	/// \param __imm
				1863	/// An immediate integer operand with bits [2:0] determining which vector
				1864	/// element is extracted and returned.
				1865	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1866	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1867	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1868	_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1869	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1870	__v8si __b = (__v8si)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1871	return __b[__imm & 7];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1872	}
				1873
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1874	/// \brief Takes a [16 x i16] vector and returns the vector element value
				1875	/// indexed by the immediate constant operand.
				1876	///
				1877	/// \headerfile <x86intrin.h>
				1878	///
				1879	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1880	/// EXTRACTF128+COMPOSITE instruction.
				1881	///
				1882	/// \param __a
				1883	/// A 256-bit integer vector of [16 x i16].
				1884	/// \param __imm
				1885	/// An immediate integer operand with bits [3:0] determining which vector
				1886	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1887	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1888	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1889	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1890	_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1891	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1892	__v16hi __b = (__v16hi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1893	return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1894	}
				1895
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1896	/// \brief Takes a [32 x i8] vector and returns the vector element value
				1897	/// indexed by the immediate constant operand.
				1898	///
				1899	/// \headerfile <x86intrin.h>
				1900	///
				1901	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1902	/// EXTRACTF128+COMPOSITE instruction.
				1903	///
				1904	/// \param __a
				1905	/// A 256-bit integer vector of [32 x i8].
				1906	/// \param __imm
				1907	/// An immediate integer operand with bits [4:0] determining which vector
				1908	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1909	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				1910	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1911	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1912	_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1913	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1914	__v32qi __b = (__v32qi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1915	return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1916	}
				1917
				1918	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1919	/// \brief Takes a [4 x i64] vector and returns the vector element value
				1920	/// indexed by the immediate constant operand.
				1921	///
				1922	/// \headerfile <x86intrin.h>
				1923	///
				1924	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1925	/// EXTRACTF128+COMPOSITE instruction.
				1926	///
				1927	/// \param __a
				1928	/// A 256-bit integer vector of [4 x i64].
				1929	/// \param __imm
				1930	/// An immediate integer operand with bits [1:0] determining which vector
				1931	/// element is extracted and returned.
				1932	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				1933	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1934	static __inline long long __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1935	_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1936	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1937	__v4di __b = (__v4di)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1938	return __b[__imm & 3];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1939	}
				1940	#endif
				1941
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1942	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				1943	/// indexed by the immediate constant operand by a new value. Returns the
				1944	/// modified vector.
				1945	///
				1946	/// \headerfile <x86intrin.h>
				1947	///
				1948	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1949	/// INSERTF128+COMPOSITE instruction.
				1950	///
				1951	/// \param __a
				1952	/// A vector of [8 x i32] to be used by the insert operation.
				1953	/// \param __b
				1954	/// An integer value. The replacement value for the insert operation.
				1955	/// \param __imm
				1956	/// An immediate integer specifying the index of the vector element to be
				1957	/// replaced.
				1958	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1959	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1960	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1961	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1962	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1963	__v8si __c = (__v8si)__a;
				1964	__c[__imm & 7] = __b;
				1965	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1966	}
				1967
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1968
				1969	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				1970	/// indexed by the immediate constant operand with a new value. Returns the
				1971	/// modified vector.
				1972	///
				1973	/// \headerfile <x86intrin.h>
				1974	///
				1975	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1976	/// INSERTF128+COMPOSITE instruction.
				1977	///
				1978	/// \param __a
				1979	/// A vector of [16 x i16] to be used by the insert operation.
				1980	/// \param __b
				1981	/// An i16 integer value. The replacement value for the insert operation.
				1982	/// \param __imm
				1983	/// An immediate integer specifying the index of the vector element to be
				1984	/// replaced.
				1985	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1986	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1987	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1988	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1989	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1990	__v16hi __c = (__v16hi)__a;
				1991	__c[__imm & 15] = __b;
				1992	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1993	}
				1994
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1995	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				1996	/// indexed by the immediate constant operand with a new value. Returns the
				1997	/// modified vector.
				1998	///
				1999	/// \headerfile <x86intrin.h>
				2000	///
				2001	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2002	/// INSERTF128+COMPOSITE instruction.
				2003	///
				2004	/// \param __a
				2005	/// A vector of [32 x i8] to be used by the insert operation.
				2006	/// \param __b
				2007	/// An i8 integer value. The replacement value for the insert operation.
				2008	/// \param __imm
				2009	/// An immediate integer specifying the index of the vector element to be
				2010	/// replaced.
				2011	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2012	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2013	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2014	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2015	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2016	__v32qi __c = (__v32qi)__a;
				2017	__c[__imm & 31] = __b;
				2018	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2019	}
				2020
				2021	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2022	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2023	/// indexed by the immediate constant operand with a new value. Returns the
				2024	/// modified vector.
				2025	///
				2026	/// \headerfile <x86intrin.h>
				2027	///
				2028	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2029	/// INSERTF128+COMPOSITE instruction.
				2030	///
				2031	/// \param __a
				2032	/// A vector of [4 x i64] to be used by the insert operation.
				2033	/// \param __b
				2034	/// A 64-bit integer value. The replacement value for the insert operation.
				2035	/// \param __imm
				2036	/// An immediate integer specifying the index of the vector element to be
				2037	/// replaced.
				2038	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2039	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2040	static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhas	d740029	2015-02-19 19:00:33 +0000	[diff] [blame]	2041	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2042	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2043	__v4di __c = (__v4di)__a;
				2044	__c[__imm & 3] = __b;
				2045	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2046	}
				2047	#endif
				2048
				2049	/* Conversion */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2050	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2051	///
				2052	/// \headerfile <x86intrin.h>
				2053	///
				2054	/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
				2055	///
				2056	/// \param __a
				2057	/// A 128-bit integer vector of [4 x i32].
				2058	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2059	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2060	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2061	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2062	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2063	}
				2064
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2065	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2066	///
				2067	/// \headerfile <x86intrin.h>
				2068	///
				2069	/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
				2070	///
				2071	/// \param __a
				2072	/// A 256-bit integer vector.
				2073	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2074	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2075	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2076	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2077	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2078	}
				2079
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2080	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2081	/// [4 x float].
				2082	///
				2083	/// \headerfile <x86intrin.h>
				2084	///
				2085	/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
				2086	///
				2087	/// \param __a
				2088	/// A 256-bit vector of [4 x double].
				2089	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2090	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2091	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2092	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2093	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2094	}
				2095
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2096	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2097	///
				2098	/// \headerfile <x86intrin.h>
				2099	///
				2100	/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
				2101	///
				2102	/// \param __a
				2103	/// A 256-bit vector of [8 x float].
				2104	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2105	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2106	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2107	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2108	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2109	}
				2110
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2111	/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
				2112	/// x double].
				2113	///
				2114	/// \headerfile <x86intrin.h>
				2115	///
				2116	/// This intrinsic corresponds to the \c VCVTPS2PD instruction.
				2117	///
				2118	/// \param __a
				2119	/// A 128-bit vector of [4 x float].
				2120	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2121	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2122	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2123	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2124	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2125	}
				2126
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2127	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2128	/// x i32], truncating the result by rounding towards zero when it is
				2129	/// inexact.
				2130	///
				2131	/// \headerfile <x86intrin.h>
				2132	///
				2133	/// This intrinsic corresponds to the \c VCVTTPD2DQ instruction.
				2134	///
				2135	/// \param __a
				2136	/// A 256-bit vector of [4 x double].
				2137	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2138	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2139	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2140	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2141	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2142	}
				2143
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2144	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2145	/// x i32]. When a conversion is inexact, the value returned is rounded
				2146	/// according to the rounding control bits in the MXCSR register.
				2147	///
				2148	/// \headerfile <x86intrin.h>
				2149	///
				2150	/// This intrinsic corresponds to the \c VCVTPD2DQ instruction.
				2151	///
				2152	/// \param __a
				2153	/// A 256-bit vector of [4 x double].
				2154	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2155	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2156	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2157	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2158	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2159	}
				2160
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2161	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
				2162	/// truncating the result by rounding towards zero when it is inexact.
				2163	///
				2164	/// \headerfile <x86intrin.h>
				2165	///
				2166	/// This intrinsic corresponds to the \c VCVTTPS2DQ instruction.
				2167	///
				2168	/// \param __a
				2169	/// A 256-bit vector of [8 x float].
				2170	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2171	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2172	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2173	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2174	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2175	}
				2176
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2177	static __inline double __DEFAULT_FN_ATTRS
				2178	_mm256_cvtsd_f64(__m256d __a)
				2179	{
				2180	return __a[0];
				2181	}
				2182
				2183	static __inline int __DEFAULT_FN_ATTRS
				2184	_mm256_cvtsi256_si32(__m256i __a)
				2185	{
				2186	__v8si __b = (__v8si)__a;
				2187	return __b[0];
				2188	}
				2189
				2190	static __inline float __DEFAULT_FN_ATTRS
				2191	_mm256_cvtss_f32(__m256 __a)
				2192	{
				2193	return __a[0];
				2194	}
				2195
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2196	/* Vector replicate */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2197	/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
				2198	/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
				2199	/// Bits [255:224] of __a are written to bits [255:224] and [223:192]
				2200	/// of the return value.
				2201	/// Bits [191:160] of __a are written to bits [191:160] and [159:128]
				2202	/// of the return value.
				2203	/// Bits [127:96] of __a are written to bits [127:96] and [95:64] of
				2204	/// the return value.
				2205	/// Bits [63:32] of __a are written to bits [63:32] and [31:0] of the
				2206	/// return value.
				2207	///
				2208	/// \headerfile <x86intrin.h>
				2209	///
				2210	/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
				2211	///
				2212	/// \param __a
				2213	/// A 256-bit vector of [8 x float].
				2214	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2215	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2216	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2217	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2218	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2219	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2220	}
				2221
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2222	/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
				2223	/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
				2224	/// Bits [223:192] of __a are written to bits [255:224] and [223:192]
				2225	/// of the return value.
				2226	/// Bits [159:128] of __a are written to bits [191:160] and [159:128]
				2227	/// of the return value.
				2228	/// Bits [95:64] of __a are written to bits [127:96] and [95:64] of
				2229	/// the return value.
				2230	/// Bits [31:0] of __a are written to bits [63:32] and [31:0] of the
				2231	/// return value.
				2232	///
				2233	/// \headerfile <x86intrin.h>
				2234	///
				2235	/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
				2236	///
				2237	/// \param __a
				2238	/// A 256-bit vector of [8 x float].
				2239	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2240	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2241	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2242	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2243	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2244	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2245	}
				2246
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2247	/// \brief Moves and duplicates double-precision floating point values from a
				2248	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2249	/// vector of [4 x double].
				2250	/// Bits [63:0] of __a are written to bits [127:64] and [63:0] of the
				2251	/// return value.
				2252	/// Bits [191:128] of __a are written to bits [255:192] and [191:128]
				2253	/// of the return value.
				2254	///
				2255	/// \headerfile <x86intrin.h>
				2256	///
				2257	/// This intrinsic corresponds to the \c VMOVDDUP instruction.
				2258	///
				2259	/// \param __a
				2260	/// A 256-bit vector of [4 x double].
				2261	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2262	/// duplicated values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2263	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2264	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2265	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2266	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2267	}
				2268
				2269	/* Unpack and Interleave */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2270	/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
				2271	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2272	///
				2273	/// \headerfile <x86intrin.h>
				2274	///
				2275	/// This intrinsic corresponds to the \c VUNPCKHPD instruction.
				2276	///
				2277	/// \param __a
				2278	/// A 256-bit floating-point vector of [4 x double].
				2279	/// Bits [127:64] are written to bits [63:0] of the return value.
				2280	/// Bits [255:192] are written to bits [191:128] of the return value.
				2281	/// \param __b
				2282	/// A 256-bit floating-point vector of [4 x double].
				2283	/// Bits [127:64] are written to bits [127:64] of the return value.
				2284	/// Bits [255:192] are written to bits [255:192] of the return value.
				2285	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2286	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2287	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2288	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2289	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2290	}
				2291
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2292	/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
				2293	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2294	///
				2295	/// \headerfile <x86intrin.h>
				2296	///
				2297	/// This intrinsic corresponds to the \c VUNPCKLPD instruction.
				2298	///
				2299	/// \param __a
				2300	/// A 256-bit floating-point vector of [4 x double].
				2301	/// Bits [63:0] are written to bits [63:0] of the return value.
				2302	/// Bits [191:128] are written to bits [191:128] of the return value.
				2303	/// \param __b
				2304	/// A 256-bit floating-point vector of [4 x double].
				2305	/// Bits [63:0] are written to bits [127:64] of the return value.
				2306	/// Bits [191:128] are written to bits [255:192] of the return value.
				2307	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2308	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2309	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2311	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2312	}
				2313
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2314	/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
				2315	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2316	/// vector of [8 x float].
				2317	///
				2318	/// \headerfile <x86intrin.h>
				2319	///
				2320	/// This intrinsic corresponds to the \c VUNPCKHPS instruction.
				2321	///
				2322	/// \param __a
				2323	/// A 256-bit vector of [8 x float].
				2324	/// Bits [95:64] are written to bits [31:0] of the return value.
				2325	/// Bits [127:96] are written to bits [95:64] of the return value.
				2326	/// Bits [223:192] are written to bits [159:128] of the return value.
				2327	/// Bits [255:224] are written to bits [223:192] of the return value.
				2328	/// \param __b
				2329	/// A 256-bit vector of [8 x float].
				2330	/// Bits [95:64] are written to bits [63:32] of the return value.
				2331	/// Bits [127:96] are written to bits [127:96] of the return value.
				2332	/// Bits [223:192] are written to bits [191:160] of the return value.
				2333	/// Bits [255:224] are written to bits [255:224] of the return value.
				2334	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2335	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2336	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2337	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2338	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2339	}
				2340
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2341	/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
				2342	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2343	/// vector of [8 x float].
				2344	///
				2345	/// \headerfile <x86intrin.h>
				2346	///
				2347	/// This intrinsic corresponds to the \c VUNPCKLPS instruction.
				2348	///
				2349	/// \param __a
				2350	/// A 256-bit vector of [8 x float].
				2351	/// Bits [31:0] are written to bits [31:0] of the return value.
				2352	/// Bits [63:32] are written to bits [95:64] of the return value.
				2353	/// Bits [159:128] are written to bits [159:128] of the return value.
				2354	/// Bits [191:160] are written to bits [223:192] of the return value.
				2355	/// \param __b
				2356	/// A 256-bit vector of [8 x float].
				2357	/// Bits [31:0] are written to bits [63:32] of the return value.
				2358	/// Bits [63:32] are written to bits [127:96] of the return value.
				2359	/// Bits [159:128] are written to bits [191:160] of the return value.
				2360	/// Bits [191:160] are written to bits [255:224] of the return value.
				2361	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2362	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2363	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2364	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2365	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2366	}
				2367
				2368	/* Bit Test */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2369	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2370	/// element-by-element comparison of the double-precision element in the
				2371	/// first source vector and the corresponding element in the second source
				2372	/// vector. The EFLAGS register is updated as follows:
				2373	/// If there is at least one pair of double-precision elements where the
				2374	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2375	/// ZF flag is set to 1.
				2376	/// If there is at least one pair of double-precision elements where the
				2377	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2378	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2379	/// This intrinsic returns the value of the ZF flag.
				2380	///
				2381	/// \headerfile <x86intrin.h>
				2382	///
				2383	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2384	///
				2385	/// \param __a
				2386	/// A 128-bit vector of [2 x double].
				2387	/// \param __b
				2388	/// A 128-bit vector of [2 x double].
				2389	/// \returns the ZF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2390	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2391	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2392	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2393	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2394	}
				2395
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2396	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2397	/// element-by-element comparison of the double-precision element in the
				2398	/// first source vector and the corresponding element in the second source
				2399	/// vector. The EFLAGS register is updated as follows:
				2400	/// If there is at least one pair of double-precision elements where the
				2401	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2402	/// ZF flag is set to 1.
				2403	/// If there is at least one pair of double-precision elements where the
				2404	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2405	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2406	/// This intrinsic returns the value of the CF flag.
				2407	///
				2408	/// \headerfile <x86intrin.h>
				2409	///
				2410	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2411	///
				2412	/// \param __a
				2413	/// A 128-bit vector of [2 x double].
				2414	/// \param __b
				2415	/// A 128-bit vector of [2 x double].
				2416	/// \returns the CF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2417	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2418	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2419	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2420	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2421	}
				2422
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2423	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2424	/// element-by-element comparison of the double-precision element in the
				2425	/// first source vector and the corresponding element in the second source
				2426	/// vector. The EFLAGS register is updated as follows:
				2427	/// If there is at least one pair of double-precision elements where the
				2428	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2429	/// ZF flag is set to 1.
				2430	/// If there is at least one pair of double-precision elements where the
				2431	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2432	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2433	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2434	/// otherwise it returns 0.
				2435	///
				2436	/// \headerfile <x86intrin.h>
				2437	///
				2438	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2439	///
				2440	/// \param __a
				2441	/// A 128-bit vector of [2 x double].
				2442	/// \param __b
				2443	/// A 128-bit vector of [2 x double].
				2444	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2445	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2446	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2447	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2448	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2449	}
				2450
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2451	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2452	/// element-by-element comparison of the single-precision element in the
				2453	/// first source vector and the corresponding element in the second source
				2454	/// vector. The EFLAGS register is updated as follows:
				2455	/// If there is at least one pair of single-precision elements where the
				2456	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2457	/// ZF flag is set to 1.
				2458	/// If there is at least one pair of single-precision elements where the
				2459	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2460	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2461	/// This intrinsic returns the value of the ZF flag.
				2462	///
				2463	/// \headerfile <x86intrin.h>
				2464	///
				2465	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2466	///
				2467	/// \param __a
				2468	/// A 128-bit vector of [4 x float].
				2469	/// \param __b
				2470	/// A 128-bit vector of [4 x float].
				2471	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2472	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2473	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2474	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2475	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2476	}
				2477
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2478	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2479	/// element-by-element comparison of the single-precision element in the
				2480	/// first source vector and the corresponding element in the second source
				2481	/// vector. The EFLAGS register is updated as follows:
				2482	/// If there is at least one pair of single-precision elements where the
				2483	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2484	/// ZF flag is set to 1.
				2485	/// If there is at least one pair of single-precision elements where the
				2486	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2487	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2488	/// This intrinsic returns the value of the CF flag.
				2489	///
				2490	/// \headerfile <x86intrin.h>
				2491	///
				2492	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2493	///
				2494	/// \param __a
				2495	/// A 128-bit vector of [4 x float].
				2496	/// \param __b
				2497	/// A 128-bit vector of [4 x float].
				2498	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2499	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2500	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2501	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2502	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2503	}
				2504
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2505	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2506	/// element-by-element comparison of the single-precision element in the
				2507	/// first source vector and the corresponding element in the second source
				2508	/// vector. The EFLAGS register is updated as follows:
				2509	/// If there is at least one pair of single-precision elements where the
				2510	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2511	/// ZF flag is set to 1.
				2512	/// If there is at least one pair of single-precision elements where the
				2513	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2514	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2515	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2516	/// otherwise it returns 0.
				2517	///
				2518	/// \headerfile <x86intrin.h>
				2519	///
				2520	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2521	///
				2522	/// \param __a
				2523	/// A 128-bit vector of [4 x float].
				2524	/// \param __b
				2525	/// A 128-bit vector of [4 x float].
				2526	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2527	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2528	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2529	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2530	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2531	}
				2532
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2533	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2534	/// element-by-element comparison of the double-precision elements in the
				2535	/// first source vector and the corresponding elements in the second source
				2536	/// vector. The EFLAGS register is updated as follows:
				2537	/// If there is at least one pair of double-precision elements where the
				2538	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2539	/// ZF flag is set to 1.
				2540	/// If there is at least one pair of double-precision elements where the
				2541	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2542	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2543	/// This intrinsic returns the value of the ZF flag.
				2544	///
				2545	/// \headerfile <x86intrin.h>
				2546	///
				2547	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2548	///
				2549	/// \param __a
				2550	/// A 256-bit vector of [4 x double].
				2551	/// \param __b
				2552	/// A 256-bit vector of [4 x double].
				2553	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2554	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2555	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2556	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2557	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2558	}
				2559
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2560	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2561	/// element-by-element comparison of the double-precision elements in the
				2562	/// first source vector and the corresponding elements in the second source
				2563	/// vector. The EFLAGS register is updated as follows:
				2564	/// If there is at least one pair of double-precision elements where the
				2565	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2566	/// ZF flag is set to 1.
				2567	/// If there is at least one pair of double-precision elements where the
				2568	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2569	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2570	/// This intrinsic returns the value of the CF flag.
				2571	///
				2572	/// \headerfile <x86intrin.h>
				2573	///
				2574	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2575	///
				2576	/// \param __a
				2577	/// A 256-bit vector of [4 x double].
				2578	/// \param __b
				2579	/// A 256-bit vector of [4 x double].
				2580	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2581	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2582	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2583	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2584	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2585	}
				2586
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2587	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2588	/// element-by-element comparison of the double-precision elements in the
				2589	/// first source vector and the corresponding elements in the second source
				2590	/// vector. The EFLAGS register is updated as follows:
				2591	/// If there is at least one pair of double-precision elements where the
				2592	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2593	/// ZF flag is set to 1.
				2594	/// If there is at least one pair of double-precision elements where the
				2595	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2596	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2597	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2598	/// otherwise it returns 0.
				2599	///
				2600	/// \headerfile <x86intrin.h>
				2601	///
				2602	/// This intrinsic corresponds to the \c VTESTPD instruction.
				2603	///
				2604	/// \param __a
				2605	/// A 256-bit vector of [4 x double].
				2606	/// \param __b
				2607	/// A 256-bit vector of [4 x double].
				2608	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2609	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2610	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2611	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2612	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2613	}
				2614
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2615	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2616	/// element-by-element comparison of the single-precision element in the
				2617	/// first source vector and the corresponding element in the second source
				2618	/// vector. The EFLAGS register is updated as follows:
				2619	/// If there is at least one pair of single-precision elements where the
				2620	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2621	/// ZF flag is set to 1.
				2622	/// If there is at least one pair of single-precision elements where the
				2623	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2624	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2625	/// This intrinsic returns the value of the ZF flag.
				2626	///
				2627	/// \headerfile <x86intrin.h>
				2628	///
				2629	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2630	///
				2631	/// \param __a
				2632	/// A 256-bit vector of [8 x float].
				2633	/// \param __b
				2634	/// A 256-bit vector of [8 x float].
				2635	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2636	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2637	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2638	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2639	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2640	}
				2641
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2642	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2643	/// element-by-element comparison of the single-precision element in the
				2644	/// first source vector and the corresponding element in the second source
				2645	/// vector. The EFLAGS register is updated as follows:
				2646	/// If there is at least one pair of single-precision elements where the
				2647	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2648	/// ZF flag is set to 1.
				2649	/// If there is at least one pair of single-precision elements where the
				2650	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2651	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2652	/// This intrinsic returns the value of the CF flag.
				2653	///
				2654	/// \headerfile <x86intrin.h>
				2655	///
				2656	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2657	///
				2658	/// \param __a
				2659	/// A 256-bit vector of [8 x float].
				2660	/// \param __b
				2661	/// A 256-bit vector of [8 x float].
				2662	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2663	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2664	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2665	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2666	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2667	}
				2668
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2669	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2670	/// element-by-element comparison of the single-precision elements in the
				2671	/// first source vector and the corresponding elements in the second source
				2672	/// vector. The EFLAGS register is updated as follows:
				2673	/// If there is at least one pair of single-precision elements where the
				2674	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2675	/// ZF flag is set to 1.
				2676	/// If there is at least one pair of single-precision elements where the
				2677	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2678	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1.
				2679	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2680	/// otherwise it returns 0.
				2681	///
				2682	/// \headerfile <x86intrin.h>
				2683	///
				2684	/// This intrinsic corresponds to the \c VTESTPS instruction.
				2685	///
				2686	/// \param __a
				2687	/// A 256-bit vector of [8 x float].
				2688	/// \param __b
				2689	/// A 256-bit vector of [8 x float].
				2690	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2691	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2692	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2693	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2694	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2695	}
				2696
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2697	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2698	/// of the two source vectors and update the EFLAGS register as follows:
				2699	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2700	/// is set to 0. Otherwise the ZF flag is set to 1.
				2701	/// If there is at least one pair of bits where the bit from the first source
				2702	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2703	/// is set to 0. Otherwise the CF flag is set to 1.
				2704	/// This intrinsic returns the value of the ZF flag.
				2705	///
				2706	/// \headerfile <x86intrin.h>
				2707	///
				2708	/// This intrinsic corresponds to the \c VPTEST instruction.
				2709	///
				2710	/// \param __a
				2711	/// A 256-bit integer vector.
				2712	/// \param __b
				2713	/// A 256-bit integer vector.
				2714	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2715	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2716	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2717	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2718	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2719	}
				2720
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2721	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2722	/// of the two source vectors and update the EFLAGS register as follows:
				2723	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2724	/// is set to 0. Otherwise the ZF flag is set to 1.
				2725	/// If there is at least one pair of bits where the bit from the first source
				2726	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2727	/// is set to 0. Otherwise the CF flag is set to 1.
				2728	/// This intrinsic returns the value of the CF flag.
				2729	///
				2730	/// \headerfile <x86intrin.h>
				2731	///
				2732	/// This intrinsic corresponds to the \c VPTEST instruction.
				2733	///
				2734	/// \param __a
				2735	/// A 256-bit integer vector.
				2736	/// \param __b
				2737	/// A 256-bit integer vector.
				2738	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2739	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2740	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2741	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2742	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2743	}
				2744
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2745	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2746	/// of the two source vectors and update the EFLAGS register as follows:
				2747	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2748	/// is set to 0. Otherwise the ZF flag is set to 1.
				2749	/// If there is at least one pair of bits where the bit from the first source
				2750	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2751	/// is set to 0. Otherwise the CF flag is set to 1.
				2752	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2753	/// otherwise it returns 0.
				2754	///
				2755	/// \headerfile <x86intrin.h>
				2756	///
				2757	/// This intrinsic corresponds to the \c VPTEST instruction.
				2758	///
				2759	/// \param __a
				2760	/// A 256-bit integer vector.
				2761	/// \param __b
				2762	/// A 256-bit integer vector.
				2763	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2764	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2765	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2766	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2767	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2768	}
				2769
				2770	/* Vector extract sign mask */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2771	/// \brief Extracts the sign bits of double-precision floating point elements
				2772	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2773	/// bits of the return value.
				2774	///
				2775	/// \headerfile <x86intrin.h>
				2776	///
				2777	/// This intrinsic corresponds to the \c VMOVMSKPD instruction.
				2778	///
				2779	/// \param __a
				2780	/// A 256-bit vector of [4 x double] containing the double-precision
				2781	/// floating point values with sign bits to be extracted.
				2782	/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2783	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2784	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2786	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2787	}
				2788
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2789	/// \brief Extracts the sign bits of double-precision floating point elements
				2790	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				2791	/// bits of the return value.
				2792	///
				2793	/// \headerfile <x86intrin.h>
				2794	///
				2795	/// This intrinsic corresponds to the \c VMOVMSKPS instruction.
				2796	///
				2797	/// \param __a
				2798	/// A 256-bit vector of [8 x float] containing the double-precision floating
				2799	/// point values with sign bits to be extracted.
				2800	/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2801	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2802	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2803	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2804	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2805	}
				2806
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2807	/* Vector __zero */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2808	/// \brief Zeroes the contents of all XMM or YMM registers.
				2809	///
				2810	/// \headerfile <x86intrin.h>
				2811	///
				2812	/// This intrinsic corresponds to the \c VZEROALL instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2813	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2814	_mm256_zeroall(void)
				2815	{
				2816	__builtin_ia32_vzeroall();
				2817	}
				2818
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2819	/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
				2820	///
				2821	/// \headerfile <x86intrin.h>
				2822	///
				2823	/// This intrinsic corresponds to the \c VZEROUPPER instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2824	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2825	_mm256_zeroupper(void)
				2826	{
				2827	__builtin_ia32_vzeroupper();
				2828	}
				2829
				2830	/* Vector load with broadcast */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2831	/// \brief Loads a scalar single-precision floating point value from the
				2832	/// specified address pointed to by __a and broadcasts it to the elements of
				2833	/// a [4 x float] vector.
				2834	///
				2835	/// \headerfile <x86intrin.h>
				2836	///
				2837	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
				2838	///
				2839	/// \param __a
				2840	/// The single-precision floating point value to be broadcast.
				2841	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				2842	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2843	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2844	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2845	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2846	float __f = *__a;
				2847	return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2848	}
				2849
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2850	/// \brief Loads a scalar double-precision floating point value from the
				2851	/// specified address pointed to by __a and broadcasts it to the elements of
				2852	/// a [4 x double] vector.
				2853	///
				2854	/// \headerfile <x86intrin.h>
				2855	///
				2856	/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
				2857	///
				2858	/// \param __a
				2859	/// The double-precision floating point value to be broadcast.
				2860	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				2861	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2862	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2863	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2864	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2865	double __d = *__a;
				2866	return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2867	}
				2868
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2869	/// \brief Loads a scalar single-precision floating point value from the
				2870	/// specified address pointed to by __a and broadcasts it to the elements of
				2871	/// a [8 x float] vector.
				2872	///
				2873	/// \headerfile <x86intrin.h>
				2874	///
				2875	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
				2876	///
				2877	/// \param __a
				2878	/// The single-precision floating point value to be broadcast.
				2879	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				2880	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2881	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2882	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2883	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2884	float __f = *__a;
				2885	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2886	}
				2887
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2888	/// \brief Loads the data from a 128-bit vector of [2 x double] from the
				2889	/// specified address pointed to by __a and broadcasts it to 128-bit
				2890	/// elements in a 256-bit vector of [4 x double].
				2891	///
				2892	/// \headerfile <x86intrin.h>
				2893	///
				2894	/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
				2895	///
				2896	/// \param __a
				2897	/// The 128-bit vector of [2 x double] to be broadcast.
				2898	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				2899	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2900	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2901	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2902	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2903	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2904	}
				2905
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2906	/// \brief Loads the data from a 128-bit vector of [4 x float] from the
				2907	/// specified address pointed to by __a and broadcasts it to 128-bit
				2908	/// elements in a 256-bit vector of [8 x float].
				2909	///
				2910	/// \headerfile <x86intrin.h>
				2911	///
				2912	/// This intrinsic corresponds to the \c VBROADCASTF128 instruction.
				2913	///
				2914	/// \param __a
				2915	/// The 128-bit vector of [4 x float] to be broadcast.
				2916	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				2917	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2918	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2919	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2920	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2921	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2922	}
				2923
				2924	/* SIMD load ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2925	/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
				2926	/// memory location pointed to by __p into a vector of [4 x double].
				2927	///
				2928	/// \headerfile <x86intrin.h>
				2929	///
				2930	/// This intrinsic corresponds to the \c VMOVAPD instruction.
				2931	///
				2932	/// \param __p
				2933	/// A 32-byte aligned pointer to a memory location containing
				2934	/// double-precision floating point values.
				2935	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2936	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2937	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2938	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2939	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2940	}
				2941
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2942	/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
				2943	/// memory location pointed to by __p into a vector of [8 x float].
				2944	///
				2945	/// \headerfile <x86intrin.h>
				2946	///
				2947	/// This intrinsic corresponds to the \c VMOVAPS instruction.
				2948	///
				2949	/// \param __p
				2950	/// A 32-byte aligned pointer to a memory location containing float values.
				2951	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2952	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2953	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2954	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2955	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2956	}
				2957
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2958	/// \brief Loads 4 double-precision floating point values from an unaligned
				2959	/// memory location pointed to by __p into a vector of [4 x double].
				2960	///
				2961	/// \headerfile <x86intrin.h>
				2962	///
				2963	/// This intrinsic corresponds to the \c VMOVUPD instruction.
				2964	///
				2965	/// \param __p
				2966	/// A pointer to a memory location containing double-precision floating
				2967	/// point values.
				2968	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2969	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2970	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2971	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2972	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2973	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2974	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2975	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2976	}
				2977
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2978	/// \brief Loads 8 single-precision floating point values from an unaligned
				2979	/// memory location pointed to by __p into a vector of [8 x float].
				2980	///
				2981	/// \headerfile <x86intrin.h>
				2982	///
				2983	/// This intrinsic corresponds to the \c VMOVUPS instruction.
				2984	///
				2985	/// \param __p
				2986	/// A pointer to a memory location containing single-precision floating
				2987	/// point values.
				2988	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2989	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2990	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2991	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2992	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2993	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2994	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2995	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2996	}
				2997
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	2998	/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
				2999	/// location pointed to by __p into elements of a 256-bit integer vector.
				3000	///
				3001	/// \headerfile <x86intrin.h>
				3002	///
				3003	/// This intrinsic corresponds to the \c VMOVDQA instruction.
				3004	///
				3005	/// \param __p
				3006	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3007	/// values.
				3008	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3009	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3010	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3011	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3012	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3013	}
				3014
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3015	/// \brief Loads 256 bits of integer data from an unaligned memory location
				3016	/// pointed to by __p into a 256-bit integer vector.
				3017	///
				3018	/// \headerfile <x86intrin.h>
				3019	///
				3020	/// This intrinsic corresponds to the \c VMOVDQU instruction.
				3021	///
				3022	/// \param __p
				3023	/// A pointer to a 256-bit integer vector containing integer values.
				3024	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3025	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3026	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3027	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3028	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3029	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3030	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3031	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3032	}
				3033
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3034	/// \brief Loads 256 bits of integer data from an unaligned memory location
				3035	/// pointed to by __p into a 256-bit integer vector. This intrinsic may
				3036	/// perform better than _mm256_loadu_si256 when the data crosses a cache
				3037	/// line boundary.
				3038	///
				3039	/// \headerfile <x86intrin.h>
				3040	///
				3041	/// This intrinsic corresponds to the \c VLDDQU instruction.
				3042	///
				3043	/// \param __p
				3044	/// A pointer to a 256-bit integer vector containing integer values.
				3045	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3046	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3047	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3048	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3049	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3050	}
				3051
				3052	/* SIMD store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3053	/// \brief Stores double-precision floating point values from a 256-bit vector
				3054	/// of [4 x double] to a 32-byte aligned memory location pointed to by __p.
				3055	///
				3056	/// \headerfile <x86intrin.h>
				3057	///
				3058	/// This intrinsic corresponds to the \c VMOVAPD instruction.
				3059	///
				3060	/// \param __p
				3061	/// A 32-byte aligned pointer to a memory location that will receive the
				3062	/// double-precision floaing point values.
				3063	/// \param __a
				3064	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3065	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3066	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3067	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3068	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3069	}
				3070
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3071	/// \brief Stores single-precision floating point values from a 256-bit vector
				3072	/// of [8 x float] to a 32-byte aligned memory location pointed to by __p.
				3073	///
				3074	/// \headerfile <x86intrin.h>
				3075	///
				3076	/// This intrinsic corresponds to the \c VMOVAPS instruction.
				3077	///
				3078	/// \param __p
				3079	/// A 32-byte aligned pointer to a memory location that will receive the
				3080	/// float values.
				3081	/// \param __a
				3082	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3083	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3084	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3085	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3086	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3087	}
				3088
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3089	/// \brief Stores double-precision floating point values from a 256-bit vector
				3090	/// of [4 x double] to an unaligned memory location pointed to by __p.
				3091	///
				3092	/// \headerfile <x86intrin.h>
				3093	///
				3094	/// This intrinsic corresponds to the \c VMOVUPD instruction.
				3095	///
				3096	/// \param __p
				3097	/// A pointer to a memory location that will receive the double-precision
				3098	/// floating point values.
				3099	/// \param __a
				3100	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3101	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3102	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3103	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3104	struct __storeu_pd {
				3105	__m256d __v;
				3106	} __attribute__((__packed__, __may_alias__));
				3107	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3108	}
				3109
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3110	/// \brief Stores single-precision floating point values from a 256-bit vector
				3111	/// of [8 x float] to an unaligned memory location pointed to by __p.
				3112	///
				3113	/// \headerfile <x86intrin.h>
				3114	///
				3115	/// This intrinsic corresponds to the \c VMOVUPS instruction.
				3116	///
				3117	/// \param __p
				3118	/// A pointer to a memory location that will receive the float values.
				3119	/// \param __a
				3120	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3121	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3122	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3123	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3124	struct __storeu_ps {
				3125	__m256 __v;
				3126	} __attribute__((__packed__, __may_alias__));
				3127	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3128	}
				3129
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3130	/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
				3131	/// aligned memory location pointed to by __p.
				3132	///
				3133	/// \headerfile <x86intrin.h>
				3134	///
				3135	/// This intrinsic corresponds to the \c VMOVDQA instruction.
				3136	///
				3137	/// \param __p
				3138	/// A 32-byte aligned pointer to a memory location that will receive the
				3139	/// integer values.
				3140	/// \param __a
				3141	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3142	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3143	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3144	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3145	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3146	}
				3147
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3148	/// \brief Stores integer values from a 256-bit integer vector to an unaligned
				3149	/// memory location pointed to by __p.
				3150	///
				3151	/// \headerfile <x86intrin.h>
				3152	///
				3153	/// This intrinsic corresponds to the \c VMOVDQU instruction.
				3154	///
				3155	/// \param __p
				3156	/// A pointer to a memory location that will receive the integer values.
				3157	/// \param __a
				3158	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3159	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3160	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3161	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3162	struct __storeu_si256 {
				3163	__m256i __v;
				3164	} __attribute__((__packed__, __may_alias__));
				3165	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3166	}
				3167
				3168	/* Conditional load ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3169	/// \brief Conditionally loads double-precision floating point elements
				3170	/// from a memory location pointed to by __p into a 128-bit vector of
				3171	/// [2 x double], depending on the mask bits associated with each data
				3172	/// element.
				3173	///
				3174	/// \headerfile <x86intrin.h>
				3175	///
				3176	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3177	///
				3178	/// \param __p
				3179	/// A pointer to a memory location that contains the double-precision
				3180	/// floating point values.
				3181	/// \param __m
				3182	/// A 128-bit integer vector containing the mask. The most significant bit of
				3183	/// each data element represents the mask bits. If a mask bit is zero, the
				3184	/// corresponding value in the memory location is not loaded and the
				3185	/// corresponding field in the return value is set to zero.
				3186	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3187	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3188	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3189	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3190	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3191	}
				3192
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3193	/// \brief Conditionally loads double-precision floating point elements
				3194	/// from a memory location pointed to by __p into a 256-bit vector of
				3195	/// [4 x double], depending on the mask bits associated with each data
				3196	/// element.
				3197	///
				3198	/// \headerfile <x86intrin.h>
				3199	///
				3200	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3201	///
				3202	/// \param __p
				3203	/// A pointer to a memory location that contains the double-precision
				3204	/// floating point values.
				3205	/// \param __m
				3206	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3207	/// significant bit of each quadword element represents the mask bits. If a
				3208	/// mask bit is zero, the corresponding value in the memory location is not
				3209	/// loaded and the corresponding field in the return value is set to zero.
				3210	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3211	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3212	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3213	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3214	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3215	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3216	}
				3217
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3218	/// \brief Conditionally loads single-precision floating point elements
				3219	/// from a memory location pointed to by __p into a 128-bit vector of
				3220	/// [4 x float], depending on the mask bits associated with each data
				3221	/// element.
				3222	///
				3223	/// \headerfile <x86intrin.h>
				3224	///
				3225	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3226	///
				3227	/// \param __p
				3228	/// A pointer to a memory location that contains the single-precision
				3229	/// floating point values.
				3230	/// \param __m
				3231	/// A 128-bit integer vector containing the mask. The most significant bit of
				3232	/// each data element represents the mask bits. If a mask bit is zero, the
				3233	/// corresponding value in the memory location is not loaded and the
				3234	/// corresponding field in the return value is set to zero.
				3235	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3236	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3237	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3238	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3239	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3240	}
				3241
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3242	/// \brief Conditionally loads single-precision floating point elements from a
				3243	/// memory location pointed to by __p into a 256-bit vector of [8 x float],
				3244	/// depending on the mask bits associated with each data element.
				3245	///
				3246	/// \headerfile <x86intrin.h>
				3247	///
				3248	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3249	///
				3250	/// \param __p
				3251	/// A pointer to a memory location that contains the single-precision
				3252	/// floating point values.
				3253	/// \param __m
				3254	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3255	/// significant bit of each dword element represents the mask bits. If a mask
				3256	/// bit is zero, the corresponding value in the memory location is not loaded
				3257	/// and the corresponding field in the return value is set to zero.
				3258	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3259	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3260	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3261	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3262	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3263	}
				3264
				3265	/* Conditional store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3266	/// \brief Moves single-precision floating point values from a 256-bit vector
				3267	/// of [8 x float] to a memory location pointed to by __p, according to the
				3268	/// specified mask.
				3269	///
				3270	/// \headerfile <x86intrin.h>
				3271	///
				3272	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3273	///
				3274	/// \param __p
				3275	/// A pointer to a memory location that will receive the float values.
				3276	/// \param __m
				3277	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3278	/// significant bit of each dword element in the mask vector represents the
				3279	/// mask bits. If a mask bit is zero, the corresponding value from vector __a
				3280	/// is not stored and the corresponding field in the memory location pointed
				3281	/// to by __p is not changed.
				3282	/// \param __a
				3283	/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3284	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3285	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3286	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3287	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3288	}
				3289
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3290	/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
				3291	/// to a memory location pointed to by __p, according to the specified mask.
				3292	///
				3293	/// \headerfile <x86intrin.h>
				3294	///
				3295	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3296	///
				3297	/// \param __p
				3298	/// A pointer to a memory location that will receive the float values.
				3299	/// \param __m
				3300	/// A 128-bit integer vector containing the mask. The most significant bit of
				3301	/// each field in the mask vector represents the mask bits. If a mask bit is
				3302	/// zero, the corresponding value from vector __a is not stored and the
				3303	/// corresponding field in the memory location pointed to by __p is not
				3304	/// changed.
				3305	/// \param __a
				3306	/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3307	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3308	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3309	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3310	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3311	}
				3312
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3313	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
				3314	/// to a memory location pointed to by __p, according to the specified mask.
				3315	///
				3316	/// \headerfile <x86intrin.h>
				3317	///
				3318	/// This intrinsic corresponds to the \c VMASKMOVPD instruction.
				3319	///
				3320	/// \param __p
				3321	/// A pointer to a memory location that will receive the float values.
				3322	/// \param __m
				3323	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3324	/// significant bit of each quadword element in the mask vector represents
				3325	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3326	/// __a is not stored and the corresponding field in the memory location
				3327	/// pointed to by __p is not changed.
				3328	/// \param __a
				3329	/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3330	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3331	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3332	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3333	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3334	}
				3335
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3336	/// \brief Moves single-precision floating point values from a 128-bit vector
				3337	/// of [4 x float] to a memory location pointed to by __p, according to the
				3338	/// specified mask.
				3339	///
				3340	/// \headerfile <x86intrin.h>
				3341	///
				3342	/// This intrinsic corresponds to the \c VMASKMOVPS instruction.
				3343	///
				3344	/// \param __p
				3345	/// A pointer to a memory location that will receive the float values.
				3346	/// \param __m
				3347	/// A 128-bit integer vector containing the mask. The most significant bit of
				3348	/// each field in the mask vector represents the mask bits. If a mask bit is
				3349	/// zero, the corresponding value from vector __a is not stored and the
				3350	/// corresponding field in the memory location pointed to by __p is not
				3351	/// changed.
				3352	/// \param __a
				3353	/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3354	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3355	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3356	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3357	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3358	}
				3359
				3360	/* Cacheability support ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3361	/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
				3362	/// aligned memory location. To minimize caching, the data is flagged as
				3363	/// non-temporal (unlikely to be used again soon).
				3364	///
				3365	/// \headerfile <x86intrin.h>
				3366	///
				3367	/// This intrinsic corresponds to the \c VMOVNTDQ instruction.
				3368	///
				3369	/// \param __a
				3370	/// A pointer to a 32-byte aligned memory location that will receive the
				3371	/// integer values.
				3372	/// \param __b
				3373	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3374	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3375	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3376	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3377	__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3378	}
				3379
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3380	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
				3381	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3382	/// flagged as non-temporal (unlikely to be used again soon).
				3383	///
				3384	/// \headerfile <x86intrin.h>
				3385	///
				3386	/// This intrinsic corresponds to the \c VMOVNTPD instruction.
				3387	///
				3388	/// \param __a
				3389	/// A pointer to a 32-byte aligned memory location that will receive the
				3390	/// integer values.
				3391	/// \param __b
				3392	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3393	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3394	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3395	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3396	__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3397	}
				3398
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3399	/// \brief Moves single-precision floating point values from a 256-bit vector
				3400	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3401	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3402	/// soon).
				3403	///
				3404	/// \headerfile <x86intrin.h>
				3405	///
				3406	/// This intrinsic corresponds to the \c VMOVNTPS instruction.
				3407	///
				3408	/// \param __p
				3409	/// A pointer to a 32-byte aligned memory location that will receive the
				3410	/// single-precision floating point values.
				3411	/// \param __a
				3412	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3413	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3414	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3415	{
Simon Pilgrim	beca5f2	2016-06-13 09:57:52 +0000	[diff] [blame]	3416	__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3417	}
				3418
				3419	/* Create vectors */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3420	/// \brief Create a 256-bit vector of [4 x double] with undefined values.
				3421	///
				3422	/// \headerfile <x86intrin.h>
				3423	///
				3424	/// This intrinsic has no corresponding instruction.
				3425	///
				3426	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3427	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3428	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3429	{
				3430	return (__m256d)__builtin_ia32_undef256();
				3431	}
				3432
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3433	/// \brief Create a 256-bit vector of [8 x float] with undefined values.
				3434	///
				3435	/// \headerfile <x86intrin.h>
				3436	///
				3437	/// This intrinsic has no corresponding instruction.
				3438	///
				3439	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3440	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3441	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3442	{
				3443	return (__m256)__builtin_ia32_undef256();
				3444	}
				3445
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame^]	3446	/// \brief Create a 256-bit integer vector with undefined values.
				3447	///
				3448	/// \headerfile <x86intrin.h>
				3449	///
				3450	/// This intrinsic has no corresponding instruction.
				3451	///
				3452	/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3453	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3454	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3455	{
				3456	return (__m256i)__builtin_ia32_undef256();
				3457	}
				3458
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3459	/// \brief Constructs a 256-bit floating-point vector of [4 x double]
				3460	/// initialized with the specified double-precision floating-point values.
				3461	///
				3462	/// \headerfile <x86intrin.h>
				3463	///
				3464	/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
				3465	///
				3466	/// \param __a
				3467	/// A double-precision floating-point value used to initialize bits [255:192]
				3468	/// of the result.
				3469	/// \param __b
				3470	/// A double-precision floating-point value used to initialize bits [191:128]
				3471	/// of the result.
				3472	/// \param __c
				3473	/// A double-precision floating-point value used to initialize bits [127:64]
				3474	/// of the result.
				3475	/// \param __d
				3476	/// A double-precision floating-point value used to initialize bits [63:0]
				3477	/// of the result.
				3478	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3479	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3480	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3481	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3482	return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3483	}
				3484
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3485	/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
				3486	/// with the specified single-precision floating-point values.
				3487	///
				3488	/// \headerfile <x86intrin.h>
				3489	///
				3490	/// This intrinsic is a utility function and does not correspond to a specific
				3491	/// instruction.
				3492	///
				3493	/// \param __a
				3494	/// A single-precision floating-point value used to initialize bits [255:224]
				3495	/// of the result.
				3496	/// \param __b
				3497	/// A single-precision floating-point value used to initialize bits [223:192]
				3498	/// of the result.
				3499	/// \param __c
				3500	/// A single-precision floating-point value used to initialize bits [191:160]
				3501	/// of the result.
				3502	/// \param __d
				3503	/// A single-precision floating-point value used to initialize bits [159:128]
				3504	/// of the result.
				3505	/// \param __e
				3506	/// A single-precision floating-point value used to initialize bits [127:96]
				3507	/// of the result.
				3508	/// \param __f
				3509	/// A single-precision floating-point value used to initialize bits [95:64]
				3510	/// of the result.
				3511	/// \param __g
				3512	/// A single-precision floating-point value used to initialize bits [63:32]
				3513	/// of the result.
				3514	/// \param __h
				3515	/// A single-precision floating-point value used to initialize bits [31:0]
				3516	/// of the result.
				3517	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3518	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3519	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3520	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3521	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3522	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3523	}
				3524
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3525	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3526	/// 32-bit integral values.
				3527	///
				3528	/// \headerfile <x86intrin.h>
				3529	///
				3530	/// This intrinsic is a utility function and does not correspond to a specific
				3531	/// instruction.
				3532	///
				3533	/// \param __i0
				3534	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3535	/// \param __i1
				3536	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3537	/// \param __i2
				3538	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3539	/// \param __i3
				3540	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3541	/// \param __i4
				3542	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3543	/// \param __i5
				3544	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3545	/// \param __i6
				3546	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3547	/// \param __i7
				3548	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3549	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3550	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3551	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3552	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3553	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3554	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3555	}
				3556
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3557	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3558	/// 16-bit integral values.
				3559	///
				3560	/// \headerfile <x86intrin.h>
				3561	///
				3562	/// This intrinsic is a utility function and does not correspond to a specific
				3563	/// instruction.
				3564	///
				3565	/// \param __w15
				3566	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3567	/// \param __w14
				3568	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3569	/// \param __w13
				3570	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3571	/// \param __w12
				3572	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3573	/// \param __w11
				3574	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3575	/// \param __w10
				3576	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3577	/// \param __w09
				3578	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3579	/// \param __w08
				3580	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3581	/// \param __w07
				3582	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3583	/// \param __w06
				3584	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3585	/// \param __w05
				3586	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3587	/// \param __w04
				3588	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3589	/// \param __w03
				3590	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3591	/// \param __w02
				3592	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3593	/// \param __w01
				3594	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3595	/// \param __w00
				3596	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3597	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3598	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3599	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3600	short __w11, short __w10, short __w09, short __w08,
				3601	short __w07, short __w06, short __w05, short __w04,
				3602	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3603	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3604	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				3605	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3606	}
				3607
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3608	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3609	/// 8-bit integral values.
				3610	///
				3611	/// \headerfile <x86intrin.h>
				3612	///
				3613	/// This intrinsic is a utility function and does not correspond to a specific
				3614	/// instruction.
				3615	///
				3616	/// \param __b31
				3617	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3618	/// \param __b30
				3619	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3620	/// \param __b29
				3621	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3622	/// \param __b28
				3623	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3624	/// \param __b27
				3625	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3626	/// \param __b26
				3627	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3628	/// \param __b25
				3629	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3630	/// \param __b24
				3631	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3632	/// \param __b23
				3633	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3634	/// \param __b22
				3635	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3636	/// \param __b21
				3637	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3638	/// \param __b20
				3639	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3640	/// \param __b19
				3641	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3642	/// \param __b18
				3643	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3644	/// \param __b17
				3645	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3646	/// \param __b16
				3647	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3648	/// \param __b15
				3649	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3650	/// \param __b14
				3651	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3652	/// \param __b13
				3653	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3654	/// \param __b12
				3655	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3656	/// \param __b11
				3657	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3658	/// \param __b10
				3659	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3660	/// \param __b09
				3661	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3662	/// \param __b08
				3663	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3664	/// \param __b07
				3665	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3666	/// \param __b06
				3667	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3668	/// \param __b05
				3669	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3670	/// \param __b04
				3671	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3672	/// \param __b03
				3673	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3674	/// \param __b02
				3675	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3676	/// \param __b01
				3677	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3678	/// \param __b00
				3679	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3680	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3681	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3682	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3683	char __b27, char __b26, char __b25, char __b24,
				3684	char __b23, char __b22, char __b21, char __b20,
				3685	char __b19, char __b18, char __b17, char __b16,
				3686	char __b15, char __b14, char __b13, char __b12,
				3687	char __b11, char __b10, char __b09, char __b08,
				3688	char __b07, char __b06, char __b05, char __b04,
				3689	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3690	{
				3691	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3692	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3693	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3694	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3695	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3696	};
				3697	}
				3698
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3699	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3700	/// 64-bit integral values.
				3701	///
				3702	/// \headerfile <x86intrin.h>
				3703	///
				3704	/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
				3705	///
				3706	/// \param __a
				3707	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3708	/// \param __b
				3709	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3710	/// \param __c
				3711	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3712	/// \param __d
				3713	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3714	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3715	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3716	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3717	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3718	return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3719	}
				3720
				3721	/* Create vectors with elements in reverse order */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3722	/// \brief Constructs a 256-bit floating-point vector of [4 x double],
				3723	/// initialized in reverse order with the specified double-precision
				3724	/// floating-point values.
				3725	///
				3726	/// \headerfile <x86intrin.h>
				3727	///
				3728	/// This intrinsic corresponds to the \c VUNPCKLPD+VINSERTF128 instruction.
				3729	///
				3730	/// \param __a
				3731	/// A double-precision floating-point value used to initialize bits [63:0]
				3732	/// of the result.
				3733	/// \param __b
				3734	/// A double-precision floating-point value used to initialize bits [127:64]
				3735	/// of the result.
				3736	/// \param __c
				3737	/// A double-precision floating-point value used to initialize bits [191:128]
				3738	/// of the result.
				3739	/// \param __d
				3740	/// A double-precision floating-point value used to initialize bits [255:192]
				3741	/// of the result.
				3742	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3743	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3744	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3745	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3746	return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3747	}
				3748
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3749	/// \brief Constructs a 256-bit floating-point vector of [8 x float],
				3750	/// initialized in reverse order with the specified single-precision
				3751	/// float-point values.
				3752	///
				3753	/// \headerfile <x86intrin.h>
				3754	///
				3755	/// This intrinsic is a utility function and does not correspond to a specific
				3756	/// instruction.
				3757	///
				3758	/// \param __a
				3759	/// A single-precision floating-point value used to initialize bits [31:0]
				3760	/// of the result.
				3761	/// \param __b
				3762	/// A single-precision floating-point value used to initialize bits [63:32]
				3763	/// of the result.
				3764	/// \param __c
				3765	/// A single-precision floating-point value used to initialize bits [95:64]
				3766	/// of the result.
				3767	/// \param __d
				3768	/// A single-precision floating-point value used to initialize bits [127:96]
				3769	/// of the result.
				3770	/// \param __e
				3771	/// A single-precision floating-point value used to initialize bits [159:128]
				3772	/// of the result.
				3773	/// \param __f
				3774	/// A single-precision floating-point value used to initialize bits [191:160]
				3775	/// of the result.
				3776	/// \param __g
				3777	/// A single-precision floating-point value used to initialize bits [223:192]
				3778	/// of the result.
				3779	/// \param __h
				3780	/// A single-precision floating-point value used to initialize bits [255:224]
				3781	/// of the result.
				3782	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3783	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3784	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3785	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3786	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3787	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3788	}
				3789
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3790	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3791	/// with the specified 32-bit integral values.
				3792	///
				3793	/// \headerfile <x86intrin.h>
				3794	///
				3795	/// This intrinsic is a utility function and does not correspond to a specific
				3796	/// instruction.
				3797	///
				3798	/// \param __i0
				3799	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3800	/// \param __i1
				3801	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3802	/// \param __i2
				3803	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3804	/// \param __i3
				3805	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3806	/// \param __i4
				3807	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3808	/// \param __i5
				3809	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3810	/// \param __i6
				3811	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3812	/// \param __i7
				3813	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3814	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3815	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3816	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3817	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3818	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3819	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3820	}
				3821
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3822	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3823	/// with the specified 16-bit integral values.
				3824	///
				3825	/// \headerfile <x86intrin.h>
				3826	///
				3827	/// This intrinsic is a utility function and does not correspond to a specific
				3828	/// instruction.
				3829	///
				3830	/// \param __w15
				3831	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3832	/// \param __w14
				3833	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3834	/// \param __w13
				3835	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3836	/// \param __w12
				3837	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3838	/// \param __w11
				3839	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3840	/// \param __w10
				3841	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3842	/// \param __w09
				3843	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3844	/// \param __w08
				3845	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3846	/// \param __w07
				3847	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3848	/// \param __w06
				3849	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3850	/// \param __w05
				3851	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3852	/// \param __w04
				3853	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3854	/// \param __w03
				3855	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3856	/// \param __w02
				3857	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3858	/// \param __w01
				3859	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3860	/// \param __w00
				3861	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3862	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3863	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3864	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3865	short __w11, short __w10, short __w09, short __w08,
				3866	short __w07, short __w06, short __w05, short __w04,
				3867	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3868	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3869	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				3870	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3871	}
				3872
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3873	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3874	/// with the specified 8-bit integral values.
				3875	///
				3876	/// \headerfile <x86intrin.h>
				3877	///
				3878	/// This intrinsic is a utility function and does not correspond to a specific
				3879	/// instruction.
				3880	///
				3881	/// \param __b31
				3882	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3883	/// \param __b30
				3884	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3885	/// \param __b29
				3886	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3887	/// \param __b28
				3888	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3889	/// \param __b27
				3890	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3891	/// \param __b26
				3892	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3893	/// \param __b25
				3894	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3895	/// \param __b24
				3896	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3897	/// \param __b23
				3898	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3899	/// \param __b22
				3900	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3901	/// \param __b21
				3902	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3903	/// \param __b20
				3904	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3905	/// \param __b19
				3906	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3907	/// \param __b18
				3908	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3909	/// \param __b17
				3910	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3911	/// \param __b16
				3912	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3913	/// \param __b15
				3914	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3915	/// \param __b14
				3916	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3917	/// \param __b13
				3918	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3919	/// \param __b12
				3920	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3921	/// \param __b11
				3922	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3923	/// \param __b10
				3924	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3925	/// \param __b09
				3926	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3927	/// \param __b08
				3928	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3929	/// \param __b07
				3930	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3931	/// \param __b06
				3932	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3933	/// \param __b05
				3934	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3935	/// \param __b04
				3936	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3937	/// \param __b03
				3938	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3939	/// \param __b02
				3940	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3941	/// \param __b01
				3942	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3943	/// \param __b00
				3944	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3945	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3946	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3947	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3948	char __b27, char __b26, char __b25, char __b24,
				3949	char __b23, char __b22, char __b21, char __b20,
				3950	char __b19, char __b18, char __b17, char __b16,
				3951	char __b15, char __b14, char __b13, char __b12,
				3952	char __b11, char __b10, char __b09, char __b08,
				3953	char __b07, char __b06, char __b05, char __b04,
				3954	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3955	{
				3956	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3957	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3958	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				3959	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				3960	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3961	}
				3962
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3963	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				3964	/// with the specified 64-bit integral values.
				3965	///
				3966	/// \headerfile <x86intrin.h>
				3967	///
				3968	/// This intrinsic corresponds to the \c VPUNPCKLQDQ+VINSERTF128 instruction.
				3969	///
				3970	/// \param __a
				3971	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3972	/// \param __b
				3973	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3974	/// \param __c
				3975	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3976	/// \param __d
				3977	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3978	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3979	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3980	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3981	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3982	return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3983	}
				3984
				3985	/* Create vectors with repeated elements */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3986	/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
				3987	/// of the four double-precision floating-point vector elements set to the
				3988	/// specified double-precision floating-point value.
				3989	///
				3990	/// \headerfile <x86intrin.h>
				3991	///
				3992	/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
				3993	///
				3994	/// \param __w
				3995	/// A double-precision floating-point value used to initialize each vector
				3996	/// element of the result.
				3997	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3998	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3999	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4000	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4001	return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4002	}
				4003
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4004	/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
				4005	/// of the eight single-precision floating-point vector elements set to the
				4006	/// specified single-precision floating-point value.
				4007	///
				4008	/// \headerfile <x86intrin.h>
				4009	///
				4010	/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
				4011	///
				4012	/// \param __w
				4013	/// A single-precision floating-point value used to initialize each vector
				4014	/// element of the result.
				4015	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4016	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4017	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4018	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4019	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4020	}
				4021
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4022	/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
				4023	/// 32-bit integral vector elements set to the specified 32-bit integral
				4024	/// value.
				4025	///
				4026	/// \headerfile <x86intrin.h>
				4027	///
				4028	/// This intrinsic corresponds to the \c VPERMILPS+VINSERTF128 instruction.
				4029	///
				4030	/// \param __i
				4031	/// A 32-bit integral value used to initialize each vector element of the
				4032	/// result.
				4033	/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4034	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4035	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4036	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4037	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4038	}
				4039
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4040	/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
				4041	/// 16-bit integral vector elements set to the specified 16-bit integral
				4042	/// value.
				4043	///
				4044	/// \headerfile <x86intrin.h>
				4045	///
				4046	/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
				4047	///
				4048	/// \param __w
				4049	/// A 16-bit integral value used to initialize each vector element of the
				4050	/// result.
				4051	/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4052	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4053	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4054	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4055	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				4056	__w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4057	}
				4058
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4059	/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
				4060	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4061	///
				4062	/// \headerfile <x86intrin.h>
				4063	///
				4064	/// This intrinsic corresponds to the \c VPSHUFB+VINSERTF128 instruction.
				4065	///
				4066	/// \param __b
				4067	/// An 8-bit integral value used to initialize each vector element of the
				4068	/// result.
				4069	/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4070	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4071	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4072	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4073	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4074	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4075	__b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4076	}
				4077
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4078	/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
				4079	/// 64-bit integral vector elements set to the specified 64-bit integral
				4080	/// value.
				4081	///
				4082	/// \headerfile <x86intrin.h>
				4083	///
				4084	/// This intrinsic corresponds to the \c VMOVDDUP+VINSERTF128 instruction.
				4085	///
				4086	/// \param __q
				4087	/// A 64-bit integral value used to initialize each vector element of the
				4088	/// result.
				4089	/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4090	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4091	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4092	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4093	return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4094	}
				4095
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4096	/* Create __zeroed vectors */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4097	/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
				4098	/// vector elements initialized to zero.
				4099	///
				4100	/// \headerfile <x86intrin.h>
				4101	///
				4102	/// This intrinsic corresponds to the \c VXORPS instruction.
				4103	///
				4104	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4105	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4106	_mm256_setzero_pd(void)
				4107	{
				4108	return (__m256d){ 0, 0, 0, 0 };
				4109	}
				4110
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4111	/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
				4112	/// vector elements initialized to zero.
				4113	///
				4114	/// \headerfile <x86intrin.h>
				4115	///
				4116	/// This intrinsic corresponds to the \c VXORPS instruction.
				4117	///
				4118	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4119	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4120	_mm256_setzero_ps(void)
				4121	{
				4122	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				4123	}
				4124
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4125	/// \brief Constructs a 256-bit integer vector initialized to zero.
				4126	///
				4127	/// \headerfile <x86intrin.h>
				4128	///
				4129	/// This intrinsic corresponds to the \c VXORPS instruction.
				4130	///
				4131	/// \returns A 256-bit integer vector initialized to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4132	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4133	_mm256_setzero_si256(void)
				4134	{
				4135	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				4136	}
				4137
				4138	/* Cast between vector types */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4139	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4140	/// floating-point vector of [8 x float].
				4141	///
				4142	/// \headerfile <x86intrin.h>
				4143	///
				4144	/// This intrinsic has no corresponding instruction.
				4145	///
				4146	/// \param __a
				4147	/// A 256-bit floating-point vector of [4 x double].
				4148	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4149	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4150	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4151	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4152	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4153	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4154	}
				4155
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4156	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4157	/// integer vector.
				4158	///
				4159	/// \headerfile <x86intrin.h>
				4160	///
				4161	/// This intrinsic has no corresponding instruction.
				4162	///
				4163	/// \param __a
				4164	/// A 256-bit floating-point vector of [4 x double].
				4165	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4166	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4167	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4168	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4169	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4170	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4171	}
				4172
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4173	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4174	/// floating-point vector of [4 x double].
				4175	///
				4176	/// \headerfile <x86intrin.h>
				4177	///
				4178	/// This intrinsic has no corresponding instruction.
				4179	///
				4180	/// \param __a
				4181	/// A 256-bit floating-point vector of [8 x float].
				4182	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4183	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4184	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4185	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4186	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4187	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4188	}
				4189
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4190	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4191	/// integer vector.
				4192	///
				4193	/// \headerfile <x86intrin.h>
				4194	///
				4195	/// This intrinsic has no corresponding instruction.
				4196	///
				4197	/// \param __a
				4198	/// A 256-bit floating-point vector of [8 x float].
				4199	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4200	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4201	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4202	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4203	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4204	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4205	}
				4206
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4207	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4208	/// of [8 x float].
				4209	///
				4210	/// \headerfile <x86intrin.h>
				4211	///
				4212	/// This intrinsic has no corresponding instruction.
				4213	///
				4214	/// \param __a
				4215	/// A 256-bit integer vector.
				4216	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4217	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4218	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4219	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4220	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4221	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4222	}
				4223
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4224	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4225	/// of [4 x double].
				4226	///
				4227	/// \headerfile <x86intrin.h>
				4228	///
				4229	/// This intrinsic has no corresponding instruction.
				4230	///
				4231	/// \param __a
				4232	/// A 256-bit integer vector.
				4233	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4234	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4235	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4236	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4237	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4238	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4239	}
				4240
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4241	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4242	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4243	///
				4244	/// \headerfile <x86intrin.h>
				4245	///
				4246	/// This intrinsic has no corresponding instruction.
				4247	///
				4248	/// \param __a
				4249	/// A 256-bit floating-point vector of [4 x double].
				4250	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4251	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4252	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4253	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4254	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4255	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4256	}
				4257
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4258	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4259	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4260	///
				4261	/// \headerfile <x86intrin.h>
				4262	///
				4263	/// This intrinsic has no corresponding instruction.
				4264	///
				4265	/// \param __a
				4266	/// A 256-bit floating-point vector of [8 x float].
				4267	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4268	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4269	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4270	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4271	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4272	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4273	}
				4274
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4275	/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
				4276	///
				4277	/// \headerfile <x86intrin.h>
				4278	///
				4279	/// This intrinsic has no corresponding instruction.
				4280	///
				4281	/// \param __a
				4282	/// A 256-bit integer vector.
				4283	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4284	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4285	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4286	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4287	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4288	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4289	}
				4290
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4291	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
				4292	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4293	/// contain the value of the source vector. The contents of the upper 128
				4294	/// bits are undefined.
				4295	///
				4296	/// \headerfile <x86intrin.h>
				4297	///
				4298	/// This intrinsic has no corresponding instruction.
				4299	///
				4300	/// \param __a
				4301	/// A 128-bit vector of [2 x double].
				4302	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4303	/// contain the value of the parameter. The contents of the upper 128 bits
				4304	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4305	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4306	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4307	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4308	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4309	}
				4310
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4311	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
				4312	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4313	/// the value of the source vector. The contents of the upper 128 bits are
				4314	/// undefined.
				4315	///
				4316	/// \headerfile <x86intrin.h>
				4317	///
				4318	/// This intrinsic has no corresponding instruction.
				4319	///
				4320	/// \param __a
				4321	/// A 128-bit vector of [4 x float].
				4322	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4323	/// contain the value of the parameter. The contents of the upper 128 bits
				4324	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4325	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4326	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4327	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4328	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4329	}
				4330
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4331	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
				4332	/// The lower 128 bits contain the value of the source vector. The contents
				4333	/// of the upper 128 bits are undefined.
				4334	///
				4335	/// \headerfile <x86intrin.h>
				4336	///
				4337	/// This intrinsic has no corresponding instruction.
				4338	///
				4339	/// \param __a
				4340	/// A 128-bit integer vector.
				4341	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4342	/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4343	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4344	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4345	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4346	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4347	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4348
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4349	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4350	Vector insert.
				4351	We use macros rather than inlines because we only want to accept
				4352	invocations where the immediate M is a constant expression.
				4353	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4354	/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
				4355	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4356	/// replacing either the upper or the lower 128 bits with the contents of a
				4357	/// 128-bit vector of [4 x float] in the second parameter. The immediate
				4358	/// integer parameter determines between the upper or the lower 128 bits.
				4359	///
				4360	/// \headerfile <x86intrin.h>
				4361	///
				4362	/// \code
				4363	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4364	/// \endcode
				4365	///
				4366	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4367	///
				4368	/// \param V1
				4369	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4370	/// first, and then either the upper or the lower 128 bits of the result will
				4371	/// be replaced by the contents of V2.
				4372	/// \param V2
				4373	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4374	/// written to either the upper or the lower 128 bits of the result depending
				4375	/// on the value of parameter M.
				4376	/// \param M
				4377	/// An immediate integer. The least significant bit determines how the values
				4378	/// from the two parameters are interleaved:
				4379	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4380	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4381	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4382	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4383	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4384	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				4385	(__m256)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4386	(__v8sf)(__m256)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4387	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				4388	(((M) & 1) ? 0 : 8), \
				4389	(((M) & 1) ? 1 : 9), \
				4390	(((M) & 1) ? 2 : 10), \
				4391	(((M) & 1) ? 3 : 11), \
				4392	(((M) & 1) ? 8 : 4), \
				4393	(((M) & 1) ? 9 : 5), \
				4394	(((M) & 1) ? 10 : 6), \
				4395	(((M) & 1) ? 11 : 7) );})
				4396
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4397	/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
				4398	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4399	/// replacing either the upper or the lower 128 bits with the contents of a
				4400	/// 128-bit vector of [2 x double] in the second parameter. The immediate
				4401	/// integer parameter determines between the upper or the lower 128 bits.
				4402	///
				4403	/// \headerfile <x86intrin.h>
				4404	///
				4405	/// \code
				4406	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4407	/// \endcode
				4408	///
				4409	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4410	///
				4411	/// \param V1
				4412	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4413	/// first, and then either the upper or the lower 128 bits of the result will
				4414	/// be replaced by the contents of V2.
				4415	/// \param V2
				4416	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4417	/// written to either the upper or the lower 128 bits of the result depending
				4418	/// on the value of parameter M.
				4419	/// \param M
				4420	/// An immediate integer. The least significant bit determines how the values
				4421	/// from the two parameters are interleaved:
				4422	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4423	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4424	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4425	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4426	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4427	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				4428	(__m256d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4429	(__v4df)(__m256d)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4430	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				4431	(((M) & 1) ? 0 : 4), \
				4432	(((M) & 1) ? 1 : 5), \
				4433	(((M) & 1) ? 4 : 2), \
				4434	(((M) & 1) ? 5 : 3) );})
				4435
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4436	/// \brief Constructs a new 256-bit integer vector by first duplicating a
				4437	/// 256-bit integer vector given in the first parameter, and then replacing
				4438	/// either the upper or the lower 128 bits with the contents of a 128-bit
				4439	/// integer vector in the second parameter. The immediate integer parameter
				4440	/// determines between the upper or the lower 128 bits.
				4441	///
				4442	/// \headerfile <x86intrin.h>
				4443	///
				4444	/// \code
				4445	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4446	/// \endcode
				4447	///
				4448	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4449	///
				4450	/// \param V1
				4451	/// A 256-bit integer vector. This vector is copied to the result first, and
				4452	/// then either the upper or the lower 128 bits of the result will be
				4453	/// replaced by the contents of V2.
				4454	/// \param V2
				4455	/// A 128-bit integer vector. The contents of this parameter are written to
				4456	/// either the upper or the lower 128 bits of the result depending on the
				4457	/// value of parameter M.
				4458	/// \param M
				4459	/// An immediate integer. The least significant bit determines how the values
				4460	/// from the two parameters are interleaved:
				4461	/// If bit [0] of M is 0, V2 are copied to bits [127:0] of the result, and
				4462	/// bits [255:128] of V1 are copied to bits [255:128] of the result.
				4463	/// If bit [0] of M is 1, V2 are copied to bits [255:128] of the result, and
				4464	/// bits [127:0] of V1 are copied to bits [127:0] of the result.
				4465	/// \returns A 256-bit integer vector containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4466	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				4467	(__m256i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4468	(__v4di)(__m256i)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4469	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				4470	(((M) & 1) ? 0 : 4), \
				4471	(((M) & 1) ? 1 : 5), \
				4472	(((M) & 1) ? 4 : 2), \
				4473	(((M) & 1) ? 5 : 3) );})
				4474
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4475	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4476	Vector extract.
				4477	We use macros rather than inlines because we only want to accept
				4478	invocations where the immediate M is a constant expression.
				4479	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4480	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4481	/// of [8 x float], as determined by the immediate integer parameter, and
				4482	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4483	///
				4484	/// \headerfile <x86intrin.h>
				4485	///
				4486	/// \code
				4487	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4488	/// \endcode
				4489	///
				4490	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4491	///
				4492	/// \param V
				4493	/// A 256-bit vector of [8 x float].
				4494	/// \param M
				4495	/// An immediate integer. The least significant bit determines which bits are
				4496	/// extracted from the first parameter:
				4497	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4498	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4499	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4500	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				4501	(__m128)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4502	(__v8sf)(__m256)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4503	(__v8sf)(_mm256_undefined_ps()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4504	(((M) & 1) ? 4 : 0), \
				4505	(((M) & 1) ? 5 : 1), \
				4506	(((M) & 1) ? 6 : 2), \
				4507	(((M) & 1) ? 7 : 3) );})
				4508
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4509	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4510	/// of [4 x double], as determined by the immediate integer parameter, and
				4511	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4512	///
				4513	/// \headerfile <x86intrin.h>
				4514	///
				4515	/// \code
				4516	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4517	/// \endcode
				4518	///
				4519	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4520	///
				4521	/// \param V
				4522	/// A 256-bit vector of [4 x double].
				4523	/// \param M
				4524	/// An immediate integer. The least significant bit determines which bits are
				4525	/// extracted from the first parameter:
				4526	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4527	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4528	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4529	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				4530	(__m128d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4531	(__v4df)(__m256d)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4532	(__v4df)(_mm256_undefined_pd()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4533	(((M) & 1) ? 2 : 0), \
				4534	(((M) & 1) ? 3 : 1) );})
				4535
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4536	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
				4537	/// integer vector, as determined by the immediate integer parameter, and
				4538	/// returns the extracted bits as a 128-bit integer vector.
				4539	///
				4540	/// \headerfile <x86intrin.h>
				4541	///
				4542	/// \code
				4543	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4544	/// \endcode
				4545	///
				4546	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction.
				4547	///
				4548	/// \param V
				4549	/// A 256-bit integer vector.
				4550	/// \param M
				4551	/// An immediate integer. The least significant bit determines which bits are
				4552	/// extracted from the first parameter:
				4553	/// If bit [0] of M is 0, bits [127:0] of V are copied to the result.
				4554	/// If bit [0] of M is 1, bits [255:128] of V are copied to the result.
				4555	/// \returns A 128-bit integer vector containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4556	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				4557	(__m128i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4558	(__v4di)(__m256i)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4559	(__v4di)(_mm256_undefined_si256()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4560	(((M) & 1) ? 2 : 0), \
				4561	(((M) & 1) ? 3 : 1) );})
				4562
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4563	/* SIMD load ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4564	/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
				4565	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4566	/// of [8 x float] by concatenating the two 128-bit vectors.
				4567	///
				4568	/// \headerfile <x86intrin.h>
				4569	///
				4570	/// This intrinsic corresponds to load instructions followed by the
				4571	/// \c VINSERTF128 instruction.
				4572	///
				4573	/// \param __addr_hi
				4574	/// A pointer to a 128-bit memory location containing 4 consecutive
				4575	/// single-precision floating-point values. These values are to be copied
				4576	/// to bits[255:128] of the result. The address of the memory location does
				4577	/// not have to be aligned.
				4578	/// \param __addr_lo
				4579	/// A pointer to a 128-bit memory location containing 4 consecutive
				4580	/// single-precision floating-point values. These values are to be copied
				4581	/// to bits[127:0] of the result. The address of the memory location does not
				4582	/// have to be aligned.
				4583	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4584	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4585	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4586	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4587	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4588	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				4589	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4590	}
				4591
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4592	/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
				4593	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4594	/// of [4 x double] by concatenating the two 128-bit vectors.
				4595	///
				4596	/// \headerfile <x86intrin.h>
				4597	///
				4598	/// This intrinsic corresponds to load instructions followed by the
				4599	/// \c VINSERTF128 instruction.
				4600	///
				4601	/// \param __addr_hi
				4602	/// A pointer to a 128-bit memory location containing two consecutive
				4603	/// double-precision floating-point values. These values are to be copied
				4604	/// to bits[255:128] of the result. The address of the memory location does
				4605	/// not have to be aligned.
				4606	/// \param __addr_lo
				4607	/// A pointer to a 128-bit memory location containing two consecutive
				4608	/// double-precision floating-point values. These values are to be copied
				4609	/// to bits[127:0] of the result. The address of the memory location does not
				4610	/// have to be aligned.
				4611	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4612	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4613	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4614	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4615	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4616	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				4617	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4618	}
				4619
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4620	/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
				4621	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4622	/// vectors.
				4623	///
				4624	/// \headerfile <x86intrin.h>
				4625	///
				4626	/// This intrinsic corresponds to load instructions followed by the
				4627	/// \c VINSERTF128 instruction.
				4628	///
				4629	/// \param __addr_hi
				4630	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4631	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4632	/// address of the memory location does not have to be aligned.
				4633	/// \param __addr_lo
				4634	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4635	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4636	/// address of the memory location does not have to be aligned.
				4637	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4638	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4639	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4640	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4641	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				4642	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4643	}
				4644
				4645	/* SIMD store ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4646	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4647	/// vector of [8 x float] into two different unaligned memory locations.
				4648	///
				4649	/// \headerfile <x86intrin.h>
				4650	///
				4651	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4652	/// instructions.
				4653	///
				4654	/// \param __addr_hi
				4655	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4656	/// copied to this memory location. The address of this memory location does
				4657	/// not have to be aligned.
				4658	/// \param __addr_lo
				4659	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4660	/// copied to this memory location. The address of this memory location does
				4661	/// not have to be aligned.
				4662	/// \param __a
				4663	/// A 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4664	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4665	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4666	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4667	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4668
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4669	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4670	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4671	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4672	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4673	}
				4674
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4675	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4676	/// vector of [4 x double] into two different unaligned memory locations.
				4677	///
				4678	/// \headerfile <x86intrin.h>
				4679	///
				4680	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4681	/// instructions.
				4682	///
				4683	/// \param __addr_hi
				4684	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4685	/// copied to this memory location. The address of this memory location does
				4686	/// not have to be aligned.
				4687	/// \param __addr_lo
				4688	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4689	/// copied to this memory location. The address of this memory location does
				4690	/// not have to be aligned.
				4691	/// \param __a
				4692	/// A 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4693	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4694	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4695	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4696	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4697
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4698	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4699	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4700	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4701	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4702	}
				4703
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4704	/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
				4705	/// two different unaligned memory locations.
				4706	///
				4707	/// \headerfile <x86intrin.h>
				4708	///
				4709	/// This intrinsic corresponds to the \c VEXTRACTF128 instruction and the store
				4710	/// instructions.
				4711	///
				4712	/// \param __addr_hi
				4713	/// A pointer to a 128-bit memory location. Bits[255:128] of __a are to be
				4714	/// copied to this memory location. The address of this memory location does
				4715	/// not have to be aligned.
				4716	/// \param __addr_lo
				4717	/// A pointer to a 128-bit memory location. Bits[127:0] of __a are to be
				4718	/// copied to this memory location. The address of this memory location does
				4719	/// not have to be aligned.
				4720	/// \param __a
				4721	/// A 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4722	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4723	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4724	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4725	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4726
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4727	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4728	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4729	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4730	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4731	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4732
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4733	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				4734	/// concatenating two 128-bit floating-point vectors of [4 x float].
				4735	///
				4736	/// \headerfile <x86intrin.h>
				4737	///
				4738	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4739	///
				4740	/// \param __hi
				4741	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4742	/// 128 bits of the result.
				4743	/// \param __lo
				4744	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4745	/// 128 bits of the result.
				4746	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4747	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4748	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4749	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				4750	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4751	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4752	}
				4753
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4754	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				4755	/// concatenating two 128-bit floating-point vectors of [2 x double].
				4756	///
				4757	/// \headerfile <x86intrin.h>
				4758	///
				4759	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4760	///
				4761	/// \param __hi
				4762	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4763	/// 128 bits of the result.
				4764	/// \param __lo
				4765	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4766	/// 128 bits of the result.
				4767	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4768	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4769	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4770	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				4771	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4772	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4773	}
				4774
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4775	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				4776	/// integer vectors.
				4777	///
				4778	/// \headerfile <x86intrin.h>
				4779	///
				4780	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4781	///
				4782	/// \param __hi
				4783	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4784	/// result.
				4785	/// \param __lo
				4786	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4787	/// result.
				4788	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4789	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4790	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				4791	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4792	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4793	}
				4794
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4795	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				4796	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				4797	/// similar to _mm256_set_m128, but the order of the input parameters is
				4798	/// swapped.
				4799	///
				4800	/// \headerfile <x86intrin.h>
				4801	///
				4802	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4803	///
				4804	/// \param __lo
				4805	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4806	/// 128 bits of the result.
				4807	/// \param __hi
				4808	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4809	/// 128 bits of the result.
				4810	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4811	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4812	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4813	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				4814	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4815	return _mm256_set_m128(__hi, __lo);
				4816	}
				4817
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4818	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				4819	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				4820	/// similar to _mm256_set_m128d, but the order of the input parameters is
				4821	/// swapped.
				4822	///
				4823	/// \headerfile <x86intrin.h>
				4824	///
				4825	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4826	///
				4827	/// \param __lo
				4828	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4829	/// 128 bits of the result.
				4830	/// \param __hi
				4831	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4832	/// 128 bits of the result.
				4833	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4834	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4835	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4836	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				4837	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4838	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4839	}
				4840
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4841	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				4842	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				4843	/// the input parameters is swapped.
				4844	///
				4845	/// \headerfile <x86intrin.h>
				4846	///
				4847	/// This intrinsic corresponds to the \c VINSERTF128 instruction.
				4848	///
				4849	/// \param __lo
				4850	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4851	/// result.
				4852	/// \param __hi
				4853	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4854	/// result.
				4855	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4856	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4857	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				4858	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4859	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4860	}
				4861
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4862	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	4863
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4864	#endif /* __AVXINTRIN_H */