Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: 27dc64424ae076d9e7636ecafc69c72d44f2334b [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
				39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
				46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				47
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
				51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	54
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	55	/* Arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	56	/// \brief Adds two 256-bit vectors of [4 x double].
				57	///
				58	/// \headerfile <x86intrin.h>
				59	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	60	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	68	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	69	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	70	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	71	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	72	}
				73
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	74	/// \brief Adds two 256-bit vectors of [8 x float].
				75	///
				76	/// \headerfile <x86intrin.h>
				77	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	78	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	86	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	87	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	88	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	89	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	90	}
				91
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	92	/// \brief Subtracts two 256-bit vectors of [4 x double].
				93	///
				94	/// \headerfile <x86intrin.h>
				95	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	96	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	104	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	105	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	106	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	107	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	108	}
				109
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	110	/// \brief Subtracts two 256-bit vectors of [8 x float].
				111	///
				112	/// \headerfile <x86intrin.h>
				113	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	114	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	122	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	123	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	124	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	125	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	126	}
				127
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	128	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	133	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	141	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	142	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	143	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	145	}
				146
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	147	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	152	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	160	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	161	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	162	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	164	}
				165
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	166	/// \brief Divides two 256-bit vectors of [4 x double].
				167	///
				168	/// \headerfile <x86intrin.h>
				169	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	170	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	178	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	179	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	180	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	181	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	182	}
				183
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	184	/// \brief Divides two 256-bit vectors of [8 x float].
				185	///
				186	/// \headerfile <x86intrin.h>
				187	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	188	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	196	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	197	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	198	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	199	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	200	}
				201
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	202	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	207	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	215	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	216	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	217	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	219	}
				220
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	221	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	226	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	234	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	235	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	238	}
				239
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	240	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	245	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	253	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	254	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	255	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	257	}
				258
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	259	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	264	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	272	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	273	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	274	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	276	}
				277
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	278	/// \brief Multiplies two 256-bit vectors of [4 x double].
				279	///
				280	/// \headerfile <x86intrin.h>
				281	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	282	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	290	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	291	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	292	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	293	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	294	}
				295
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	296	/// \brief Multiplies two 256-bit vectors of [8 x float].
				297	///
				298	/// \headerfile <x86intrin.h>
				299	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	300	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	308	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	309	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	311	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	312	}
				313
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	314	/// \brief Calculates the square roots of the values in a 256-bit vector of
				315	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	316	///
				317	/// \headerfile <x86intrin.h>
				318	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	319	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	325	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	326	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	327	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	329	}
				330
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	331	/// \brief Calculates the square roots of the values in a 256-bit vector of
				332	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	333	///
				334	/// \headerfile <x86intrin.h>
				335	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	336	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	342	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	343	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	344	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	346	}
				347
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	348	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				349	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	350	///
				351	/// \headerfile <x86intrin.h>
				352	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	353	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	359	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	360	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	361	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	363	}
				364
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	365	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				366	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	367	///
				368	/// \headerfile <x86intrin.h>
				369	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	370	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	376	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	377	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	378	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	380	}
				381
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	382	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	392	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	397	/// An integer value that specifies the rounding operation. \n
				398	/// Bits [7:4] are reserved. \n
				399	/// Bit [3] is a precision exception value: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	400	/// 0: A normal PE exception is used. \n
				401	/// 1: The PE field is not updated. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	402	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	403	/// 0: Use bits [1:0] of \a M. \n
				404	/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	405	/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	406	/// 00: Nearest. \n
				407	/// 01: Downward (toward negative infinity). \n
				408	/// 10: Upward (toward positive infinity). \n
				409	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	411	#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	413
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	414	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	424	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	429	/// An integer value that specifies the rounding operation. \n
				430	/// Bits [7:4] are reserved. \n
				431	/// Bit [3] is a precision exception value: \n
				432	/// 0: A normal PE exception is used. \n
				433	/// 1: The PE field is not updated. \n
				434	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	435	/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	436	/// 1: Use the current MXCSR setting. \n
				437	/// Bits [1:0] contain the rounding control definition: \n
				438	/// 00: Nearest. \n
				439	/// 01: Downward (toward negative infinity). \n
				440	/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	441	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	443	#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	445
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	446	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	456	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	462
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	463	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	473	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	480
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	481	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	491	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	497
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	498	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	508	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	516	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				517	///
				518	/// \headerfile <x86intrin.h>
				519	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	520	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	528	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	529	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	530	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	531	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	532	}
				533
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	534	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				535	///
				536	/// \headerfile <x86intrin.h>
				537	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	538	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	546	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	547	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	548	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	549	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	550	}
				551
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	552	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	557	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	567	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	568	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	569	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	571	}
				572
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	573	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	578	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	588	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	589	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	590	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	591	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	592	}
				593
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	594	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				595	///
				596	/// \headerfile <x86intrin.h>
				597	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	598	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	606	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	607	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	608	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	609	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	610	}
				611
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	612	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				613	///
				614	/// \headerfile <x86intrin.h>
				615	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	616	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	624	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	625	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	626	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	627	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	628	}
				629
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	630	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				631	///
				632	/// \headerfile <x86intrin.h>
				633	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	634	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	643	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	644	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	646	}
				647
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	648	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				649	///
				650	/// \headerfile <x86intrin.h>
				651	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	652	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	660	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	661	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	662	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	663	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	664	}
				665
				666	/* Horizontal arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	667	/// \brief Horizontally adds the adjacent pairs of values contained in two
				668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	672	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	684	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	685	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	686	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	688	}
				689
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	690	/// \brief Horizontally adds the adjacent pairs of values contained in two
				691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	695	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	707	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	708	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	709	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	711	}
				712
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	713	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	718	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	730	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	731	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	732	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	734	}
				735
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	736	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	741	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	753	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	754	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	755	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	757	}
				758
				759	/* Vector permutations */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	760	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				761	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	762	///
				763	/// \headerfile <x86intrin.h>
				764	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	765	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	770	/// A 128-bit integer vector operand specifying how the values are to be
				771	/// copied. \n
				772	/// Bit [1]: \n
				773	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				774	/// vector. \n
				775	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				776	/// returned vector. \n
				777	/// Bit [65]: \n
				778	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				779	/// returned vector. \n
				780	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				781	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	783	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	784	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	787	}
				788
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	789	/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
				790	/// by the 256-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	791	///
				792	/// \headerfile <x86intrin.h>
				793	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	794	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	800	/// copied. \n
				801	/// Bit [1]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	802	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				803	/// vector. \n
				804	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				805	/// returned vector. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	806	/// Bit [65]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	807	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				808	/// returned vector. \n
				809	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				810	/// returned vector. \n
				811	/// Bit [129]: \n
				812	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				813	/// returned vector. \n
				814	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				815	/// returned vector. \n
				816	/// Bit [193]: \n
				817	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				818	/// returned vector. \n
				819	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	820	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	824	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	826	}
				827
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	828	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				829	/// specified by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	830	/// \headerfile <x86intrin.h>
				831	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	832	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	833	///
				834	/// \param __a
				835	/// A 128-bit vector of [4 x float].
				836	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	837	/// A 128-bit integer vector operand specifying how the values are to be
				838	/// copied. \n
				839	/// Bits [1:0]: \n
				840	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				841	/// returned vector. \n
				842	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				843	/// returned vector. \n
				844	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				845	/// returned vector. \n
				846	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				847	/// returned vector. \n
				848	/// Bits [33:32]: \n
				849	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				850	/// returned vector. \n
				851	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				852	/// returned vector. \n
				853	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				854	/// returned vector. \n
				855	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				856	/// returned vector. \n
				857	/// Bits [65:64]: \n
				858	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				859	/// returned vector. \n
				860	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				861	/// returned vector. \n
				862	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				863	/// returned vector. \n
				864	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				865	/// returned vector. \n
				866	/// Bits [97:96]: \n
				867	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				868	/// returned vector. \n
				869	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				870	/// returned vector. \n
				871	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				872	/// returned vector. \n
				873	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				874	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	875	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	876	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	877	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	878	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	879	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	880	}
				881
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	882	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				883	/// specified by the 256-bit integer vector operand.
				884	///
				885	/// \headerfile <x86intrin.h>
				886	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	887	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	888	///
				889	/// \param __a
				890	/// A 256-bit vector of [8 x float].
				891	/// \param __c
				892	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	893	/// copied. \n
				894	/// Bits [1:0]: \n
				895	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				896	/// returned vector. \n
				897	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				898	/// returned vector. \n
				899	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				900	/// returned vector. \n
				901	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				902	/// returned vector. \n
				903	/// Bits [33:32]: \n
				904	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				905	/// returned vector. \n
				906	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				907	/// returned vector. \n
				908	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				909	/// returned vector. \n
				910	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				911	/// returned vector. \n
				912	/// Bits [65:64]: \n
				913	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				914	/// returned vector. \n
				915	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				916	/// returned vector. \n
				917	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				918	/// returned vector. \n
				919	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				920	/// returned vector. \n
				921	/// Bits [97:96]: \n
				922	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				923	/// returned vector. \n
				924	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				925	/// returned vector. \n
				926	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				927	/// returned vector. \n
				928	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				929	/// returned vector. \n
				930	/// Bits [129:128]: \n
				931	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				932	/// returned vector. \n
				933	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				934	/// returned vector. \n
				935	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				936	/// returned vector. \n
				937	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				938	/// returned vector. \n
				939	/// Bits [161:160]: \n
				940	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				941	/// returned vector. \n
				942	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				943	/// returned vector. \n
				944	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				945	/// returned vector. \n
				946	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				947	/// returned vector. \n
				948	/// Bits [193:192]: \n
				949	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				950	/// returned vector. \n
				951	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				952	/// returned vector. \n
				953	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				954	/// returned vector. \n
				955	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				956	/// returned vector. \n
				957	/// Bits [225:224]: \n
				958	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				959	/// returned vector. \n
				960	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				961	/// returned vector. \n
				962	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				963	/// returned vector. \n
				964	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				965	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	966	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	967	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	968	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	969	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	970	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	971	}
				972
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	973	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				974	/// by the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	975	///
				976	/// \headerfile <x86intrin.h>
				977	///
				978	/// \code
				979	/// __m128d _mm_permute_pd(__m128d A, const int C);
				980	/// \endcode
				981	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	982	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	983	///
				984	/// \param A
				985	/// A 128-bit vector of [2 x double].
				986	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	987	/// An immediate integer operand specifying how the values are to be
				988	/// copied. \n
				989	/// Bit [0]: \n
				990	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				991	/// vector. \n
				992	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				993	/// returned vector. \n
				994	/// Bit [1]: \n
				995	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				996	/// returned vector. \n
				997	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				998	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	999	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1000	#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1001	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1002	(__v2df)_mm_undefined_pd(), \
				1003	((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1004
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1005	/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
				1006	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1007	///
				1008	/// \headerfile <x86intrin.h>
				1009	///
				1010	/// \code
				1011	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1012	/// \endcode
				1013	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1014	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1015	///
				1016	/// \param A
				1017	/// A 256-bit vector of [4 x double].
				1018	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1019	/// An immediate integer operand specifying how the values are to be
				1020	/// copied. \n
				1021	/// Bit [0]: \n
				1022	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				1023	/// vector. \n
				1024	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1025	/// returned vector. \n
				1026	/// Bit [1]: \n
				1027	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1028	/// returned vector. \n
				1029	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1030	/// returned vector. \n
				1031	/// Bit [2]: \n
				1032	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1033	/// returned vector. \n
				1034	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1035	/// returned vector. \n
				1036	/// Bit [3]: \n
				1037	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1038	/// returned vector. \n
				1039	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1040	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1041	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1042	#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1043	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1044	(__v4df)_mm256_undefined_pd(), \
				1045	0 + (((C) >> 0) & 0x1), \
				1046	0 + (((C) >> 1) & 0x1), \
				1047	2 + (((C) >> 2) & 0x1), \
				1048	2 + (((C) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1049
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1050	/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
				1051	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1052	///
				1053	/// \headerfile <x86intrin.h>
				1054	///
				1055	/// \code
				1056	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1057	/// \endcode
				1058	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1059	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1060	///
				1061	/// \param A
				1062	/// A 128-bit vector of [4 x float].
				1063	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1064	/// An immediate integer operand specifying how the values are to be
				1065	/// copied. \n
				1066	/// Bits [1:0]: \n
				1067	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1068	/// returned vector. \n
				1069	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1070	/// returned vector. \n
				1071	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1072	/// returned vector. \n
				1073	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1074	/// returned vector. \n
				1075	/// Bits [3:2]: \n
				1076	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1077	/// returned vector. \n
				1078	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1079	/// returned vector. \n
				1080	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1081	/// returned vector. \n
				1082	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1083	/// returned vector. \n
				1084	/// Bits [5:4]: \n
				1085	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1086	/// returned vector. \n
				1087	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1088	/// returned vector. \n
				1089	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1090	/// returned vector. \n
				1091	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1092	/// returned vector. \n
				1093	/// Bits [7:6]: \n
				1094	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1095	/// returned vector. \n
				1096	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1097	/// returned vector. \n
				1098	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1099	/// returned vector. \n
				1100	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1101	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1102	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1103	#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1104	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1105	(__v4sf)_mm_undefined_ps(), \
				1106	((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
				1107	((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1108
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1109	/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
				1110	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1111	///
				1112	/// \headerfile <x86intrin.h>
				1113	///
				1114	/// \code
				1115	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1116	/// \endcode
				1117	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1118	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1119	///
				1120	/// \param A
				1121	/// A 256-bit vector of [8 x float].
				1122	/// \param C
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1123	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1124	/// copied. \n
				1125	/// Bits [1:0]: \n
				1126	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1127	/// returned vector. \n
				1128	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1129	/// returned vector. \n
				1130	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1131	/// returned vector. \n
				1132	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1133	/// returned vector. \n
				1134	/// Bits [3:2]: \n
				1135	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1136	/// returned vector. \n
				1137	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1138	/// returned vector. \n
				1139	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1140	/// returned vector. \n
				1141	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1142	/// returned vector. \n
				1143	/// Bits [5:4]: \n
				1144	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1145	/// returned vector. \n
				1146	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1147	/// returned vector. \n
				1148	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1149	/// returned vector. \n
				1150	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1151	/// returned vector. \n
				1152	/// Bits [7:6]: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1153	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1154	/// returned vector. \n
				1155	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1156	/// returned vector. \n
				1157	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1158	/// returned vector. \n
				1159	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1160	/// returned vector. \n
				1161	/// Bits [1:0]: \n
				1162	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1163	/// returned vector. \n
				1164	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1165	/// returned vector. \n
				1166	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1167	/// returned vector. \n
				1168	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1169	/// returned vector. \n
				1170	/// Bits [3:2]: \n
				1171	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1172	/// returned vector. \n
				1173	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1174	/// returned vector. \n
				1175	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1176	/// returned vector. \n
				1177	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1178	/// returned vector. \n
				1179	/// Bits [5:4]: \n
				1180	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1181	/// returned vector. \n
				1182	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1183	/// returned vector. \n
				1184	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1185	/// returned vector. \n
				1186	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1187	/// returned vector. \n
				1188	/// Bits [7:6]: \n
				1189	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1190	/// returned vector. \n
				1191	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1192	/// returned vector. \n
				1193	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1194	/// returned vector. \n
				1195	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1196	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1197	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1198	#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1199	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1200	(__v8sf)_mm256_undefined_ps(), \
				1201	0 + (((C) >> 0) & 0x3), \
				1202	0 + (((C) >> 2) & 0x3), \
				1203	0 + (((C) >> 4) & 0x3), \
				1204	0 + (((C) >> 6) & 0x3), \
				1205	4 + (((C) >> 0) & 0x3), \
				1206	4 + (((C) >> 2) & 0x3), \
				1207	4 + (((C) >> 4) & 0x3), \
				1208	4 + (((C) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1209
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1210	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1211	/// [4 x double], as specified by the immediate integer operand.
				1212	///
				1213	/// \headerfile <x86intrin.h>
				1214	///
				1215	/// \code
				1216	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1217	/// \endcode
				1218	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1219	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1220	///
				1221	/// \param V1
				1222	/// A 256-bit vector of [4 x double].
				1223	/// \param V2
				1224	/// A 256-bit vector of [4 x double.
				1225	/// \param M
				1226	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1227	/// permuted. \n
				1228	/// Bits [1:0]: \n
				1229	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1230	/// destination. \n
				1231	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1232	/// destination. \n
				1233	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1234	/// destination. \n
				1235	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1236	/// destination. \n
				1237	/// Bits [5:4]: \n
				1238	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1239	/// destination. \n
				1240	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1241	/// destination. \n
				1242	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1243	/// destination. \n
				1244	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
				1245	/// destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1246	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1247	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1248	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1249	(__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1250
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1251	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1252	/// [8 x float], as specified by the immediate integer operand.
				1253	///
				1254	/// \headerfile <x86intrin.h>
				1255	///
				1256	/// \code
				1257	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1258	/// \endcode
				1259	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1260	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1261	///
				1262	/// \param V1
				1263	/// A 256-bit vector of [8 x float].
				1264	/// \param V2
				1265	/// A 256-bit vector of [8 x float].
				1266	/// \param M
				1267	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1268	/// permuted. \n
				1269	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1270	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1271	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1272	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1273	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1274	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1275	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1276	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1277	/// destination. \n
				1278	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1279	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1280	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1281	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1282	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1283	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1284	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1285	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1286	/// destination.
				1287	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1288	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1289	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1290	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1291
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1292	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1293	/// as specified by the immediate integer operand.
				1294	///
				1295	/// \headerfile <x86intrin.h>
				1296	///
				1297	/// \code
				1298	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1299	/// \endcode
				1300	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1301	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1302	///
				1303	/// \param V1
				1304	/// A 256-bit integer vector.
				1305	/// \param V2
				1306	/// A 256-bit integer vector.
				1307	/// \param M
				1308	/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1309	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1310	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1311	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1312	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1313	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1314	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1315	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1316	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1317	/// destination. \n
				1318	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1319	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1320	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1321	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1322	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1323	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1324	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1325	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1326	/// destination.
				1327	/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1328	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1329	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1330	(__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1331
				1332	/* Vector Blend */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1333	/// \brief Merges 64-bit double-precision data values stored in either of the
				1334	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1335	/// integer operand.
				1336	///
				1337	/// \headerfile <x86intrin.h>
				1338	///
				1339	/// \code
				1340	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1341	/// \endcode
				1342	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1343	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1344	///
				1345	/// \param V1
				1346	/// A 256-bit vector of [4 x double].
				1347	/// \param V2
				1348	/// A 256-bit vector of [4 x double].
				1349	/// \param M
				1350	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1351	/// values are to be copied. The position of the mask bit corresponds to the
				1352	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1353	/// element in operand \a V1 is copied to the same position in the
				1354	/// destination. When a mask bit is 1, the corresponding 64-bit element in
				1355	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1356	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1357	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1358	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1359	(__v4df)(__m256d)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1360	(((M) & 0x01) ? 4 : 0), \
				1361	(((M) & 0x02) ? 5 : 1), \
				1362	(((M) & 0x04) ? 6 : 2), \
				1363	(((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1364
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1365	/// \brief Merges 32-bit single-precision data values stored in either of the
				1366	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1367	/// integer operand.
				1368	///
				1369	/// \headerfile <x86intrin.h>
				1370	///
				1371	/// \code
				1372	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1373	/// \endcode
				1374	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1375	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1376	///
				1377	/// \param V1
				1378	/// A 256-bit vector of [8 x float].
				1379	/// \param V2
				1380	/// A 256-bit vector of [8 x float].
				1381	/// \param M
				1382	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1383	/// values are to be copied. The position of the mask bit corresponds to the
				1384	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1385	/// element in operand \a V1 is copied to the same position in the
				1386	/// destination. When a mask bit is 1, the corresponding 32-bit element in
				1387	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1388	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1389	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1390	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1391	(__v8sf)(__m256)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1392	(((M) & 0x01) ? 8 : 0), \
				1393	(((M) & 0x02) ? 9 : 1), \
				1394	(((M) & 0x04) ? 10 : 2), \
				1395	(((M) & 0x08) ? 11 : 3), \
				1396	(((M) & 0x10) ? 12 : 4), \
				1397	(((M) & 0x20) ? 13 : 5), \
				1398	(((M) & 0x40) ? 14 : 6), \
				1399	(((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1400
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1401	/// \brief Merges 64-bit double-precision data values stored in either of the
				1402	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1403	/// operand.
				1404	///
				1405	/// \headerfile <x86intrin.h>
				1406	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1407	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1408	///
				1409	/// \param __a
				1410	/// A 256-bit vector of [4 x double].
				1411	/// \param __b
				1412	/// A 256-bit vector of [4 x double].
				1413	/// \param __c
				1414	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1415	/// how the values are to be copied. The position of the mask bit corresponds
				1416	/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1417	/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1418	/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	1419	/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1420	/// destination.
				1421	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1422	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1423	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1424	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1425	return (__m256d)__builtin_ia32_blendvpd256(
				1426	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1427	}
				1428
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1429	/// \brief Merges 32-bit single-precision data values stored in either of the
				1430	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1431	/// operand.
				1432	///
				1433	/// \headerfile <x86intrin.h>
				1434	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1435	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1436	///
				1437	/// \param __a
				1438	/// A 256-bit vector of [8 x float].
				1439	/// \param __b
				1440	/// A 256-bit vector of [8 x float].
				1441	/// \param __c
				1442	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1443	/// and 31 specifying how the values are to be copied. The position of the
				1444	/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1445	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1446	/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1447	/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1448	/// position in the destination.
				1449	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1450	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1451	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1452	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1453	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1454	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1455	}
				1456
				1457	/* Vector Dot Product */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1458	/// \brief Computes two dot products in parallel, using the lower and upper
				1459	/// halves of two [8 x float] vectors as input to the two computations, and
				1460	/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1461	/// [8 x float] result.
				1462	///
				1463	/// The immediate integer operand controls which input elements will
				1464	/// contribute to the dot product, and where the final results are returned.
				1465	/// In general, for each dot product, the four corresponding elements of the
				1466	/// input vectors are multiplied; the first two and second two products are
				1467	/// summed, then the two sums are added to form the final result.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1468	///
				1469	/// \headerfile <x86intrin.h>
				1470	///
				1471	/// \code
				1472	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1473	/// \endcode
				1474	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1475	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1476	///
				1477	/// \param V1
				1478	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1479	/// \param V2
				1480	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1481	/// \param M
				1482	/// An immediate integer argument. Bits [7:4] determine which elements of
				1483	/// the input vectors are used, with bit [4] corresponding to the lowest
				1484	/// element and bit [7] corresponding to the highest element of each [4 x
				1485	/// float] subvector. If a bit is set, the corresponding elements from the
				1486	/// two input vectors are used as an input for dot product; otherwise that
				1487	/// input is treated as zero. Bits [3:0] determine which elements of the
				1488	/// result will receive a copy of the final dot product, with bit [0]
				1489	/// corresponding to the lowest element and bit [3] corresponding to the
				1490	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1491	/// product is returned in the corresponding element; otherwise that element
				1492	/// is set to zero. The bitmask is applied in the same way to each of the
				1493	/// two parallel dot product computations.
				1494	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1495	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1496	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1497	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1498
				1499	/* Vector shuffle */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1500	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1501	/// specified by the immediate value operand.
				1502	///
				1503	/// The four selected elements in each operand are copied to the destination
				1504	/// according to the bits specified in the immediate operand. The selected
				1505	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1506	/// bits [191:128] of the destination, and the selected elements from the
				1507	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
				1508	/// the destination. For example, if bits [7:0] of the immediate operand
				1509	/// contain a value of 0xFF, the 256-bit destination vector would contain the
				1510	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1511	///
				1512	/// \headerfile <x86intrin.h>
				1513	///
				1514	/// \code
				1515	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1516	/// \endcode
				1517	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1518	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1519	///
				1520	/// \param a
				1521	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1522	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1523	/// according to the bits specified in the immediate operand.
				1524	/// \param b
				1525	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1526	/// operand are copied to bits [127:64] and bits [255:192] in the
				1527	/// destination, according to the bits specified in the immediate operand.
				1528	/// \param mask
				1529	/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1530	/// copy from \a a and \a b \n.
				1531	/// Bits [3:0] specify the values copied from operand \a a. \n
				1532	/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1533	/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1534	/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1535	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1536	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1537	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1538	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1539	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1540	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1541	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1542	/// the destination. \n
				1543	/// Bit value assignments: \n
				1544	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
				1545	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
				1546	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1547	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1548	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1549	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1550	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1551	(__v8sf)(__m256)(b), \
				1552	0 + (((mask) >> 0) & 0x3), \
				1553	0 + (((mask) >> 2) & 0x3), \
				1554	8 + (((mask) >> 4) & 0x3), \
				1555	8 + (((mask) >> 6) & 0x3), \
				1556	4 + (((mask) >> 0) & 0x3), \
				1557	4 + (((mask) >> 2) & 0x3), \
				1558	12 + (((mask) >> 4) & 0x3), \
				1559	12 + (((mask) >> 6) & 0x3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1560
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1561	/// \brief Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1562	/// [4 x double], as specified by the immediate value operand.
				1563	///
				1564	/// The selected elements from the first 256-bit operand are copied to bits
				1565	/// [63:0] and bits [191:128] in the destination, and the selected elements
				1566	/// from the second 256-bit operand are copied to bits [127:64] and bits
				1567	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
				1568	/// operand contain a value of 0xF, the 256-bit destination vector would
				1569	/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1570	///
				1571	/// \headerfile <x86intrin.h>
				1572	///
				1573	/// \code
				1574	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1575	/// \endcode
				1576	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1577	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1578	///
				1579	/// \param a
				1580	/// A 256-bit vector of [4 x double].
				1581	/// \param b
				1582	/// A 256-bit vector of [4 x double].
				1583	/// \param mask
				1584	/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1585	/// copy from \a a and \a b: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1586	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1587	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1588	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1589	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1590	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1591	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1592	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1593	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1594	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1595	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1596	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1597	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1598	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1599	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1600	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1601	/// destination.
				1602	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1603	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1604	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1605	(__v4df)(__m256d)(b), \
				1606	0 + (((mask) >> 0) & 0x1), \
				1607	4 + (((mask) >> 1) & 0x1), \
				1608	2 + (((mask) >> 2) & 0x1), \
				1609	6 + (((mask) >> 3) & 0x1)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1610
				1611	/* Compare */
				1612	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1613	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1614	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1615	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1616	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1617	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1618	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1619	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1620	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1621	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1622	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1623	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1624	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1625	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1626	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1627	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1628	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1629	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1630	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1631	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1632	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1633	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1634	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1635	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1636	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1637	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1638	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1639	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1640	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1641	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1642	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1643	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1644
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1645	/// \brief Compares each of the corresponding double-precision values of two
				1646	/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1647	/// immediate integer operand.
				1648	///
				1649	/// Returns a [2 x double] vector consisting of two doubles corresponding to
				1650	/// the two comparison results: zero if the comparison is false, and all 1's
				1651	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1652	///
				1653	/// \headerfile <x86intrin.h>
				1654	///
				1655	/// \code
				1656	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1657	/// \endcode
				1658	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1659	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1660	///
				1661	/// \param a
				1662	/// A 128-bit vector of [2 x double].
				1663	/// \param b
				1664	/// A 128-bit vector of [2 x double].
				1665	/// \param c
				1666	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1667	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1668	/// 0x00: Equal (ordered, non-signaling) \n
				1669	/// 0x01: Less-than (ordered, signaling) \n
				1670	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1671	/// 0x03: Unordered (non-signaling) \n
				1672	/// 0x04: Not-equal (unordered, non-signaling) \n
				1673	/// 0x05: Not-less-than (unordered, signaling) \n
				1674	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1675	/// 0x07: Ordered (non-signaling) \n
				1676	/// 0x08: Equal (unordered, non-signaling) \n
				1677	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1678	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1679	/// 0x0B: False (ordered, non-signaling) \n
				1680	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1681	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1682	/// 0x0E: Greater-than (ordered, signaling) \n
				1683	/// 0x0F: True (unordered, non-signaling) \n
				1684	/// 0x10: Equal (ordered, signaling) \n
				1685	/// 0x11: Less-than (ordered, non-signaling) \n
				1686	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1687	/// 0x13: Unordered (signaling) \n
				1688	/// 0x14: Not-equal (unordered, signaling) \n
				1689	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1690	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1691	/// 0x17: Ordered (signaling) \n
				1692	/// 0x18: Equal (unordered, signaling) \n
				1693	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1694	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1695	/// 0x1B: False (ordered, signaling) \n
				1696	/// 0x1C: Not-equal (ordered, signaling) \n
				1697	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1698	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1699	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1700	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1701	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1702	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1703	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1704
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1705	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1706	/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1707	/// operand.
				1708	///
				1709	/// Returns a [4 x float] vector consisting of four floats corresponding to
				1710	/// the four comparison results: zero if the comparison is false, and all 1's
				1711	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1712	///
				1713	/// \headerfile <x86intrin.h>
				1714	///
				1715	/// \code
				1716	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1717	/// \endcode
				1718	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1719	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1720	///
				1721	/// \param a
				1722	/// A 128-bit vector of [4 x float].
				1723	/// \param b
				1724	/// A 128-bit vector of [4 x float].
				1725	/// \param c
				1726	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1727	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1728	/// 0x00: Equal (ordered, non-signaling) \n
				1729	/// 0x01: Less-than (ordered, signaling) \n
				1730	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1731	/// 0x03: Unordered (non-signaling) \n
				1732	/// 0x04: Not-equal (unordered, non-signaling) \n
				1733	/// 0x05: Not-less-than (unordered, signaling) \n
				1734	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1735	/// 0x07: Ordered (non-signaling) \n
				1736	/// 0x08: Equal (unordered, non-signaling) \n
				1737	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1738	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1739	/// 0x0B: False (ordered, non-signaling) \n
				1740	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1741	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1742	/// 0x0E: Greater-than (ordered, signaling) \n
				1743	/// 0x0F: True (unordered, non-signaling) \n
				1744	/// 0x10: Equal (ordered, signaling) \n
				1745	/// 0x11: Less-than (ordered, non-signaling) \n
				1746	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1747	/// 0x13: Unordered (signaling) \n
				1748	/// 0x14: Not-equal (unordered, signaling) \n
				1749	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1750	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1751	/// 0x17: Ordered (signaling) \n
				1752	/// 0x18: Equal (unordered, signaling) \n
				1753	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1754	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1755	/// 0x1B: False (ordered, signaling) \n
				1756	/// 0x1C: Not-equal (ordered, signaling) \n
				1757	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1758	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1759	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1760	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1761	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1762	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1763	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1764
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1765	/// \brief Compares each of the corresponding double-precision values of two
				1766	/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1767	/// immediate integer operand.
				1768	///
				1769	/// Returns a [4 x double] vector consisting of four doubles corresponding to
				1770	/// the four comparison results: zero if the comparison is false, and all 1's
				1771	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1772	///
				1773	/// \headerfile <x86intrin.h>
				1774	///
				1775	/// \code
				1776	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1777	/// \endcode
				1778	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1779	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1780	///
				1781	/// \param a
				1782	/// A 256-bit vector of [4 x double].
				1783	/// \param b
				1784	/// A 256-bit vector of [4 x double].
				1785	/// \param c
				1786	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1787	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1788	/// 0x00: Equal (ordered, non-signaling) \n
				1789	/// 0x01: Less-than (ordered, signaling) \n
				1790	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1791	/// 0x03: Unordered (non-signaling) \n
				1792	/// 0x04: Not-equal (unordered, non-signaling) \n
				1793	/// 0x05: Not-less-than (unordered, signaling) \n
				1794	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1795	/// 0x07: Ordered (non-signaling) \n
				1796	/// 0x08: Equal (unordered, non-signaling) \n
				1797	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1798	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1799	/// 0x0B: False (ordered, non-signaling) \n
				1800	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1801	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1802	/// 0x0E: Greater-than (ordered, signaling) \n
				1803	/// 0x0F: True (unordered, non-signaling) \n
				1804	/// 0x10: Equal (ordered, signaling) \n
				1805	/// 0x11: Less-than (ordered, non-signaling) \n
				1806	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1807	/// 0x13: Unordered (signaling) \n
				1808	/// 0x14: Not-equal (unordered, signaling) \n
				1809	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1810	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1811	/// 0x17: Ordered (signaling) \n
				1812	/// 0x18: Equal (unordered, signaling) \n
				1813	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1814	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1815	/// 0x1B: False (ordered, signaling) \n
				1816	/// 0x1C: Not-equal (ordered, signaling) \n
				1817	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1818	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1819	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1820	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1821	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1822	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1823	(__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1824
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1825	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1826	/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1827	/// operand.
				1828	///
				1829	/// Returns a [8 x float] vector consisting of eight floats corresponding to
				1830	/// the eight comparison results: zero if the comparison is false, and all
				1831	/// 1's if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1832	///
				1833	/// \headerfile <x86intrin.h>
				1834	///
				1835	/// \code
				1836	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1837	/// \endcode
				1838	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1839	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1840	///
				1841	/// \param a
				1842	/// A 256-bit vector of [8 x float].
				1843	/// \param b
				1844	/// A 256-bit vector of [8 x float].
				1845	/// \param c
				1846	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1847	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1848	/// 0x00: Equal (ordered, non-signaling) \n
				1849	/// 0x01: Less-than (ordered, signaling) \n
				1850	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1851	/// 0x03: Unordered (non-signaling) \n
				1852	/// 0x04: Not-equal (unordered, non-signaling) \n
				1853	/// 0x05: Not-less-than (unordered, signaling) \n
				1854	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1855	/// 0x07: Ordered (non-signaling) \n
				1856	/// 0x08: Equal (unordered, non-signaling) \n
				1857	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1858	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1859	/// 0x0B: False (ordered, non-signaling) \n
				1860	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1861	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1862	/// 0x0E: Greater-than (ordered, signaling) \n
				1863	/// 0x0F: True (unordered, non-signaling) \n
				1864	/// 0x10: Equal (ordered, signaling) \n
				1865	/// 0x11: Less-than (ordered, non-signaling) \n
				1866	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1867	/// 0x13: Unordered (signaling) \n
				1868	/// 0x14: Not-equal (unordered, signaling) \n
				1869	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1870	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1871	/// 0x17: Ordered (signaling) \n
				1872	/// 0x18: Equal (unordered, signaling) \n
				1873	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1874	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1875	/// 0x1B: False (ordered, signaling) \n
				1876	/// 0x1C: Not-equal (ordered, signaling) \n
				1877	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1878	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1879	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1880	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1881	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1882	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1883	(__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1884
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1885	/// \brief Compares each of the corresponding scalar double-precision values of
				1886	/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1887	/// immediate integer operand.
				1888	///
				1889	/// If the result is true, all 64 bits of the destination vector are set;
				1890	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1891	///
				1892	/// \headerfile <x86intrin.h>
				1893	///
				1894	/// \code
				1895	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1896	/// \endcode
				1897	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1898	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1899	///
				1900	/// \param a
				1901	/// A 128-bit vector of [2 x double].
				1902	/// \param b
				1903	/// A 128-bit vector of [2 x double].
				1904	/// \param c
				1905	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1906	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1907	/// 0x00: Equal (ordered, non-signaling) \n
				1908	/// 0x01: Less-than (ordered, signaling) \n
				1909	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1910	/// 0x03: Unordered (non-signaling) \n
				1911	/// 0x04: Not-equal (unordered, non-signaling) \n
				1912	/// 0x05: Not-less-than (unordered, signaling) \n
				1913	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1914	/// 0x07: Ordered (non-signaling) \n
				1915	/// 0x08: Equal (unordered, non-signaling) \n
				1916	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1917	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1918	/// 0x0B: False (ordered, non-signaling) \n
				1919	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1920	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1921	/// 0x0E: Greater-than (ordered, signaling) \n
				1922	/// 0x0F: True (unordered, non-signaling) \n
				1923	/// 0x10: Equal (ordered, signaling) \n
				1924	/// 0x11: Less-than (ordered, non-signaling) \n
				1925	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1926	/// 0x13: Unordered (signaling) \n
				1927	/// 0x14: Not-equal (unordered, signaling) \n
				1928	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1929	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1930	/// 0x17: Ordered (signaling) \n
				1931	/// 0x18: Equal (unordered, signaling) \n
				1932	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1933	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1934	/// 0x1B: False (ordered, signaling) \n
				1935	/// 0x1C: Not-equal (ordered, signaling) \n
				1936	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1937	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1938	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1939	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1940	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1941	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1942	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1943
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1944	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1945	/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1946	/// integer operand.
				1947	///
				1948	/// If the result is true, all 32 bits of the destination vector are set;
				1949	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1950	///
				1951	/// \headerfile <x86intrin.h>
				1952	///
				1953	/// \code
				1954	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1955	/// \endcode
				1956	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1957	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1958	///
				1959	/// \param a
				1960	/// A 128-bit vector of [4 x float].
				1961	/// \param b
				1962	/// A 128-bit vector of [4 x float].
				1963	/// \param c
				1964	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1965	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	1966	/// 0x00: Equal (ordered, non-signaling) \n
				1967	/// 0x01: Less-than (ordered, signaling) \n
				1968	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1969	/// 0x03: Unordered (non-signaling) \n
				1970	/// 0x04: Not-equal (unordered, non-signaling) \n
				1971	/// 0x05: Not-less-than (unordered, signaling) \n
				1972	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1973	/// 0x07: Ordered (non-signaling) \n
				1974	/// 0x08: Equal (unordered, non-signaling) \n
				1975	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1976	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1977	/// 0x0B: False (ordered, non-signaling) \n
				1978	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1979	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1980	/// 0x0E: Greater-than (ordered, signaling) \n
				1981	/// 0x0F: True (unordered, non-signaling) \n
				1982	/// 0x10: Equal (ordered, signaling) \n
				1983	/// 0x11: Less-than (ordered, non-signaling) \n
				1984	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1985	/// 0x13: Unordered (signaling) \n
				1986	/// 0x14: Not-equal (unordered, signaling) \n
				1987	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1988	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1989	/// 0x17: Ordered (signaling) \n
				1990	/// 0x18: Equal (unordered, signaling) \n
				1991	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1992	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1993	/// 0x1B: False (ordered, signaling) \n
				1994	/// 0x1C: Not-equal (ordered, signaling) \n
				1995	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1996	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1997	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1998	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1999	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	2000	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				2001	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2002
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2003	/// \brief Takes a [8 x i32] vector and returns the vector element value
				2004	/// indexed by the immediate constant operand.
				2005	///
				2006	/// \headerfile <x86intrin.h>
				2007	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2008	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2009	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2010	///
				2011	/// \param __a
				2012	/// A 256-bit vector of [8 x i32].
				2013	/// \param __imm
				2014	/// An immediate integer operand with bits [2:0] determining which vector
				2015	/// element is extracted and returned.
				2016	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				2017	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2018	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	2019	_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2020	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2021	__v8si __b = (__v8si)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	2022	return __b[__imm & 7];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2023	}
				2024
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2025	/// \brief Takes a [16 x i16] vector and returns the vector element value
				2026	/// indexed by the immediate constant operand.
				2027	///
				2028	/// \headerfile <x86intrin.h>
				2029	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2030	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2031	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2032	///
				2033	/// \param __a
				2034	/// A 256-bit integer vector of [16 x i16].
				2035	/// \param __imm
				2036	/// An immediate integer operand with bits [3:0] determining which vector
				2037	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2038	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2039	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2040	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	2041	_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2042	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2043	__v16hi __b = (__v16hi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2044	return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2045	}
				2046
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2047	/// \brief Takes a [32 x i8] vector and returns the vector element value
				2048	/// indexed by the immediate constant operand.
				2049	///
				2050	/// \headerfile <x86intrin.h>
				2051	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2052	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2053	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2054	///
				2055	/// \param __a
				2056	/// A 256-bit integer vector of [32 x i8].
				2057	/// \param __imm
				2058	/// An immediate integer operand with bits [4:0] determining which vector
				2059	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2060	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				2061	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2062	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	2063	_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2064	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2065	__v32qi __b = (__v32qi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2066	return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2067	}
				2068
				2069	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2070	/// \brief Takes a [4 x i64] vector and returns the vector element value
				2071	/// indexed by the immediate constant operand.
				2072	///
				2073	/// \headerfile <x86intrin.h>
				2074	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2075	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2076	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2077	///
				2078	/// \param __a
				2079	/// A 256-bit integer vector of [4 x i64].
				2080	/// \param __imm
				2081	/// An immediate integer operand with bits [1:0] determining which vector
				2082	/// element is extracted and returned.
				2083	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				2084	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2085	static __inline long long __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2086	_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2087	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2088	__v4di __b = (__v4di)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	2089	return __b[__imm & 3];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2090	}
				2091	#endif
				2092
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2093	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				2094	/// indexed by the immediate constant operand by a new value. Returns the
				2095	/// modified vector.
				2096	///
				2097	/// \headerfile <x86intrin.h>
				2098	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2099	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2100	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2101	///
				2102	/// \param __a
				2103	/// A vector of [8 x i32] to be used by the insert operation.
				2104	/// \param __b
				2105	/// An integer value. The replacement value for the insert operation.
				2106	/// \param __imm
				2107	/// An immediate integer specifying the index of the vector element to be
				2108	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2109	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2110	/// \a __imm with \a __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2111	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2112	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2113	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2114	__v8si __c = (__v8si)__a;
				2115	__c[__imm & 7] = __b;
				2116	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2117	}
				2118
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2119
				2120	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				2121	/// indexed by the immediate constant operand with a new value. Returns the
				2122	/// modified vector.
				2123	///
				2124	/// \headerfile <x86intrin.h>
				2125	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2126	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2127	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2128	///
				2129	/// \param __a
				2130	/// A vector of [16 x i16] to be used by the insert operation.
				2131	/// \param __b
				2132	/// An i16 integer value. The replacement value for the insert operation.
				2133	/// \param __imm
				2134	/// An immediate integer specifying the index of the vector element to be
				2135	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2136	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2137	/// \a __imm with \a __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2138	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2139	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2140	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2141	__v16hi __c = (__v16hi)__a;
				2142	__c[__imm & 15] = __b;
				2143	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2144	}
				2145
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2146	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				2147	/// indexed by the immediate constant operand with a new value. Returns the
				2148	/// modified vector.
				2149	///
				2150	/// \headerfile <x86intrin.h>
				2151	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2152	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2153	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2154	///
				2155	/// \param __a
				2156	/// A vector of [32 x i8] to be used by the insert operation.
				2157	/// \param __b
				2158	/// An i8 integer value. The replacement value for the insert operation.
				2159	/// \param __imm
				2160	/// An immediate integer specifying the index of the vector element to be
				2161	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2162	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2163	/// \a __imm with \a __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2164	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2165	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2166	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2167	__v32qi __c = (__v32qi)__a;
				2168	__c[__imm & 31] = __b;
				2169	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2170	}
				2171
				2172	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2173	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2174	/// indexed by the immediate constant operand with a new value. Returns the
				2175	/// modified vector.
				2176	///
				2177	/// \headerfile <x86intrin.h>
				2178	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2179	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2180	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2181	///
				2182	/// \param __a
				2183	/// A vector of [4 x i64] to be used by the insert operation.
				2184	/// \param __b
				2185	/// A 64-bit integer value. The replacement value for the insert operation.
				2186	/// \param __imm
				2187	/// An immediate integer specifying the index of the vector element to be
				2188	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2189	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2190	/// \a __imm with \a __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2191	static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhas	d740029	2015-02-19 19:00:33 +0000	[diff] [blame]	2192	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2193	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2194	__v4di __c = (__v4di)__a;
				2195	__c[__imm & 3] = __b;
				2196	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2197	}
				2198	#endif
				2199
				2200	/* Conversion */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2201	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2202	///
				2203	/// \headerfile <x86intrin.h>
				2204	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2205	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2206	///
				2207	/// \param __a
				2208	/// A 128-bit integer vector of [4 x i32].
				2209	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2210	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2211	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2212	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2213	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2214	}
				2215
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2216	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2217	///
				2218	/// \headerfile <x86intrin.h>
				2219	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2220	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2221	///
				2222	/// \param __a
				2223	/// A 256-bit integer vector.
				2224	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2225	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2226	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2227	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2228	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2229	}
				2230
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2231	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2232	/// [4 x float].
				2233	///
				2234	/// \headerfile <x86intrin.h>
				2235	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2236	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2237	///
				2238	/// \param __a
				2239	/// A 256-bit vector of [4 x double].
				2240	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2241	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2242	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2243	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2244	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2245	}
				2246
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2247	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2248	///
				2249	/// \headerfile <x86intrin.h>
				2250	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2251	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2252	///
				2253	/// \param __a
				2254	/// A 256-bit vector of [8 x float].
				2255	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2256	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2257	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2258	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2259	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2260	}
				2261
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2262	/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
				2263	/// x double].
				2264	///
				2265	/// \headerfile <x86intrin.h>
				2266	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2267	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2268	///
				2269	/// \param __a
				2270	/// A 128-bit vector of [4 x float].
				2271	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2272	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2273	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2274	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2275	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2276	}
				2277
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2278	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2279	/// x i32], truncating the result by rounding towards zero when it is
				2280	/// inexact.
				2281	///
				2282	/// \headerfile <x86intrin.h>
				2283	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2284	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2285	///
				2286	/// \param __a
				2287	/// A 256-bit vector of [4 x double].
				2288	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2289	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2290	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2291	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2292	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2293	}
				2294
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2295	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2296	/// x i32]. When a conversion is inexact, the value returned is rounded
				2297	/// according to the rounding control bits in the MXCSR register.
				2298	///
				2299	/// \headerfile <x86intrin.h>
				2300	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2301	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2302	///
				2303	/// \param __a
				2304	/// A 256-bit vector of [4 x double].
				2305	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2306	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2307	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2308	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2309	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2310	}
				2311
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2312	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
				2313	/// truncating the result by rounding towards zero when it is inexact.
				2314	///
				2315	/// \headerfile <x86intrin.h>
				2316	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2317	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2318	///
				2319	/// \param __a
				2320	/// A 256-bit vector of [8 x float].
				2321	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2322	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2323	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2324	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2325	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2326	}
				2327
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2328	/// \brief Returns the first element of the input vector of [4 x double].
				2329	///
				2330	/// \headerfile <avxintrin.h>
				2331	///
				2332	/// This intrinsic is a utility function and does not correspond to a specific
				2333	/// instruction.
				2334	///
				2335	/// \param __a
				2336	/// A 256-bit vector of [4 x double].
				2337	/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2338	static __inline double __DEFAULT_FN_ATTRS
				2339	_mm256_cvtsd_f64(__m256d __a)
				2340	{
				2341	return __a[0];
				2342	}
				2343
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2344	/// \brief Returns the first element of the input vector of [8 x i32].
				2345	///
				2346	/// \headerfile <avxintrin.h>
				2347	///
				2348	/// This intrinsic is a utility function and does not correspond to a specific
				2349	/// instruction.
				2350	///
				2351	/// \param __a
				2352	/// A 256-bit vector of [8 x i32].
				2353	/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2354	static __inline int __DEFAULT_FN_ATTRS
				2355	_mm256_cvtsi256_si32(__m256i __a)
				2356	{
				2357	__v8si __b = (__v8si)__a;
				2358	return __b[0];
				2359	}
				2360
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2361	/// \brief Returns the first element of the input vector of [8 x float].
				2362	///
				2363	/// \headerfile <avxintrin.h>
				2364	///
				2365	/// This intrinsic is a utility function and does not correspond to a specific
				2366	/// instruction.
				2367	///
				2368	/// \param __a
				2369	/// A 256-bit vector of [8 x float].
				2370	/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2371	static __inline float __DEFAULT_FN_ATTRS
				2372	_mm256_cvtss_f32(__m256 __a)
				2373	{
				2374	return __a[0];
				2375	}
				2376
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2377	/* Vector replicate */
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	2378	/// \brief Moves and duplicates odd-indexed values from a 256-bit vector of
				2379	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2380	///
				2381	/// \headerfile <x86intrin.h>
				2382	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2383	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2384	///
				2385	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2386	/// A 256-bit vector of [8 x float]. \n
				2387	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
				2388	/// the return value. \n
				2389	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
				2390	/// the return value. \n
				2391	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
				2392	/// return value. \n
				2393	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
				2394	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2395	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2396	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2397	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2398	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2399	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2400	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2401	}
				2402
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	2403	/// \brief Moves and duplicates even-indexed values from a 256-bit vector of
				2404	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2405	///
				2406	/// \headerfile <x86intrin.h>
				2407	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2408	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2409	///
				2410	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2411	/// A 256-bit vector of [8 x float]. \n
				2412	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
				2413	/// the return value. \n
				2414	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
				2415	/// the return value. \n
				2416	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
				2417	/// return value. \n
				2418	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
				2419	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2420	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2421	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2422	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2423	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2424	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2425	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2426	}
				2427
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2428	/// \brief Moves and duplicates double-precision floating point values from a
				2429	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2430	/// vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2431	///
				2432	/// \headerfile <x86intrin.h>
				2433	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2434	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2435	///
				2436	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2437	/// A 256-bit vector of [4 x double]. \n
				2438	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
				2439	/// return value. \n
				2440	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
				2441	/// the return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2442	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2443	/// duplicated values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2444	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2445	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2446	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2447	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2448	}
				2449
				2450	/* Unpack and Interleave */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2451	/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
				2452	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2453	///
				2454	/// \headerfile <x86intrin.h>
				2455	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2456	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2457	///
				2458	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2459	/// A 256-bit floating-point vector of [4 x double]. \n
				2460	/// Bits [127:64] are written to bits [63:0] of the return value. \n
				2461	/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2462	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2463	/// A 256-bit floating-point vector of [4 x double]. \n
				2464	/// Bits [127:64] are written to bits [127:64] of the return value. \n
				2465	/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2466	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2467	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2468	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2469	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2470	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2471	}
				2472
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2473	/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
				2474	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2475	///
				2476	/// \headerfile <x86intrin.h>
				2477	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2478	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2479	///
				2480	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2481	/// A 256-bit floating-point vector of [4 x double]. \n
				2482	/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2483	/// Bits [191:128] are written to bits [191:128] of the return value.
				2484	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2485	/// A 256-bit floating-point vector of [4 x double]. \n
				2486	/// Bits [63:0] are written to bits [127:64] of the return value. \n
				2487	/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2488	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2489	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2490	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2491	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2492	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2493	}
				2494
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2495	/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
				2496	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2497	/// vector of [8 x float].
				2498	///
				2499	/// \headerfile <x86intrin.h>
				2500	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2501	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2502	///
				2503	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2504	/// A 256-bit vector of [8 x float]. \n
				2505	/// Bits [95:64] are written to bits [31:0] of the return value. \n
				2506	/// Bits [127:96] are written to bits [95:64] of the return value. \n
				2507	/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2508	/// Bits [255:224] are written to bits [223:192] of the return value.
				2509	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2510	/// A 256-bit vector of [8 x float]. \n
				2511	/// Bits [95:64] are written to bits [63:32] of the return value. \n
				2512	/// Bits [127:96] are written to bits [127:96] of the return value. \n
				2513	/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2514	/// Bits [255:224] are written to bits [255:224] of the return value.
				2515	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2516	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2517	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2518	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2519	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2520	}
				2521
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2522	/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
				2523	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2524	/// vector of [8 x float].
				2525	///
				2526	/// \headerfile <x86intrin.h>
				2527	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2528	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2529	///
				2530	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2531	/// A 256-bit vector of [8 x float]. \n
				2532	/// Bits [31:0] are written to bits [31:0] of the return value. \n
				2533	/// Bits [63:32] are written to bits [95:64] of the return value. \n
				2534	/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2535	/// Bits [191:160] are written to bits [223:192] of the return value.
				2536	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2537	/// A 256-bit vector of [8 x float]. \n
				2538	/// Bits [31:0] are written to bits [63:32] of the return value. \n
				2539	/// Bits [63:32] are written to bits [127:96] of the return value. \n
				2540	/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2541	/// Bits [191:160] are written to bits [255:224] of the return value.
				2542	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2543	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2544	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2545	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2546	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2547	}
				2548
				2549	/* Bit Test */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2550	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2551	/// element-by-element comparison of the double-precision element in the
				2552	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2553	/// vector.
				2554	///
				2555	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2556	/// If there is at least one pair of double-precision elements where the
				2557	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2558	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2559	/// If there is at least one pair of double-precision elements where the
				2560	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2561	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2562	/// This intrinsic returns the value of the ZF flag.
				2563	///
				2564	/// \headerfile <x86intrin.h>
				2565	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2566	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2567	///
				2568	/// \param __a
				2569	/// A 128-bit vector of [2 x double].
				2570	/// \param __b
				2571	/// A 128-bit vector of [2 x double].
				2572	/// \returns the ZF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2573	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2574	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2575	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2576	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2577	}
				2578
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2579	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2580	/// element-by-element comparison of the double-precision element in the
				2581	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2582	/// vector.
				2583	///
				2584	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2585	/// If there is at least one pair of double-precision elements where the
				2586	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2587	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2588	/// If there is at least one pair of double-precision elements where the
				2589	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2590	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2591	/// This intrinsic returns the value of the CF flag.
				2592	///
				2593	/// \headerfile <x86intrin.h>
				2594	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2595	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2596	///
				2597	/// \param __a
				2598	/// A 128-bit vector of [2 x double].
				2599	/// \param __b
				2600	/// A 128-bit vector of [2 x double].
				2601	/// \returns the CF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2602	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2603	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2604	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2605	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2606	}
				2607
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2608	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
				2609	/// element-by-element comparison of the double-precision element in the
				2610	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2611	/// vector.
				2612	///
				2613	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2614	/// If there is at least one pair of double-precision elements where the
				2615	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2616	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2617	/// If there is at least one pair of double-precision elements where the
				2618	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2619	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2620	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2621	/// otherwise it returns 0.
				2622	///
				2623	/// \headerfile <x86intrin.h>
				2624	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2625	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2626	///
				2627	/// \param __a
				2628	/// A 128-bit vector of [2 x double].
				2629	/// \param __b
				2630	/// A 128-bit vector of [2 x double].
				2631	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2632	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2633	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2634	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2635	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2636	}
				2637
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2638	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2639	/// element-by-element comparison of the single-precision element in the
				2640	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2641	/// vector.
				2642	///
				2643	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2644	/// If there is at least one pair of single-precision elements where the
				2645	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2646	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2647	/// If there is at least one pair of single-precision elements where the
				2648	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2649	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2650	/// This intrinsic returns the value of the ZF flag.
				2651	///
				2652	/// \headerfile <x86intrin.h>
				2653	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2654	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2655	///
				2656	/// \param __a
				2657	/// A 128-bit vector of [4 x float].
				2658	/// \param __b
				2659	/// A 128-bit vector of [4 x float].
				2660	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2661	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2662	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2663	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2664	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2665	}
				2666
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2667	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2668	/// element-by-element comparison of the single-precision element in the
				2669	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2670	/// vector.
				2671	///
				2672	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2673	/// If there is at least one pair of single-precision elements where the
				2674	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2675	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2676	/// If there is at least one pair of single-precision elements where the
				2677	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2678	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2679	/// This intrinsic returns the value of the CF flag.
				2680	///
				2681	/// \headerfile <x86intrin.h>
				2682	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2683	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2684	///
				2685	/// \param __a
				2686	/// A 128-bit vector of [4 x float].
				2687	/// \param __b
				2688	/// A 128-bit vector of [4 x float].
				2689	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2690	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2691	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2692	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2693	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2694	}
				2695
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2696	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
				2697	/// element-by-element comparison of the single-precision element in the
				2698	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2699	/// vector.
				2700	///
				2701	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2702	/// If there is at least one pair of single-precision elements where the
				2703	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2704	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2705	/// If there is at least one pair of single-precision elements where the
				2706	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2707	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2708	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2709	/// otherwise it returns 0.
				2710	///
				2711	/// \headerfile <x86intrin.h>
				2712	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2713	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2714	///
				2715	/// \param __a
				2716	/// A 128-bit vector of [4 x float].
				2717	/// \param __b
				2718	/// A 128-bit vector of [4 x float].
				2719	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2720	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2721	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2722	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2723	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2724	}
				2725
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2726	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2727	/// element-by-element comparison of the double-precision elements in the
				2728	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2729	/// vector.
				2730	///
				2731	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2732	/// If there is at least one pair of double-precision elements where the
				2733	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2734	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2735	/// If there is at least one pair of double-precision elements where the
				2736	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2737	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2738	/// This intrinsic returns the value of the ZF flag.
				2739	///
				2740	/// \headerfile <x86intrin.h>
				2741	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2742	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2743	///
				2744	/// \param __a
				2745	/// A 256-bit vector of [4 x double].
				2746	/// \param __b
				2747	/// A 256-bit vector of [4 x double].
				2748	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2749	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2750	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2751	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2752	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2753	}
				2754
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2755	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2756	/// element-by-element comparison of the double-precision elements in the
				2757	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2758	/// vector.
				2759	///
				2760	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2761	/// If there is at least one pair of double-precision elements where the
				2762	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2763	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2764	/// If there is at least one pair of double-precision elements where the
				2765	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2766	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2767	/// This intrinsic returns the value of the CF flag.
				2768	///
				2769	/// \headerfile <x86intrin.h>
				2770	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2771	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2772	///
				2773	/// \param __a
				2774	/// A 256-bit vector of [4 x double].
				2775	/// \param __b
				2776	/// A 256-bit vector of [4 x double].
				2777	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2778	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2779	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2780	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2781	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2782	}
				2783
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2784	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
				2785	/// element-by-element comparison of the double-precision elements in the
				2786	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2787	/// vector.
				2788	///
				2789	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2790	/// If there is at least one pair of double-precision elements where the
				2791	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2792	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2793	/// If there is at least one pair of double-precision elements where the
				2794	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2795	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2796	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2797	/// otherwise it returns 0.
				2798	///
				2799	/// \headerfile <x86intrin.h>
				2800	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2801	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2802	///
				2803	/// \param __a
				2804	/// A 256-bit vector of [4 x double].
				2805	/// \param __b
				2806	/// A 256-bit vector of [4 x double].
				2807	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2808	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2809	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2810	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2811	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2812	}
				2813
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2814	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2815	/// element-by-element comparison of the single-precision element in the
				2816	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2817	/// vector.
				2818	///
				2819	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2820	/// If there is at least one pair of single-precision elements where the
				2821	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2822	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2823	/// If there is at least one pair of single-precision elements where the
				2824	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2825	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2826	/// This intrinsic returns the value of the ZF flag.
				2827	///
				2828	/// \headerfile <x86intrin.h>
				2829	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2830	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2831	///
				2832	/// \param __a
				2833	/// A 256-bit vector of [8 x float].
				2834	/// \param __b
				2835	/// A 256-bit vector of [8 x float].
				2836	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2837	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2838	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2839	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2840	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2841	}
				2842
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2843	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2844	/// element-by-element comparison of the single-precision element in the
				2845	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2846	/// vector.
				2847	///
				2848	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2849	/// If there is at least one pair of single-precision elements where the
				2850	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2851	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2852	/// If there is at least one pair of single-precision elements where the
				2853	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2854	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2855	/// This intrinsic returns the value of the CF flag.
				2856	///
				2857	/// \headerfile <x86intrin.h>
				2858	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2859	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2860	///
				2861	/// \param __a
				2862	/// A 256-bit vector of [8 x float].
				2863	/// \param __b
				2864	/// A 256-bit vector of [8 x float].
				2865	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2866	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2867	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2868	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2869	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2870	}
				2871
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2872	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
				2873	/// element-by-element comparison of the single-precision elements in the
				2874	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2875	/// vector.
				2876	///
				2877	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2878	/// If there is at least one pair of single-precision elements where the
				2879	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2880	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2881	/// If there is at least one pair of single-precision elements where the
				2882	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2883	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2884	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2885	/// otherwise it returns 0.
				2886	///
				2887	/// \headerfile <x86intrin.h>
				2888	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2889	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2890	///
				2891	/// \param __a
				2892	/// A 256-bit vector of [8 x float].
				2893	/// \param __b
				2894	/// A 256-bit vector of [8 x float].
				2895	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2896	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2897	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2898	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2899	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2900	}
				2901
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2902	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2903	/// of the two source vectors.
				2904	///
				2905	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2906	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2907	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2908	/// If there is at least one pair of bits where the bit from the first source
				2909	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2910	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2911	/// This intrinsic returns the value of the ZF flag.
				2912	///
				2913	/// \headerfile <x86intrin.h>
				2914	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2915	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2916	///
				2917	/// \param __a
				2918	/// A 256-bit integer vector.
				2919	/// \param __b
				2920	/// A 256-bit integer vector.
				2921	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2922	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2923	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2924	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2925	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2926	}
				2927
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2928	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2929	/// of the two source vectors.
				2930	///
				2931	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2932	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2933	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2934	/// If there is at least one pair of bits where the bit from the first source
				2935	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2936	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2937	/// This intrinsic returns the value of the CF flag.
				2938	///
				2939	/// \headerfile <x86intrin.h>
				2940	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2941	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2942	///
				2943	/// \param __a
				2944	/// A 256-bit integer vector.
				2945	/// \param __b
				2946	/// A 256-bit integer vector.
				2947	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2948	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2949	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2950	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2951	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2952	}
				2953
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2954	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2955	/// of the two source vectors.
				2956	///
				2957	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2958	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2959	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2960	/// If there is at least one pair of bits where the bit from the first source
				2961	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2962	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2963	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2964	/// otherwise it returns 0.
				2965	///
				2966	/// \headerfile <x86intrin.h>
				2967	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2968	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2969	///
				2970	/// \param __a
				2971	/// A 256-bit integer vector.
				2972	/// \param __b
				2973	/// A 256-bit integer vector.
				2974	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2975	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2976	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2977	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2978	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2979	}
				2980
				2981	/* Vector extract sign mask */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2982	/// \brief Extracts the sign bits of double-precision floating point elements
				2983	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2984	/// bits of the return value.
				2985	///
				2986	/// \headerfile <x86intrin.h>
				2987	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2988	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2989	///
				2990	/// \param __a
				2991	/// A 256-bit vector of [4 x double] containing the double-precision
				2992	/// floating point values with sign bits to be extracted.
				2993	/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2994	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2995	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2996	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2997	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2998	}
				2999
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	3000	/// \brief Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3001	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				3002	/// bits of the return value.
				3003	///
				3004	/// \headerfile <x86intrin.h>
				3005	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3006	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3007	///
				3008	/// \param __a
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame^]	3009	/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3010	/// point values with sign bits to be extracted.
				3011	/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3012	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3013	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3014	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3015	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3016	}
				3017
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3018	/* Vector __zero */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3019	/// \brief Zeroes the contents of all XMM or YMM registers.
				3020	///
				3021	/// \headerfile <x86intrin.h>
				3022	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3023	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3024	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3025	_mm256_zeroall(void)
				3026	{
				3027	__builtin_ia32_vzeroall();
				3028	}
				3029
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3030	/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
				3031	///
				3032	/// \headerfile <x86intrin.h>
				3033	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3034	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3035	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3036	_mm256_zeroupper(void)
				3037	{
				3038	__builtin_ia32_vzeroupper();
				3039	}
				3040
				3041	/* Vector load with broadcast */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3042	/// \brief Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3043	/// specified address pointed to by \a __a and broadcasts it to the elements
				3044	/// of a [4 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3045	///
				3046	/// \headerfile <x86intrin.h>
				3047	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3048	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3049	///
				3050	/// \param __a
				3051	/// The single-precision floating point value to be broadcast.
				3052	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				3053	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3054	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3055	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3056	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3057	float __f = *__a;
				3058	return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3059	}
				3060
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3061	/// \brief Loads a scalar double-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3062	/// specified address pointed to by \a __a and broadcasts it to the elements
				3063	/// of a [4 x double] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3064	///
				3065	/// \headerfile <x86intrin.h>
				3066	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3067	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3068	///
				3069	/// \param __a
				3070	/// The double-precision floating point value to be broadcast.
				3071	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				3072	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3073	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3074	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3075	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3076	double __d = *__a;
				3077	return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3078	}
				3079
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3080	/// \brief Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3081	/// specified address pointed to by \a __a and broadcasts it to the elements
				3082	/// of a [8 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3083	///
				3084	/// \headerfile <x86intrin.h>
				3085	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3086	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3087	///
				3088	/// \param __a
				3089	/// The single-precision floating point value to be broadcast.
				3090	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				3091	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3092	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3093	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3094	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3095	float __f = *__a;
				3096	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3097	}
				3098
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3099	/// \brief Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3100	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3101	/// elements in a 256-bit vector of [4 x double].
				3102	///
				3103	/// \headerfile <x86intrin.h>
				3104	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3105	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3106	///
				3107	/// \param __a
				3108	/// The 128-bit vector of [2 x double] to be broadcast.
				3109	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				3110	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3111	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3112	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3113	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	3114	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3115	}
				3116
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3117	/// \brief Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3118	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3119	/// elements in a 256-bit vector of [8 x float].
				3120	///
				3121	/// \headerfile <x86intrin.h>
				3122	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3123	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3124	///
				3125	/// \param __a
				3126	/// The 128-bit vector of [4 x float] to be broadcast.
				3127	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				3128	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3129	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3130	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3131	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	3132	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3133	}
				3134
				3135	/* SIMD load ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3136	/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3137	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3138	///
				3139	/// \headerfile <x86intrin.h>
				3140	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3141	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3142	///
				3143	/// \param __p
				3144	/// A 32-byte aligned pointer to a memory location containing
				3145	/// double-precision floating point values.
				3146	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3147	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3148	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3149	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3150	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3151	}
				3152
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3153	/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3154	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3155	///
				3156	/// \headerfile <x86intrin.h>
				3157	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3158	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3159	///
				3160	/// \param __p
				3161	/// A 32-byte aligned pointer to a memory location containing float values.
				3162	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3163	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3164	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3165	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3166	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3167	}
				3168
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3169	/// \brief Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3170	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3171	///
				3172	/// \headerfile <x86intrin.h>
				3173	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3174	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3175	///
				3176	/// \param __p
				3177	/// A pointer to a memory location containing double-precision floating
				3178	/// point values.
				3179	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3180	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3181	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3182	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3183	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3184	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3185	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3186	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3187	}
				3188
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3189	/// \brief Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3190	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3191	///
				3192	/// \headerfile <x86intrin.h>
				3193	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3194	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3195	///
				3196	/// \param __p
				3197	/// A pointer to a memory location containing single-precision floating
				3198	/// point values.
				3199	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3200	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3201	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3202	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3203	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3204	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3205	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3206	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3207	}
				3208
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3209	/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3210	/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3211	///
				3212	/// \headerfile <x86intrin.h>
				3213	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3214	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3215	///
				3216	/// \param __p
				3217	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3218	/// values.
				3219	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3220	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3221	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3222	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3223	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3224	}
				3225
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3226	/// \brief Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3227	/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3228	///
				3229	/// \headerfile <x86intrin.h>
				3230	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3231	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3232	///
				3233	/// \param __p
				3234	/// A pointer to a 256-bit integer vector containing integer values.
				3235	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3236	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3237	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3238	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3239	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3240	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3241	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3242	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3243	}
				3244
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3245	/// \brief Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3246	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
				3247	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3248	/// line boundary.
				3249	///
				3250	/// \headerfile <x86intrin.h>
				3251	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3252	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3253	///
				3254	/// \param __p
				3255	/// A pointer to a 256-bit integer vector containing integer values.
				3256	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3257	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3258	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3259	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3260	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3261	}
				3262
				3263	/* SIMD store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3264	/// \brief Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3265	/// of [4 x double] to a 32-byte aligned memory location pointed to by
				3266	/// \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3267	///
				3268	/// \headerfile <x86intrin.h>
				3269	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3270	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3271	///
				3272	/// \param __p
				3273	/// A 32-byte aligned pointer to a memory location that will receive the
				3274	/// double-precision floaing point values.
				3275	/// \param __a
				3276	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3277	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3278	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3279	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3280	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3281	}
				3282
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3283	/// \brief Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3284	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3285	///
				3286	/// \headerfile <x86intrin.h>
				3287	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3288	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3289	///
				3290	/// \param __p
				3291	/// A 32-byte aligned pointer to a memory location that will receive the
				3292	/// float values.
				3293	/// \param __a
				3294	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3295	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3296	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3297	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3298	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3299	}
				3300
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3301	/// \brief Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3302	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3303	///
				3304	/// \headerfile <x86intrin.h>
				3305	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3306	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3307	///
				3308	/// \param __p
				3309	/// A pointer to a memory location that will receive the double-precision
				3310	/// floating point values.
				3311	/// \param __a
				3312	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3313	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3314	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3315	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3316	struct __storeu_pd {
				3317	__m256d __v;
				3318	} __attribute__((__packed__, __may_alias__));
				3319	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3320	}
				3321
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3322	/// \brief Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3323	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3324	///
				3325	/// \headerfile <x86intrin.h>
				3326	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3327	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3328	///
				3329	/// \param __p
				3330	/// A pointer to a memory location that will receive the float values.
				3331	/// \param __a
				3332	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3333	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3334	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3335	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3336	struct __storeu_ps {
				3337	__m256 __v;
				3338	} __attribute__((__packed__, __may_alias__));
				3339	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3340	}
				3341
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3342	/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3343	/// aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3344	///
				3345	/// \headerfile <x86intrin.h>
				3346	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3347	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3348	///
				3349	/// \param __p
				3350	/// A 32-byte aligned pointer to a memory location that will receive the
				3351	/// integer values.
				3352	/// \param __a
				3353	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3354	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3355	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3356	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3357	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3358	}
				3359
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3360	/// \brief Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3361	/// memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3362	///
				3363	/// \headerfile <x86intrin.h>
				3364	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3365	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3366	///
				3367	/// \param __p
				3368	/// A pointer to a memory location that will receive the integer values.
				3369	/// \param __a
				3370	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3371	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3372	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3373	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3374	struct __storeu_si256 {
				3375	__m256i __v;
				3376	} __attribute__((__packed__, __may_alias__));
				3377	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3378	}
				3379
				3380	/* Conditional load ops */
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3381	/// \brief Conditionally loads double-precision floating point elements from a
				3382	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3383	/// [2 x double], depending on the mask bits associated with each data
				3384	/// element.
				3385	///
				3386	/// \headerfile <x86intrin.h>
				3387	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3388	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3389	///
				3390	/// \param __p
				3391	/// A pointer to a memory location that contains the double-precision
				3392	/// floating point values.
				3393	/// \param __m
				3394	/// A 128-bit integer vector containing the mask. The most significant bit of
				3395	/// each data element represents the mask bits. If a mask bit is zero, the
				3396	/// corresponding value in the memory location is not loaded and the
				3397	/// corresponding field in the return value is set to zero.
				3398	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3399	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3400	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3401	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3402	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3403	}
				3404
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3405	/// \brief Conditionally loads double-precision floating point elements from a
				3406	/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3407	/// [4 x double], depending on the mask bits associated with each data
				3408	/// element.
				3409	///
				3410	/// \headerfile <x86intrin.h>
				3411	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3412	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3413	///
				3414	/// \param __p
				3415	/// A pointer to a memory location that contains the double-precision
				3416	/// floating point values.
				3417	/// \param __m
				3418	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3419	/// significant bit of each quadword element represents the mask bits. If a
				3420	/// mask bit is zero, the corresponding value in the memory location is not
				3421	/// loaded and the corresponding field in the return value is set to zero.
				3422	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3423	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3424	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3425	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3426	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3427	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3428	}
				3429
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3430	/// \brief Conditionally loads single-precision floating point elements from a
				3431	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3432	/// [4 x float], depending on the mask bits associated with each data
				3433	/// element.
				3434	///
				3435	/// \headerfile <x86intrin.h>
				3436	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3437	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3438	///
				3439	/// \param __p
				3440	/// A pointer to a memory location that contains the single-precision
				3441	/// floating point values.
				3442	/// \param __m
				3443	/// A 128-bit integer vector containing the mask. The most significant bit of
				3444	/// each data element represents the mask bits. If a mask bit is zero, the
				3445	/// corresponding value in the memory location is not loaded and the
				3446	/// corresponding field in the return value is set to zero.
				3447	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3448	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3449	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3450	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3451	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3452	}
				3453
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3454	/// \brief Conditionally loads single-precision floating point elements from a
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3455	/// memory location pointed to by \a __p into a 256-bit vector of
				3456	/// [8 x float], depending on the mask bits associated with each data
				3457	/// element.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3458	///
				3459	/// \headerfile <x86intrin.h>
				3460	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3461	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3462	///
				3463	/// \param __p
				3464	/// A pointer to a memory location that contains the single-precision
				3465	/// floating point values.
				3466	/// \param __m
				3467	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3468	/// significant bit of each dword element represents the mask bits. If a mask
				3469	/// bit is zero, the corresponding value in the memory location is not loaded
				3470	/// and the corresponding field in the return value is set to zero.
				3471	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3472	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3473	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3474	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3475	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3476	}
				3477
				3478	/* Conditional store ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3479	/// \brief Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3480	/// of [8 x float] to a memory location pointed to by \a __p, according to
				3481	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3482	///
				3483	/// \headerfile <x86intrin.h>
				3484	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3485	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3486	///
				3487	/// \param __p
				3488	/// A pointer to a memory location that will receive the float values.
				3489	/// \param __m
				3490	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3491	/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3492	/// mask bits. If a mask bit is zero, the corresponding value from vector
				3493	/// \a __a is not stored and the corresponding field in the memory location
				3494	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3495	/// \param __a
				3496	/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3497	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3498	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3499	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3500	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3501	}
				3502
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3503	/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3504	/// to a memory location pointed to by \a __p, according to the specified
				3505	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3506	///
				3507	/// \headerfile <x86intrin.h>
				3508	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3509	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3510	///
				3511	/// \param __p
				3512	/// A pointer to a memory location that will receive the float values.
				3513	/// \param __m
				3514	/// A 128-bit integer vector containing the mask. The most significant bit of
				3515	/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3516	/// zero, the corresponding value from vector \a __a is not stored and the
				3517	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3518	/// changed.
				3519	/// \param __a
				3520	/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3521	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3522	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3523	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3524	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3525	}
				3526
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3527	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3528	/// to a memory location pointed to by \a __p, according to the specified
				3529	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3530	///
				3531	/// \headerfile <x86intrin.h>
				3532	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3533	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3534	///
				3535	/// \param __p
				3536	/// A pointer to a memory location that will receive the float values.
				3537	/// \param __m
				3538	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3539	/// significant bit of each quadword element in the mask vector represents
				3540	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3541	/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3542	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3543	/// \param __a
				3544	/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3545	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3546	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3547	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3548	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3549	}
				3550
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3551	/// \brief Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3552	/// of [4 x float] to a memory location pointed to by \a __p, according to
				3553	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3554	///
				3555	/// \headerfile <x86intrin.h>
				3556	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3557	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3558	///
				3559	/// \param __p
				3560	/// A pointer to a memory location that will receive the float values.
				3561	/// \param __m
				3562	/// A 128-bit integer vector containing the mask. The most significant bit of
				3563	/// each field in the mask vector represents the mask bits. If a mask bit is
				3564	/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3565	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3566	/// changed.
				3567	/// \param __a
				3568	/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3569	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3570	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3571	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3572	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3573	}
				3574
				3575	/* Cacheability support ops */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3576	/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
				3577	/// aligned memory location. To minimize caching, the data is flagged as
				3578	/// non-temporal (unlikely to be used again soon).
				3579	///
				3580	/// \headerfile <x86intrin.h>
				3581	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3582	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3583	///
				3584	/// \param __a
				3585	/// A pointer to a 32-byte aligned memory location that will receive the
				3586	/// integer values.
				3587	/// \param __b
				3588	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3589	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3590	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3591	{
Simon Pilgrim	c14865c	2017-07-29 15:33:34 +0000	[diff] [blame]	3592	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
				3593	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3594	}
				3595
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3596	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
				3597	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3598	/// flagged as non-temporal (unlikely to be used again soon).
				3599	///
				3600	/// \headerfile <x86intrin.h>
				3601	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3602	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3603	///
				3604	/// \param __a
				3605	/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanova	cb3603a	2017-06-06 22:58:01 +0000	[diff] [blame]	3606	/// double-precision floating-point values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3607	/// \param __b
				3608	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3609	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3610	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3611	{
Simon Pilgrim	c14865c	2017-07-29 15:33:34 +0000	[diff] [blame]	3612	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
				3613	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3614	}
				3615
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3616	/// \brief Moves single-precision floating point values from a 256-bit vector
				3617	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3618	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3619	/// soon).
				3620	///
				3621	/// \headerfile <x86intrin.h>
				3622	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3623	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3624	///
				3625	/// \param __p
				3626	/// A pointer to a 32-byte aligned memory location that will receive the
				3627	/// single-precision floating point values.
				3628	/// \param __a
				3629	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3630	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3631	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3632	{
Simon Pilgrim	c14865c	2017-07-29 15:33:34 +0000	[diff] [blame]	3633	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
				3634	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3635	}
				3636
				3637	/* Create vectors */
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3638	/// \brief Create a 256-bit vector of [4 x double] with undefined values.
				3639	///
				3640	/// \headerfile <x86intrin.h>
				3641	///
				3642	/// This intrinsic has no corresponding instruction.
				3643	///
				3644	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3645	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3646	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3647	{
				3648	return (__m256d)__builtin_ia32_undef256();
				3649	}
				3650
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3651	/// \brief Create a 256-bit vector of [8 x float] with undefined values.
				3652	///
				3653	/// \headerfile <x86intrin.h>
				3654	///
				3655	/// This intrinsic has no corresponding instruction.
				3656	///
				3657	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3658	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3659	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3660	{
				3661	return (__m256)__builtin_ia32_undef256();
				3662	}
				3663
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3664	/// \brief Create a 256-bit integer vector with undefined values.
				3665	///
				3666	/// \headerfile <x86intrin.h>
				3667	///
				3668	/// This intrinsic has no corresponding instruction.
				3669	///
				3670	/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3671	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3672	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3673	{
				3674	return (__m256i)__builtin_ia32_undef256();
				3675	}
				3676
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3677	/// \brief Constructs a 256-bit floating-point vector of [4 x double]
				3678	/// initialized with the specified double-precision floating-point values.
				3679	///
				3680	/// \headerfile <x86intrin.h>
				3681	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3682	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3683	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3684	///
				3685	/// \param __a
				3686	/// A double-precision floating-point value used to initialize bits [255:192]
				3687	/// of the result.
				3688	/// \param __b
				3689	/// A double-precision floating-point value used to initialize bits [191:128]
				3690	/// of the result.
				3691	/// \param __c
				3692	/// A double-precision floating-point value used to initialize bits [127:64]
				3693	/// of the result.
				3694	/// \param __d
				3695	/// A double-precision floating-point value used to initialize bits [63:0]
				3696	/// of the result.
				3697	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3698	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3699	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3700	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3701	return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3702	}
				3703
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3704	/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
				3705	/// with the specified single-precision floating-point values.
				3706	///
				3707	/// \headerfile <x86intrin.h>
				3708	///
				3709	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3710	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3711	///
				3712	/// \param __a
				3713	/// A single-precision floating-point value used to initialize bits [255:224]
				3714	/// of the result.
				3715	/// \param __b
				3716	/// A single-precision floating-point value used to initialize bits [223:192]
				3717	/// of the result.
				3718	/// \param __c
				3719	/// A single-precision floating-point value used to initialize bits [191:160]
				3720	/// of the result.
				3721	/// \param __d
				3722	/// A single-precision floating-point value used to initialize bits [159:128]
				3723	/// of the result.
				3724	/// \param __e
				3725	/// A single-precision floating-point value used to initialize bits [127:96]
				3726	/// of the result.
				3727	/// \param __f
				3728	/// A single-precision floating-point value used to initialize bits [95:64]
				3729	/// of the result.
				3730	/// \param __g
				3731	/// A single-precision floating-point value used to initialize bits [63:32]
				3732	/// of the result.
				3733	/// \param __h
				3734	/// A single-precision floating-point value used to initialize bits [31:0]
				3735	/// of the result.
				3736	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3737	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3738	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3739	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3740	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3741	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3742	}
				3743
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3744	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3745	/// 32-bit integral values.
				3746	///
				3747	/// \headerfile <x86intrin.h>
				3748	///
				3749	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3750	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3751	///
				3752	/// \param __i0
				3753	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3754	/// \param __i1
				3755	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3756	/// \param __i2
				3757	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3758	/// \param __i3
				3759	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3760	/// \param __i4
				3761	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3762	/// \param __i5
				3763	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3764	/// \param __i6
				3765	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3766	/// \param __i7
				3767	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3768	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3769	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3770	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3771	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3772	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3773	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3774	}
				3775
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3776	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3777	/// 16-bit integral values.
				3778	///
				3779	/// \headerfile <x86intrin.h>
				3780	///
				3781	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3782	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3783	///
				3784	/// \param __w15
				3785	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3786	/// \param __w14
				3787	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3788	/// \param __w13
				3789	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3790	/// \param __w12
				3791	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3792	/// \param __w11
				3793	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3794	/// \param __w10
				3795	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3796	/// \param __w09
				3797	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3798	/// \param __w08
				3799	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3800	/// \param __w07
				3801	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3802	/// \param __w06
				3803	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3804	/// \param __w05
				3805	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3806	/// \param __w04
				3807	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3808	/// \param __w03
				3809	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3810	/// \param __w02
				3811	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3812	/// \param __w01
				3813	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3814	/// \param __w00
				3815	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3816	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3817	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3818	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3819	short __w11, short __w10, short __w09, short __w08,
				3820	short __w07, short __w06, short __w05, short __w04,
				3821	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3822	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3823	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				3824	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3825	}
				3826
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3827	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3828	/// 8-bit integral values.
				3829	///
				3830	/// \headerfile <x86intrin.h>
				3831	///
				3832	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3833	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3834	///
				3835	/// \param __b31
				3836	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3837	/// \param __b30
				3838	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3839	/// \param __b29
				3840	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3841	/// \param __b28
				3842	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3843	/// \param __b27
				3844	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3845	/// \param __b26
				3846	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3847	/// \param __b25
				3848	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3849	/// \param __b24
				3850	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3851	/// \param __b23
				3852	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3853	/// \param __b22
				3854	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3855	/// \param __b21
				3856	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3857	/// \param __b20
				3858	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3859	/// \param __b19
				3860	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3861	/// \param __b18
				3862	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3863	/// \param __b17
				3864	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3865	/// \param __b16
				3866	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3867	/// \param __b15
				3868	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3869	/// \param __b14
				3870	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3871	/// \param __b13
				3872	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3873	/// \param __b12
				3874	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3875	/// \param __b11
				3876	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3877	/// \param __b10
				3878	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3879	/// \param __b09
				3880	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3881	/// \param __b08
				3882	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3883	/// \param __b07
				3884	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3885	/// \param __b06
				3886	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3887	/// \param __b05
				3888	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3889	/// \param __b04
				3890	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3891	/// \param __b03
				3892	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3893	/// \param __b02
				3894	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3895	/// \param __b01
				3896	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3897	/// \param __b00
				3898	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3899	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3900	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3901	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3902	char __b27, char __b26, char __b25, char __b24,
				3903	char __b23, char __b22, char __b21, char __b20,
				3904	char __b19, char __b18, char __b17, char __b16,
				3905	char __b15, char __b14, char __b13, char __b12,
				3906	char __b11, char __b10, char __b09, char __b08,
				3907	char __b07, char __b06, char __b05, char __b04,
				3908	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3909	{
				3910	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3911	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3912	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3913	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3914	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3915	};
				3916	}
				3917
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3918	/// \brief Constructs a 256-bit integer vector initialized with the specified
				3919	/// 64-bit integral values.
				3920	///
				3921	/// \headerfile <x86intrin.h>
				3922	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3923	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				3924	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3925	///
				3926	/// \param __a
				3927	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3928	/// \param __b
				3929	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3930	/// \param __c
				3931	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3932	/// \param __d
				3933	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3934	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3935	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3936	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3937	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3938	return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3939	}
				3940
				3941	/* Create vectors with elements in reverse order */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3942	/// \brief Constructs a 256-bit floating-point vector of [4 x double],
				3943	/// initialized in reverse order with the specified double-precision
				3944	/// floating-point values.
				3945	///
				3946	/// \headerfile <x86intrin.h>
				3947	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3948	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3949	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3950	///
				3951	/// \param __a
				3952	/// A double-precision floating-point value used to initialize bits [63:0]
				3953	/// of the result.
				3954	/// \param __b
				3955	/// A double-precision floating-point value used to initialize bits [127:64]
				3956	/// of the result.
				3957	/// \param __c
				3958	/// A double-precision floating-point value used to initialize bits [191:128]
				3959	/// of the result.
				3960	/// \param __d
				3961	/// A double-precision floating-point value used to initialize bits [255:192]
				3962	/// of the result.
				3963	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3964	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3965	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3966	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3967	return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3968	}
				3969
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3970	/// \brief Constructs a 256-bit floating-point vector of [8 x float],
				3971	/// initialized in reverse order with the specified single-precision
				3972	/// float-point values.
				3973	///
				3974	/// \headerfile <x86intrin.h>
				3975	///
				3976	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3977	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3978	///
				3979	/// \param __a
				3980	/// A single-precision floating-point value used to initialize bits [31:0]
				3981	/// of the result.
				3982	/// \param __b
				3983	/// A single-precision floating-point value used to initialize bits [63:32]
				3984	/// of the result.
				3985	/// \param __c
				3986	/// A single-precision floating-point value used to initialize bits [95:64]
				3987	/// of the result.
				3988	/// \param __d
				3989	/// A single-precision floating-point value used to initialize bits [127:96]
				3990	/// of the result.
				3991	/// \param __e
				3992	/// A single-precision floating-point value used to initialize bits [159:128]
				3993	/// of the result.
				3994	/// \param __f
				3995	/// A single-precision floating-point value used to initialize bits [191:160]
				3996	/// of the result.
				3997	/// \param __g
				3998	/// A single-precision floating-point value used to initialize bits [223:192]
				3999	/// of the result.
				4000	/// \param __h
				4001	/// A single-precision floating-point value used to initialize bits [255:224]
				4002	/// of the result.
				4003	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4004	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4005	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4006	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4007	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4008	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4009	}
				4010
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4011	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				4012	/// with the specified 32-bit integral values.
				4013	///
				4014	/// \headerfile <x86intrin.h>
				4015	///
				4016	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4017	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4018	///
				4019	/// \param __i0
				4020	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				4021	/// \param __i1
				4022	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				4023	/// \param __i2
				4024	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				4025	/// \param __i3
				4026	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				4027	/// \param __i4
				4028	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				4029	/// \param __i5
				4030	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				4031	/// \param __i6
				4032	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				4033	/// \param __i7
				4034	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				4035	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4036	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4037	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4038	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4039	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4040	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4041	}
				4042
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4043	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				4044	/// with the specified 16-bit integral values.
				4045	///
				4046	/// \headerfile <x86intrin.h>
				4047	///
				4048	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4049	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4050	///
				4051	/// \param __w15
				4052	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				4053	/// \param __w14
				4054	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				4055	/// \param __w13
				4056	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				4057	/// \param __w12
				4058	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				4059	/// \param __w11
				4060	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				4061	/// \param __w10
				4062	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				4063	/// \param __w09
				4064	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				4065	/// \param __w08
				4066	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				4067	/// \param __w07
				4068	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				4069	/// \param __w06
				4070	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				4071	/// \param __w05
				4072	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				4073	/// \param __w04
				4074	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				4075	/// \param __w03
				4076	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				4077	/// \param __w02
				4078	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				4079	/// \param __w01
				4080	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				4081	/// \param __w00
				4082	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				4083	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4084	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4085	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4086	short __w11, short __w10, short __w09, short __w08,
				4087	short __w07, short __w06, short __w05, short __w04,
				4088	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4089	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4090	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				4091	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4092	}
				4093
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4094	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				4095	/// with the specified 8-bit integral values.
				4096	///
				4097	/// \headerfile <x86intrin.h>
				4098	///
				4099	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4100	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4101	///
				4102	/// \param __b31
				4103	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				4104	/// \param __b30
				4105	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				4106	/// \param __b29
				4107	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				4108	/// \param __b28
				4109	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				4110	/// \param __b27
				4111	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				4112	/// \param __b26
				4113	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				4114	/// \param __b25
				4115	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				4116	/// \param __b24
				4117	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				4118	/// \param __b23
				4119	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				4120	/// \param __b22
				4121	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				4122	/// \param __b21
				4123	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				4124	/// \param __b20
				4125	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				4126	/// \param __b19
				4127	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				4128	/// \param __b18
				4129	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				4130	/// \param __b17
				4131	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				4132	/// \param __b16
				4133	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				4134	/// \param __b15
				4135	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				4136	/// \param __b14
				4137	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				4138	/// \param __b13
				4139	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				4140	/// \param __b12
				4141	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				4142	/// \param __b11
				4143	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				4144	/// \param __b10
				4145	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				4146	/// \param __b09
				4147	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				4148	/// \param __b08
				4149	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				4150	/// \param __b07
				4151	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				4152	/// \param __b06
				4153	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				4154	/// \param __b05
				4155	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				4156	/// \param __b04
				4157	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				4158	/// \param __b03
				4159	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				4160	/// \param __b02
				4161	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				4162	/// \param __b01
				4163	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				4164	/// \param __b00
				4165	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				4166	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4167	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4168	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4169	char __b27, char __b26, char __b25, char __b24,
				4170	char __b23, char __b22, char __b21, char __b20,
				4171	char __b19, char __b18, char __b17, char __b16,
				4172	char __b15, char __b14, char __b13, char __b12,
				4173	char __b11, char __b10, char __b09, char __b08,
				4174	char __b07, char __b06, char __b05, char __b04,
				4175	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4176	{
				4177	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4178	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4179	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				4180	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				4181	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4182	}
				4183
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4184	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
				4185	/// with the specified 64-bit integral values.
				4186	///
				4187	/// \headerfile <x86intrin.h>
				4188	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4189	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				4190	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4191	///
				4192	/// \param __a
				4193	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				4194	/// \param __b
				4195	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				4196	/// \param __c
				4197	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				4198	/// \param __d
				4199	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				4200	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4201	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4202	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4203	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4204	return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4205	}
				4206
				4207	/* Create vectors with repeated elements */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4208	/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
				4209	/// of the four double-precision floating-point vector elements set to the
				4210	/// specified double-precision floating-point value.
				4211	///
				4212	/// \headerfile <x86intrin.h>
				4213	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4214	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4215	///
				4216	/// \param __w
				4217	/// A double-precision floating-point value used to initialize each vector
				4218	/// element of the result.
				4219	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4220	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4221	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4222	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4223	return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4224	}
				4225
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4226	/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
				4227	/// of the eight single-precision floating-point vector elements set to the
				4228	/// specified single-precision floating-point value.
				4229	///
				4230	/// \headerfile <x86intrin.h>
				4231	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4232	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4233	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4234	///
				4235	/// \param __w
				4236	/// A single-precision floating-point value used to initialize each vector
				4237	/// element of the result.
				4238	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4239	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4240	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4241	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4242	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4243	}
				4244
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4245	/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
				4246	/// 32-bit integral vector elements set to the specified 32-bit integral
				4247	/// value.
				4248	///
				4249	/// \headerfile <x86intrin.h>
				4250	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4251	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4252	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4253	///
				4254	/// \param __i
				4255	/// A 32-bit integral value used to initialize each vector element of the
				4256	/// result.
				4257	/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4258	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4259	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4260	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4261	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4262	}
				4263
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4264	/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
				4265	/// 16-bit integral vector elements set to the specified 16-bit integral
				4266	/// value.
				4267	///
				4268	/// \headerfile <x86intrin.h>
				4269	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4270	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4271	///
				4272	/// \param __w
				4273	/// A 16-bit integral value used to initialize each vector element of the
				4274	/// result.
				4275	/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4276	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4277	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4278	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4279	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				4280	__w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4281	}
				4282
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4283	/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
				4284	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4285	///
				4286	/// \headerfile <x86intrin.h>
				4287	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4288	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4289	///
				4290	/// \param __b
				4291	/// An 8-bit integral value used to initialize each vector element of the
				4292	/// result.
				4293	/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4294	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4295	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4296	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4297	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4298	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				4299	__b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4300	}
				4301
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4302	/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
				4303	/// 64-bit integral vector elements set to the specified 64-bit integral
				4304	/// value.
				4305	///
				4306	/// \headerfile <x86intrin.h>
				4307	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4308	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4309	///
				4310	/// \param __q
				4311	/// A 64-bit integral value used to initialize each vector element of the
				4312	/// result.
				4313	/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4314	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4315	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4316	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4317	return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4318	}
				4319
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4320	/* Create __zeroed vectors */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4321	/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
				4322	/// vector elements initialized to zero.
				4323	///
				4324	/// \headerfile <x86intrin.h>
				4325	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4326	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4327	///
				4328	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4329	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4330	_mm256_setzero_pd(void)
				4331	{
				4332	return (__m256d){ 0, 0, 0, 0 };
				4333	}
				4334
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4335	/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
				4336	/// vector elements initialized to zero.
				4337	///
				4338	/// \headerfile <x86intrin.h>
				4339	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4340	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4341	///
				4342	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4343	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4344	_mm256_setzero_ps(void)
				4345	{
				4346	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				4347	}
				4348
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4349	/// \brief Constructs a 256-bit integer vector initialized to zero.
				4350	///
				4351	/// \headerfile <x86intrin.h>
				4352	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4353	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4354	///
				4355	/// \returns A 256-bit integer vector initialized to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4356	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4357	_mm256_setzero_si256(void)
				4358	{
				4359	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				4360	}
				4361
				4362	/* Cast between vector types */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4363	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4364	/// floating-point vector of [8 x float].
				4365	///
				4366	/// \headerfile <x86intrin.h>
				4367	///
				4368	/// This intrinsic has no corresponding instruction.
				4369	///
				4370	/// \param __a
				4371	/// A 256-bit floating-point vector of [4 x double].
				4372	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4373	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4374	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4375	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4376	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4377	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4378	}
				4379
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4380	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4381	/// integer vector.
				4382	///
				4383	/// \headerfile <x86intrin.h>
				4384	///
				4385	/// This intrinsic has no corresponding instruction.
				4386	///
				4387	/// \param __a
				4388	/// A 256-bit floating-point vector of [4 x double].
				4389	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4390	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4391	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4392	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4393	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4394	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4395	}
				4396
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4397	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4398	/// floating-point vector of [4 x double].
				4399	///
				4400	/// \headerfile <x86intrin.h>
				4401	///
				4402	/// This intrinsic has no corresponding instruction.
				4403	///
				4404	/// \param __a
				4405	/// A 256-bit floating-point vector of [8 x float].
				4406	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4407	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4408	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4409	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4410	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4411	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4412	}
				4413
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4414	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4415	/// integer vector.
				4416	///
				4417	/// \headerfile <x86intrin.h>
				4418	///
				4419	/// This intrinsic has no corresponding instruction.
				4420	///
				4421	/// \param __a
				4422	/// A 256-bit floating-point vector of [8 x float].
				4423	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4424	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4425	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4426	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4427	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4428	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4429	}
				4430
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4431	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4432	/// of [8 x float].
				4433	///
				4434	/// \headerfile <x86intrin.h>
				4435	///
				4436	/// This intrinsic has no corresponding instruction.
				4437	///
				4438	/// \param __a
				4439	/// A 256-bit integer vector.
				4440	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4441	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4442	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4443	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4444	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4445	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4446	}
				4447
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4448	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
				4449	/// of [4 x double].
				4450	///
				4451	/// \headerfile <x86intrin.h>
				4452	///
				4453	/// This intrinsic has no corresponding instruction.
				4454	///
				4455	/// \param __a
				4456	/// A 256-bit integer vector.
				4457	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4458	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4459	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4460	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4461	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4462	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4463	}
				4464
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4465	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4466	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4467	///
				4468	/// \headerfile <x86intrin.h>
				4469	///
				4470	/// This intrinsic has no corresponding instruction.
				4471	///
				4472	/// \param __a
				4473	/// A 256-bit floating-point vector of [4 x double].
				4474	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4475	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4476	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4477	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4478	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4479	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4480	}
				4481
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4482	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
				4483	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4484	///
				4485	/// \headerfile <x86intrin.h>
				4486	///
				4487	/// This intrinsic has no corresponding instruction.
				4488	///
				4489	/// \param __a
				4490	/// A 256-bit floating-point vector of [8 x float].
				4491	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4492	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4493	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4494	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4495	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4496	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4497	}
				4498
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4499	/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
				4500	///
				4501	/// \headerfile <x86intrin.h>
				4502	///
				4503	/// This intrinsic has no corresponding instruction.
				4504	///
				4505	/// \param __a
				4506	/// A 256-bit integer vector.
				4507	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4508	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4509	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4510	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4511	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4512	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4513	}
				4514
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4515	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4516	/// 128-bit floating-point vector of [2 x double].
				4517	///
				4518	/// The lower 128 bits contain the value of the source vector. The contents
				4519	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4520	///
				4521	/// \headerfile <x86intrin.h>
				4522	///
				4523	/// This intrinsic has no corresponding instruction.
				4524	///
				4525	/// \param __a
				4526	/// A 128-bit vector of [2 x double].
				4527	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4528	/// contain the value of the parameter. The contents of the upper 128 bits
				4529	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4530	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4531	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4532	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4533	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4534	}
				4535
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4536	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4537	/// 128-bit floating-point vector of [4 x float].
				4538	///
				4539	/// The lower 128 bits contain the value of the source vector. The contents
				4540	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4541	///
				4542	/// \headerfile <x86intrin.h>
				4543	///
				4544	/// This intrinsic has no corresponding instruction.
				4545	///
				4546	/// \param __a
				4547	/// A 128-bit vector of [4 x float].
				4548	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4549	/// contain the value of the parameter. The contents of the upper 128 bits
				4550	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4551	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4552	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4553	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4554	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4555	}
				4556
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4557	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4558	///
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4559	/// The lower 128 bits contain the value of the source vector. The contents
				4560	/// of the upper 128 bits are undefined.
				4561	///
				4562	/// \headerfile <x86intrin.h>
				4563	///
				4564	/// This intrinsic has no corresponding instruction.
				4565	///
				4566	/// \param __a
				4567	/// A 128-bit integer vector.
				4568	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4569	/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4570	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4571	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4572	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4573	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4574	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4575
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4576	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
				4577	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4578	/// contain the value of the source vector. The upper 128 bits are set
				4579	/// to zero.
				4580	///
				4581	/// \headerfile <x86intrin.h>
				4582	///
				4583	/// This intrinsic has no corresponding instruction.
				4584	///
				4585	/// \param __a
				4586	/// A 128-bit vector of [2 x double].
				4587	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4588	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4589	static __inline __m256d __DEFAULT_FN_ATTRS
				4590	_mm256_zextpd128_pd256(__m128d __a)
				4591	{
				4592	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
				4593	}
				4594
				4595	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
				4596	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4597	/// the value of the source vector. The upper 128 bits are set to zero.
				4598	///
				4599	/// \headerfile <x86intrin.h>
				4600	///
				4601	/// This intrinsic has no corresponding instruction.
				4602	///
				4603	/// \param __a
				4604	/// A 128-bit vector of [4 x float].
				4605	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4606	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4607	static __inline __m256 __DEFAULT_FN_ATTRS
				4608	_mm256_zextps128_ps256(__m128 __a)
				4609	{
				4610	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
				4611	}
				4612
				4613	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
				4614	/// The lower 128 bits contain the value of the source vector. The upper
				4615	/// 128 bits are set to zero.
				4616	///
				4617	/// \headerfile <x86intrin.h>
				4618	///
				4619	/// This intrinsic has no corresponding instruction.
				4620	///
				4621	/// \param __a
				4622	/// A 128-bit integer vector.
				4623	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4624	/// the parameter. The upper 128 bits are set to zero.
				4625	static __inline __m256i __DEFAULT_FN_ATTRS
				4626	_mm256_zextsi128_si256(__m128i __a)
				4627	{
				4628	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
				4629	}
				4630
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4631	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4632	Vector insert.
				4633	We use macros rather than inlines because we only want to accept
				4634	invocations where the immediate M is a constant expression.
				4635	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4636	/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
				4637	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4638	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4639	/// 128-bit vector of [4 x float] in the second parameter.
				4640	///
				4641	/// The immediate integer parameter determines between the upper or the lower
				4642	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4643	///
				4644	/// \headerfile <x86intrin.h>
				4645	///
				4646	/// \code
				4647	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4648	/// \endcode
				4649	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4650	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4651	///
				4652	/// \param V1
				4653	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4654	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4655	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4656	/// \param V2
				4657	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4658	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4659	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4660	/// \param M
				4661	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4662	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4663	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4664	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4665	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4666	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4667	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4668	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4669	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4670	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				4671	(__m256)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4672	(__v8sf)(__m256)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4673	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				4674	(((M) & 1) ? 0 : 8), \
				4675	(((M) & 1) ? 1 : 9), \
				4676	(((M) & 1) ? 2 : 10), \
				4677	(((M) & 1) ? 3 : 11), \
				4678	(((M) & 1) ? 8 : 4), \
				4679	(((M) & 1) ? 9 : 5), \
				4680	(((M) & 1) ? 10 : 6), \
				4681	(((M) & 1) ? 11 : 7) );})
				4682
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4683	/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
				4684	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4685	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4686	/// 128-bit vector of [2 x double] in the second parameter.
				4687	///
				4688	/// The immediate integer parameter determines between the upper or the lower
				4689	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4690	///
				4691	/// \headerfile <x86intrin.h>
				4692	///
				4693	/// \code
				4694	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4695	/// \endcode
				4696	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4697	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4698	///
				4699	/// \param V1
				4700	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4701	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4702	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4703	/// \param V2
				4704	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4705	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4706	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4707	/// \param M
				4708	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4709	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4710	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4711	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4712	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4713	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4714	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4715	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4716	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4717	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				4718	(__m256d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4719	(__v4df)(__m256d)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4720	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				4721	(((M) & 1) ? 0 : 4), \
				4722	(((M) & 1) ? 1 : 5), \
				4723	(((M) & 1) ? 4 : 2), \
				4724	(((M) & 1) ? 5 : 3) );})
				4725
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4726	/// \brief Constructs a new 256-bit integer vector by first duplicating a
				4727	/// 256-bit integer vector given in the first parameter, and then replacing
				4728	/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4729	/// integer vector in the second parameter.
				4730	///
				4731	/// The immediate integer parameter determines between the upper or the lower
				4732	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4733	///
				4734	/// \headerfile <x86intrin.h>
				4735	///
				4736	/// \code
				4737	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4738	/// \endcode
				4739	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4740	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4741	///
				4742	/// \param V1
				4743	/// A 256-bit integer vector. This vector is copied to the result first, and
				4744	/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4745	/// replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4746	/// \param V2
				4747	/// A 128-bit integer vector. The contents of this parameter are written to
				4748	/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4749	/// value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4750	/// \param M
				4751	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4752	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4753	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4754	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4755	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4756	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4757	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4758	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4759	/// \returns A 256-bit integer vector containing the interleaved values.
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4760	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				4761	(__m256i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4762	(__v4di)(__m256i)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4763	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				4764	(((M) & 1) ? 0 : 4), \
				4765	(((M) & 1) ? 1 : 5), \
				4766	(((M) & 1) ? 4 : 2), \
				4767	(((M) & 1) ? 5 : 3) );})
				4768
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4769	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4770	Vector extract.
				4771	We use macros rather than inlines because we only want to accept
				4772	invocations where the immediate M is a constant expression.
				4773	*/
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4774	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4775	/// of [8 x float], as determined by the immediate integer parameter, and
				4776	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4777	///
				4778	/// \headerfile <x86intrin.h>
				4779	///
				4780	/// \code
				4781	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4782	/// \endcode
				4783	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4784	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4785	///
				4786	/// \param V
				4787	/// A 256-bit vector of [8 x float].
				4788	/// \param M
				4789	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4790	/// extracted from the first parameter: \n
				4791	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4792	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4793	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4794	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4795	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				4796	(__m128)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4797	(__v8sf)(__m256)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4798	(__v8sf)(_mm256_undefined_ps()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4799	(((M) & 1) ? 4 : 0), \
				4800	(((M) & 1) ? 5 : 1), \
				4801	(((M) & 1) ? 6 : 2), \
				4802	(((M) & 1) ? 7 : 3) );})
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4803
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4804	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
				4805	/// of [4 x double], as determined by the immediate integer parameter, and
				4806	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4807	///
				4808	/// \headerfile <x86intrin.h>
				4809	///
				4810	/// \code
				4811	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4812	/// \endcode
				4813	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4814	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4815	///
				4816	/// \param V
				4817	/// A 256-bit vector of [4 x double].
				4818	/// \param M
				4819	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4820	/// extracted from the first parameter: \n
				4821	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4822	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4823	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4824	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4825	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				4826	(__m128d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4827	(__v4df)(__m256d)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4828	(__v4df)(_mm256_undefined_pd()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4829	(((M) & 1) ? 2 : 0), \
				4830	(((M) & 1) ? 3 : 1) );})
				4831
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4832	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
				4833	/// integer vector, as determined by the immediate integer parameter, and
				4834	/// returns the extracted bits as a 128-bit integer vector.
				4835	///
				4836	/// \headerfile <x86intrin.h>
				4837	///
				4838	/// \code
				4839	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4840	/// \endcode
				4841	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4842	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4843	///
				4844	/// \param V
				4845	/// A 256-bit integer vector.
				4846	/// \param M
				4847	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4848	/// extracted from the first parameter: \n
				4849	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4850	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4851	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4852	/// \returns A 128-bit integer vector containing the extracted bits.
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4853	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				4854	(__m128i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	4855	(__v4di)(__m256i)(V), \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	4856	(__v4di)(_mm256_undefined_si256()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4857	(((M) & 1) ? 2 : 0), \
				4858	(((M) & 1) ? 3 : 1) );})
				4859
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4860	/* SIMD load ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4861	/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
				4862	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4863	/// of [8 x float] by concatenating the two 128-bit vectors.
				4864	///
				4865	/// \headerfile <x86intrin.h>
				4866	///
				4867	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4868	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4869	///
				4870	/// \param __addr_hi
				4871	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4872	/// single-precision floating-point values. These values are to be copied to
				4873	/// bits[255:128] of the result. The address of the memory location does not
				4874	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4875	/// \param __addr_lo
				4876	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4877	/// single-precision floating-point values. These values are to be copied to
				4878	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4879	/// have to be aligned.
				4880	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4881	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4882	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4883	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4884	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4885	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				4886	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4887	}
				4888
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4889	/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
				4890	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4891	/// of [4 x double] by concatenating the two 128-bit vectors.
				4892	///
				4893	/// \headerfile <x86intrin.h>
				4894	///
				4895	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4896	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4897	///
				4898	/// \param __addr_hi
				4899	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4900	/// double-precision floating-point values. These values are to be copied to
				4901	/// bits[255:128] of the result. The address of the memory location does not
				4902	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4903	/// \param __addr_lo
				4904	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4905	/// double-precision floating-point values. These values are to be copied to
				4906	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4907	/// have to be aligned.
				4908	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4909	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4910	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4911	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4912	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4913	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				4914	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4915	}
				4916
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4917	/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
				4918	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4919	/// vectors.
				4920	///
				4921	/// \headerfile <x86intrin.h>
				4922	///
				4923	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4924	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4925	///
				4926	/// \param __addr_hi
				4927	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4928	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4929	/// address of the memory location does not have to be aligned.
				4930	/// \param __addr_lo
				4931	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4932	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4933	/// address of the memory location does not have to be aligned.
				4934	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4935	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4936	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4937	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4938	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				4939	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4940	}
				4941
				4942	/* SIMD store ops (unaligned) */
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4943	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4944	/// vector of [8 x float] into two different unaligned memory locations.
				4945	///
				4946	/// \headerfile <x86intrin.h>
				4947	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4948	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4949	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4950	///
				4951	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4952	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4953	/// copied to this memory location. The address of this memory location does
				4954	/// not have to be aligned.
				4955	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4956	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4957	/// copied to this memory location. The address of this memory location does
				4958	/// not have to be aligned.
				4959	/// \param __a
				4960	/// A 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4961	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4962	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4963	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4964	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4965
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4966	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4967	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4968	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4969	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4970	}
				4971
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4972	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
				4973	/// vector of [4 x double] into two different unaligned memory locations.
				4974	///
				4975	/// \headerfile <x86intrin.h>
				4976	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4977	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4978	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4979	///
				4980	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4981	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4982	/// copied to this memory location. The address of this memory location does
				4983	/// not have to be aligned.
				4984	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4985	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4986	/// copied to this memory location. The address of this memory location does
				4987	/// not have to be aligned.
				4988	/// \param __a
				4989	/// A 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4990	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4991	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4992	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4993	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4994
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4995	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4996	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4997	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4998	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4999	}
				5000
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5001	/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
				5002	/// two different unaligned memory locations.
				5003	///
				5004	/// \headerfile <x86intrin.h>
				5005	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	5006	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				5007	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5008	///
				5009	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	5010	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5011	/// copied to this memory location. The address of this memory location does
				5012	/// not have to be aligned.
				5013	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	5014	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5015	/// copied to this memory location. The address of this memory location does
				5016	/// not have to be aligned.
				5017	/// \param __a
				5018	/// A 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5019	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	5020	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	5021	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	5022	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	5023
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	5024	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	5025	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	5026	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	5027	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	5028	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	5029
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5030	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				5031	/// concatenating two 128-bit floating-point vectors of [4 x float].
				5032	///
				5033	/// \headerfile <x86intrin.h>
				5034	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5035	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5036	///
				5037	/// \param __hi
				5038	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				5039	/// 128 bits of the result.
				5040	/// \param __lo
				5041	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				5042	/// 128 bits of the result.
				5043	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				5044	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5045	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5046	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				5047	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	5048	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5049	}
				5050
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5051	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				5052	/// concatenating two 128-bit floating-point vectors of [2 x double].
				5053	///
				5054	/// \headerfile <x86intrin.h>
				5055	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5056	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5057	///
				5058	/// \param __hi
				5059	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				5060	/// 128 bits of the result.
				5061	/// \param __lo
				5062	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				5063	/// 128 bits of the result.
				5064	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				5065	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5066	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5067	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				5068	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5069	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5070	}
				5071
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5072	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				5073	/// integer vectors.
				5074	///
				5075	/// \headerfile <x86intrin.h>
				5076	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5077	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5078	///
				5079	/// \param __hi
				5080	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				5081	/// result.
				5082	/// \param __lo
				5083	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				5084	/// result.
				5085	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5086	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5087	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				5088	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5089	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5090	}
				5091
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5092	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
				5093	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				5094	/// similar to _mm256_set_m128, but the order of the input parameters is
				5095	/// swapped.
				5096	///
				5097	/// \headerfile <x86intrin.h>
				5098	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5099	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5100	///
				5101	/// \param __lo
				5102	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				5103	/// 128 bits of the result.
				5104	/// \param __hi
				5105	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				5106	/// 128 bits of the result.
				5107	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				5108	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5109	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5110	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				5111	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5112	return _mm256_set_m128(__hi, __lo);
				5113	}
				5114
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5115	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
				5116	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				5117	/// similar to _mm256_set_m128d, but the order of the input parameters is
				5118	/// swapped.
				5119	///
				5120	/// \headerfile <x86intrin.h>
				5121	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5122	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5123	///
				5124	/// \param __lo
				5125	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				5126	/// 128 bits of the result.
				5127	/// \param __hi
				5128	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				5129	/// 128 bits of the result.
				5130	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				5131	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5132	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5133	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				5134	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5135	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5136	}
				5137
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5138	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
				5139	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				5140	/// the input parameters is swapped.
				5141	///
				5142	/// \headerfile <x86intrin.h>
				5143	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5144	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5145	///
				5146	/// \param __lo
				5147	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				5148	/// result.
				5149	/// \param __hi
				5150	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				5151	/// result.
				5152	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5153	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5154	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				5155	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5156	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5157	}
				5158
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5159	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	5160
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	5161	#endif /* __AVXINTRIN_H */