Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: 0b7813526eafd7a581b5215e081702f65b156195 [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	47
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	54
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	55	/* Arithmetic */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	56	/// Adds two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	57	///
				58	/// \headerfile <x86intrin.h>
				59	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	60	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	68	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	69	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	70	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	71	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	72	}
				73
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	74	/// Adds two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	75	///
				76	/// \headerfile <x86intrin.h>
				77	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	78	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	86	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	87	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	88	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	89	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	90	}
				91
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	92	/// Subtracts two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	93	///
				94	/// \headerfile <x86intrin.h>
				95	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	96	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	104	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	105	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	106	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	107	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	108	}
				109
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	110	/// Subtracts two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	111	///
				112	/// \headerfile <x86intrin.h>
				113	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	114	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	122	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	123	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	124	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	125	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	126	}
				127
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	128	/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	133	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	141	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	142	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	143	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	145	}
				146
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	147	/// Adds the even-indexed values and subtracts the odd-indexed values of
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	152	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	160	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	161	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	162	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	164	}
				165
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	166	/// Divides two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	167	///
				168	/// \headerfile <x86intrin.h>
				169	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	170	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	178	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	179	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	180	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	181	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	182	}
				183
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	184	/// Divides two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	185	///
				186	/// \headerfile <x86intrin.h>
				187	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	188	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	196	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	197	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	198	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	199	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	200	}
				201
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	202	/// Compares two 256-bit vectors of [4 x double] and returns the greater
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	207	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	215	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	216	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	217	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	219	}
				220
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	221	/// Compares two 256-bit vectors of [8 x float] and returns the greater
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	226	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	234	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	235	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	238	}
				239
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	240	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	245	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	253	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	254	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	255	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	257	}
				258
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	259	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	264	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	272	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	273	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	274	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	276	}
				277
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	278	/// Multiplies two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	279	///
				280	/// \headerfile <x86intrin.h>
				281	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	282	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	290	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	291	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	292	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	293	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	294	}
				295
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	296	/// Multiplies two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	297	///
				298	/// \headerfile <x86intrin.h>
				299	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	300	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	308	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	309	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	311	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	312	}
				313
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	314	/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	315	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	316	///
				317	/// \headerfile <x86intrin.h>
				318	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	319	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	325	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	326	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	327	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	329	}
				330
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	331	/// Calculates the square roots of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	332	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	333	///
				334	/// \headerfile <x86intrin.h>
				335	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	336	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	342	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	343	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	344	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	346	}
				347
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	348	/// Calculates the reciprocal square roots of the values in a 256-bit
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	349	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	350	///
				351	/// \headerfile <x86intrin.h>
				352	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	353	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	359	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	360	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	361	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	363	}
				364
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	365	/// Calculates the reciprocals of the values in a 256-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	366	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	367	///
				368	/// \headerfile <x86intrin.h>
				369	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	370	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	376	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	377	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	378	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	380	}
				381
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	382	/// Rounds the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	392	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	397	/// An integer value that specifies the rounding operation. \n
				398	/// Bits [7:4] are reserved. \n
				399	/// Bit [3] is a precision exception value: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	400	/// 0: A normal PE exception is used. \n
				401	/// 1: The PE field is not updated. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	402	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	403	/// 0: Use bits [1:0] of \a M. \n
				404	/// 1: Use the current MXCSR setting. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	405	/// Bits [1:0] contain the rounding control definition: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	406	/// 00: Nearest. \n
				407	/// 01: Downward (toward negative infinity). \n
				408	/// 10: Upward (toward positive infinity). \n
				409	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	411	#define _mm256_round_pd(V, M) \
				412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	413
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	414	/// Rounds the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	424	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	429	/// An integer value that specifies the rounding operation. \n
				430	/// Bits [7:4] are reserved. \n
				431	/// Bit [3] is a precision exception value: \n
				432	/// 0: A normal PE exception is used. \n
				433	/// 1: The PE field is not updated. \n
				434	/// Bit [2] is the rounding control source: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	435	/// 0: Use bits [1:0] of \a M. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	436	/// 1: Use the current MXCSR setting. \n
				437	/// Bits [1:0] contain the rounding control definition: \n
				438	/// 00: Nearest. \n
				439	/// 01: Downward (toward negative infinity). \n
				440	/// 10: Upward (toward positive infinity). \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	441	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	443	#define _mm256_round_ps(V, M) \
				444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	445
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	446	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	456	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	462
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	463	/// Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	473	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	480
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	481	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	491	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	497
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	498	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	508	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	516	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	517	///
				518	/// \headerfile <x86intrin.h>
				519	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	520	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	528	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	529	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	530	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	531	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	532	}
				533
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	534	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	535	///
				536	/// \headerfile <x86intrin.h>
				537	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	538	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	546	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	547	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	548	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	549	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	550	}
				551
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	552	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	557	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	567	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	568	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	569	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	571	}
				572
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	573	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	578	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	588	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	589	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	590	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	591	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	592	}
				593
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	594	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	595	///
				596	/// \headerfile <x86intrin.h>
				597	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	598	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	606	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	607	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	608	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	609	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	610	}
				611
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	612	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	613	///
				614	/// \headerfile <x86intrin.h>
				615	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	616	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	624	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	625	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	626	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	627	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	628	}
				629
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	630	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	631	///
				632	/// \headerfile <x86intrin.h>
				633	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	634	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	643	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	644	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	646	}
				647
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	648	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	649	///
				650	/// \headerfile <x86intrin.h>
				651	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	652	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	660	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	661	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	662	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	663	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	664	}
				665
				666	/* Horizontal arithmetic */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	667	/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	672	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	684	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	685	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	686	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	688	}
				689
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	690	/// Horizontally adds the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	695	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	707	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	708	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	709	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	711	}
				712
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	713	/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	718	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	730	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	731	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	732	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	734	}
				735
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	736	/// Horizontally subtracts the adjacent pairs of values contained in two
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	741	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	753	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	754	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	755	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	757	}
				758
				759	/* Vector permutations */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	760	/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	761	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	762	///
				763	/// \headerfile <x86intrin.h>
				764	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	765	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	770	/// A 128-bit integer vector operand specifying how the values are to be
				771	/// copied. \n
				772	/// Bit [1]: \n
				773	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				774	/// vector. \n
				775	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				776	/// returned vector. \n
				777	/// Bit [65]: \n
				778	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				779	/// returned vector. \n
				780	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				781	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	783	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	784	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	787	}
				788
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	789	/// Copies the values in a 256-bit vector of [4 x double] as specified
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	790	/// by the 256-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	791	///
				792	/// \headerfile <x86intrin.h>
				793	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	794	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	800	/// copied. \n
				801	/// Bit [1]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	802	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				803	/// vector. \n
				804	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				805	/// returned vector. \n
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	806	/// Bit [65]: \n
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	807	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				808	/// returned vector. \n
				809	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				810	/// returned vector. \n
				811	/// Bit [129]: \n
				812	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				813	/// returned vector. \n
				814	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				815	/// returned vector. \n
				816	/// Bit [193]: \n
				817	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				818	/// returned vector. \n
				819	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	820	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	824	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	826	}
				827
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	828	/// Copies the values stored in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	829	/// specified by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	830	/// \headerfile <x86intrin.h>
				831	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	832	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	833	///
				834	/// \param __a
				835	/// A 128-bit vector of [4 x float].
				836	/// \param __c
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	837	/// A 128-bit integer vector operand specifying how the values are to be
				838	/// copied. \n
				839	/// Bits [1:0]: \n
				840	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				841	/// returned vector. \n
				842	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				843	/// returned vector. \n
				844	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				845	/// returned vector. \n
				846	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				847	/// returned vector. \n
				848	/// Bits [33:32]: \n
				849	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				850	/// returned vector. \n
				851	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				852	/// returned vector. \n
				853	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				854	/// returned vector. \n
				855	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				856	/// returned vector. \n
				857	/// Bits [65:64]: \n
				858	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				859	/// returned vector. \n
				860	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				861	/// returned vector. \n
				862	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				863	/// returned vector. \n
				864	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				865	/// returned vector. \n
				866	/// Bits [97:96]: \n
				867	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				868	/// returned vector. \n
				869	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				870	/// returned vector. \n
				871	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				872	/// returned vector. \n
				873	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				874	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	875	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	876	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	877	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	878	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	879	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	880	}
				881
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	882	/// Copies the values stored in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	883	/// specified by the 256-bit integer vector operand.
				884	///
				885	/// \headerfile <x86intrin.h>
				886	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	887	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	888	///
				889	/// \param __a
				890	/// A 256-bit vector of [8 x float].
				891	/// \param __c
				892	/// A 256-bit integer vector operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	893	/// copied. \n
				894	/// Bits [1:0]: \n
				895	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				896	/// returned vector. \n
				897	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				898	/// returned vector. \n
				899	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				900	/// returned vector. \n
				901	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				902	/// returned vector. \n
				903	/// Bits [33:32]: \n
				904	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				905	/// returned vector. \n
				906	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				907	/// returned vector. \n
				908	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				909	/// returned vector. \n
				910	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				911	/// returned vector. \n
				912	/// Bits [65:64]: \n
				913	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				914	/// returned vector. \n
				915	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				916	/// returned vector. \n
				917	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				918	/// returned vector. \n
				919	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				920	/// returned vector. \n
				921	/// Bits [97:96]: \n
				922	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				923	/// returned vector. \n
				924	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				925	/// returned vector. \n
				926	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				927	/// returned vector. \n
				928	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				929	/// returned vector. \n
				930	/// Bits [129:128]: \n
				931	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				932	/// returned vector. \n
				933	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				934	/// returned vector. \n
				935	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				936	/// returned vector. \n
				937	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				938	/// returned vector. \n
				939	/// Bits [161:160]: \n
				940	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				941	/// returned vector. \n
				942	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				943	/// returned vector. \n
				944	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				945	/// returned vector. \n
				946	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				947	/// returned vector. \n
				948	/// Bits [193:192]: \n
				949	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				950	/// returned vector. \n
				951	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				952	/// returned vector. \n
				953	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				954	/// returned vector. \n
				955	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				956	/// returned vector. \n
				957	/// Bits [225:224]: \n
				958	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				959	/// returned vector. \n
				960	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				961	/// returned vector. \n
				962	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				963	/// returned vector. \n
				964	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				965	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	966	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	967	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	968	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	969	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	970	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	971	}
				972
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	973	/// Copies the values in a 128-bit vector of [2 x double] as specified
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	974	/// by the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	975	///
				976	/// \headerfile <x86intrin.h>
				977	///
				978	/// \code
				979	/// __m128d _mm_permute_pd(__m128d A, const int C);
				980	/// \endcode
				981	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	982	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	983	///
				984	/// \param A
				985	/// A 128-bit vector of [2 x double].
				986	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	987	/// An immediate integer operand specifying how the values are to be
				988	/// copied. \n
				989	/// Bit [0]: \n
				990	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				991	/// vector. \n
				992	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				993	/// returned vector. \n
				994	/// Bit [1]: \n
				995	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				996	/// returned vector. \n
				997	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				998	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	999	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1000	#define _mm_permute_pd(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1001	(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1002
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1003	/// Copies the values in a 256-bit vector of [4 x double] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1004	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1005	///
				1006	/// \headerfile <x86intrin.h>
				1007	///
				1008	/// \code
				1009	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1010	/// \endcode
				1011	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1012	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1013	///
				1014	/// \param A
				1015	/// A 256-bit vector of [4 x double].
				1016	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1017	/// An immediate integer operand specifying how the values are to be
				1018	/// copied. \n
				1019	/// Bit [0]: \n
				1020	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				1021	/// vector. \n
				1022	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1023	/// returned vector. \n
				1024	/// Bit [1]: \n
				1025	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1026	/// returned vector. \n
				1027	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1028	/// returned vector. \n
				1029	/// Bit [2]: \n
				1030	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1031	/// returned vector. \n
				1032	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1033	/// returned vector. \n
				1034	/// Bit [3]: \n
				1035	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1036	/// returned vector. \n
				1037	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1038	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1039	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1040	#define _mm256_permute_pd(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1041	(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1042
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1043	/// Copies the values in a 128-bit vector of [4 x float] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1044	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1045	///
				1046	/// \headerfile <x86intrin.h>
				1047	///
				1048	/// \code
				1049	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1050	/// \endcode
				1051	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1052	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1053	///
				1054	/// \param A
				1055	/// A 128-bit vector of [4 x float].
				1056	/// \param C
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1057	/// An immediate integer operand specifying how the values are to be
				1058	/// copied. \n
				1059	/// Bits [1:0]: \n
				1060	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1061	/// returned vector. \n
				1062	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1063	/// returned vector. \n
				1064	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1065	/// returned vector. \n
				1066	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1067	/// returned vector. \n
				1068	/// Bits [3:2]: \n
				1069	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1070	/// returned vector. \n
				1071	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1072	/// returned vector. \n
				1073	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1074	/// returned vector. \n
				1075	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1076	/// returned vector. \n
				1077	/// Bits [5:4]: \n
				1078	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1079	/// returned vector. \n
				1080	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1081	/// returned vector. \n
				1082	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1083	/// returned vector. \n
				1084	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1085	/// returned vector. \n
				1086	/// Bits [7:6]: \n
				1087	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1088	/// returned vector. \n
				1089	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1090	/// returned vector. \n
				1091	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1092	/// returned vector. \n
				1093	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1094	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1095	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1096	#define _mm_permute_ps(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1097	(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1098
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1099	/// Copies the values in a 256-bit vector of [8 x float] as specified by
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1100	/// the immediate integer operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1101	///
				1102	/// \headerfile <x86intrin.h>
				1103	///
				1104	/// \code
				1105	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1106	/// \endcode
				1107	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1108	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1109	///
				1110	/// \param A
				1111	/// A 256-bit vector of [8 x float].
				1112	/// \param C
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1113	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1114	/// copied. \n
				1115	/// Bits [1:0]: \n
				1116	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1117	/// returned vector. \n
				1118	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1119	/// returned vector. \n
				1120	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1121	/// returned vector. \n
				1122	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1123	/// returned vector. \n
				1124	/// Bits [3:2]: \n
				1125	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1126	/// returned vector. \n
				1127	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1128	/// returned vector. \n
				1129	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1130	/// returned vector. \n
				1131	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1132	/// returned vector. \n
				1133	/// Bits [5:4]: \n
				1134	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1135	/// returned vector. \n
				1136	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1137	/// returned vector. \n
				1138	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1139	/// returned vector. \n
				1140	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1141	/// returned vector. \n
				1142	/// Bits [7:6]: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1143	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1144	/// returned vector. \n
				1145	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1146	/// returned vector. \n
				1147	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1148	/// returned vector. \n
				1149	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1150	/// returned vector. \n
				1151	/// Bits [1:0]: \n
				1152	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1153	/// returned vector. \n
				1154	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1155	/// returned vector. \n
				1156	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1157	/// returned vector. \n
				1158	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1159	/// returned vector. \n
				1160	/// Bits [3:2]: \n
				1161	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1162	/// returned vector. \n
				1163	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1164	/// returned vector. \n
				1165	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1166	/// returned vector. \n
				1167	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1168	/// returned vector. \n
				1169	/// Bits [5:4]: \n
				1170	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1171	/// returned vector. \n
				1172	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1173	/// returned vector. \n
				1174	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1175	/// returned vector. \n
				1176	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1177	/// returned vector. \n
				1178	/// Bits [7:6]: \n
				1179	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1180	/// returned vector. \n
				1181	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1182	/// returned vector. \n
				1183	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1184	/// returned vector. \n
				1185	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1186	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1187	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1188	#define _mm256_permute_ps(A, C) \
Craig Topper	acf5601	2018-06-08 00:59:27 +0000	[diff] [blame]	1189	(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1190
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1191	/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1192	/// [4 x double], as specified by the immediate integer operand.
				1193	///
				1194	/// \headerfile <x86intrin.h>
				1195	///
				1196	/// \code
				1197	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1198	/// \endcode
				1199	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1200	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1201	///
				1202	/// \param V1
				1203	/// A 256-bit vector of [4 x double].
				1204	/// \param V2
				1205	/// A 256-bit vector of [4 x double.
				1206	/// \param M
				1207	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1208	/// permuted. \n
				1209	/// Bits [1:0]: \n
				1210	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1211	/// destination. \n
				1212	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1213	/// destination. \n
				1214	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1215	/// destination. \n
				1216	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1217	/// destination. \n
				1218	/// Bits [5:4]: \n
				1219	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1220	/// destination. \n
				1221	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1222	/// destination. \n
				1223	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1224	/// destination. \n
				1225	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
				1226	/// destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1227	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1228	#define _mm256_permute2f128_pd(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1229	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1230	(__v4df)(__m256d)(V2), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1231
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1232	/// Permutes 128-bit data values stored in two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1233	/// [8 x float], as specified by the immediate integer operand.
				1234	///
				1235	/// \headerfile <x86intrin.h>
				1236	///
				1237	/// \code
				1238	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1239	/// \endcode
				1240	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1241	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1242	///
				1243	/// \param V1
				1244	/// A 256-bit vector of [8 x float].
				1245	/// \param V2
				1246	/// A 256-bit vector of [8 x float].
				1247	/// \param M
				1248	/// An immediate integer operand specifying how the values are to be
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1249	/// permuted. \n
				1250	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1251	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1252	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1253	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1254	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1255	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1256	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1257	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1258	/// destination. \n
				1259	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1260	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1261	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1262	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1263	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1264	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1265	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1266	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1267	/// destination.
				1268	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1269	#define _mm256_permute2f128_ps(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1270	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1271	(__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1272
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1273	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1274	/// as specified by the immediate integer operand.
				1275	///
				1276	/// \headerfile <x86intrin.h>
				1277	///
				1278	/// \code
				1279	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1280	/// \endcode
				1281	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1282	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1283	///
				1284	/// \param V1
				1285	/// A 256-bit integer vector.
				1286	/// \param V2
				1287	/// A 256-bit integer vector.
				1288	/// \param M
				1289	/// An immediate integer operand specifying how the values are to be copied.
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1290	/// Bits [1:0]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1291	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1292	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1293	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1294	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1295	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1296	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1297	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1298	/// destination. \n
				1299	/// Bits [5:4]: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1300	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1301	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1302	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1303	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1304	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1305	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1306	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1307	/// destination.
				1308	/// \returns A 256-bit integer vector containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1309	#define _mm256_permute2f128_si256(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1310	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1311	(__v8si)(__m256i)(V2), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1312
				1313	/* Vector Blend */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1314	/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1315	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1316	/// integer operand.
				1317	///
				1318	/// \headerfile <x86intrin.h>
				1319	///
				1320	/// \code
				1321	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1322	/// \endcode
				1323	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1324	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1325	///
				1326	/// \param V1
				1327	/// A 256-bit vector of [4 x double].
				1328	/// \param V2
				1329	/// A 256-bit vector of [4 x double].
				1330	/// \param M
				1331	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1332	/// values are to be copied. The position of the mask bit corresponds to the
				1333	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1334	/// element in operand \a V1 is copied to the same position in the
				1335	/// destination. When a mask bit is 1, the corresponding 64-bit element in
				1336	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1337	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1338	#define _mm256_blend_pd(V1, V2, M) \
Craig Topper	7d17d72	2018-06-08 00:00:21 +0000	[diff] [blame]	1339	(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
				1340	(__v4df)(__m256d)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1341
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1342	/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1343	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1344	/// integer operand.
				1345	///
				1346	/// \headerfile <x86intrin.h>
				1347	///
				1348	/// \code
				1349	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1350	/// \endcode
				1351	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1352	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1353	///
				1354	/// \param V1
				1355	/// A 256-bit vector of [8 x float].
				1356	/// \param V2
				1357	/// A 256-bit vector of [8 x float].
				1358	/// \param M
				1359	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1360	/// values are to be copied. The position of the mask bit corresponds to the
				1361	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1362	/// element in operand \a V1 is copied to the same position in the
				1363	/// destination. When a mask bit is 1, the corresponding 32-bit element in
				1364	/// operand \a V2 is copied to the same position in the destination.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1365	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1366	#define _mm256_blend_ps(V1, V2, M) \
Craig Topper	7d17d72	2018-06-08 00:00:21 +0000	[diff] [blame]	1367	(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
				1368	(__v8sf)(__m256)(V2), (int)(M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1369
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1370	/// Merges 64-bit double-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1371	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1372	/// operand.
				1373	///
				1374	/// \headerfile <x86intrin.h>
				1375	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1376	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1377	///
				1378	/// \param __a
				1379	/// A 256-bit vector of [4 x double].
				1380	/// \param __b
				1381	/// A 256-bit vector of [4 x double].
				1382	/// \param __c
				1383	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1384	/// how the values are to be copied. The position of the mask bit corresponds
				1385	/// to the most significant bit of a copied value. When a mask bit is 0, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1386	/// corresponding 64-bit element in operand \a __a is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1387	/// position in the destination. When a mask bit is 1, the corresponding
Ekaterina Romanova	4c77e89	2016-11-26 19:38:19 +0000	[diff] [blame]	1388	/// 64-bit element in operand \a __b is copied to the same position in the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1389	/// destination.
				1390	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1391	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1392	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1393	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1394	return (__m256d)__builtin_ia32_blendvpd256(
				1395	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1396	}
				1397
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1398	/// Merges 32-bit single-precision data values stored in either of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1399	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1400	/// operand.
				1401	///
				1402	/// \headerfile <x86intrin.h>
				1403	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1404	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1405	///
				1406	/// \param __a
				1407	/// A 256-bit vector of [8 x float].
				1408	/// \param __b
				1409	/// A 256-bit vector of [8 x float].
				1410	/// \param __c
				1411	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1412	/// and 31 specifying how the values are to be copied. The position of the
				1413	/// mask bit corresponds to the most significant bit of a copied value. When
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1414	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1415	/// copied to the same position in the destination. When a mask bit is 1, the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1416	/// corresponding 32-bit element in operand \a __b is copied to the same
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1417	/// position in the destination.
				1418	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1419	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1420	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1421	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1422	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1423	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1424	}
				1425
				1426	/* Vector Dot Product */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1427	/// Computes two dot products in parallel, using the lower and upper
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1428	/// halves of two [8 x float] vectors as input to the two computations, and
				1429	/// returning the two dot products in the lower and upper halves of the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1430	/// [8 x float] result.
				1431	///
				1432	/// The immediate integer operand controls which input elements will
				1433	/// contribute to the dot product, and where the final results are returned.
				1434	/// In general, for each dot product, the four corresponding elements of the
				1435	/// input vectors are multiplied; the first two and second two products are
				1436	/// summed, then the two sums are added to form the final result.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1437	///
				1438	/// \headerfile <x86intrin.h>
				1439	///
				1440	/// \code
				1441	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1442	/// \endcode
				1443	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1444	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1445	///
				1446	/// \param V1
				1447	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1448	/// \param V2
				1449	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1450	/// \param M
				1451	/// An immediate integer argument. Bits [7:4] determine which elements of
				1452	/// the input vectors are used, with bit [4] corresponding to the lowest
				1453	/// element and bit [7] corresponding to the highest element of each [4 x
				1454	/// float] subvector. If a bit is set, the corresponding elements from the
				1455	/// two input vectors are used as an input for dot product; otherwise that
				1456	/// input is treated as zero. Bits [3:0] determine which elements of the
				1457	/// result will receive a copy of the final dot product, with bit [0]
				1458	/// corresponding to the lowest element and bit [3] corresponding to the
				1459	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1460	/// product is returned in the corresponding element; otherwise that element
				1461	/// is set to zero. The bitmask is applied in the same way to each of the
				1462	/// two parallel dot product computations.
				1463	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1464	#define _mm256_dp_ps(V1, V2, M) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1465	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1466	(__v8sf)(__m256)(V2), (M))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1467
				1468	/* Vector shuffle */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1469	/// Selects 8 float values from the 256-bit operands of [8 x float], as
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1470	/// specified by the immediate value operand.
				1471	///
				1472	/// The four selected elements in each operand are copied to the destination
				1473	/// according to the bits specified in the immediate operand. The selected
				1474	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1475	/// bits [191:128] of the destination, and the selected elements from the
				1476	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
				1477	/// the destination. For example, if bits [7:0] of the immediate operand
				1478	/// contain a value of 0xFF, the 256-bit destination vector would contain the
				1479	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1480	///
				1481	/// \headerfile <x86intrin.h>
				1482	///
				1483	/// \code
				1484	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1485	/// \endcode
				1486	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1487	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1488	///
				1489	/// \param a
				1490	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1491	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1492	/// according to the bits specified in the immediate operand.
				1493	/// \param b
				1494	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1495	/// operand are copied to bits [127:64] and bits [255:192] in the
				1496	/// destination, according to the bits specified in the immediate operand.
				1497	/// \param mask
				1498	/// An immediate value containing an 8-bit value specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1499	/// copy from \a a and \a b \n.
				1500	/// Bits [3:0] specify the values copied from operand \a a. \n
				1501	/// Bits [7:4] specify the values copied from operand \a b. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1502	/// The destinations within the 256-bit destination are assigned values as
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1503	/// follows, according to the bit value assignments described below: \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1504	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1505	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1506	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1507	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1508	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1509	/// destination. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1510	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1511	/// the destination. \n
				1512	/// Bit value assignments: \n
				1513	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
				1514	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
				1515	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1516	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1517	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1518	#define _mm256_shuffle_ps(a, b, mask) \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1519	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1520	(__v8sf)(__m256)(b), \
				1521	0 + (((mask) >> 0) & 0x3), \
				1522	0 + (((mask) >> 2) & 0x3), \
				1523	8 + (((mask) >> 4) & 0x3), \
				1524	8 + (((mask) >> 6) & 0x3), \
				1525	4 + (((mask) >> 0) & 0x3), \
				1526	4 + (((mask) >> 2) & 0x3), \
				1527	12 + (((mask) >> 4) & 0x3), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1528	12 + (((mask) >> 6) & 0x3))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1529
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1530	/// Selects four double-precision values from the 256-bit operands of
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1531	/// [4 x double], as specified by the immediate value operand.
				1532	///
				1533	/// The selected elements from the first 256-bit operand are copied to bits
				1534	/// [63:0] and bits [191:128] in the destination, and the selected elements
				1535	/// from the second 256-bit operand are copied to bits [127:64] and bits
				1536	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
				1537	/// operand contain a value of 0xF, the 256-bit destination vector would
				1538	/// contain the following values: b[3], a[3], b[1], a[1].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1539	///
				1540	/// \headerfile <x86intrin.h>
				1541	///
				1542	/// \code
				1543	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1544	/// \endcode
				1545	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1546	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1547	///
				1548	/// \param a
				1549	/// A 256-bit vector of [4 x double].
				1550	/// \param b
				1551	/// A 256-bit vector of [4 x double].
				1552	/// \param mask
				1553	/// An immediate value containing 8-bit values specifying which elements to
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1554	/// copy from \a a and \a b: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1555	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1556	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1557	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1558	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1559	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1560	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1561	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1562	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1563	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1564	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1565	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1566	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1567	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1568	/// destination. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	1569	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1570	/// destination.
				1571	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1572	#define _mm256_shuffle_pd(a, b, mask) \
Craig Topper	2a383c9	2016-07-04 22:18:01 +0000	[diff] [blame]	1573	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1574	(__v4df)(__m256d)(b), \
				1575	0 + (((mask) >> 0) & 0x1), \
				1576	4 + (((mask) >> 1) & 0x1), \
				1577	2 + (((mask) >> 2) & 0x1), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1578	6 + (((mask) >> 3) & 0x1))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1579
				1580	/* Compare */
				1581	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1582	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1583	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1584	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1585	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1586	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1587	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1588	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1589	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1590	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1591	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1592	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1593	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1594	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1595	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1596	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1597	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1598	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1599	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1600	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1601	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1602	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1603	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1604	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1605	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Sanjay Patel	bd0d006	2017-04-12 15:19:08 +0000	[diff] [blame]	1606	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1607	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1608	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1609	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1610	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1611	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1612	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1613
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1614	/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1615	/// 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1616	/// immediate integer operand.
				1617	///
				1618	/// Returns a [2 x double] vector consisting of two doubles corresponding to
				1619	/// the two comparison results: zero if the comparison is false, and all 1's
				1620	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1621	///
				1622	/// \headerfile <x86intrin.h>
				1623	///
				1624	/// \code
				1625	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1626	/// \endcode
				1627	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1628	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1629	///
				1630	/// \param a
				1631	/// A 128-bit vector of [2 x double].
				1632	/// \param b
				1633	/// A 128-bit vector of [2 x double].
				1634	/// \param c
				1635	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1636	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1637	/// 0x00: Equal (ordered, non-signaling) \n
				1638	/// 0x01: Less-than (ordered, signaling) \n
				1639	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1640	/// 0x03: Unordered (non-signaling) \n
				1641	/// 0x04: Not-equal (unordered, non-signaling) \n
				1642	/// 0x05: Not-less-than (unordered, signaling) \n
				1643	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1644	/// 0x07: Ordered (non-signaling) \n
				1645	/// 0x08: Equal (unordered, non-signaling) \n
				1646	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1647	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1648	/// 0x0B: False (ordered, non-signaling) \n
				1649	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1650	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1651	/// 0x0E: Greater-than (ordered, signaling) \n
				1652	/// 0x0F: True (unordered, non-signaling) \n
				1653	/// 0x10: Equal (ordered, signaling) \n
				1654	/// 0x11: Less-than (ordered, non-signaling) \n
				1655	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1656	/// 0x13: Unordered (signaling) \n
				1657	/// 0x14: Not-equal (unordered, signaling) \n
				1658	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1659	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1660	/// 0x17: Ordered (signaling) \n
				1661	/// 0x18: Equal (unordered, signaling) \n
				1662	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1663	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1664	/// 0x1B: False (ordered, signaling) \n
				1665	/// 0x1C: Not-equal (ordered, signaling) \n
				1666	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1667	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1668	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1669	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1670	#define _mm_cmp_pd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1671	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1672	(__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1673
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1674	/// Compares each of the corresponding values of two 128-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1675	/// [4 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1676	/// operand.
				1677	///
				1678	/// Returns a [4 x float] vector consisting of four floats corresponding to
				1679	/// the four comparison results: zero if the comparison is false, and all 1's
				1680	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1681	///
				1682	/// \headerfile <x86intrin.h>
				1683	///
				1684	/// \code
				1685	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1686	/// \endcode
				1687	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1688	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1689	///
				1690	/// \param a
				1691	/// A 128-bit vector of [4 x float].
				1692	/// \param b
				1693	/// A 128-bit vector of [4 x float].
				1694	/// \param c
				1695	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1696	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1697	/// 0x00: Equal (ordered, non-signaling) \n
				1698	/// 0x01: Less-than (ordered, signaling) \n
				1699	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1700	/// 0x03: Unordered (non-signaling) \n
				1701	/// 0x04: Not-equal (unordered, non-signaling) \n
				1702	/// 0x05: Not-less-than (unordered, signaling) \n
				1703	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1704	/// 0x07: Ordered (non-signaling) \n
				1705	/// 0x08: Equal (unordered, non-signaling) \n
				1706	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1707	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1708	/// 0x0B: False (ordered, non-signaling) \n
				1709	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1710	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1711	/// 0x0E: Greater-than (ordered, signaling) \n
				1712	/// 0x0F: True (unordered, non-signaling) \n
				1713	/// 0x10: Equal (ordered, signaling) \n
				1714	/// 0x11: Less-than (ordered, non-signaling) \n
				1715	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1716	/// 0x13: Unordered (signaling) \n
				1717	/// 0x14: Not-equal (unordered, signaling) \n
				1718	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1719	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1720	/// 0x17: Ordered (signaling) \n
				1721	/// 0x18: Equal (unordered, signaling) \n
				1722	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1723	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1724	/// 0x1B: False (ordered, signaling) \n
				1725	/// 0x1C: Not-equal (ordered, signaling) \n
				1726	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1727	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1728	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1729	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1730	#define _mm_cmp_ps(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1731	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1732	(__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1733
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1734	/// Compares each of the corresponding double-precision values of two
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1735	/// 256-bit vectors of [4 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1736	/// immediate integer operand.
				1737	///
				1738	/// Returns a [4 x double] vector consisting of four doubles corresponding to
				1739	/// the four comparison results: zero if the comparison is false, and all 1's
				1740	/// if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1741	///
				1742	/// \headerfile <x86intrin.h>
				1743	///
				1744	/// \code
				1745	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1746	/// \endcode
				1747	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1748	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1749	///
				1750	/// \param a
				1751	/// A 256-bit vector of [4 x double].
				1752	/// \param b
				1753	/// A 256-bit vector of [4 x double].
				1754	/// \param c
				1755	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1756	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1757	/// 0x00: Equal (ordered, non-signaling) \n
				1758	/// 0x01: Less-than (ordered, signaling) \n
				1759	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1760	/// 0x03: Unordered (non-signaling) \n
				1761	/// 0x04: Not-equal (unordered, non-signaling) \n
				1762	/// 0x05: Not-less-than (unordered, signaling) \n
				1763	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1764	/// 0x07: Ordered (non-signaling) \n
				1765	/// 0x08: Equal (unordered, non-signaling) \n
				1766	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1767	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1768	/// 0x0B: False (ordered, non-signaling) \n
				1769	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1770	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1771	/// 0x0E: Greater-than (ordered, signaling) \n
				1772	/// 0x0F: True (unordered, non-signaling) \n
				1773	/// 0x10: Equal (ordered, signaling) \n
				1774	/// 0x11: Less-than (ordered, non-signaling) \n
				1775	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1776	/// 0x13: Unordered (signaling) \n
				1777	/// 0x14: Not-equal (unordered, signaling) \n
				1778	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1779	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1780	/// 0x17: Ordered (signaling) \n
				1781	/// 0x18: Equal (unordered, signaling) \n
				1782	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1783	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1784	/// 0x1B: False (ordered, signaling) \n
				1785	/// 0x1C: Not-equal (ordered, signaling) \n
				1786	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1787	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1788	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1789	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1790	#define _mm256_cmp_pd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1791	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1792	(__v4df)(__m256d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1793
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1794	/// Compares each of the corresponding values of two 256-bit vectors of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1795	/// [8 x float], using the operation specified by the immediate integer
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1796	/// operand.
				1797	///
				1798	/// Returns a [8 x float] vector consisting of eight floats corresponding to
				1799	/// the eight comparison results: zero if the comparison is false, and all
				1800	/// 1's if the comparison is true.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1801	///
				1802	/// \headerfile <x86intrin.h>
				1803	///
				1804	/// \code
				1805	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1806	/// \endcode
				1807	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1808	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1809	///
				1810	/// \param a
				1811	/// A 256-bit vector of [8 x float].
				1812	/// \param b
				1813	/// A 256-bit vector of [8 x float].
				1814	/// \param c
				1815	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1816	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1817	/// 0x00: Equal (ordered, non-signaling) \n
				1818	/// 0x01: Less-than (ordered, signaling) \n
				1819	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1820	/// 0x03: Unordered (non-signaling) \n
				1821	/// 0x04: Not-equal (unordered, non-signaling) \n
				1822	/// 0x05: Not-less-than (unordered, signaling) \n
				1823	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1824	/// 0x07: Ordered (non-signaling) \n
				1825	/// 0x08: Equal (unordered, non-signaling) \n
				1826	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1827	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1828	/// 0x0B: False (ordered, non-signaling) \n
				1829	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1830	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1831	/// 0x0E: Greater-than (ordered, signaling) \n
				1832	/// 0x0F: True (unordered, non-signaling) \n
				1833	/// 0x10: Equal (ordered, signaling) \n
				1834	/// 0x11: Less-than (ordered, non-signaling) \n
				1835	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1836	/// 0x13: Unordered (signaling) \n
				1837	/// 0x14: Not-equal (unordered, signaling) \n
				1838	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1839	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1840	/// 0x17: Ordered (signaling) \n
				1841	/// 0x18: Equal (unordered, signaling) \n
				1842	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1843	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1844	/// 0x1B: False (ordered, signaling) \n
				1845	/// 0x1C: Not-equal (ordered, signaling) \n
				1846	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1847	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1848	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1849	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1850	#define _mm256_cmp_ps(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1851	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1852	(__v8sf)(__m256)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1853
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1854	/// Compares each of the corresponding scalar double-precision values of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1855	/// two 128-bit vectors of [2 x double], using the operation specified by the
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1856	/// immediate integer operand.
				1857	///
				1858	/// If the result is true, all 64 bits of the destination vector are set;
				1859	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1860	///
				1861	/// \headerfile <x86intrin.h>
				1862	///
				1863	/// \code
				1864	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1865	/// \endcode
				1866	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1867	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1868	///
				1869	/// \param a
				1870	/// A 128-bit vector of [2 x double].
				1871	/// \param b
				1872	/// A 128-bit vector of [2 x double].
				1873	/// \param c
				1874	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1875	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1876	/// 0x00: Equal (ordered, non-signaling) \n
				1877	/// 0x01: Less-than (ordered, signaling) \n
				1878	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1879	/// 0x03: Unordered (non-signaling) \n
				1880	/// 0x04: Not-equal (unordered, non-signaling) \n
				1881	/// 0x05: Not-less-than (unordered, signaling) \n
				1882	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1883	/// 0x07: Ordered (non-signaling) \n
				1884	/// 0x08: Equal (unordered, non-signaling) \n
				1885	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1886	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1887	/// 0x0B: False (ordered, non-signaling) \n
				1888	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1889	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1890	/// 0x0E: Greater-than (ordered, signaling) \n
				1891	/// 0x0F: True (unordered, non-signaling) \n
				1892	/// 0x10: Equal (ordered, signaling) \n
				1893	/// 0x11: Less-than (ordered, non-signaling) \n
				1894	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1895	/// 0x13: Unordered (signaling) \n
				1896	/// 0x14: Not-equal (unordered, signaling) \n
				1897	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1898	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1899	/// 0x17: Ordered (signaling) \n
				1900	/// 0x18: Equal (unordered, signaling) \n
				1901	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1902	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1903	/// 0x1B: False (ordered, signaling) \n
				1904	/// 0x1C: Not-equal (ordered, signaling) \n
				1905	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1906	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1907	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1908	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1909	#define _mm_cmp_sd(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1910	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1911	(__v2df)(__m128d)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1912
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1913	/// Compares each of the corresponding scalar values of two 128-bit
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1914	/// vectors of [4 x float], using the operation specified by the immediate
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	1915	/// integer operand.
				1916	///
				1917	/// If the result is true, all 32 bits of the destination vector are set;
				1918	/// otherwise they are cleared.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1919	///
				1920	/// \headerfile <x86intrin.h>
				1921	///
				1922	/// \code
				1923	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1924	/// \endcode
				1925	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	1926	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1927	///
				1928	/// \param a
				1929	/// A 128-bit vector of [4 x float].
				1930	/// \param b
				1931	/// A 128-bit vector of [4 x float].
				1932	/// \param c
				1933	/// An immediate integer operand, with bits [4:0] specifying which comparison
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1934	/// operation to use: \n
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	1935	/// 0x00: Equal (ordered, non-signaling) \n
				1936	/// 0x01: Less-than (ordered, signaling) \n
				1937	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1938	/// 0x03: Unordered (non-signaling) \n
				1939	/// 0x04: Not-equal (unordered, non-signaling) \n
				1940	/// 0x05: Not-less-than (unordered, signaling) \n
				1941	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1942	/// 0x07: Ordered (non-signaling) \n
				1943	/// 0x08: Equal (unordered, non-signaling) \n
				1944	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1945	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1946	/// 0x0B: False (ordered, non-signaling) \n
				1947	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1948	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1949	/// 0x0E: Greater-than (ordered, signaling) \n
				1950	/// 0x0F: True (unordered, non-signaling) \n
				1951	/// 0x10: Equal (ordered, signaling) \n
				1952	/// 0x11: Less-than (ordered, non-signaling) \n
				1953	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1954	/// 0x13: Unordered (signaling) \n
				1955	/// 0x14: Not-equal (unordered, signaling) \n
				1956	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1957	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1958	/// 0x17: Ordered (signaling) \n
				1959	/// 0x18: Equal (unordered, signaling) \n
				1960	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1961	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1962	/// 0x1B: False (ordered, signaling) \n
				1963	/// 0x1C: Not-equal (ordered, signaling) \n
				1964	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1965	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1966	/// 0x1F: True (unordered, signaling)
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1967	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1968	#define _mm_cmp_ss(a, b, c) \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1969	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	1970	(__v4sf)(__m128)(b), (c))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1971
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1972	/// Takes a [8 x i32] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1973	/// indexed by the immediate constant operand.
				1974	///
				1975	/// \headerfile <x86intrin.h>
				1976	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1977	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1978	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1979	///
				1980	/// \param __a
				1981	/// A 256-bit vector of [8 x i32].
				1982	/// \param __imm
				1983	/// An immediate integer operand with bits [2:0] determining which vector
				1984	/// element is extracted and returned.
				1985	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1986	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	1987	#define _mm256_extract_epi32(X, N) \
				1988	(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1989
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	1990	/// Takes a [16 x i16] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1991	/// indexed by the immediate constant operand.
				1992	///
				1993	/// \headerfile <x86intrin.h>
				1994	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	1995	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1996	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1997	///
				1998	/// \param __a
				1999	/// A 256-bit integer vector of [16 x i16].
				2000	/// \param __imm
				2001	/// An immediate integer operand with bits [3:0] determining which vector
				2002	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2003	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2004	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2005	#define _mm256_extract_epi16(X, N) \
				2006	(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
				2007	(int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2008
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2009	/// Takes a [32 x i8] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2010	/// indexed by the immediate constant operand.
				2011	///
				2012	/// \headerfile <x86intrin.h>
				2013	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2014	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2015	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2016	///
				2017	/// \param __a
				2018	/// A 256-bit integer vector of [32 x i8].
				2019	/// \param __imm
				2020	/// An immediate integer operand with bits [4:0] determining which vector
				2021	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	2022	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				2023	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2024	#define _mm256_extract_epi8(X, N) \
				2025	(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
				2026	(int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2027
				2028	#ifdef __x86_64__
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2029	/// Takes a [4 x i64] vector and returns the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2030	/// indexed by the immediate constant operand.
				2031	///
				2032	/// \headerfile <x86intrin.h>
				2033	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2034	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2035	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2036	///
				2037	/// \param __a
				2038	/// A 256-bit integer vector of [4 x i64].
				2039	/// \param __imm
				2040	/// An immediate integer operand with bits [1:0] determining which vector
				2041	/// element is extracted and returned.
				2042	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				2043	/// packed data.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2044	#define _mm256_extract_epi64(X, N) \
				2045	(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2046	#endif
				2047
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2048	/// Takes a [8 x i32] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2049	/// indexed by the immediate constant operand by a new value. Returns the
				2050	/// modified vector.
				2051	///
				2052	/// \headerfile <x86intrin.h>
				2053	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2054	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2055	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2056	///
				2057	/// \param __a
				2058	/// A vector of [8 x i32] to be used by the insert operation.
				2059	/// \param __b
				2060	/// An integer value. The replacement value for the insert operation.
				2061	/// \param __imm
				2062	/// An immediate integer specifying the index of the vector element to be
				2063	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2064	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2065	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2066	#define _mm256_insert_epi32(X, I, N) \
				2067	(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
				2068	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2069
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2070
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2071	/// Takes a [16 x i16] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2072	/// indexed by the immediate constant operand with a new value. Returns the
				2073	/// modified vector.
				2074	///
				2075	/// \headerfile <x86intrin.h>
				2076	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2077	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2078	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2079	///
				2080	/// \param __a
				2081	/// A vector of [16 x i16] to be used by the insert operation.
				2082	/// \param __b
				2083	/// An i16 integer value. The replacement value for the insert operation.
				2084	/// \param __imm
				2085	/// An immediate integer specifying the index of the vector element to be
				2086	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2087	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2088	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2089	#define _mm256_insert_epi16(X, I, N) \
				2090	(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
				2091	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2092
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2093	/// Takes a [32 x i8] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2094	/// indexed by the immediate constant operand with a new value. Returns the
				2095	/// modified vector.
				2096	///
				2097	/// \headerfile <x86intrin.h>
				2098	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2099	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2100	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2101	///
				2102	/// \param __a
				2103	/// A vector of [32 x i8] to be used by the insert operation.
				2104	/// \param __b
				2105	/// An i8 integer value. The replacement value for the insert operation.
				2106	/// \param __imm
				2107	/// An immediate integer specifying the index of the vector element to be
				2108	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2109	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2110	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2111	#define _mm256_insert_epi8(X, I, N) \
				2112	(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
				2113	(int)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2114
				2115	#ifdef __x86_64__
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2116	/// Takes a [4 x i64] vector and replaces the vector element value
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2117	/// indexed by the immediate constant operand with a new value. Returns the
				2118	/// modified vector.
				2119	///
				2120	/// \headerfile <x86intrin.h>
				2121	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2122	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2123	/// instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2124	///
				2125	/// \param __a
				2126	/// A vector of [4 x i64] to be used by the insert operation.
				2127	/// \param __b
				2128	/// A 64-bit integer value. The replacement value for the insert operation.
				2129	/// \param __imm
				2130	/// An immediate integer specifying the index of the vector element to be
				2131	/// replaced.
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2132	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2133	/// \a __imm with \a __b.
Craig Topper	f3914b7	2018-06-06 00:24:55 +0000	[diff] [blame]	2134	#define _mm256_insert_epi64(X, I, N) \
				2135	(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
				2136	(long long)(I), (int)(N))
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2137	#endif
				2138
				2139	/* Conversion */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2140	/// Converts a vector of [4 x i32] into a vector of [4 x double].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2141	///
				2142	/// \headerfile <x86intrin.h>
				2143	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2144	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2145	///
				2146	/// \param __a
				2147	/// A 128-bit integer vector of [4 x i32].
				2148	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2149	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2150	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2151	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2152	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2153	}
				2154
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2155	/// Converts a vector of [8 x i32] into a vector of [8 x float].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2156	///
				2157	/// \headerfile <x86intrin.h>
				2158	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2159	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2160	///
				2161	/// \param __a
				2162	/// A 256-bit integer vector.
				2163	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2164	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2165	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2166	{
Craig Topper	842171d	2018-05-21 20:19:17 +0000	[diff] [blame]	2167	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2168	}
				2169
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2170	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2171	/// [4 x float].
				2172	///
				2173	/// \headerfile <x86intrin.h>
				2174	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2175	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2176	///
				2177	/// \param __a
				2178	/// A 256-bit vector of [4 x double].
				2179	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2180	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2181	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2182	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2183	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2184	}
				2185
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2186	/// Converts a vector of [8 x float] into a vector of [8 x i32].
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2187	///
				2188	/// \headerfile <x86intrin.h>
				2189	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2190	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2191	///
				2192	/// \param __a
				2193	/// A 256-bit vector of [8 x float].
				2194	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2195	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2196	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2197	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2198	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2199	}
				2200
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2201	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2202	/// x double].
				2203	///
				2204	/// \headerfile <x86intrin.h>
				2205	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2206	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2207	///
				2208	/// \param __a
				2209	/// A 128-bit vector of [4 x float].
				2210	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2211	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2212	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2213	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2214	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2215	}
				2216
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2217	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2218	/// x i32], truncating the result by rounding towards zero when it is
				2219	/// inexact.
				2220	///
				2221	/// \headerfile <x86intrin.h>
				2222	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2223	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2224	///
				2225	/// \param __a
				2226	/// A 256-bit vector of [4 x double].
				2227	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2228	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2229	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2230	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2231	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2232	}
				2233
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2234	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2235	/// x i32]. When a conversion is inexact, the value returned is rounded
				2236	/// according to the rounding control bits in the MXCSR register.
				2237	///
				2238	/// \headerfile <x86intrin.h>
				2239	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2240	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2241	///
				2242	/// \param __a
				2243	/// A 256-bit vector of [4 x double].
				2244	/// \returns A 128-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2245	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2246	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2247	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2248	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2249	}
				2250
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2251	/// Converts a vector of [8 x float] into a vector of [8 x i32],
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2252	/// truncating the result by rounding towards zero when it is inexact.
				2253	///
				2254	/// \headerfile <x86intrin.h>
				2255	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2256	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2257	///
				2258	/// \param __a
				2259	/// A 256-bit vector of [8 x float].
				2260	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2261	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2262	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2263	{
Simon Pilgrim	e3b9ee0	2016-07-20 10:18:01 +0000	[diff] [blame]	2264	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2265	}
				2266
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2267	/// Returns the first element of the input vector of [4 x double].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2268	///
				2269	/// \headerfile <avxintrin.h>
				2270	///
				2271	/// This intrinsic is a utility function and does not correspond to a specific
				2272	/// instruction.
				2273	///
				2274	/// \param __a
				2275	/// A 256-bit vector of [4 x double].
				2276	/// \returns A 64 bit double containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2277	static __inline double __DEFAULT_FN_ATTRS
				2278	_mm256_cvtsd_f64(__m256d __a)
				2279	{
				2280	return __a[0];
				2281	}
				2282
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2283	/// Returns the first element of the input vector of [8 x i32].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2284	///
				2285	/// \headerfile <avxintrin.h>
				2286	///
				2287	/// This intrinsic is a utility function and does not correspond to a specific
				2288	/// instruction.
				2289	///
				2290	/// \param __a
				2291	/// A 256-bit vector of [8 x i32].
				2292	/// \returns A 32 bit integer containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2293	static __inline int __DEFAULT_FN_ATTRS
				2294	_mm256_cvtsi256_si32(__m256i __a)
				2295	{
				2296	__v8si __b = (__v8si)__a;
				2297	return __b[0];
				2298	}
				2299
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2300	/// Returns the first element of the input vector of [8 x float].
Ekaterina Romanova	2e041c9	2017-01-13 01:14:08 +0000	[diff] [blame]	2301	///
				2302	/// \headerfile <avxintrin.h>
				2303	///
				2304	/// This intrinsic is a utility function and does not correspond to a specific
				2305	/// instruction.
				2306	///
				2307	/// \param __a
				2308	/// A 256-bit vector of [8 x float].
				2309	/// \returns A 32 bit float containing the first element of the input vector.
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2310	static __inline float __DEFAULT_FN_ATTRS
				2311	_mm256_cvtss_f32(__m256 __a)
				2312	{
				2313	return __a[0];
				2314	}
				2315
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2316	/* Vector replicate */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2317	/// Moves and duplicates odd-indexed values from a 256-bit vector of
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2318	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2319	///
				2320	/// \headerfile <x86intrin.h>
				2321	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2322	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2323	///
				2324	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2325	/// A 256-bit vector of [8 x float]. \n
				2326	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
				2327	/// the return value. \n
				2328	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
				2329	/// the return value. \n
				2330	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
				2331	/// return value. \n
				2332	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
				2333	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2334	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2335	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2336	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2337	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2338	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2339	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2340	}
				2341
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2342	/// Moves and duplicates even-indexed values from a 256-bit vector of
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2343	/// [8 x float] to float values in a 256-bit vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2344	///
				2345	/// \headerfile <x86intrin.h>
				2346	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2347	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2348	///
				2349	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2350	/// A 256-bit vector of [8 x float]. \n
				2351	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
				2352	/// the return value. \n
				2353	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
				2354	/// the return value. \n
				2355	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
				2356	/// return value. \n
				2357	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
				2358	/// return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2359	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2360	/// values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2361	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2362	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2363	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2364	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2365	}
				2366
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2367	/// Moves and duplicates double-precision floating point values from a
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2368	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2369	/// vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2370	///
				2371	/// \headerfile <x86intrin.h>
				2372	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2373	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2374	///
				2375	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2376	/// A 256-bit vector of [4 x double]. \n
				2377	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
				2378	/// return value. \n
				2379	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
				2380	/// the return value.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2381	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2382	/// duplicated values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2383	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2384	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2385	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2386	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2387	}
				2388
				2389	/* Unpack and Interleave */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2390	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2391	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2392	///
				2393	/// \headerfile <x86intrin.h>
				2394	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2395	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2396	///
				2397	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2398	/// A 256-bit floating-point vector of [4 x double]. \n
				2399	/// Bits [127:64] are written to bits [63:0] of the return value. \n
				2400	/// Bits [255:192] are written to bits [191:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2401	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2402	/// A 256-bit floating-point vector of [4 x double]. \n
				2403	/// Bits [127:64] are written to bits [127:64] of the return value. \n
				2404	/// Bits [255:192] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2405	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2406	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2407	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2408	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2409	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2410	}
				2411
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2412	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2413	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2414	///
				2415	/// \headerfile <x86intrin.h>
				2416	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2417	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2418	///
				2419	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2420	/// A 256-bit floating-point vector of [4 x double]. \n
				2421	/// Bits [63:0] are written to bits [63:0] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2422	/// Bits [191:128] are written to bits [191:128] of the return value.
				2423	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2424	/// A 256-bit floating-point vector of [4 x double]. \n
				2425	/// Bits [63:0] are written to bits [127:64] of the return value. \n
				2426	/// Bits [191:128] are written to bits [255:192] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2427	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2428	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2429	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2430	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2431	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2432	}
				2433
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2434	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2435	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2436	/// vector of [8 x float].
				2437	///
				2438	/// \headerfile <x86intrin.h>
				2439	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2440	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2441	///
				2442	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2443	/// A 256-bit vector of [8 x float]. \n
				2444	/// Bits [95:64] are written to bits [31:0] of the return value. \n
				2445	/// Bits [127:96] are written to bits [95:64] of the return value. \n
				2446	/// Bits [223:192] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2447	/// Bits [255:224] are written to bits [223:192] of the return value.
				2448	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2449	/// A 256-bit vector of [8 x float]. \n
				2450	/// Bits [95:64] are written to bits [63:32] of the return value. \n
				2451	/// Bits [127:96] are written to bits [127:96] of the return value. \n
				2452	/// Bits [223:192] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2453	/// Bits [255:224] are written to bits [255:224] of the return value.
				2454	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2455	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2456	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2457	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2458	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2459	}
				2460
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2461	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2462	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2463	/// vector of [8 x float].
				2464	///
				2465	/// \headerfile <x86intrin.h>
				2466	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2467	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2468	///
				2469	/// \param __a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2470	/// A 256-bit vector of [8 x float]. \n
				2471	/// Bits [31:0] are written to bits [31:0] of the return value. \n
				2472	/// Bits [63:32] are written to bits [95:64] of the return value. \n
				2473	/// Bits [159:128] are written to bits [159:128] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2474	/// Bits [191:160] are written to bits [223:192] of the return value.
				2475	/// \param __b
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2476	/// A 256-bit vector of [8 x float]. \n
				2477	/// Bits [31:0] are written to bits [63:32] of the return value. \n
				2478	/// Bits [63:32] are written to bits [127:96] of the return value. \n
				2479	/// Bits [159:128] are written to bits [191:160] of the return value. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2480	/// Bits [191:160] are written to bits [255:224] of the return value.
				2481	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2482	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2483	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2484	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2485	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2486	}
				2487
				2488	/* Bit Test */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2489	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2490	/// element-by-element comparison of the double-precision element in the
				2491	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2492	/// vector.
				2493	///
				2494	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2495	/// If there is at least one pair of double-precision elements where the
				2496	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2497	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2498	/// If there is at least one pair of double-precision elements where the
				2499	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2500	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2501	/// This intrinsic returns the value of the ZF flag.
				2502	///
				2503	/// \headerfile <x86intrin.h>
				2504	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2505	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2506	///
				2507	/// \param __a
				2508	/// A 128-bit vector of [2 x double].
				2509	/// \param __b
				2510	/// A 128-bit vector of [2 x double].
				2511	/// \returns the ZF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2512	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2513	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2514	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2515	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2516	}
				2517
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2518	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2519	/// element-by-element comparison of the double-precision element in the
				2520	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2521	/// vector.
				2522	///
				2523	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2524	/// If there is at least one pair of double-precision elements where the
				2525	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2526	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2527	/// If there is at least one pair of double-precision elements where the
				2528	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2529	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2530	/// This intrinsic returns the value of the CF flag.
				2531	///
				2532	/// \headerfile <x86intrin.h>
				2533	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2534	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2535	///
				2536	/// \param __a
				2537	/// A 128-bit vector of [2 x double].
				2538	/// \param __b
				2539	/// A 128-bit vector of [2 x double].
				2540	/// \returns the CF flag in the EFLAGS register.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2541	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2542	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2543	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2544	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2545	}
				2546
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2547	/// Given two 128-bit floating-point vectors of [2 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2548	/// element-by-element comparison of the double-precision element in the
				2549	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2550	/// vector.
				2551	///
				2552	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2553	/// If there is at least one pair of double-precision elements where the
				2554	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2555	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2556	/// If there is at least one pair of double-precision elements where the
				2557	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2558	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2559	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2560	/// otherwise it returns 0.
				2561	///
				2562	/// \headerfile <x86intrin.h>
				2563	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2564	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2565	///
				2566	/// \param __a
				2567	/// A 128-bit vector of [2 x double].
				2568	/// \param __b
				2569	/// A 128-bit vector of [2 x double].
				2570	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2571	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2572	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2573	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2574	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2575	}
				2576
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2577	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2578	/// element-by-element comparison of the single-precision element in the
				2579	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2580	/// vector.
				2581	///
				2582	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2583	/// If there is at least one pair of single-precision elements where the
				2584	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2585	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2586	/// If there is at least one pair of single-precision elements where the
				2587	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2588	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2589	/// This intrinsic returns the value of the ZF flag.
				2590	///
				2591	/// \headerfile <x86intrin.h>
				2592	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2593	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2594	///
				2595	/// \param __a
				2596	/// A 128-bit vector of [4 x float].
				2597	/// \param __b
				2598	/// A 128-bit vector of [4 x float].
				2599	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2600	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2601	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2602	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2603	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2604	}
				2605
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2606	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2607	/// element-by-element comparison of the single-precision element in the
				2608	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2609	/// vector.
				2610	///
				2611	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2612	/// If there is at least one pair of single-precision elements where the
				2613	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2614	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2615	/// If there is at least one pair of single-precision elements where the
				2616	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2617	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2618	/// This intrinsic returns the value of the CF flag.
				2619	///
				2620	/// \headerfile <x86intrin.h>
				2621	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2622	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2623	///
				2624	/// \param __a
				2625	/// A 128-bit vector of [4 x float].
				2626	/// \param __b
				2627	/// A 128-bit vector of [4 x float].
				2628	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2629	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2630	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2631	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2632	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2633	}
				2634
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2635	/// Given two 128-bit floating-point vectors of [4 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2636	/// element-by-element comparison of the single-precision element in the
				2637	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2638	/// vector.
				2639	///
				2640	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2641	/// If there is at least one pair of single-precision elements where the
				2642	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2643	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2644	/// If there is at least one pair of single-precision elements where the
				2645	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2646	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2647	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2648	/// otherwise it returns 0.
				2649	///
				2650	/// \headerfile <x86intrin.h>
				2651	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2652	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2653	///
				2654	/// \param __a
				2655	/// A 128-bit vector of [4 x float].
				2656	/// \param __b
				2657	/// A 128-bit vector of [4 x float].
				2658	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2659	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2660	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2661	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2662	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2663	}
				2664
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2665	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2666	/// element-by-element comparison of the double-precision elements in the
				2667	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2668	/// vector.
				2669	///
				2670	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2671	/// If there is at least one pair of double-precision elements where the
				2672	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2673	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2674	/// If there is at least one pair of double-precision elements where the
				2675	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2676	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2677	/// This intrinsic returns the value of the ZF flag.
				2678	///
				2679	/// \headerfile <x86intrin.h>
				2680	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2681	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2682	///
				2683	/// \param __a
				2684	/// A 256-bit vector of [4 x double].
				2685	/// \param __b
				2686	/// A 256-bit vector of [4 x double].
				2687	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2688	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2689	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2690	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2691	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2692	}
				2693
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2694	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2695	/// element-by-element comparison of the double-precision elements in the
				2696	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2697	/// vector.
				2698	///
				2699	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2700	/// If there is at least one pair of double-precision elements where the
				2701	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2702	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2703	/// If there is at least one pair of double-precision elements where the
				2704	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2705	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2706	/// This intrinsic returns the value of the CF flag.
				2707	///
				2708	/// \headerfile <x86intrin.h>
				2709	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2710	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2711	///
				2712	/// \param __a
				2713	/// A 256-bit vector of [4 x double].
				2714	/// \param __b
				2715	/// A 256-bit vector of [4 x double].
				2716	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2717	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2718	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2719	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2720	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2721	}
				2722
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2723	/// Given two 256-bit floating-point vectors of [4 x double], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2724	/// element-by-element comparison of the double-precision elements in the
				2725	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2726	/// vector.
				2727	///
				2728	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2729	/// If there is at least one pair of double-precision elements where the
				2730	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2731	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2732	/// If there is at least one pair of double-precision elements where the
				2733	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2734	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2735	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2736	/// otherwise it returns 0.
				2737	///
				2738	/// \headerfile <x86intrin.h>
				2739	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2740	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2741	///
				2742	/// \param __a
				2743	/// A 256-bit vector of [4 x double].
				2744	/// \param __b
				2745	/// A 256-bit vector of [4 x double].
				2746	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2747	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2748	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2749	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2750	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2751	}
				2752
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2753	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2754	/// element-by-element comparison of the single-precision element in the
				2755	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2756	/// vector.
				2757	///
				2758	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2759	/// If there is at least one pair of single-precision elements where the
				2760	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2761	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2762	/// If there is at least one pair of single-precision elements where the
				2763	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2764	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2765	/// This intrinsic returns the value of the ZF flag.
				2766	///
				2767	/// \headerfile <x86intrin.h>
				2768	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2769	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2770	///
				2771	/// \param __a
				2772	/// A 256-bit vector of [8 x float].
				2773	/// \param __b
				2774	/// A 256-bit vector of [8 x float].
				2775	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2776	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2777	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2778	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2779	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2780	}
				2781
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2782	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2783	/// element-by-element comparison of the single-precision element in the
				2784	/// first source vector and the corresponding element in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2785	/// vector.
				2786	///
				2787	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2788	/// If there is at least one pair of single-precision elements where the
				2789	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2790	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2791	/// If there is at least one pair of single-precision elements where the
				2792	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2793	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2794	/// This intrinsic returns the value of the CF flag.
				2795	///
				2796	/// \headerfile <x86intrin.h>
				2797	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2798	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2799	///
				2800	/// \param __a
				2801	/// A 256-bit vector of [8 x float].
				2802	/// \param __b
				2803	/// A 256-bit vector of [8 x float].
				2804	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2805	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2806	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2807	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2808	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2809	}
				2810
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2811	/// Given two 256-bit floating-point vectors of [8 x float], perform an
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2812	/// element-by-element comparison of the single-precision elements in the
				2813	/// first source vector and the corresponding elements in the second source
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2814	/// vector.
				2815	///
				2816	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2817	/// If there is at least one pair of single-precision elements where the
				2818	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2819	/// ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2820	/// If there is at least one pair of single-precision elements where the
				2821	/// sign-bit of the first element is 0 and the sign-bit of the second element
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2822	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2823	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2824	/// otherwise it returns 0.
				2825	///
				2826	/// \headerfile <x86intrin.h>
				2827	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2828	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2829	///
				2830	/// \param __a
				2831	/// A 256-bit vector of [8 x float].
				2832	/// \param __b
				2833	/// A 256-bit vector of [8 x float].
				2834	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2835	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2836	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2837	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2838	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2839	}
				2840
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2841	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2842	/// of the two source vectors.
				2843	///
				2844	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2845	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2846	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2847	/// If there is at least one pair of bits where the bit from the first source
				2848	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2849	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2850	/// This intrinsic returns the value of the ZF flag.
				2851	///
				2852	/// \headerfile <x86intrin.h>
				2853	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2854	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2855	///
				2856	/// \param __a
				2857	/// A 256-bit integer vector.
				2858	/// \param __b
				2859	/// A 256-bit integer vector.
				2860	/// \returns the ZF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2861	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2862	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2863	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2864	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2865	}
				2866
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2867	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2868	/// of the two source vectors.
				2869	///
				2870	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2871	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2872	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2873	/// If there is at least one pair of bits where the bit from the first source
				2874	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2875	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2876	/// This intrinsic returns the value of the CF flag.
				2877	///
				2878	/// \headerfile <x86intrin.h>
				2879	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2880	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2881	///
				2882	/// \param __a
				2883	/// A 256-bit integer vector.
				2884	/// \param __b
				2885	/// A 256-bit integer vector.
				2886	/// \returns the CF flag.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2887	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2888	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2889	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2890	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2891	}
				2892
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2893	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	2894	/// of the two source vectors.
				2895	///
				2896	/// The EFLAGS register is updated as follows: \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2897	/// If there is at least one pair of bits where both bits are 1, the ZF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2898	/// is set to 0. Otherwise the ZF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2899	/// If there is at least one pair of bits where the bit from the first source
				2900	/// vector is 0 and the bit from the second source vector is 1, the CF flag
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	2901	/// is set to 0. Otherwise the CF flag is set to 1. \n
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2902	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2903	/// otherwise it returns 0.
				2904	///
				2905	/// \headerfile <x86intrin.h>
				2906	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2907	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2908	///
				2909	/// \param __a
				2910	/// A 256-bit integer vector.
				2911	/// \param __b
				2912	/// A 256-bit integer vector.
				2913	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2914	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2915	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2916	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2917	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2918	}
				2919
				2920	/* Vector extract sign mask */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2921	/// Extracts the sign bits of double-precision floating point elements
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2922	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2923	/// bits of the return value.
				2924	///
				2925	/// \headerfile <x86intrin.h>
				2926	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2927	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2928	///
				2929	/// \param __a
				2930	/// A 256-bit vector of [4 x double] containing the double-precision
				2931	/// floating point values with sign bits to be extracted.
				2932	/// \returns The sign bits from the operand, written to bits [3:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2933	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2934	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2935	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2936	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2937	}
				2938
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2939	/// Extracts the sign bits of single-precision floating point elements
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2940	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				2941	/// bits of the return value.
				2942	///
				2943	/// \headerfile <x86intrin.h>
				2944	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2945	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2946	///
				2947	/// \param __a
Douglas Yung	7ff9142	2018-01-08 21:21:17 +0000	[diff] [blame]	2948	/// A 256-bit vector of [8 x float] containing the single-precision floating
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2949	/// point values with sign bits to be extracted.
				2950	/// \returns The sign bits from the operand, written to bits [7:0].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2951	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2952	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2953	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2954	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2955	}
				2956
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2957	/* Vector __zero */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2958	/// Zeroes the contents of all XMM or YMM registers.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2959	///
				2960	/// \headerfile <x86intrin.h>
				2961	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2962	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2963	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2964	_mm256_zeroall(void)
				2965	{
				2966	__builtin_ia32_vzeroall();
				2967	}
				2968
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2969	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2970	///
				2971	/// \headerfile <x86intrin.h>
				2972	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2973	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2974	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2975	_mm256_zeroupper(void)
				2976	{
				2977	__builtin_ia32_vzeroupper();
				2978	}
				2979
				2980	/* Vector load with broadcast */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	2981	/// Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	2982	/// specified address pointed to by \a __a and broadcasts it to the elements
				2983	/// of a [4 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2984	///
				2985	/// \headerfile <x86intrin.h>
				2986	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	2987	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	2988	///
				2989	/// \param __a
				2990	/// The single-precision floating point value to be broadcast.
				2991	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				2992	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2993	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2994	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2995	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2996	float __f = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	2997	return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2998	}
				2999
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3000	/// Loads a scalar double-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3001	/// specified address pointed to by \a __a and broadcasts it to the elements
				3002	/// of a [4 x double] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3003	///
				3004	/// \headerfile <x86intrin.h>
				3005	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3006	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3007	///
				3008	/// \param __a
				3009	/// The double-precision floating point value to be broadcast.
				3010	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				3011	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3012	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3013	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3014	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3015	double __d = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3016	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3017	}
				3018
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3019	/// Loads a scalar single-precision floating point value from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3020	/// specified address pointed to by \a __a and broadcasts it to the elements
				3021	/// of a [8 x float] vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3022	///
				3023	/// \headerfile <x86intrin.h>
				3024	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3025	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3026	///
				3027	/// \param __a
				3028	/// The single-precision floating point value to be broadcast.
				3029	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				3030	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3031	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3032	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3033	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	3034	float __f = *__a;
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3035	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3036	}
				3037
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3038	/// Loads the data from a 128-bit vector of [2 x double] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3039	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3040	/// elements in a 256-bit vector of [4 x double].
				3041	///
				3042	/// \headerfile <x86intrin.h>
				3043	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3044	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3045	///
				3046	/// \param __a
				3047	/// The 128-bit vector of [2 x double] to be broadcast.
				3048	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				3049	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3050	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3051	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3052	{
Craig Topper	6fb26f9	2018-06-03 19:42:59 +0000	[diff] [blame]	3053	__m128d __b = _mm_loadu_pd((const double *)__a);
				3054	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
				3055	0, 1, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3056	}
				3057
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3058	/// Loads the data from a 128-bit vector of [4 x float] from the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3059	/// specified address pointed to by \a __a and broadcasts it to 128-bit
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3060	/// elements in a 256-bit vector of [8 x float].
				3061	///
				3062	/// \headerfile <x86intrin.h>
				3063	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3064	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3065	///
				3066	/// \param __a
				3067	/// The 128-bit vector of [4 x float] to be broadcast.
				3068	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				3069	/// equal to the broadcast value.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3070	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3071	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3072	{
Craig Topper	6fb26f9	2018-06-03 19:42:59 +0000	[diff] [blame]	3073	__m128 __b = _mm_loadu_ps((const float *)__a);
				3074	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
				3075	0, 1, 2, 3, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3076	}
				3077
				3078	/* SIMD load ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3079	/// Loads 4 double-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3080	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3081	///
				3082	/// \headerfile <x86intrin.h>
				3083	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3084	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3085	///
				3086	/// \param __p
				3087	/// A 32-byte aligned pointer to a memory location containing
				3088	/// double-precision floating point values.
				3089	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3090	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3091	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3092	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3093	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3094	}
				3095
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3096	/// Loads 8 single-precision floating point values from a 32-byte aligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3097	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3098	///
				3099	/// \headerfile <x86intrin.h>
				3100	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3101	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3102	///
				3103	/// \param __p
				3104	/// A 32-byte aligned pointer to a memory location containing float values.
				3105	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3106	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3107	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3108	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3109	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3110	}
				3111
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3112	/// Loads 4 double-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3113	/// memory location pointed to by \a __p into a vector of [4 x double].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3114	///
				3115	/// \headerfile <x86intrin.h>
				3116	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3117	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3118	///
				3119	/// \param __p
				3120	/// A pointer to a memory location containing double-precision floating
				3121	/// point values.
				3122	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3123	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3124	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3125	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3126	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3127	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3128	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3129	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3130	}
				3131
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3132	/// Loads 8 single-precision floating point values from an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3133	/// memory location pointed to by \a __p into a vector of [8 x float].
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3134	///
				3135	/// \headerfile <x86intrin.h>
				3136	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3137	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3138	///
				3139	/// \param __p
				3140	/// A pointer to a memory location containing single-precision floating
				3141	/// point values.
				3142	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3143	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3144	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3145	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3146	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3147	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3148	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3149	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3150	}
				3151
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3152	/// Loads 256 bits of integer data from a 32-byte aligned memory
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3153	/// location pointed to by \a __p into elements of a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3154	///
				3155	/// \headerfile <x86intrin.h>
				3156	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3157	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3158	///
				3159	/// \param __p
				3160	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3161	/// values.
				3162	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3163	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3164	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3165	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3166	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3167	}
				3168
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3169	/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3170	/// pointed to by \a __p into a 256-bit integer vector.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3171	///
				3172	/// \headerfile <x86intrin.h>
				3173	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3174	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3175	///
				3176	/// \param __p
				3177	/// A pointer to a 256-bit integer vector containing integer values.
				3178	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3179	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3180	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3181	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	3182	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3183	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	3184	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3185	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3186	}
				3187
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3188	/// Loads 256 bits of integer data from an unaligned memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3189	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
				3190	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3191	/// line boundary.
				3192	///
				3193	/// \headerfile <x86intrin.h>
				3194	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3195	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3196	///
				3197	/// \param __p
				3198	/// A pointer to a 256-bit integer vector containing integer values.
				3199	/// \returns A 256-bit integer vector containing the moved values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3200	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3201	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3202	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3203	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3204	}
				3205
				3206	/* SIMD store ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3207	/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3208	/// of [4 x double] to a 32-byte aligned memory location pointed to by
				3209	/// \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3210	///
				3211	/// \headerfile <x86intrin.h>
				3212	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3213	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3214	///
				3215	/// \param __p
				3216	/// A 32-byte aligned pointer to a memory location that will receive the
				3217	/// double-precision floaing point values.
				3218	/// \param __a
				3219	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3220	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3221	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3222	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3223	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3224	}
				3225
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3226	/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3227	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3228	///
				3229	/// \headerfile <x86intrin.h>
				3230	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3231	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3232	///
				3233	/// \param __p
				3234	/// A 32-byte aligned pointer to a memory location that will receive the
				3235	/// float values.
				3236	/// \param __a
				3237	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3238	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3239	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3240	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3241	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3242	}
				3243
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3244	/// Stores double-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3245	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3246	///
				3247	/// \headerfile <x86intrin.h>
				3248	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3249	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3250	///
				3251	/// \param __p
				3252	/// A pointer to a memory location that will receive the double-precision
				3253	/// floating point values.
				3254	/// \param __a
				3255	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3256	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3257	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3258	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3259	struct __storeu_pd {
				3260	__m256d __v;
				3261	} __attribute__((__packed__, __may_alias__));
				3262	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3263	}
				3264
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3265	/// Stores single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3266	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3267	///
				3268	/// \headerfile <x86intrin.h>
				3269	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3270	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3271	///
				3272	/// \param __p
				3273	/// A pointer to a memory location that will receive the float values.
				3274	/// \param __a
				3275	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3276	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3277	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3278	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3279	struct __storeu_ps {
				3280	__m256 __v;
				3281	} __attribute__((__packed__, __may_alias__));
				3282	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3283	}
				3284
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3285	/// Stores integer values from a 256-bit integer vector to a 32-byte
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3286	/// aligned memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3287	///
				3288	/// \headerfile <x86intrin.h>
				3289	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3290	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3291	///
				3292	/// \param __p
				3293	/// A 32-byte aligned pointer to a memory location that will receive the
				3294	/// integer values.
				3295	/// \param __a
				3296	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3297	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3298	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3299	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3300	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3301	}
				3302
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3303	/// Stores integer values from a 256-bit integer vector to an unaligned
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3304	/// memory location pointed to by \a __p.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3305	///
				3306	/// \headerfile <x86intrin.h>
				3307	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3308	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3309	///
				3310	/// \param __p
				3311	/// A pointer to a memory location that will receive the integer values.
				3312	/// \param __a
				3313	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3314	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3315	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3316	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	3317	struct __storeu_si256 {
				3318	__m256i __v;
				3319	} __attribute__((__packed__, __may_alias__));
				3320	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3321	}
				3322
				3323	/* Conditional load ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3324	/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3325	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3326	/// [2 x double], depending on the mask bits associated with each data
				3327	/// element.
				3328	///
				3329	/// \headerfile <x86intrin.h>
				3330	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3331	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3332	///
				3333	/// \param __p
				3334	/// A pointer to a memory location that contains the double-precision
				3335	/// floating point values.
				3336	/// \param __m
				3337	/// A 128-bit integer vector containing the mask. The most significant bit of
				3338	/// each data element represents the mask bits. If a mask bit is zero, the
				3339	/// corresponding value in the memory location is not loaded and the
				3340	/// corresponding field in the return value is set to zero.
				3341	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3342	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3343	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3344	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3345	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3346	}
				3347
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3348	/// Conditionally loads double-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3349	/// memory location pointed to by \a __p into a 256-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3350	/// [4 x double], depending on the mask bits associated with each data
				3351	/// element.
				3352	///
				3353	/// \headerfile <x86intrin.h>
				3354	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3355	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3356	///
				3357	/// \param __p
				3358	/// A pointer to a memory location that contains the double-precision
				3359	/// floating point values.
				3360	/// \param __m
				3361	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3362	/// significant bit of each quadword element represents the mask bits. If a
				3363	/// mask bit is zero, the corresponding value in the memory location is not
				3364	/// loaded and the corresponding field in the return value is set to zero.
				3365	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3366	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3367	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3368	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3369	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3370	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3371	}
				3372
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3373	/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3374	/// memory location pointed to by \a __p into a 128-bit vector of
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3375	/// [4 x float], depending on the mask bits associated with each data
				3376	/// element.
				3377	///
				3378	/// \headerfile <x86intrin.h>
				3379	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3380	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3381	///
				3382	/// \param __p
				3383	/// A pointer to a memory location that contains the single-precision
				3384	/// floating point values.
				3385	/// \param __m
				3386	/// A 128-bit integer vector containing the mask. The most significant bit of
				3387	/// each data element represents the mask bits. If a mask bit is zero, the
				3388	/// corresponding value in the memory location is not loaded and the
				3389	/// corresponding field in the return value is set to zero.
				3390	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3391	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3392	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3393	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3394	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3395	}
				3396
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3397	/// Conditionally loads single-precision floating point elements from a
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3398	/// memory location pointed to by \a __p into a 256-bit vector of
				3399	/// [8 x float], depending on the mask bits associated with each data
				3400	/// element.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3401	///
				3402	/// \headerfile <x86intrin.h>
				3403	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3404	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3405	///
				3406	/// \param __p
				3407	/// A pointer to a memory location that contains the single-precision
				3408	/// floating point values.
				3409	/// \param __m
				3410	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3411	/// significant bit of each dword element represents the mask bits. If a mask
				3412	/// bit is zero, the corresponding value in the memory location is not loaded
				3413	/// and the corresponding field in the return value is set to zero.
				3414	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3415	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3416	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3417	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3418	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3419	}
				3420
				3421	/* Conditional store ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3422	/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3423	/// of [8 x float] to a memory location pointed to by \a __p, according to
				3424	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3425	///
				3426	/// \headerfile <x86intrin.h>
				3427	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3428	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3429	///
				3430	/// \param __p
				3431	/// A pointer to a memory location that will receive the float values.
				3432	/// \param __m
				3433	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3434	/// significant bit of each dword element in the mask vector represents the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3435	/// mask bits. If a mask bit is zero, the corresponding value from vector
				3436	/// \a __a is not stored and the corresponding field in the memory location
				3437	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3438	/// \param __a
				3439	/// A 256-bit vector of [8 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3440	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3441	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3442	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3443	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3444	}
				3445
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3446	/// Moves double-precision values from a 128-bit vector of [2 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3447	/// to a memory location pointed to by \a __p, according to the specified
				3448	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3449	///
				3450	/// \headerfile <x86intrin.h>
				3451	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3452	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3453	///
				3454	/// \param __p
				3455	/// A pointer to a memory location that will receive the float values.
				3456	/// \param __m
				3457	/// A 128-bit integer vector containing the mask. The most significant bit of
				3458	/// each field in the mask vector represents the mask bits. If a mask bit is
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3459	/// zero, the corresponding value from vector \a __a is not stored and the
				3460	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3461	/// changed.
				3462	/// \param __a
				3463	/// A 128-bit vector of [2 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3464	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3465	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3466	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3467	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3468	}
				3469
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3470	/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3471	/// to a memory location pointed to by \a __p, according to the specified
				3472	/// mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3473	///
				3474	/// \headerfile <x86intrin.h>
				3475	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3476	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3477	///
				3478	/// \param __p
				3479	/// A pointer to a memory location that will receive the float values.
				3480	/// \param __m
				3481	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3482	/// significant bit of each quadword element in the mask vector represents
				3483	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3484	/// __a is not stored and the corresponding field in the memory location
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3485	/// pointed to by \a __p is not changed.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3486	/// \param __a
				3487	/// A 256-bit vector of [4 x double] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3488	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3489	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3490	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3491	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3492	}
				3493
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3494	/// Moves single-precision floating point values from a 128-bit vector
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3495	/// of [4 x float] to a memory location pointed to by \a __p, according to
				3496	/// the specified mask.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3497	///
				3498	/// \headerfile <x86intrin.h>
				3499	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3500	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3501	///
				3502	/// \param __p
				3503	/// A pointer to a memory location that will receive the float values.
				3504	/// \param __m
				3505	/// A 128-bit integer vector containing the mask. The most significant bit of
				3506	/// each field in the mask vector represents the mask bits. If a mask bit is
				3507	/// zero, the corresponding value from vector __a is not stored and the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	3508	/// corresponding field in the memory location pointed to by \a __p is not
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3509	/// changed.
				3510	/// \param __a
				3511	/// A 128-bit vector of [4 x float] containing the values to be stored.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3512	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3513	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3514	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	3515	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3516	}
				3517
				3518	/* Cacheability support ops */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3519	/// Moves integer data from a 256-bit integer vector to a 32-byte
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3520	/// aligned memory location. To minimize caching, the data is flagged as
				3521	/// non-temporal (unlikely to be used again soon).
				3522	///
				3523	/// \headerfile <x86intrin.h>
				3524	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3525	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3526	///
				3527	/// \param __a
				3528	/// A pointer to a 32-byte aligned memory location that will receive the
				3529	/// integer values.
				3530	/// \param __b
				3531	/// A 256-bit integer vector containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3532	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3533	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3534	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3535	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
				3536	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3537	}
				3538
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3539	/// Moves double-precision values from a 256-bit vector of [4 x double]
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3540	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3541	/// flagged as non-temporal (unlikely to be used again soon).
				3542	///
				3543	/// \headerfile <x86intrin.h>
				3544	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3545	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3546	///
				3547	/// \param __a
				3548	/// A pointer to a 32-byte aligned memory location that will receive the
Ekaterina Romanova	cb3603a	2017-06-06 22:58:01 +0000	[diff] [blame]	3549	/// double-precision floating-point values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3550	/// \param __b
				3551	/// A 256-bit vector of [4 x double] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3552	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3553	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3554	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3555	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
				3556	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3557	}
				3558
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3559	/// Moves single-precision floating point values from a 256-bit vector
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3560	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3561	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3562	/// soon).
				3563	///
				3564	/// \headerfile <x86intrin.h>
				3565	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	3566	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3567	///
				3568	/// \param __p
				3569	/// A pointer to a 32-byte aligned memory location that will receive the
				3570	/// single-precision floating point values.
				3571	/// \param __a
				3572	/// A 256-bit vector of [8 x float] containing the values to be moved.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3573	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3574	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3575	{
Reid Kleckner	89fbd55	2018-06-04 21:39:20 +0000	[diff] [blame]	3576	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
				3577	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3578	}
				3579
				3580	/* Create vectors */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3581	/// Create a 256-bit vector of [4 x double] with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3582	///
				3583	/// \headerfile <x86intrin.h>
				3584	///
				3585	/// This intrinsic has no corresponding instruction.
				3586	///
				3587	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3588	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3589	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3590	{
				3591	return (__m256d)__builtin_ia32_undef256();
				3592	}
				3593
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3594	/// Create a 256-bit vector of [8 x float] with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3595	///
				3596	/// \headerfile <x86intrin.h>
				3597	///
				3598	/// This intrinsic has no corresponding instruction.
				3599	///
				3600	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3601	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3602	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3603	{
				3604	return (__m256)__builtin_ia32_undef256();
				3605	}
				3606
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3607	/// Create a 256-bit integer vector with undefined values.
Ekaterina Romanova	0a70076	2016-11-19 04:59:08 +0000	[diff] [blame]	3608	///
				3609	/// \headerfile <x86intrin.h>
				3610	///
				3611	/// This intrinsic has no corresponding instruction.
				3612	///
				3613	/// \returns A 256-bit integer vector containing undefined values.
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3614	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame]	3615	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	3616	{
				3617	return (__m256i)__builtin_ia32_undef256();
				3618	}
				3619
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3620	/// Constructs a 256-bit floating-point vector of [4 x double]
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3621	/// initialized with the specified double-precision floating-point values.
				3622	///
				3623	/// \headerfile <x86intrin.h>
				3624	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3625	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3626	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3627	///
				3628	/// \param __a
				3629	/// A double-precision floating-point value used to initialize bits [255:192]
				3630	/// of the result.
				3631	/// \param __b
				3632	/// A double-precision floating-point value used to initialize bits [191:128]
				3633	/// of the result.
				3634	/// \param __c
				3635	/// A double-precision floating-point value used to initialize bits [127:64]
				3636	/// of the result.
				3637	/// \param __d
				3638	/// A double-precision floating-point value used to initialize bits [63:0]
				3639	/// of the result.
				3640	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3641	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3642	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3643	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3644	return __extension__ (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3645	}
				3646
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3647	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3648	/// with the specified single-precision floating-point values.
				3649	///
				3650	/// \headerfile <x86intrin.h>
				3651	///
				3652	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3653	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3654	///
				3655	/// \param __a
				3656	/// A single-precision floating-point value used to initialize bits [255:224]
				3657	/// of the result.
				3658	/// \param __b
				3659	/// A single-precision floating-point value used to initialize bits [223:192]
				3660	/// of the result.
				3661	/// \param __c
				3662	/// A single-precision floating-point value used to initialize bits [191:160]
				3663	/// of the result.
				3664	/// \param __d
				3665	/// A single-precision floating-point value used to initialize bits [159:128]
				3666	/// of the result.
				3667	/// \param __e
				3668	/// A single-precision floating-point value used to initialize bits [127:96]
				3669	/// of the result.
				3670	/// \param __f
				3671	/// A single-precision floating-point value used to initialize bits [95:64]
				3672	/// of the result.
				3673	/// \param __g
				3674	/// A single-precision floating-point value used to initialize bits [63:32]
				3675	/// of the result.
				3676	/// \param __h
				3677	/// A single-precision floating-point value used to initialize bits [31:0]
				3678	/// of the result.
				3679	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3680	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3681	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3682	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3683	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3684	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3685	}
				3686
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3687	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3688	/// 32-bit integral values.
				3689	///
				3690	/// \headerfile <x86intrin.h>
				3691	///
				3692	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3693	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3694	///
				3695	/// \param __i0
				3696	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3697	/// \param __i1
				3698	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3699	/// \param __i2
				3700	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3701	/// \param __i3
				3702	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3703	/// \param __i4
				3704	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3705	/// \param __i5
				3706	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3707	/// \param __i6
				3708	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3709	/// \param __i7
				3710	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3711	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3712	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3713	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3714	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3715	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3716	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3717	}
				3718
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3719	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3720	/// 16-bit integral values.
				3721	///
				3722	/// \headerfile <x86intrin.h>
				3723	///
				3724	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3725	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3726	///
				3727	/// \param __w15
				3728	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3729	/// \param __w14
				3730	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3731	/// \param __w13
				3732	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3733	/// \param __w12
				3734	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3735	/// \param __w11
				3736	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3737	/// \param __w10
				3738	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3739	/// \param __w09
				3740	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3741	/// \param __w08
				3742	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3743	/// \param __w07
				3744	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3745	/// \param __w06
				3746	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3747	/// \param __w05
				3748	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3749	/// \param __w04
				3750	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3751	/// \param __w03
				3752	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3753	/// \param __w02
				3754	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3755	/// \param __w01
				3756	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3757	/// \param __w00
				3758	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3759	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3760	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3761	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3762	short __w11, short __w10, short __w09, short __w08,
				3763	short __w07, short __w06, short __w05, short __w04,
				3764	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3765	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3766	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3767	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3768	}
				3769
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3770	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3771	/// 8-bit integral values.
				3772	///
				3773	/// \headerfile <x86intrin.h>
				3774	///
				3775	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3776	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3777	///
				3778	/// \param __b31
				3779	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3780	/// \param __b30
				3781	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3782	/// \param __b29
				3783	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3784	/// \param __b28
				3785	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3786	/// \param __b27
				3787	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3788	/// \param __b26
				3789	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3790	/// \param __b25
				3791	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3792	/// \param __b24
				3793	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3794	/// \param __b23
				3795	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3796	/// \param __b22
				3797	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3798	/// \param __b21
				3799	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3800	/// \param __b20
				3801	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3802	/// \param __b19
				3803	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3804	/// \param __b18
				3805	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3806	/// \param __b17
				3807	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3808	/// \param __b16
				3809	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3810	/// \param __b15
				3811	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3812	/// \param __b14
				3813	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3814	/// \param __b13
				3815	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3816	/// \param __b12
				3817	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3818	/// \param __b11
				3819	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3820	/// \param __b10
				3821	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3822	/// \param __b09
				3823	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3824	/// \param __b08
				3825	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3826	/// \param __b07
				3827	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3828	/// \param __b06
				3829	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3830	/// \param __b05
				3831	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3832	/// \param __b04
				3833	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3834	/// \param __b03
				3835	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3836	/// \param __b02
				3837	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3838	/// \param __b01
				3839	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3840	/// \param __b00
				3841	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3842	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3843	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3844	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3845	char __b27, char __b26, char __b25, char __b24,
				3846	char __b23, char __b22, char __b21, char __b20,
				3847	char __b19, char __b18, char __b17, char __b16,
				3848	char __b15, char __b14, char __b13, char __b12,
				3849	char __b11, char __b10, char __b09, char __b08,
				3850	char __b07, char __b06, char __b05, char __b04,
				3851	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3852	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3853	return __extension__ (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3854	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3855	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3856	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3857	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3858	};
				3859	}
				3860
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3861	/// Constructs a 256-bit integer vector initialized with the specified
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3862	/// 64-bit integral values.
				3863	///
				3864	/// \headerfile <x86intrin.h>
				3865	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3866	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				3867	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3868	///
				3869	/// \param __a
				3870	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3871	/// \param __b
				3872	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3873	/// \param __c
				3874	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3875	/// \param __d
				3876	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3877	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3878	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3879	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3880	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	3881	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3882	}
				3883
				3884	/* Create vectors with elements in reverse order */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3885	/// Constructs a 256-bit floating-point vector of [4 x double],
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3886	/// initialized in reverse order with the specified double-precision
				3887	/// floating-point values.
				3888	///
				3889	/// \headerfile <x86intrin.h>
				3890	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3891	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3892	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3893	///
				3894	/// \param __a
				3895	/// A double-precision floating-point value used to initialize bits [63:0]
				3896	/// of the result.
				3897	/// \param __b
				3898	/// A double-precision floating-point value used to initialize bits [127:64]
				3899	/// of the result.
				3900	/// \param __c
				3901	/// A double-precision floating-point value used to initialize bits [191:128]
				3902	/// of the result.
				3903	/// \param __d
				3904	/// A double-precision floating-point value used to initialize bits [255:192]
				3905	/// of the result.
				3906	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3907	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3908	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3909	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3910	return _mm256_set_pd(__d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3911	}
				3912
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3913	/// Constructs a 256-bit floating-point vector of [8 x float],
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3914	/// initialized in reverse order with the specified single-precision
				3915	/// float-point values.
				3916	///
				3917	/// \headerfile <x86intrin.h>
				3918	///
				3919	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3920	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3921	///
				3922	/// \param __a
				3923	/// A single-precision floating-point value used to initialize bits [31:0]
				3924	/// of the result.
				3925	/// \param __b
				3926	/// A single-precision floating-point value used to initialize bits [63:32]
				3927	/// of the result.
				3928	/// \param __c
				3929	/// A single-precision floating-point value used to initialize bits [95:64]
				3930	/// of the result.
				3931	/// \param __d
				3932	/// A single-precision floating-point value used to initialize bits [127:96]
				3933	/// of the result.
				3934	/// \param __e
				3935	/// A single-precision floating-point value used to initialize bits [159:128]
				3936	/// of the result.
				3937	/// \param __f
				3938	/// A single-precision floating-point value used to initialize bits [191:160]
				3939	/// of the result.
				3940	/// \param __g
				3941	/// A single-precision floating-point value used to initialize bits [223:192]
				3942	/// of the result.
				3943	/// \param __h
				3944	/// A single-precision floating-point value used to initialize bits [255:224]
				3945	/// of the result.
				3946	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3947	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3948	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3949	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3950	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3951	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3952	}
				3953
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3954	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3955	/// with the specified 32-bit integral values.
				3956	///
				3957	/// \headerfile <x86intrin.h>
				3958	///
				3959	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3960	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3961	///
				3962	/// \param __i0
				3963	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3964	/// \param __i1
				3965	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3966	/// \param __i2
				3967	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3968	/// \param __i3
				3969	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3970	/// \param __i4
				3971	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3972	/// \param __i5
				3973	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3974	/// \param __i6
				3975	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3976	/// \param __i7
				3977	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3978	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	3979	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	3980	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	3981	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3982	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	3983	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	3984	}
				3985
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	3986	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3987	/// with the specified 16-bit integral values.
				3988	///
				3989	/// \headerfile <x86intrin.h>
				3990	///
				3991	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	3992	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	3993	///
				3994	/// \param __w15
				3995	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3996	/// \param __w14
				3997	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3998	/// \param __w13
				3999	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				4000	/// \param __w12
				4001	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				4002	/// \param __w11
				4003	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				4004	/// \param __w10
				4005	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				4006	/// \param __w09
				4007	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				4008	/// \param __w08
				4009	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				4010	/// \param __w07
				4011	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				4012	/// \param __w06
				4013	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				4014	/// \param __w05
				4015	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				4016	/// \param __w04
				4017	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				4018	/// \param __w03
				4019	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				4020	/// \param __w02
				4021	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				4022	/// \param __w01
				4023	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				4024	/// \param __w00
				4025	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				4026	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4027	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4028	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4029	short __w11, short __w10, short __w09, short __w08,
				4030	short __w07, short __w06, short __w05, short __w04,
				4031	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4032	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4033	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
				4034	__w04, __w05, __w06, __w07,
				4035	__w08, __w09, __w10, __w11,
				4036	__w12, __w13, __w14, __w15);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4037	}
				4038
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4039	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4040	/// with the specified 8-bit integral values.
				4041	///
				4042	/// \headerfile <x86intrin.h>
				4043	///
				4044	/// This intrinsic is a utility function and does not correspond to a specific
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4045	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4046	///
				4047	/// \param __b31
				4048	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				4049	/// \param __b30
				4050	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				4051	/// \param __b29
				4052	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				4053	/// \param __b28
				4054	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				4055	/// \param __b27
				4056	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				4057	/// \param __b26
				4058	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				4059	/// \param __b25
				4060	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				4061	/// \param __b24
				4062	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				4063	/// \param __b23
				4064	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				4065	/// \param __b22
				4066	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				4067	/// \param __b21
				4068	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				4069	/// \param __b20
				4070	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				4071	/// \param __b19
				4072	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				4073	/// \param __b18
				4074	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				4075	/// \param __b17
				4076	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				4077	/// \param __b16
				4078	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				4079	/// \param __b15
				4080	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				4081	/// \param __b14
				4082	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				4083	/// \param __b13
				4084	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				4085	/// \param __b12
				4086	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				4087	/// \param __b11
				4088	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				4089	/// \param __b10
				4090	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				4091	/// \param __b09
				4092	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				4093	/// \param __b08
				4094	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				4095	/// \param __b07
				4096	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				4097	/// \param __b06
				4098	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				4099	/// \param __b05
				4100	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				4101	/// \param __b04
				4102	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				4103	/// \param __b03
				4104	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				4105	/// \param __b02
				4106	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				4107	/// \param __b01
				4108	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				4109	/// \param __b00
				4110	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				4111	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4112	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4113	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	4114	char __b27, char __b26, char __b25, char __b24,
				4115	char __b23, char __b22, char __b21, char __b20,
				4116	char __b19, char __b18, char __b17, char __b16,
				4117	char __b15, char __b14, char __b13, char __b12,
				4118	char __b11, char __b10, char __b09, char __b08,
				4119	char __b07, char __b06, char __b05, char __b04,
				4120	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4121	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4122	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				4123	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				4124	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				4125	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4126	}
				4127
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4128	/// Constructs a 256-bit integer vector, initialized in reverse order
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4129	/// with the specified 64-bit integral values.
				4130	///
				4131	/// \headerfile <x86intrin.h>
				4132	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4133	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				4134	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4135	///
				4136	/// \param __a
				4137	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				4138	/// \param __b
				4139	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				4140	/// \param __c
				4141	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				4142	/// \param __d
				4143	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				4144	/// \returns An initialized 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4145	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4146	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4147	{
Tim Shen	f811de4	2018-05-31 01:51:07 +0000	[diff] [blame]	4148	return _mm256_set_epi64x(__d, __c, __b, __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4149	}
				4150
				4151	/* Create vectors with repeated elements */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4152	/// Constructs a 256-bit floating-point vector of [4 x double], with each
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4153	/// of the four double-precision floating-point vector elements set to the
				4154	/// specified double-precision floating-point value.
				4155	///
				4156	/// \headerfile <x86intrin.h>
				4157	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4158	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4159	///
				4160	/// \param __w
				4161	/// A double-precision floating-point value used to initialize each vector
				4162	/// element of the result.
				4163	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4164	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4165	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4166	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4167	return _mm256_set_pd(__w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4168	}
				4169
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4170	/// Constructs a 256-bit floating-point vector of [8 x float], with each
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4171	/// of the eight single-precision floating-point vector elements set to the
				4172	/// specified single-precision floating-point value.
				4173	///
				4174	/// \headerfile <x86intrin.h>
				4175	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4176	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4177	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4178	///
				4179	/// \param __w
				4180	/// A single-precision floating-point value used to initialize each vector
				4181	/// element of the result.
				4182	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4183	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4184	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4185	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4186	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4187	}
				4188
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4189	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4190	/// 32-bit integral vector elements set to the specified 32-bit integral
				4191	/// value.
				4192	///
				4193	/// \headerfile <x86intrin.h>
				4194	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4195	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4196	/// instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4197	///
				4198	/// \param __i
				4199	/// A 32-bit integral value used to initialize each vector element of the
				4200	/// result.
				4201	/// \returns An initialized 256-bit integer vector of [8 x i32].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4202	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4203	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4204	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4205	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4206	}
				4207
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4208	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4209	/// 16-bit integral vector elements set to the specified 16-bit integral
				4210	/// value.
				4211	///
				4212	/// \headerfile <x86intrin.h>
				4213	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4214	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4215	///
				4216	/// \param __w
				4217	/// A 16-bit integral value used to initialize each vector element of the
				4218	/// result.
				4219	/// \returns An initialized 256-bit integer vector of [16 x i16].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4220	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4221	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4222	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4223	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
				4224	__w, __w, __w, __w, __w, __w, __w, __w);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4225	}
				4226
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4227	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4228	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4229	///
				4230	/// \headerfile <x86intrin.h>
				4231	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4232	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4233	///
				4234	/// \param __b
				4235	/// An 8-bit integral value used to initialize each vector element of the
				4236	/// result.
				4237	/// \returns An initialized 256-bit integer vector of [32 x i8].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4238	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4239	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4240	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4241	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
				4242	__b, __b, __b, __b, __b, __b, __b, __b,
				4243	__b, __b, __b, __b, __b, __b, __b, __b,
				4244	__b, __b, __b, __b, __b, __b, __b, __b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4245	}
				4246
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4247	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4248	/// 64-bit integral vector elements set to the specified 64-bit integral
				4249	/// value.
				4250	///
				4251	/// \headerfile <x86intrin.h>
				4252	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4253	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4254	///
				4255	/// \param __q
				4256	/// A 64-bit integral value used to initialize each vector element of the
				4257	/// result.
				4258	/// \returns An initialized 256-bit integer vector of [4 x i64].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4259	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4260	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4261	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4262	return _mm256_set_epi64x(__q, __q, __q, __q);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4263	}
				4264
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4265	/* Create __zeroed vectors */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4266	/// Constructs a 256-bit floating-point vector of [4 x double] with all
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4267	/// vector elements initialized to zero.
				4268	///
				4269	/// \headerfile <x86intrin.h>
				4270	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4271	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4272	///
				4273	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4274	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4275	_mm256_setzero_pd(void)
				4276	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4277	return __extension__ (__m256d){ 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4278	}
				4279
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4280	/// Constructs a 256-bit floating-point vector of [8 x float] with all
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4281	/// vector elements initialized to zero.
				4282	///
				4283	/// \headerfile <x86intrin.h>
				4284	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4285	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4286	///
				4287	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4288	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4289	_mm256_setzero_ps(void)
				4290	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4291	return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4292	}
				4293
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4294	/// Constructs a 256-bit integer vector initialized to zero.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4295	///
				4296	/// \headerfile <x86intrin.h>
				4297	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4298	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4299	///
				4300	/// \returns A 256-bit integer vector initialized to zero.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4301	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4302	_mm256_setzero_si256(void)
				4303	{
Craig Topper	63ec0ea	2018-05-30 21:08:27 +0000	[diff] [blame]	4304	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4305	}
				4306
				4307	/* Cast between vector types */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4308	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4309	/// floating-point vector of [8 x float].
				4310	///
				4311	/// \headerfile <x86intrin.h>
				4312	///
				4313	/// This intrinsic has no corresponding instruction.
				4314	///
				4315	/// \param __a
				4316	/// A 256-bit floating-point vector of [4 x double].
				4317	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4318	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4319	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4320	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4321	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4322	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4323	}
				4324
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4325	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4326	/// integer vector.
				4327	///
				4328	/// \headerfile <x86intrin.h>
				4329	///
				4330	/// This intrinsic has no corresponding instruction.
				4331	///
				4332	/// \param __a
				4333	/// A 256-bit floating-point vector of [4 x double].
				4334	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4335	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4336	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4337	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4338	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4339	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4340	}
				4341
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4342	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4343	/// floating-point vector of [4 x double].
				4344	///
				4345	/// \headerfile <x86intrin.h>
				4346	///
				4347	/// This intrinsic has no corresponding instruction.
				4348	///
				4349	/// \param __a
				4350	/// A 256-bit floating-point vector of [8 x float].
				4351	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4352	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4353	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4354	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4355	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4356	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4357	}
				4358
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4359	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4360	/// integer vector.
				4361	///
				4362	/// \headerfile <x86intrin.h>
				4363	///
				4364	/// This intrinsic has no corresponding instruction.
				4365	///
				4366	/// \param __a
				4367	/// A 256-bit floating-point vector of [8 x float].
				4368	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4369	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4370	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4371	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4372	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4373	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4374	}
				4375
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4376	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4377	/// of [8 x float].
				4378	///
				4379	/// \headerfile <x86intrin.h>
				4380	///
				4381	/// This intrinsic has no corresponding instruction.
				4382	///
				4383	/// \param __a
				4384	/// A 256-bit integer vector.
				4385	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4386	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4387	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4388	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4389	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4390	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4391	}
				4392
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4393	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4394	/// of [4 x double].
				4395	///
				4396	/// \headerfile <x86intrin.h>
				4397	///
				4398	/// This intrinsic has no corresponding instruction.
				4399	///
				4400	/// \param __a
				4401	/// A 256-bit integer vector.
				4402	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4403	/// bitwise pattern as the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4404	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4405	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4406	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4407	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4408	}
				4409
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4410	/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4411	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4412	///
				4413	/// \headerfile <x86intrin.h>
				4414	///
				4415	/// This intrinsic has no corresponding instruction.
				4416	///
				4417	/// \param __a
				4418	/// A 256-bit floating-point vector of [4 x double].
				4419	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4420	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4421	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4422	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4423	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4424	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4425	}
				4426
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4427	/// Returns the lower 128 bits of a 256-bit floating-point vector of
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4428	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4429	///
				4430	/// \headerfile <x86intrin.h>
				4431	///
				4432	/// This intrinsic has no corresponding instruction.
				4433	///
				4434	/// \param __a
				4435	/// A 256-bit floating-point vector of [8 x float].
				4436	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4437	/// lower 128 bits of the parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4438	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4439	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4440	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4441	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4442	}
				4443
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4444	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4445	///
				4446	/// \headerfile <x86intrin.h>
				4447	///
				4448	/// This intrinsic has no corresponding instruction.
				4449	///
				4450	/// \param __a
				4451	/// A 256-bit integer vector.
				4452	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4453	/// parameter.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4454	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4455	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4456	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4457	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4458	}
				4459
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4460	/// Constructs a 256-bit floating-point vector of [4 x double] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4461	/// 128-bit floating-point vector of [2 x double].
				4462	///
				4463	/// The lower 128 bits contain the value of the source vector. The contents
				4464	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4465	///
				4466	/// \headerfile <x86intrin.h>
				4467	///
				4468	/// This intrinsic has no corresponding instruction.
				4469	///
				4470	/// \param __a
				4471	/// A 128-bit vector of [2 x double].
				4472	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4473	/// contain the value of the parameter. The contents of the upper 128 bits
				4474	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4475	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4476	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4477	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4478	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4479	}
				4480
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4481	/// Constructs a 256-bit floating-point vector of [8 x float] from a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4482	/// 128-bit floating-point vector of [4 x float].
				4483	///
				4484	/// The lower 128 bits contain the value of the source vector. The contents
				4485	/// of the upper 128 bits are undefined.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4486	///
				4487	/// \headerfile <x86intrin.h>
				4488	///
				4489	/// This intrinsic has no corresponding instruction.
				4490	///
				4491	/// \param __a
				4492	/// A 128-bit vector of [4 x float].
				4493	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4494	/// contain the value of the parameter. The contents of the upper 128 bits
				4495	/// are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4496	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4497	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4498	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4499	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4500	}
				4501
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4502	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4503	///
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4504	/// The lower 128 bits contain the value of the source vector. The contents
				4505	/// of the upper 128 bits are undefined.
				4506	///
				4507	/// \headerfile <x86intrin.h>
				4508	///
				4509	/// This intrinsic has no corresponding instruction.
				4510	///
				4511	/// \param __a
				4512	/// A 128-bit integer vector.
				4513	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4514	/// the parameter. The contents of the upper 128 bits are undefined.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4515	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	4516	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4517	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4518	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	4519	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4520
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4521	/// Constructs a 256-bit floating-point vector of [4 x double] from a
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4522	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4523	/// contain the value of the source vector. The upper 128 bits are set
				4524	/// to zero.
				4525	///
				4526	/// \headerfile <x86intrin.h>
				4527	///
				4528	/// This intrinsic has no corresponding instruction.
				4529	///
				4530	/// \param __a
				4531	/// A 128-bit vector of [2 x double].
				4532	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4533	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4534	static __inline __m256d __DEFAULT_FN_ATTRS
				4535	_mm256_zextpd128_pd256(__m128d __a)
				4536	{
				4537	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
				4538	}
				4539
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4540	/// Constructs a 256-bit floating-point vector of [8 x float] from a
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4541	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4542	/// the value of the source vector. The upper 128 bits are set to zero.
				4543	///
				4544	/// \headerfile <x86intrin.h>
				4545	///
				4546	/// This intrinsic has no corresponding instruction.
				4547	///
				4548	/// \param __a
				4549	/// A 128-bit vector of [4 x float].
				4550	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4551	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4552	static __inline __m256 __DEFAULT_FN_ATTRS
				4553	_mm256_zextps128_ps256(__m128 __a)
				4554	{
				4555	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
				4556	}
				4557
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4558	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
Simon Pilgrim	96d02f5	2017-04-29 17:17:06 +0000	[diff] [blame]	4559	/// The lower 128 bits contain the value of the source vector. The upper
				4560	/// 128 bits are set to zero.
				4561	///
				4562	/// \headerfile <x86intrin.h>
				4563	///
				4564	/// This intrinsic has no corresponding instruction.
				4565	///
				4566	/// \param __a
				4567	/// A 128-bit integer vector.
				4568	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4569	/// the parameter. The upper 128 bits are set to zero.
				4570	static __inline __m256i __DEFAULT_FN_ATTRS
				4571	_mm256_zextsi128_si256(__m128i __a)
				4572	{
				4573	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
				4574	}
				4575
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4576	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4577	Vector insert.
				4578	We use macros rather than inlines because we only want to accept
				4579	invocations where the immediate M is a constant expression.
				4580	*/
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4581	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4582	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4583	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4584	/// 128-bit vector of [4 x float] in the second parameter.
				4585	///
				4586	/// The immediate integer parameter determines between the upper or the lower
				4587	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4588	///
				4589	/// \headerfile <x86intrin.h>
				4590	///
				4591	/// \code
				4592	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4593	/// \endcode
				4594	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4595	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4596	///
				4597	/// \param V1
				4598	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4599	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4600	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4601	/// \param V2
				4602	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4603	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4604	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4605	/// \param M
				4606	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4607	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4608	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4609	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4610	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4611	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4612	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4613	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4614	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4615	#define _mm256_insertf128_ps(V1, V2, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4616	(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
				4617	(__v4sf)(__m128)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4618
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4619	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4620	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4621	/// replacing either the upper or the lower 128 bits with the contents of a
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4622	/// 128-bit vector of [2 x double] in the second parameter.
				4623	///
				4624	/// The immediate integer parameter determines between the upper or the lower
				4625	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4626	///
				4627	/// \headerfile <x86intrin.h>
				4628	///
				4629	/// \code
				4630	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4631	/// \endcode
				4632	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4633	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4634	///
				4635	/// \param V1
				4636	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4637	/// first, and then either the upper or the lower 128 bits of the result will
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4638	/// be replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4639	/// \param V2
				4640	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4641	/// written to either the upper or the lower 128 bits of the result depending
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4642	/// on the value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4643	/// \param M
				4644	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4645	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4646	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4647	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4648	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4649	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4650	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4651	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4652	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4653	#define _mm256_insertf128_pd(V1, V2, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4654	(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
				4655	(__v2df)(__m128d)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4656
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4657	/// Constructs a new 256-bit integer vector by first duplicating a
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4658	/// 256-bit integer vector given in the first parameter, and then replacing
				4659	/// either the upper or the lower 128 bits with the contents of a 128-bit
Ekaterina Romanova	1d4a0f2	2017-05-15 03:25:04 +0000	[diff] [blame]	4660	/// integer vector in the second parameter.
				4661	///
				4662	/// The immediate integer parameter determines between the upper or the lower
				4663	/// 128 bits.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4664	///
				4665	/// \headerfile <x86intrin.h>
				4666	///
				4667	/// \code
				4668	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4669	/// \endcode
				4670	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4671	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4672	///
				4673	/// \param V1
				4674	/// A 256-bit integer vector. This vector is copied to the result first, and
				4675	/// then either the upper or the lower 128 bits of the result will be
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4676	/// replaced by the contents of \a V2.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4677	/// \param V2
				4678	/// A 128-bit integer vector. The contents of this parameter are written to
				4679	/// either the upper or the lower 128 bits of the result depending on the
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4680	/// value of parameter \a M.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4681	/// \param M
				4682	/// An immediate integer. The least significant bit determines how the values
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4683	/// from the two parameters are interleaved: \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4684	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4685	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4686	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4687	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4688	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4689	/// result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4690	/// \returns A 256-bit integer vector containing the interleaved values.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4691	#define _mm256_insertf128_si256(V1, V2, M) \
Craig Topper	573dab1	2018-06-08 04:09:14 +0000	[diff] [blame^]	4692	(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
				4693	(__v4si)(__m128i)(V2), (int)(M))
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	4694
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	4695	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4696	Vector extract.
				4697	We use macros rather than inlines because we only want to accept
				4698	invocations where the immediate M is a constant expression.
				4699	*/
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4700	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4701	/// of [8 x float], as determined by the immediate integer parameter, and
				4702	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4703	///
				4704	/// \headerfile <x86intrin.h>
				4705	///
				4706	/// \code
				4707	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4708	/// \endcode
				4709	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4710	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4711	///
				4712	/// \param V
				4713	/// A 256-bit vector of [8 x float].
				4714	/// \param M
				4715	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4716	/// extracted from the first parameter: \n
				4717	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4718	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4719	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4720	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4721	#define _mm256_extractf128_ps(V, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4722	(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4723
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4724	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4725	/// of [4 x double], as determined by the immediate integer parameter, and
				4726	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4727	///
				4728	/// \headerfile <x86intrin.h>
				4729	///
				4730	/// \code
				4731	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4732	/// \endcode
				4733	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4734	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4735	///
				4736	/// \param V
				4737	/// A 256-bit vector of [4 x double].
				4738	/// \param M
				4739	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4740	/// extracted from the first parameter: \n
				4741	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4742	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4743	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4744	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4745	#define _mm256_extractf128_pd(V, M) \
Craig Topper	3428bee	2018-06-08 03:24:47 +0000	[diff] [blame]	4746	(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4747
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4748	/// Extracts either the upper or the lower 128 bits from a 256-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4749	/// integer vector, as determined by the immediate integer parameter, and
				4750	/// returns the extracted bits as a 128-bit integer vector.
				4751	///
				4752	/// \headerfile <x86intrin.h>
				4753	///
				4754	/// \code
				4755	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4756	/// \endcode
				4757	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4758	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4759	///
				4760	/// \param V
				4761	/// A 256-bit integer vector.
				4762	/// \param M
				4763	/// An immediate integer. The least significant bit determines which bits are
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4764	/// extracted from the first parameter: \n
				4765	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4766	/// result. \n
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4767	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4768	/// \returns A 128-bit integer vector containing the extracted bits.
Craig Topper	c633867	2018-05-31 00:51:20 +0000	[diff] [blame]	4769	#define _mm256_extractf128_si256(V, M) \
Craig Topper	573dab1	2018-06-08 04:09:14 +0000	[diff] [blame^]	4770	(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	4771
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4772	/* SIMD load ops (unaligned) */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4773	/// Loads two 128-bit floating-point vectors of [4 x float] from
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4774	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4775	/// of [8 x float] by concatenating the two 128-bit vectors.
				4776	///
				4777	/// \headerfile <x86intrin.h>
				4778	///
				4779	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4780	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4781	///
				4782	/// \param __addr_hi
				4783	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4784	/// single-precision floating-point values. These values are to be copied to
				4785	/// bits[255:128] of the result. The address of the memory location does not
				4786	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4787	/// \param __addr_lo
				4788	/// A pointer to a 128-bit memory location containing 4 consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4789	/// single-precision floating-point values. These values are to be copied to
				4790	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4791	/// have to be aligned.
				4792	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4793	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4794	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4795	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4796	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4797	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				4798	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4799	}
				4800
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4801	/// Loads two 128-bit floating-point vectors of [2 x double] from
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4802	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4803	/// of [4 x double] by concatenating the two 128-bit vectors.
				4804	///
				4805	/// \headerfile <x86intrin.h>
				4806	///
				4807	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4808	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4809	///
				4810	/// \param __addr_hi
				4811	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4812	/// double-precision floating-point values. These values are to be copied to
				4813	/// bits[255:128] of the result. The address of the memory location does not
				4814	/// have to be aligned.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4815	/// \param __addr_lo
				4816	/// A pointer to a 128-bit memory location containing two consecutive
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4817	/// double-precision floating-point values. These values are to be copied to
				4818	/// bits[127:0] of the result. The address of the memory location does not
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4819	/// have to be aligned.
				4820	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4821	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4823	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4824	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4825	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				4826	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4827	}
				4828
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4829	/// Loads two 128-bit integer vectors from unaligned memory locations and
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4830	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4831	/// vectors.
				4832	///
				4833	/// \headerfile <x86intrin.h>
				4834	///
				4835	/// This intrinsic corresponds to load instructions followed by the
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4836	/// <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4837	///
				4838	/// \param __addr_hi
				4839	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4840	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4841	/// address of the memory location does not have to be aligned.
				4842	/// \param __addr_lo
				4843	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4844	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4845	/// address of the memory location does not have to be aligned.
				4846	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4847	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4848	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4849	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	4850	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				4851	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4852	}
				4853
				4854	/* SIMD store ops (unaligned) */
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4855	/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4856	/// vector of [8 x float] into two different unaligned memory locations.
				4857	///
				4858	/// \headerfile <x86intrin.h>
				4859	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4860	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4861	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4862	///
				4863	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4864	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4865	/// copied to this memory location. The address of this memory location does
				4866	/// not have to be aligned.
				4867	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4868	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4869	/// copied to this memory location. The address of this memory location does
				4870	/// not have to be aligned.
				4871	/// \param __a
				4872	/// A 256-bit floating-point vector of [8 x float].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4873	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4874	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4875	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4876	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4877
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4878	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4879	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4880	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4881	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4882	}
				4883
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4884	/// Stores the upper and lower 128 bits of a 256-bit floating-point
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4885	/// vector of [4 x double] into two different unaligned memory locations.
				4886	///
				4887	/// \headerfile <x86intrin.h>
				4888	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4889	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4890	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4891	///
				4892	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4893	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4894	/// copied to this memory location. The address of this memory location does
				4895	/// not have to be aligned.
				4896	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4897	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4898	/// copied to this memory location. The address of this memory location does
				4899	/// not have to be aligned.
				4900	/// \param __a
				4901	/// A 256-bit floating-point vector of [4 x double].
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4902	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4903	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4904	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4905	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4906
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4907	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4908	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4909	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4910	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4911	}
				4912
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4913	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4914	/// two different unaligned memory locations.
				4915	///
				4916	/// \headerfile <x86intrin.h>
				4917	///
Ekaterina Romanova	16166a4	2016-12-23 23:36:26 +0000	[diff] [blame]	4918	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4919	/// store instructions.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4920	///
				4921	/// \param __addr_hi
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4922	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4923	/// copied to this memory location. The address of this memory location does
				4924	/// not have to be aligned.
				4925	/// \param __addr_lo
Ekaterina Romanova	d604219	2016-12-08 04:09:17 +0000	[diff] [blame]	4926	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4927	/// copied to this memory location. The address of this memory location does
				4928	/// not have to be aligned.
				4929	/// \param __a
				4930	/// A 256-bit integer vector.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4931	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4932	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4933	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4934	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4935
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4936	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4937	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	4938	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	4939	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	4940	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	4941
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4942	/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4943	/// concatenating two 128-bit floating-point vectors of [4 x float].
				4944	///
				4945	/// \headerfile <x86intrin.h>
				4946	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4947	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4948	///
				4949	/// \param __hi
				4950	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4951	/// 128 bits of the result.
				4952	/// \param __lo
				4953	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4954	/// 128 bits of the result.
				4955	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4956	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4957	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4958	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				4959	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	4960	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4961	}
				4962
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4963	/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4964	/// concatenating two 128-bit floating-point vectors of [2 x double].
				4965	///
				4966	/// \headerfile <x86intrin.h>
				4967	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4968	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4969	///
				4970	/// \param __hi
				4971	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4972	/// 128 bits of the result.
				4973	/// \param __lo
				4974	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4975	/// 128 bits of the result.
				4976	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4977	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4978	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4979	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				4980	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	4981	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				4982	}
				4983
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	4984	/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4985	/// integer vectors.
				4986	///
				4987	/// \headerfile <x86intrin.h>
				4988	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	4989	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	4990	///
				4991	/// \param __hi
				4992	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4993	/// result.
				4994	/// \param __lo
				4995	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4996	/// result.
				4997	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	4998	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	4999	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				5000	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5001	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5002	}
				5003
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	5004	/// Constructs a 256-bit floating-point vector of [8 x float] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5005	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				5006	/// similar to _mm256_set_m128, but the order of the input parameters is
				5007	/// swapped.
				5008	///
				5009	/// \headerfile <x86intrin.h>
				5010	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5011	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5012	///
				5013	/// \param __lo
				5014	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				5015	/// 128 bits of the result.
				5016	/// \param __hi
				5017	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				5018	/// 128 bits of the result.
				5019	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				5020	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5021	static __inline __m256 __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5022	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				5023	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5024	return _mm256_set_m128(__hi, __lo);
				5025	}
				5026
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	5027	/// Constructs a 256-bit floating-point vector of [4 x double] by
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5028	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				5029	/// similar to _mm256_set_m128d, but the order of the input parameters is
				5030	/// swapped.
				5031	///
				5032	/// \headerfile <x86intrin.h>
				5033	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5034	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5035	///
				5036	/// \param __lo
				5037	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				5038	/// 128 bits of the result.
				5039	/// \param __hi
				5040	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				5041	/// 128 bits of the result.
				5042	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				5043	/// concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5044	static __inline __m256d __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5045	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				5046	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5047	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5048	}
				5049
Adrian Prantl	9fc8faf	2018-05-09 01:00:01 +0000	[diff] [blame]	5050	/// Constructs a 256-bit integer vector by concatenating two 128-bit
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5051	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				5052	/// the input parameters is swapped.
				5053	///
				5054	/// \headerfile <x86intrin.h>
				5055	///
Ekaterina Romanova	0c1c3bb	2016-12-09 18:35:50 +0000	[diff] [blame]	5056	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
Ekaterina Romanova	64adc38	2016-11-09 03:58:30 +0000	[diff] [blame]	5057	///
				5058	/// \param __lo
				5059	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				5060	/// result.
				5061	/// \param __hi
				5062	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				5063	/// result.
				5064	/// \returns A 256-bit integer vector containing the concatenated result.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5065	static __inline __m256i __DEFAULT_FN_ATTRS
Ekaterina Romanova	2174b6f	2016-11-17 23:02:00 +0000	[diff] [blame]	5066	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				5067	{
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	5068	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				5069	}
				5070
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	5071	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	5072
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	5073	#endif /* __AVXINTRIN_H */