Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: 20e9ef4e57600c86f620accb9a093a10e0bf382e [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	38	/* We need an explicitly signed variant for char. Note that this shouldn't
				39	* appear in the interface though. */
				40	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				41
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	42	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				43	typedef double __m256d __attribute__((__vector_size__(32)));
				44	typedef long long __m256i __attribute__((__vector_size__(32)));
				45
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	46	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	47	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	48
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	49	/* Arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	50	/// \brief Adds two 256-bit vectors of [4 x double].
				51	///
				52	/// \headerfile <x86intrin.h>
				53	///
				54	/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
				55	///
				56	/// \param __a
				57	/// A 256-bit vector of [4 x double] containing one of the source operands.
				58	/// \param __b
				59	/// A 256-bit vector of [4 x double] containing one of the source operands.
				60	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				61	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	62	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	63	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	64	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	65	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	66	}
				67
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	68	/// \brief Adds two 256-bit vectors of [8 x float].
				69	///
				70	/// \headerfile <x86intrin.h>
				71	///
				72	/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
				73	///
				74	/// \param __a
				75	/// A 256-bit vector of [8 x float] containing one of the source operands.
				76	/// \param __b
				77	/// A 256-bit vector of [8 x float] containing one of the source operands.
				78	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				79	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	80	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	81	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	82	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	83	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	84	}
				85
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	86	/// \brief Subtracts two 256-bit vectors of [4 x double].
				87	///
				88	/// \headerfile <x86intrin.h>
				89	///
				90	/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
				91	///
				92	/// \param __a
				93	/// A 256-bit vector of [4 x double] containing the minuend.
				94	/// \param __b
				95	/// A 256-bit vector of [4 x double] containing the subtrahend.
				96	/// \returns A 256-bit vector of [4 x double] containing the differences between
				97	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	98	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	99	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	100	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	101	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	102	}
				103
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	104	/// \brief Subtracts two 256-bit vectors of [8 x float].
				105	///
				106	/// \headerfile <x86intrin.h>
				107	///
				108	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
				109	///
				110	/// \param __a
				111	/// A 256-bit vector of [8 x float] containing the minuend.
				112	/// \param __b
				113	/// A 256-bit vector of [8 x float] containing the subtrahend.
				114	/// \returns A 256-bit vector of [8 x float] containing the differences between
				115	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	116	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	117	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	118	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	119	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	120	}
				121
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	122	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				123	/// two 256-bit vectors of [4 x double].
				124	///
				125	/// \headerfile <x86intrin.h>
				126	///
				127	/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
				128	///
				129	/// \param __a
				130	/// A 256-bit vector of [4 x double] containing the left source operand.
				131	/// \param __b
				132	/// A 256-bit vector of [4 x double] containing the right source operand.
				133	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				134	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	135	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	136	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	137	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	138	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	139	}
				140
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	141	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				142	/// two 256-bit vectors of [8 x float].
				143	///
				144	/// \headerfile <x86intrin.h>
				145	///
				146	/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
				147	///
				148	/// \param __a
				149	/// A 256-bit vector of [8 x float] containing the left source operand.
				150	/// \param __b
				151	/// A 256-bit vector of [8 x float] containing the right source operand.
				152	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				153	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	154	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	155	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	156	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	157	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	158	}
				159
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	160	/// \brief Divides two 256-bit vectors of [4 x double].
				161	///
				162	/// \headerfile <x86intrin.h>
				163	///
				164	/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
				165	///
				166	/// \param __a
				167	/// A 256-bit vector of [4 x double] containing the dividend.
				168	/// \param __b
				169	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	170	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				171	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	172	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	173	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	174	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	175	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	176	}
				177
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	178	/// \brief Divides two 256-bit vectors of [8 x float].
				179	///
				180	/// \headerfile <x86intrin.h>
				181	///
				182	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
				183	///
				184	/// \param __a
				185	/// A 256-bit vector of [8 x float] containing the dividend.
				186	/// \param __b
				187	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	188	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				189	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	190	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	191	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	192	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	193	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	194	}
				195
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	196	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				197	/// of each pair of values.
				198	///
				199	/// \headerfile <x86intrin.h>
				200	///
				201	/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
				202	///
				203	/// \param __a
				204	/// A 256-bit vector of [4 x double] containing one of the operands.
				205	/// \param __b
				206	/// A 256-bit vector of [4 x double] containing one of the operands.
				207	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				208	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	209	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	210	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	211	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	212	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	213	}
				214
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	215	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				216	/// of each pair of values.
				217	///
				218	/// \headerfile <x86intrin.h>
				219	///
				220	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
				221	///
				222	/// \param __a
				223	/// A 256-bit vector of [8 x float] containing one of the operands.
				224	/// \param __b
				225	/// A 256-bit vector of [8 x float] containing one of the operands.
				226	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				227	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	228	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	229	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	230	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	231	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	232	}
				233
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	234	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				235	/// of each pair of values.
				236	///
				237	/// \headerfile <x86intrin.h>
				238	///
				239	/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
				240	///
				241	/// \param __a
				242	/// A 256-bit vector of [4 x double] containing one of the operands.
				243	/// \param __b
				244	/// A 256-bit vector of [4 x double] containing one of the operands.
				245	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				246	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	247	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	248	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	249	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	250	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	251	}
				252
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	253	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				254	/// of each pair of values.
				255	///
				256	/// \headerfile <x86intrin.h>
				257	///
				258	/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
				259	///
				260	/// \param __a
				261	/// A 256-bit vector of [8 x float] containing one of the operands.
				262	/// \param __b
				263	/// A 256-bit vector of [8 x float] containing one of the operands.
				264	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				265	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	266	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	267	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	268	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	269	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	270	}
				271
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	272	/// \brief Multiplies two 256-bit vectors of [4 x double].
				273	///
				274	/// \headerfile <x86intrin.h>
				275	///
				276	/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
				277	///
				278	/// \param __a
				279	/// A 256-bit vector of [4 x double] containing one of the operands.
				280	/// \param __b
				281	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	282	/// \returns A 256-bit vector of [4 x double] containing the products of both
				283	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	284	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	285	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	286	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	287	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	288	}
				289
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	290	/// \brief Multiplies two 256-bit vectors of [8 x float].
				291	///
				292	/// \headerfile <x86intrin.h>
				293	///
				294	/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
				295	///
				296	/// \param __a
				297	/// A 256-bit vector of [8 x float] containing one of the operands.
				298	/// \param __b
				299	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	300	/// \returns A 256-bit vector of [8 x float] containing the products of both
				301	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	302	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	303	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	304	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	305	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	306	}
				307
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	308	/// \brief Calculates the square roots of the values in a 256-bit vector of
				309	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	310	///
				311	/// \headerfile <x86intrin.h>
				312	///
				313	/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
				314	///
				315	/// \param __a
				316	/// A 256-bit vector of [4 x double].
				317	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				318	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	319	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	320	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	321	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	322	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	323	}
				324
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	325	/// \brief Calculates the square roots of the values in a 256-bit vector of
				326	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	327	///
				328	/// \headerfile <x86intrin.h>
				329	///
				330	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
				331	///
				332	/// \param __a
				333	/// A 256-bit vector of [8 x float].
				334	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				335	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	336	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	337	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	338	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	339	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	340	}
				341
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	342	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				343	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	344	///
				345	/// \headerfile <x86intrin.h>
				346	///
				347	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
				348	///
				349	/// \param __a
				350	/// A 256-bit vector of [8 x float].
				351	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				352	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	353	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	354	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	355	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	356	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	357	}
				358
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	359	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				360	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	361	///
				362	/// \headerfile <x86intrin.h>
				363	///
				364	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
				365	///
				366	/// \param __a
				367	/// A 256-bit vector of [8 x float].
				368	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				369	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	370	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	371	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	372	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	373	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	374	}
				375
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	376	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				377	/// by the byte operand. The source values are rounded to integer values and
				378	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	379	///
				380	/// \headerfile <x86intrin.h>
				381	///
				382	/// \code
				383	/// __m256d _mm256_round_pd(__m256d V, const int M);
				384	/// \endcode
				385	///
				386	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				387	///
				388	/// \param V
				389	/// A 256-bit vector of [4 x double].
				390	/// \param M
				391	/// An integer value that specifies the rounding operation.
				392	/// Bits [7:4] are reserved.
				393	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	394	/// 0: A normal PE exception is used.
				395	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	396	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	397	/// 0: Use bits [1:0] of M.
				398	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	399	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	400	/// 00: Nearest.
				401	/// 01: Downward (toward negative infinity).
				402	/// 10: Upward (toward positive infinity).
				403	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	404	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	405	#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	406	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	407
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	408	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				409	/// specified by the byte operand. The source values are rounded to integer
				410	/// values and returned as floating-point values.
				411	///
				412	/// \headerfile <x86intrin.h>
				413	///
				414	/// \code
				415	/// __m256 _mm256_round_ps(__m256 V, const int M);
				416	/// \endcode
				417	///
				418	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				419	///
				420	/// \param V
				421	/// A 256-bit vector of [8 x float].
				422	/// \param M
				423	/// An integer value that specifies the rounding operation.
				424	/// Bits [7:4] are reserved.
				425	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	426	/// 0: A normal PE exception is used.
				427	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	428	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	429	/// 0: Use bits [1:0] of M.
				430	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	431	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	432	/// 00: Nearest.
				433	/// 01: Downward (toward negative infinity).
				434	/// 10: Upward (toward positive infinity).
				435	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	436	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	437	#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	438	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	439
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	440	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	441	/// source values are rounded up to integer values and returned as 64-bit
				442	/// double-precision floating-point values.
				443	///
				444	/// \headerfile <x86intrin.h>
				445	///
				446	/// \code
				447	/// __m256d _mm256_ceil_pd(__m256d V);
				448	/// \endcode
				449	///
				450	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				451	///
				452	/// \param V
				453	/// A 256-bit vector of [4 x double].
				454	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	455	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	456
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	457	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	458	/// The source values are rounded down to integer values and returned as
				459	/// 64-bit double-precision floating-point values.
				460	///
				461	/// \headerfile <x86intrin.h>
				462	///
				463	/// \code
				464	/// __m256d _mm256_floor_pd(__m256d V);
				465	/// \endcode
				466	///
				467	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				468	///
				469	/// \param V
				470	/// A 256-bit vector of [4 x double].
				471	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				472	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	473	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	474
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	475	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	476	/// source values are rounded up to integer values and returned as
				477	/// floating-point values.
				478	///
				479	/// \headerfile <x86intrin.h>
				480	///
				481	/// \code
				482	/// __m256 _mm256_ceil_ps(__m256 V);
				483	/// \endcode
				484	///
				485	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				486	///
				487	/// \param V
				488	/// A 256-bit vector of [8 x float].
				489	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	490	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	491
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	492	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	493	/// source values are rounded down to integer values and returned as
				494	/// floating-point values.
				495	///
				496	/// \headerfile <x86intrin.h>
				497	///
				498	/// \code
				499	/// __m256 _mm256_floor_ps(__m256 V);
				500	/// \endcode
				501	///
				502	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				503	///
				504	/// \param V
				505	/// A 256-bit vector of [8 x float].
				506	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	507	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				508
				509	/* Logical */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	510	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				511	///
				512	/// \headerfile <x86intrin.h>
				513	///
				514	/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
				515	///
				516	/// \param __a
				517	/// A 256-bit vector of [4 x double] containing one of the source operands.
				518	/// \param __b
				519	/// A 256-bit vector of [4 x double] containing one of the source operands.
				520	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				521	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	522	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	523	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	524	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	525	return (__m256d)((__v4di)__a & (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	526	}
				527
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	528	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				529	///
				530	/// \headerfile <x86intrin.h>
				531	///
				532	/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
				533	///
				534	/// \param __a
				535	/// A 256-bit vector of [8 x float] containing one of the source operands.
				536	/// \param __b
				537	/// A 256-bit vector of [8 x float] containing one of the source operands.
				538	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				539	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	540	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	541	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	542	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	543	return (__m256)((__v8si)__a & (__v8si)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	544	}
				545
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	546	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				547	/// the one's complement of the values contained in the first source operand.
				548	///
				549	/// \headerfile <x86intrin.h>
				550	///
				551	/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
				552	///
				553	/// \param __a
				554	/// A 256-bit vector of [4 x double] containing the left source operand. The
				555	/// one's complement of this value is used in the bitwise AND.
				556	/// \param __b
				557	/// A 256-bit vector of [4 x double] containing the right source operand.
				558	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				559	/// values of the second operand and the one's complement of the first
				560	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	561	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	562	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	563	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	564	return (__m256d)(~(__v4di)__a & (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	565	}
				566
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	567	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				568	/// the one's complement of the values contained in the first source operand.
				569	///
				570	/// \headerfile <x86intrin.h>
				571	///
				572	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
				573	///
				574	/// \param __a
				575	/// A 256-bit vector of [8 x float] containing the left source operand. The
				576	/// one's complement of this value is used in the bitwise AND.
				577	/// \param __b
				578	/// A 256-bit vector of [8 x float] containing the right source operand.
				579	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				580	/// values of the second operand and the one's complement of the first
				581	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	582	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	583	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	584	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	585	return (__m256)(~(__v8si)__a & (__v8si)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	586	}
				587
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	588	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				589	///
				590	/// \headerfile <x86intrin.h>
				591	///
				592	/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
				593	///
				594	/// \param __a
				595	/// A 256-bit vector of [4 x double] containing one of the source operands.
				596	/// \param __b
				597	/// A 256-bit vector of [4 x double] containing one of the source operands.
				598	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				599	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	600	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	601	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	602	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	603	return (__m256d)((__v4di)__a \| (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	604	}
				605
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	606	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				607	///
				608	/// \headerfile <x86intrin.h>
				609	///
				610	/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
				611	///
				612	/// \param __a
				613	/// A 256-bit vector of [8 x float] containing one of the source operands.
				614	/// \param __b
				615	/// A 256-bit vector of [8 x float] containing one of the source operands.
				616	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				617	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	618	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	619	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	620	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	621	return (__m256)((__v8si)__a \| (__v8si)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	622	}
				623
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	624	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				625	///
				626	/// \headerfile <x86intrin.h>
				627	///
				628	/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
				629	///
				630	/// \param __a
				631	/// A 256-bit vector of [4 x double] containing one of the source operands.
				632	/// \param __b
				633	/// A 256-bit vector of [4 x double] containing one of the source operands.
				634	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				635	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	636	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	637	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	638	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	639	return (__m256d)((__v4di)__a ^ (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	640	}
				641
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	642	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				643	///
				644	/// \headerfile <x86intrin.h>
				645	///
				646	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				647	///
				648	/// \param __a
				649	/// A 256-bit vector of [8 x float] containing one of the source operands.
				650	/// \param __b
				651	/// A 256-bit vector of [8 x float] containing one of the source operands.
				652	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				653	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	654	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	655	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	656	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	657	return (__m256)((__v8si)__a ^ (__v8si)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	658	}
				659
				660	/* Horizontal arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	661	/// \brief Horizontally adds the adjacent pairs of values contained in two
				662	/// 256-bit vectors of [4 x double].
				663	///
				664	/// \headerfile <x86intrin.h>
				665	///
				666	/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
				667	///
				668	/// \param __a
				669	/// A 256-bit vector of [4 x double] containing one of the source operands.
				670	/// The horizontal sums of the values are returned in the even-indexed
				671	/// elements of a vector of [4 x double].
				672	/// \param __b
				673	/// A 256-bit vector of [4 x double] containing one of the source operands.
				674	/// The horizontal sums of the values are returned in the odd-indexed
				675	/// elements of a vector of [4 x double].
				676	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				677	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	678	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	679	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	680	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	681	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	682	}
				683
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	684	/// \brief Horizontally adds the adjacent pairs of values contained in two
				685	/// 256-bit vectors of [8 x float].
				686	///
				687	/// \headerfile <x86intrin.h>
				688	///
				689	/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
				690	///
				691	/// \param __a
				692	/// A 256-bit vector of [8 x float] containing one of the source operands.
				693	/// The horizontal sums of the values are returned in the elements with
				694	/// index 0, 1, 4, 5 of a vector of [8 x float].
				695	/// \param __b
				696	/// A 256-bit vector of [8 x float] containing one of the source operands.
				697	/// The horizontal sums of the values are returned in the elements with
				698	/// index 2, 3, 6, 7 of a vector of [8 x float].
				699	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				700	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	701	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	702	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	703	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	704	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	705	}
				706
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	707	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				708	/// 256-bit vectors of [4 x double].
				709	///
				710	/// \headerfile <x86intrin.h>
				711	///
				712	/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
				713	///
				714	/// \param __a
				715	/// A 256-bit vector of [4 x double] containing one of the source operands.
				716	/// The horizontal differences between the values are returned in the
				717	/// even-indexed elements of a vector of [4 x double].
				718	/// \param __b
				719	/// A 256-bit vector of [4 x double] containing one of the source operands.
				720	/// The horizontal differences between the values are returned in the
				721	/// odd-indexed elements of a vector of [4 x double].
				722	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				723	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	724	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	725	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	726	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	727	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	728	}
				729
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	730	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				731	/// 256-bit vectors of [8 x float].
				732	///
				733	/// \headerfile <x86intrin.h>
				734	///
				735	/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
				736	///
				737	/// \param __a
				738	/// A 256-bit vector of [8 x float] containing one of the source operands.
				739	/// The horizontal differences between the values are returned in the
				740	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				741	/// \param __b
				742	/// A 256-bit vector of [8 x float] containing one of the source operands.
				743	/// The horizontal differences between the values are returned in the
				744	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				745	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				746	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	747	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	748	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	749	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	750	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	751	}
				752
				753	/* Vector permutations */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	754	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				755	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	756	///
				757	/// \headerfile <x86intrin.h>
				758	///
				759	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				760	///
				761	/// \param __a
				762	/// A 128-bit vector of [2 x double].
				763	/// \param __c
				764	/// A 128-bit integer vector operand specifying how the values are to be
				765	/// copied.
				766	/// Bit [1]:
				767	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	768	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	769	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	770	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	771	/// Bit [65]:
				772	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	773	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	774	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	775	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	776	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	777	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	778	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	779	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	780	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	781	}
				782
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	783	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	784	/// specified by the 256-bit integer vector operand.
				785	///
				786	/// \headerfile <x86intrin.h>
				787	///
				788	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				789	///
				790	/// \param __a
				791	/// A 256-bit vector of [4 x double].
				792	/// \param __c
				793	/// A 256-bit integer vector operand specifying how the values are to be
				794	/// copied.
				795	/// Bit [1]:
				796	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	797	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	798	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	799	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	800	/// Bit [65]:
				801	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	802	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	803	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	804	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	805	/// Bit [129]:
				806	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	807	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	808	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	809	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	810	/// Bit [193]:
				811	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	812	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	813	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	814	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	815	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	816	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	817	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	818	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	819	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	820	}
				821
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	822	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				823	/// specified by the 128-bit integer vector operand.
				824	///
				825	/// \headerfile <x86intrin.h>
				826	///
				827	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				828	///
				829	/// \param __a
				830	/// A 128-bit vector of [4 x float].
				831	/// \param __c
				832	/// A 128-bit integer vector operand specifying how the values are to be
				833	/// copied.
				834	/// Bits [1:0]:
				835	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	836	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	837	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	838	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	839	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	840	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	841	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	842	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	843	/// Bits [33:32]:
				844	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	845	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	846	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	847	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	848	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	849	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	850	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	851	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	852	/// Bits [65:64]:
				853	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	854	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	855	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	856	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	857	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	858	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	859	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	860	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	861	/// Bits [97:96]:
				862	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	863	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	864	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	865	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	866	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	867	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	868	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	869	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	870	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	871	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	872	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	873	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	874	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	875	}
				876
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	877	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				878	/// specified by the 256-bit integer vector operand.
				879	///
				880	/// \headerfile <x86intrin.h>
				881	///
				882	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				883	///
				884	/// \param __a
				885	/// A 256-bit vector of [8 x float].
				886	/// \param __c
				887	/// A 256-bit integer vector operand specifying how the values are to be
				888	/// copied.
				889	/// Bits [1:0]:
				890	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	891	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	892	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	893	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	894	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	895	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	896	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	897	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	898	/// Bits [33:32]:
				899	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	900	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	901	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	902	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	903	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	904	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	905	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	906	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	907	/// Bits [65:64]:
				908	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	909	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	910	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	911	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	912	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	913	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	914	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	915	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	916	/// Bits [97:96]:
				917	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	918	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	919	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	920	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	921	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	922	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	923	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	924	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	925	/// Bits [129:128]:
				926	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	927	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	928	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	929	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	930	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	931	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	932	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	933	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	934	/// Bits [161:160]:
				935	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	936	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	937	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	938	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	939	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	940	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	941	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	942	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	943	/// Bits [193:192]:
				944	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	945	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	946	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	947	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	948	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	949	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	950	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	951	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	952	/// Bits [225:224]:
				953	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	954	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	955	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	956	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	957	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	958	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	959	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	960	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	961	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	962	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	963	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	964	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	965	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	966	}
				967
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	968	/// \brief Copies the values in a 128-bit vector of [2 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	969	/// specified by the immediate integer operand.
				970	///
				971	/// \headerfile <x86intrin.h>
				972	///
				973	/// \code
				974	/// __m128d _mm_permute_pd(__m128d A, const int C);
				975	/// \endcode
				976	///
				977	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				978	///
				979	/// \param A
				980	/// A 128-bit vector of [2 x double].
				981	/// \param C
				982	/// An immediate integer operand specifying how the values are to be copied.
				983	/// Bit [0]:
				984	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	985	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	986	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	987	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	988	/// Bit [1]:
				989	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	990	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	991	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	992	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	993	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	994	#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	995	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
				996	(__v2df)_mm_setzero_pd(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	997	(C) & 0x1, ((C) & 0x2) >> 1); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	998
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	999	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1000	/// specified by the immediate integer operand.
				1001	///
				1002	/// \headerfile <x86intrin.h>
				1003	///
				1004	/// \code
				1005	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1006	/// \endcode
				1007	///
				1008	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				1009	///
				1010	/// \param A
				1011	/// A 256-bit vector of [4 x double].
				1012	/// \param C
				1013	/// An immediate integer operand specifying how the values are to be copied.
				1014	/// Bit [0]:
				1015	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1016	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1017	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1018	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1019	/// Bit [1]:
				1020	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1021	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1022	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1023	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1024	/// Bit [2]:
				1025	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1026	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1027	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1028	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1029	/// Bit [3]:
				1030	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1031	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1032	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1033	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1034	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1035	#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1036	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
				1037	(__v4df)_mm256_setzero_pd(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1038	(C) & 0x1, ((C) & 0x2) >> 1, \
				1039	2 + (((C) & 0x4) >> 2), \
				1040	2 + (((C) & 0x8) >> 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1041
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1042	/// \brief Copies the values in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1043	/// specified by the immediate integer operand.
				1044	///
				1045	/// \headerfile <x86intrin.h>
				1046	///
				1047	/// \code
				1048	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1049	/// \endcode
				1050	///
				1051	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1052	///
				1053	/// \param A
				1054	/// A 128-bit vector of [4 x float].
				1055	/// \param C
				1056	/// An immediate integer operand specifying how the values are to be copied.
				1057	/// Bits [1:0]:
				1058	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1059	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1060	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1061	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1062	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1063	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1064	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1065	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1066	/// Bits [3:2]:
				1067	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1068	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1069	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1070	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1071	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1072	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1073	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1074	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1075	/// Bits [5:4]:
				1076	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1077	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1078	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1079	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1080	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1081	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1082	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1083	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1084	/// Bits [7:6]:
				1085	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1086	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1087	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1088	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1089	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1090	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1091	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1092	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1093	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1094	#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1095	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
				1096	(__v4sf)_mm_setzero_ps(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1097	(C) & 0x3, ((C) & 0xc) >> 2, \
Craig Topper	678a53c	2012-03-30 05:09:18 +0000	[diff] [blame]	1098	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1099
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1100	/// \brief Copies the values in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1101	/// specified by the immediate integer operand.
				1102	///
				1103	/// \headerfile <x86intrin.h>
				1104	///
				1105	/// \code
				1106	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1107	/// \endcode
				1108	///
				1109	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1110	///
				1111	/// \param A
				1112	/// A 256-bit vector of [8 x float].
				1113	/// \param C
				1114	/// An immediate integer operand specifying how the values are to be copied.
				1115	/// Bits [1:0]:
				1116	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1117	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1118	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1119	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1120	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1121	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1122	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1123	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1124	/// Bits [3:2]:
				1125	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1126	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1127	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1128	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1129	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1130	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1131	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1132	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1133	/// Bits [5:4]:
				1134	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1135	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1136	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1137	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1138	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1139	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1140	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1141	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1142	/// Bits [7:6]:
				1143	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1144	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1145	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1146	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1147	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1148	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1149	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1150	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1151	/// Bits [1:0]:
				1152	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1153	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1154	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1155	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1156	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1157	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1158	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1159	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1160	/// Bits [3:2]:
				1161	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1162	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1163	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1164	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1165	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1166	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1167	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1168	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1169	/// Bits [5:4]:
				1170	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1171	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1172	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1173	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1174	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1175	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1176	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1177	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1178	/// Bits [7:6]:
				1179	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1180	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1181	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1182	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1183	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1184	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1185	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1186	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1187	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1188	#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1189	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
				1190	(__v8sf)_mm256_setzero_ps(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1191	(C) & 0x3, ((C) & 0xc) >> 2, \
				1192	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
				1193	4 + (((C) & 0x03) >> 0), \
				1194	4 + (((C) & 0x0c) >> 2), \
				1195	4 + (((C) & 0x30) >> 4), \
				1196	4 + (((C) & 0xc0) >> 6)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1197
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1198	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1199	/// [4 x double], as specified by the immediate integer operand.
				1200	///
				1201	/// \headerfile <x86intrin.h>
				1202	///
				1203	/// \code
				1204	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1205	/// \endcode
				1206	///
				1207	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1208	///
				1209	/// \param V1
				1210	/// A 256-bit vector of [4 x double].
				1211	/// \param V2
				1212	/// A 256-bit vector of [4 x double.
				1213	/// \param M
				1214	/// An immediate integer operand specifying how the values are to be
				1215	/// permuted.
				1216	/// Bits [1:0]:
				1217	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1218	/// destination.
				1219	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1220	/// destination.
				1221	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1222	/// destination.
				1223	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1224	/// destination.
				1225	/// Bits [5:4]:
				1226	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1227	/// destination.
				1228	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1229	/// destination.
				1230	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1231	/// destination.
				1232	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1233	/// destination.
				1234	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1235	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1236	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1237	(__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1238
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1239	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1240	/// [8 x float], as specified by the immediate integer operand.
				1241	///
				1242	/// \headerfile <x86intrin.h>
				1243	///
				1244	/// \code
				1245	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1246	/// \endcode
				1247	///
				1248	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1249	///
				1250	/// \param V1
				1251	/// A 256-bit vector of [8 x float].
				1252	/// \param V2
				1253	/// A 256-bit vector of [8 x float].
				1254	/// \param M
				1255	/// An immediate integer operand specifying how the values are to be
				1256	/// permuted.
				1257	/// Bits [1:0]:
				1258	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1259	/// destination.
				1260	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1261	/// destination.
				1262	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1263	/// destination.
				1264	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1265	/// destination.
				1266	/// Bits [5:4]:
				1267	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1268	/// destination.
				1269	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1270	/// destination.
				1271	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1272	/// destination.
				1273	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1274	/// destination.
				1275	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1276	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1277	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1278	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1279
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1280	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1281	/// as specified by the immediate integer operand.
				1282	///
				1283	/// \headerfile <x86intrin.h>
				1284	///
				1285	/// \code
				1286	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1287	/// \endcode
				1288	///
				1289	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1290	///
				1291	/// \param V1
				1292	/// A 256-bit integer vector.
				1293	/// \param V2
				1294	/// A 256-bit integer vector.
				1295	/// \param M
				1296	/// An immediate integer operand specifying how the values are to be copied.
				1297	/// Bits [1:0]:
				1298	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1299	/// destination.
				1300	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1301	/// destination.
				1302	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1303	/// destination.
				1304	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1305	/// destination.
				1306	/// Bits [5:4]:
				1307	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1308	/// destination.
				1309	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1310	/// destination.
				1311	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1312	/// destination.
				1313	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1314	/// destination.
				1315	/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1316	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1317	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1318	(__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1319
				1320	/* Vector Blend */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1321	/// \brief Merges 64-bit double-precision data values stored in either of the
				1322	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1323	/// integer operand.
				1324	///
				1325	/// \headerfile <x86intrin.h>
				1326	///
				1327	/// \code
				1328	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1329	/// \endcode
				1330	///
				1331	/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
				1332	///
				1333	/// \param V1
				1334	/// A 256-bit vector of [4 x double].
				1335	/// \param V2
				1336	/// A 256-bit vector of [4 x double].
				1337	/// \param M
				1338	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1339	/// values are to be copied. The position of the mask bit corresponds to the
				1340	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				1341	/// element in operand V1 is copied to the same position in the destination.
				1342	/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
				1343	/// copied to the same position in the destination.
				1344	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1345	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1346	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1347	(__v4df)(__m256d)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1348	(((M) & 0x01) ? 4 : 0), \
				1349	(((M) & 0x02) ? 5 : 1), \
				1350	(((M) & 0x04) ? 6 : 2), \
				1351	(((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1352
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1353	/// \brief Merges 32-bit single-precision data values stored in either of the
				1354	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1355	/// integer operand.
				1356	///
				1357	/// \headerfile <x86intrin.h>
				1358	///
				1359	/// \code
				1360	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1361	/// \endcode
				1362	///
				1363	/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
				1364	///
				1365	/// \param V1
				1366	/// A 256-bit vector of [8 x float].
				1367	/// \param V2
				1368	/// A 256-bit vector of [8 x float].
				1369	/// \param M
				1370	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1371	/// values are to be copied. The position of the mask bit corresponds to the
				1372	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				1373	/// element in operand V1 is copied to the same position in the destination.
				1374	/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
				1375	/// copied to the same position in the destination.
				1376	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1377	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1378	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1379	(__v8sf)(__m256)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1380	(((M) & 0x01) ? 8 : 0), \
				1381	(((M) & 0x02) ? 9 : 1), \
				1382	(((M) & 0x04) ? 10 : 2), \
				1383	(((M) & 0x08) ? 11 : 3), \
				1384	(((M) & 0x10) ? 12 : 4), \
				1385	(((M) & 0x20) ? 13 : 5), \
				1386	(((M) & 0x40) ? 14 : 6), \
				1387	(((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1388
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1389	/// \brief Merges 64-bit double-precision data values stored in either of the
				1390	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1391	/// operand.
				1392	///
				1393	/// \headerfile <x86intrin.h>
				1394	///
				1395	/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
				1396	///
				1397	/// \param __a
				1398	/// A 256-bit vector of [4 x double].
				1399	/// \param __b
				1400	/// A 256-bit vector of [4 x double].
				1401	/// \param __c
				1402	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1403	/// how the values are to be copied. The position of the mask bit corresponds
				1404	/// to the most significant bit of a copied value. When a mask bit is 0, the
				1405	/// corresponding 64-bit element in operand __a is copied to the same
				1406	/// position in the destination. When a mask bit is 1, the corresponding
				1407	/// 64-bit element in operand __b is copied to the same position in the
				1408	/// destination.
				1409	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1410	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1411	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1412	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1413	return (__m256d)__builtin_ia32_blendvpd256(
				1414	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1415	}
				1416
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1417	/// \brief Merges 32-bit single-precision data values stored in either of the
				1418	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1419	/// operand.
				1420	///
				1421	/// \headerfile <x86intrin.h>
				1422	///
				1423	/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
				1424	///
				1425	/// \param __a
				1426	/// A 256-bit vector of [8 x float].
				1427	/// \param __b
				1428	/// A 256-bit vector of [8 x float].
				1429	/// \param __c
				1430	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1431	/// and 31 specifying how the values are to be copied. The position of the
				1432	/// mask bit corresponds to the most significant bit of a copied value. When
				1433	/// a mask bit is 0, the corresponding 32-bit element in operand __a is
				1434	/// copied to the same position in the destination. When a mask bit is 1, the
				1435	/// corresponding 32-bit element in operand __b is copied to the same
				1436	/// position in the destination.
				1437	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1438	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1439	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1440	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1441	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1442	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1443	}
				1444
				1445	/* Vector Dot Product */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1446	/// \brief Computes two dot products in parallel, using the lower and upper
				1447	/// halves of two [8 x float] vectors as input to the two computations, and
				1448	/// returning the two dot products in the lower and upper halves of the
				1449	/// [8 x float] result. The immediate integer operand controls which
				1450	/// input elements will contribute to the dot product, and where the final
				1451	/// results are returned. In general, for each dot product, the four
				1452	/// corresponding elements of the input vectors are multiplied; the first
				1453	/// two and second two products are summed, then the two sums are added to
				1454	/// form the final result.
				1455	///
				1456	/// \headerfile <x86intrin.h>
				1457	///
				1458	/// \code
				1459	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1460	/// \endcode
				1461	///
				1462	/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
				1463	///
				1464	/// \param V1
				1465	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1466	/// \param V2
				1467	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1468	/// \param M
				1469	/// An immediate integer argument. Bits [7:4] determine which elements of
				1470	/// the input vectors are used, with bit [4] corresponding to the lowest
				1471	/// element and bit [7] corresponding to the highest element of each [4 x
				1472	/// float] subvector. If a bit is set, the corresponding elements from the
				1473	/// two input vectors are used as an input for dot product; otherwise that
				1474	/// input is treated as zero. Bits [3:0] determine which elements of the
				1475	/// result will receive a copy of the final dot product, with bit [0]
				1476	/// corresponding to the lowest element and bit [3] corresponding to the
				1477	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1478	/// product is returned in the corresponding element; otherwise that element
				1479	/// is set to zero. The bitmask is applied in the same way to each of the
				1480	/// two parallel dot product computations.
				1481	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1482	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1483	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1484	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1485
				1486	/* Vector shuffle */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1487	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
				1488	/// specified by the immediate value operand. The four selected elements in
				1489	/// each operand are copied to the destination according to the bits
				1490	/// specified in the immediate operand. The selected elements from the first
				1491	/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
				1492	/// destination, and the selected elements from the second 256-bit operand
				1493	/// are copied to bits [127:64] and bits [255:192] of the destination. For
				1494	/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
				1495	/// the 256-bit destination vector would contain the following values: b[7],
				1496	/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
				1497	///
				1498	/// \headerfile <x86intrin.h>
				1499	///
				1500	/// \code
				1501	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1502	/// \endcode
				1503	///
				1504	/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
				1505	///
				1506	/// \param a
				1507	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1508	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1509	/// according to the bits specified in the immediate operand.
				1510	/// \param b
				1511	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1512	/// operand are copied to bits [127:64] and bits [255:192] in the
				1513	/// destination, according to the bits specified in the immediate operand.
				1514	/// \param mask
				1515	/// An immediate value containing an 8-bit value specifying which elements to
				1516	/// copy from a and b. Bits [3:0] specify the values copied from operand a.
				1517	/// Bits [7:4] specify the values copied from operand b.
				1518	/// The destinations within the 256-bit destination are assigned values as
				1519	/// follows, according to the bit value assignments described below:
				1520	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
				1521	/// destination.
				1522	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
				1523	/// destination.
				1524	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
				1525	/// destination.
				1526	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
				1527	/// the destination.
				1528	/// Bit value assignments:
				1529	/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
				1530	/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
				1531	/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
				1532	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1533	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1534	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1535	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1536	(__v8sf)(__m256)(b), \
				1537	(mask) & 0x3, \
				1538	((mask) & 0xc) >> 2, \
				1539	(((mask) & 0x30) >> 4) + 8, \
				1540	(((mask) & 0xc0) >> 6) + 8, \
				1541	((mask) & 0x3) + 4, \
				1542	(((mask) & 0xc) >> 2) + 4, \
				1543	(((mask) & 0x30) >> 4) + 12, \
				1544	(((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1545
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1546	/// \brief Selects four double-precision values from the 256-bit operands of
				1547	/// [4 x double], as specified by the immediate value operand. The selected
				1548	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1549	/// bits [191:128] in the destination, and the selected elements from the
				1550	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
				1551	/// the destination. For example, if bits [3:0] of the immediate operand
				1552	/// contain a value of 0xF, the 256-bit destination vector would contain the
				1553	/// following values: b[3], a[3], b[1], a[1].
				1554	///
				1555	/// \headerfile <x86intrin.h>
				1556	///
				1557	/// \code
				1558	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1559	/// \endcode
				1560	///
				1561	/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
				1562	///
				1563	/// \param a
				1564	/// A 256-bit vector of [4 x double].
				1565	/// \param b
				1566	/// A 256-bit vector of [4 x double].
				1567	/// \param mask
				1568	/// An immediate value containing 8-bit values specifying which elements to
				1569	/// copy from a and b:
				1570	/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
				1571	/// destination.
				1572	/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
				1573	/// destination.
				1574	/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
				1575	/// destination.
				1576	/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
				1577	/// destination.
				1578	/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
				1579	/// destination.
				1580	/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
				1581	/// destination.
				1582	/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
				1583	/// destination.
				1584	/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
				1585	/// destination.
				1586	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1587	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1588	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1589	(__v4df)(__m256d)(b), \
				1590	(mask) & 0x1, \
				1591	(((mask) & 0x2) >> 1) + 4, \
				1592	(((mask) & 0x4) >> 2) + 2, \
				1593	(((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1594
				1595	/* Compare */
				1596	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1597	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1598	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1599	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1600	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1601	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1602	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1603	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1604	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1605	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1606	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1607	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1608	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1609	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1610	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1611	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1612	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1613	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1614	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1615	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1616	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1617	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1618	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1619	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1620	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1621	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1622	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1623	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1624	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1625	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1626	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1627	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1628
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1629	/// \brief Compares each of the corresponding double-precision values of two
				1630	/// 128-bit vectors of [2 x double], using the operation specified by the
				1631	/// immediate integer operand. Returns a [2 x double] vector consisting of
				1632	/// two doubles corresponding to the two comparison results: zero if the
				1633	/// comparison is false, and all 1's if the comparison is true.
				1634	///
				1635	/// \headerfile <x86intrin.h>
				1636	///
				1637	/// \code
				1638	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1639	/// \endcode
				1640	///
				1641	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1642	///
				1643	/// \param a
				1644	/// A 128-bit vector of [2 x double].
				1645	/// \param b
				1646	/// A 128-bit vector of [2 x double].
				1647	/// \param c
				1648	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1649	/// operation to use:
				1650	/// 00h, 08h, 10h, 18h: Equal
				1651	/// 01h, 09h, 11h, 19h: Less than
				1652	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1653	/// operands)
				1654	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1655	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1656	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1657	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1658	/// (swapped operands)
				1659	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1660	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1661	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1662	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1663	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1664
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1665	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1666	/// [4 x float], using the operation specified by the immediate integer
				1667	/// operand. Returns a [4 x float] vector consisting of four floats
				1668	/// corresponding to the four comparison results: zero if the comparison is
				1669	/// false, and all 1's if the comparison is true.
				1670	///
				1671	/// \headerfile <x86intrin.h>
				1672	///
				1673	/// \code
				1674	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1675	/// \endcode
				1676	///
				1677	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1678	///
				1679	/// \param a
				1680	/// A 128-bit vector of [4 x float].
				1681	/// \param b
				1682	/// A 128-bit vector of [4 x float].
				1683	/// \param c
				1684	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1685	/// operation to use:
				1686	/// 00h, 08h, 10h, 18h: Equal
				1687	/// 01h, 09h, 11h, 19h: Less than
				1688	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1689	/// operands)
				1690	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1691	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1692	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1693	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1694	/// (swapped operands)
				1695	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1696	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1697	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1698	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1699	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1700
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1701	/// \brief Compares each of the corresponding double-precision values of two
				1702	/// 256-bit vectors of [4 x double], using the operation specified by the
				1703	/// immediate integer operand. Returns a [4 x double] vector consisting of
				1704	/// four doubles corresponding to the four comparison results: zero if the
				1705	/// comparison is false, and all 1's if the comparison is true.
				1706	///
				1707	/// \headerfile <x86intrin.h>
				1708	///
				1709	/// \code
				1710	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1711	/// \endcode
				1712	///
				1713	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1714	///
				1715	/// \param a
				1716	/// A 256-bit vector of [4 x double].
				1717	/// \param b
				1718	/// A 256-bit vector of [4 x double].
				1719	/// \param c
				1720	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1721	/// operation to use:
				1722	/// 00h, 08h, 10h, 18h: Equal
				1723	/// 01h, 09h, 11h, 19h: Less than
				1724	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1725	/// operands)
				1726	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1727	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1728	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1729	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1730	/// (swapped operands)
				1731	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1732	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1733	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1734	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1735	(__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1736
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1737	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1738	/// [8 x float], using the operation specified by the immediate integer
				1739	/// operand. Returns a [8 x float] vector consisting of eight floats
				1740	/// corresponding to the eight comparison results: zero if the comparison is
				1741	/// false, and all 1's if the comparison is true.
				1742	///
				1743	/// \headerfile <x86intrin.h>
				1744	///
				1745	/// \code
				1746	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1747	/// \endcode
				1748	///
				1749	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1750	///
				1751	/// \param a
				1752	/// A 256-bit vector of [8 x float].
				1753	/// \param b
				1754	/// A 256-bit vector of [8 x float].
				1755	/// \param c
				1756	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1757	/// operation to use:
				1758	/// 00h, 08h, 10h, 18h: Equal
				1759	/// 01h, 09h, 11h, 19h: Less than
				1760	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1761	/// operands)
				1762	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1763	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1764	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1765	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1766	/// (swapped operands)
				1767	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1768	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1769	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1770	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1771	(__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1772
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1773	/// \brief Compares each of the corresponding scalar double-precision values of
				1774	/// two 128-bit vectors of [2 x double], using the operation specified by the
				1775	/// immediate integer operand. If the result is true, all 64 bits of the
				1776	/// destination vector are set; otherwise they are cleared.
				1777	///
				1778	/// \headerfile <x86intrin.h>
				1779	///
				1780	/// \code
				1781	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1782	/// \endcode
				1783	///
				1784	/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
				1785	///
				1786	/// \param a
				1787	/// A 128-bit vector of [2 x double].
				1788	/// \param b
				1789	/// A 128-bit vector of [2 x double].
				1790	/// \param c
				1791	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1792	/// operation to use:
				1793	/// 00h, 08h, 10h, 18h: Equal
				1794	/// 01h, 09h, 11h, 19h: Less than
				1795	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1796	/// operands)
				1797	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1798	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1799	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1800	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1801	/// (swapped operands)
				1802	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1803	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1804	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1805	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1806	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1807
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1808	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1809	/// vectors of [4 x float], using the operation specified by the immediate
				1810	/// integer operand. If the result is true, all 32 bits of the destination
				1811	/// vector are set; otherwise they are cleared.
				1812	///
				1813	/// \headerfile <x86intrin.h>
				1814	///
				1815	/// \code
				1816	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1817	/// \endcode
				1818	///
				1819	/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
				1820	///
				1821	/// \param a
				1822	/// A 128-bit vector of [4 x float].
				1823	/// \param b
				1824	/// A 128-bit vector of [4 x float].
				1825	/// \param c
				1826	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1827	/// operation to use:
				1828	/// 00h, 08h, 10h, 18h: Equal
				1829	/// 01h, 09h, 11h, 19h: Less than
				1830	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1831	/// operands)
				1832	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1833	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1834	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1835	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1836	/// (swapped operands)
				1837	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1838	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1839	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1840	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1841	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1842
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1843	/// \brief Takes a [8 x i32] vector and returns the vector element value
				1844	/// indexed by the immediate constant operand.
				1845	///
				1846	/// \headerfile <x86intrin.h>
				1847	///
				1848	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1849	/// EXTRACTF128+COMPOSITE instruction.
				1850	///
				1851	/// \param __a
				1852	/// A 256-bit vector of [8 x i32].
				1853	/// \param __imm
				1854	/// An immediate integer operand with bits [2:0] determining which vector
				1855	/// element is extracted and returned.
				1856	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1857	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1858	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1859	_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1860	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1861	__v8si __b = (__v8si)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1862	return __b[__imm & 7];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1863	}
				1864
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1865	/// \brief Takes a [16 x i16] vector and returns the vector element value
				1866	/// indexed by the immediate constant operand.
				1867	///
				1868	/// \headerfile <x86intrin.h>
				1869	///
				1870	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1871	/// EXTRACTF128+COMPOSITE instruction.
				1872	///
				1873	/// \param __a
				1874	/// A 256-bit integer vector of [16 x i16].
				1875	/// \param __imm
				1876	/// An immediate integer operand with bits [3:0] determining which vector
				1877	/// element is extracted and returned.
				1878	/// \returns A 32-bit integer containing the extracted 16 bits of extended
				1879	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1880	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1881	_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1882	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1883	__v16hi __b = (__v16hi)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1884	return __b[__imm & 15];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1885	}
				1886
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1887	/// \brief Takes a [32 x i8] vector and returns the vector element value
				1888	/// indexed by the immediate constant operand.
				1889	///
				1890	/// \headerfile <x86intrin.h>
				1891	///
				1892	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1893	/// EXTRACTF128+COMPOSITE instruction.
				1894	///
				1895	/// \param __a
				1896	/// A 256-bit integer vector of [32 x i8].
				1897	/// \param __imm
				1898	/// An immediate integer operand with bits [4:0] determining which vector
				1899	/// element is extracted and returned.
				1900	/// \returns A 32-bit integer containing the extracted 8 bits of extended packed
				1901	/// data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1902	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1903	_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1904	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1905	__v32qi __b = (__v32qi)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1906	return __b[__imm & 31];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1907	}
				1908
				1909	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1910	/// \brief Takes a [4 x i64] vector and returns the vector element value
				1911	/// indexed by the immediate constant operand.
				1912	///
				1913	/// \headerfile <x86intrin.h>
				1914	///
				1915	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1916	/// EXTRACTF128+COMPOSITE instruction.
				1917	///
				1918	/// \param __a
				1919	/// A 256-bit integer vector of [4 x i64].
				1920	/// \param __imm
				1921	/// An immediate integer operand with bits [1:0] determining which vector
				1922	/// element is extracted and returned.
				1923	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				1924	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1925	static __inline long long __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1926	_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1927	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1928	__v4di __b = (__v4di)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1929	return __b[__imm & 3];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1930	}
				1931	#endif
				1932
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1933	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				1934	/// indexed by the immediate constant operand by a new value. Returns the
				1935	/// modified vector.
				1936	///
				1937	/// \headerfile <x86intrin.h>
				1938	///
				1939	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1940	/// INSERTF128+COMPOSITE instruction.
				1941	///
				1942	/// \param __a
				1943	/// A vector of [8 x i32] to be used by the insert operation.
				1944	/// \param __b
				1945	/// An integer value. The replacement value for the insert operation.
				1946	/// \param __imm
				1947	/// An immediate integer specifying the index of the vector element to be
				1948	/// replaced.
				1949	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1950	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1951	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1952	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1953	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1954	__v8si __c = (__v8si)__a;
				1955	__c[__imm & 7] = __b;
				1956	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1957	}
				1958
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1959
				1960	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				1961	/// indexed by the immediate constant operand with a new value. Returns the
				1962	/// modified vector.
				1963	///
				1964	/// \headerfile <x86intrin.h>
				1965	///
				1966	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1967	/// INSERTF128+COMPOSITE instruction.
				1968	///
				1969	/// \param __a
				1970	/// A vector of [16 x i16] to be used by the insert operation.
				1971	/// \param __b
				1972	/// An i16 integer value. The replacement value for the insert operation.
				1973	/// \param __imm
				1974	/// An immediate integer specifying the index of the vector element to be
				1975	/// replaced.
				1976	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1977	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1978	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1979	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1980	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1981	__v16hi __c = (__v16hi)__a;
				1982	__c[__imm & 15] = __b;
				1983	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1984	}
				1985
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	1986	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				1987	/// indexed by the immediate constant operand with a new value. Returns the
				1988	/// modified vector.
				1989	///
				1990	/// \headerfile <x86intrin.h>
				1991	///
				1992	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1993	/// INSERTF128+COMPOSITE instruction.
				1994	///
				1995	/// \param __a
				1996	/// A vector of [32 x i8] to be used by the insert operation.
				1997	/// \param __b
				1998	/// An i8 integer value. The replacement value for the insert operation.
				1999	/// \param __imm
				2000	/// An immediate integer specifying the index of the vector element to be
				2001	/// replaced.
				2002	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2003	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2004	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2005	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2006	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2007	__v32qi __c = (__v32qi)__a;
				2008	__c[__imm & 31] = __b;
				2009	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2010	}
				2011
				2012	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	2013	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2014	/// indexed by the immediate constant operand with a new value. Returns the
				2015	/// modified vector.
				2016	///
				2017	/// \headerfile <x86intrin.h>
				2018	///
				2019	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2020	/// INSERTF128+COMPOSITE instruction.
				2021	///
				2022	/// \param __a
				2023	/// A vector of [4 x i64] to be used by the insert operation.
				2024	/// \param __b
				2025	/// A 64-bit integer value. The replacement value for the insert operation.
				2026	/// \param __imm
				2027	/// An immediate integer specifying the index of the vector element to be
				2028	/// replaced.
				2029	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2030	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2031	static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhas	d740029	2015-02-19 19:00:33 +0000	[diff] [blame]	2032	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2033	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2034	__v4di __c = (__v4di)__a;
				2035	__c[__imm & 3] = __b;
				2036	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2037	}
				2038	#endif
				2039
				2040	/* Conversion */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	2041	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2042	///
				2043	/// \headerfile <x86intrin.h>
				2044	///
				2045	/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
				2046	///
				2047	/// \param __a
				2048	/// A 128-bit integer vector of [4 x i32].
				2049	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2050	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2051	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2052	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2053	return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2054	}
				2055
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	2056	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2057	///
				2058	/// \headerfile <x86intrin.h>
				2059	///
				2060	/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
				2061	///
				2062	/// \param __a
				2063	/// A 256-bit integer vector.
				2064	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2065	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2066	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2067	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2068	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2069	}
				2070
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	2071	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2072	/// [4 x float].
				2073	///
				2074	/// \headerfile <x86intrin.h>
				2075	///
				2076	/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
				2077	///
				2078	/// \param __a
				2079	/// A 256-bit vector of [4 x double].
				2080	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2081	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2082	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2083	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2084	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2085	}
				2086
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame^]	2087	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2088	///
				2089	/// \headerfile <x86intrin.h>
				2090	///
				2091	/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
				2092	///
				2093	/// \param __a
				2094	/// A 256-bit vector of [8 x float].
				2095	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2096	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2097	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2098	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2099	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2100	}
				2101
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2102	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2103	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2104	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2105	return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2106	}
				2107
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2108	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2109	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2110	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2111	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2112	}
				2113
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2114	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2115	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2116	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2117	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2118	}
				2119
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2120	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2121	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2122	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2123	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2124	}
				2125
				2126	/* Vector replicate */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2127	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2128	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2129	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2130	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2131	}
				2132
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2133	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2134	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2135	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2136	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2137	}
				2138
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2139	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2140	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2141	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2142	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2143	}
				2144
				2145	/* Unpack and Interleave */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2146	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2147	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2148	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2149	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2150	}
				2151
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2152	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2153	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2154	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2155	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2156	}
				2157
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2158	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2159	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2160	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2161	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2162	}
				2163
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2164	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2165	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2166	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2167	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2168	}
				2169
				2170	/* Bit Test */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2171	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2172	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2173	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2174	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2175	}
				2176
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2177	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2178	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2179	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2180	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2181	}
				2182
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2183	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2184	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2185	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2186	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2187	}
				2188
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2189	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2190	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2191	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2192	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2193	}
				2194
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2195	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2196	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2197	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2198	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2199	}
				2200
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2201	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2202	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2203	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2204	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2205	}
				2206
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2207	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2208	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2209	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2210	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2211	}
				2212
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2213	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2214	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2215	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2216	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2217	}
				2218
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2219	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2220	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2221	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2222	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2223	}
				2224
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2225	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2226	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2227	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2228	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2229	}
				2230
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2231	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2232	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2233	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2234	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2235	}
				2236
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2237	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2238	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2239	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2240	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2241	}
				2242
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2243	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2244	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2245	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2246	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2247	}
				2248
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2249	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2250	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2251	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2252	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2253	}
				2254
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2255	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2256	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2257	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2258	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2259	}
				2260
				2261	/* Vector extract sign mask */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2262	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2263	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2264	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2265	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2266	}
				2267
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2268	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2269	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2270	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2271	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2272	}
				2273
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2274	/* Vector __zero */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2275	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2276	_mm256_zeroall(void)
				2277	{
				2278	__builtin_ia32_vzeroall();
				2279	}
				2280
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2281	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2282	_mm256_zeroupper(void)
				2283	{
				2284	__builtin_ia32_vzeroupper();
				2285	}
				2286
				2287	/* Vector load with broadcast */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2288	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2289	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2290	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2291	float __f = *__a;
				2292	return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2293	}
				2294
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2295	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2296	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2297	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2298	double __d = *__a;
				2299	return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2300	}
				2301
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2302	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2303	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2304	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2305	float __f = *__a;
				2306	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2307	}
				2308
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2309	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2310	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2311	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2312	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2313	}
				2314
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2315	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2316	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2317	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2318	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2319	}
				2320
				2321	/* SIMD load ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2322	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2323	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2324	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2325	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2326	}
				2327
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2328	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2329	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2330	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2331	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2332	}
				2333
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2334	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2335	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2336	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2337	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2338	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2339	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2340	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2341	}
				2342
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2343	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2344	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2345	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2346	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2347	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2348	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2349	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2350	}
				2351
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2352	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2353	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2354	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2355	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2356	}
				2357
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2358	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2359	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2360	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2361	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2362	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2363	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2364	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2365	}
				2366
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2367	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2368	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2369	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2370	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2371	}
				2372
				2373	/* SIMD store ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2374	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2375	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2376	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2377	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2378	}
				2379
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2380	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2381	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2382	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2383	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2384	}
				2385
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2386	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2387	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2388	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2389	__builtin_ia32_storeupd256(__p, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2390	}
				2391
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2392	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2393	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2394	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2395	__builtin_ia32_storeups256(__p, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2396	}
				2397
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2398	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2399	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2400	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2401	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2402	}
				2403
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2404	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2405	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2406	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2407	__builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2408	}
				2409
				2410	/* Conditional load ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2411	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2412	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2413	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2414	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2415	}
				2416
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2417	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2418	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2419	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2420	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2421	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2422	}
				2423
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2424	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2425	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2426	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2427	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2428	}
				2429
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2430	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2431	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2432	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2433	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2434	}
				2435
				2436	/* Conditional store ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2437	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2438	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2439	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2440	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2441	}
				2442
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2443	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2444	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2445	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2446	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2447	}
				2448
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2449	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2450	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2451	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2452	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2453	}
				2454
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2455	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2456	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2457	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2458	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2459	}
				2460
				2461	/* Cacheability support ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2462	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2463	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2464	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2465	__builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2466	}
				2467
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2468	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2469	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2470	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2471	__builtin_ia32_movntpd256(__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2472	}
				2473
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2474	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2475	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2476	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2477	__builtin_ia32_movntps256(__p, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2478	}
				2479
				2480	/* Create vectors */
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	2481	static __inline__ __m256d __DEFAULT_FN_ATTRS
				2482	_mm256_undefined_pd()
				2483	{
				2484	return (__m256d)__builtin_ia32_undef256();
				2485	}
				2486
				2487	static __inline__ __m256 __DEFAULT_FN_ATTRS
				2488	_mm256_undefined_ps()
				2489	{
				2490	return (__m256)__builtin_ia32_undef256();
				2491	}
				2492
				2493	static __inline__ __m256i __DEFAULT_FN_ATTRS
				2494	_mm256_undefined_si256()
				2495	{
				2496	return (__m256i)__builtin_ia32_undef256();
				2497	}
				2498
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2499	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2500	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2501	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2502	return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2503	}
				2504
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2505	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2506	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2507	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2508	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2509	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2510	}
				2511
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2512	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2513	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2514	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2515	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2516	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2517	}
				2518
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2519	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2520	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2521	short __w11, short __w10, short __w09, short __w08,
				2522	short __w07, short __w06, short __w05, short __w04,
				2523	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2524	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2525	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				2526	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2527	}
				2528
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2529	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2530	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2531	char __b27, char __b26, char __b25, char __b24,
				2532	char __b23, char __b22, char __b21, char __b20,
				2533	char __b19, char __b18, char __b17, char __b16,
				2534	char __b15, char __b14, char __b13, char __b12,
				2535	char __b11, char __b10, char __b09, char __b08,
				2536	char __b07, char __b06, char __b05, char __b04,
				2537	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2538	{
				2539	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2540	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				2541	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				2542	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				2543	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2544	};
				2545	}
				2546
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2547	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2548	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2549	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2550	return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2551	}
				2552
				2553	/* Create vectors with elements in reverse order */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2554	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2555	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2556	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2557	return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2558	}
				2559
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2560	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2561	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2562	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2563	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2564	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2565	}
				2566
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2567	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2568	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2569	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2570	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2571	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2572	}
				2573
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2574	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2575	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2576	short __w11, short __w10, short __w09, short __w08,
				2577	short __w07, short __w06, short __w05, short __w04,
				2578	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2579	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2580	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				2581	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2582	}
				2583
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2584	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2585	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2586	char __b27, char __b26, char __b25, char __b24,
				2587	char __b23, char __b22, char __b21, char __b20,
				2588	char __b19, char __b18, char __b17, char __b16,
				2589	char __b15, char __b14, char __b13, char __b12,
				2590	char __b11, char __b10, char __b09, char __b08,
				2591	char __b07, char __b06, char __b05, char __b04,
				2592	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2593	{
				2594	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2595	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2596	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				2597	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				2598	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2599	}
				2600
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2601	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2602	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2603	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2604	return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2605	}
				2606
				2607	/* Create vectors with repeated elements */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2608	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2609	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2610	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2611	return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2612	}
				2613
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2614	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2615	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2616	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2617	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2618	}
				2619
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2620	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2621	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2622	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2623	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2624	}
				2625
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2626	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2627	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2628	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2629	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				2630	__w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2631	}
				2632
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2633	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2634	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2635	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2636	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2637	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2638	__b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2639	}
				2640
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2641	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2642	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2643	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2644	return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2645	}
				2646
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2647	/* Create __zeroed vectors */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2648	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2649	_mm256_setzero_pd(void)
				2650	{
				2651	return (__m256d){ 0, 0, 0, 0 };
				2652	}
				2653
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2654	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2655	_mm256_setzero_ps(void)
				2656	{
				2657	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				2658	}
				2659
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2660	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2661	_mm256_setzero_si256(void)
				2662	{
				2663	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				2664	}
				2665
				2666	/* Cast between vector types */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2667	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2668	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2669	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2670	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2671	}
				2672
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2673	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2674	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2675	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2676	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2677	}
				2678
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2679	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2680	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2681	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2682	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2683	}
				2684
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2685	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2686	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2687	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2688	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2689	}
				2690
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2691	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2692	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2693	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2694	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2695	}
				2696
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2697	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2698	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2699	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2700	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2701	}
				2702
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2703	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2704	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2705	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2706	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2707	}
				2708
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2709	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2710	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2711	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2712	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2713	}
				2714
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2715	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2716	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2717	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2718	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2719	}
				2720
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2721	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2722	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2723	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2724	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2725	}
				2726
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2727	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2728	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2729	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2730	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2731	}
				2732
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2733	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2734	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2735	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2736	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2737	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2738
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	2739	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2740	Vector insert.
				2741	We use macros rather than inlines because we only want to accept
				2742	invocations where the immediate M is a constant expression.
				2743	*/
				2744	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				2745	(__m256)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2746	(__v8sf)(__m256)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2747	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				2748	(((M) & 1) ? 0 : 8), \
				2749	(((M) & 1) ? 1 : 9), \
				2750	(((M) & 1) ? 2 : 10), \
				2751	(((M) & 1) ? 3 : 11), \
				2752	(((M) & 1) ? 8 : 4), \
				2753	(((M) & 1) ? 9 : 5), \
				2754	(((M) & 1) ? 10 : 6), \
				2755	(((M) & 1) ? 11 : 7) );})
				2756
				2757	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				2758	(__m256d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2759	(__v4df)(__m256d)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2760	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				2761	(((M) & 1) ? 0 : 4), \
				2762	(((M) & 1) ? 1 : 5), \
				2763	(((M) & 1) ? 4 : 2), \
				2764	(((M) & 1) ? 5 : 3) );})
				2765
				2766	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				2767	(__m256i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2768	(__v4di)(__m256i)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2769	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				2770	(((M) & 1) ? 0 : 4), \
				2771	(((M) & 1) ? 1 : 5), \
				2772	(((M) & 1) ? 4 : 2), \
				2773	(((M) & 1) ? 5 : 3) );})
				2774
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	2775	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2776	Vector extract.
				2777	We use macros rather than inlines because we only want to accept
				2778	invocations where the immediate M is a constant expression.
				2779	*/
				2780	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				2781	(__m128)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2782	(__v8sf)(__m256)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2783	(__v8sf)(_mm256_setzero_ps()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2784	(((M) & 1) ? 4 : 0), \
				2785	(((M) & 1) ? 5 : 1), \
				2786	(((M) & 1) ? 6 : 2), \
				2787	(((M) & 1) ? 7 : 3) );})
				2788
				2789	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				2790	(__m128d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2791	(__v4df)(__m256d)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2792	(__v4df)(_mm256_setzero_pd()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2793	(((M) & 1) ? 2 : 0), \
				2794	(((M) & 1) ? 3 : 1) );})
				2795
				2796	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				2797	(__m128i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2798	(__v4di)(__m256i)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2799	(__v4di)(_mm256_setzero_si256()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2800	(((M) & 1) ? 2 : 0), \
				2801	(((M) & 1) ? 3 : 1) );})
				2802
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2803	/* SIMD load ops (unaligned) */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2804	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2805	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2806	{
				2807	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2808	__m128 __v;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2809	} __attribute__((__packed__, __may_alias__));
				2810
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2811	__m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
				2812	return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2813	}
				2814
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2815	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2816	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2817	{
				2818	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2819	__m128d __v;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2820	} __attribute__((__packed__, __may_alias__));
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	2821
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2822	__m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
				2823	return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2824	}
				2825
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2826	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2827	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2828	{
				2829	struct __loadu_si128 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2830	__m128i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2831	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2832	__m256i __v256 = _mm256_castsi128_si256(
				2833	((struct __loadu_si128*)__addr_lo)->__v);
				2834	return _mm256_insertf128_si256(__v256,
				2835	((struct __loadu_si128*)__addr_hi)->__v, 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2836	}
				2837
				2838	/* SIMD store ops (unaligned) */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2839	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2840	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2841	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2842	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2843
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2844	__v128 = _mm256_castps256_ps128(__a);
				2845	__builtin_ia32_storeups(__addr_lo, __v128);
				2846	__v128 = _mm256_extractf128_ps(__a, 1);
				2847	__builtin_ia32_storeups(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2848	}
				2849
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2850	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2851	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2852	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2853	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2854
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2855	__v128 = _mm256_castpd256_pd128(__a);
				2856	__builtin_ia32_storeupd(__addr_lo, __v128);
				2857	__v128 = _mm256_extractf128_pd(__a, 1);
				2858	__builtin_ia32_storeupd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2859	}
				2860
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2861	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2862	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2863	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2864	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2865
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2866	__v128 = _mm256_castsi256_si128(__a);
				2867	__builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
				2868	__v128 = _mm256_extractf128_si256(__a, 1);
				2869	__builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2870	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	2871
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2872	static __inline __m256 __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2873	_mm256_set_m128 (__m128 __hi, __m128 __lo) {
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2874	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2875	}
				2876
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2877	static __inline __m256d __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2878	_mm256_set_m128d (__m128d __hi, __m128d __lo) {
				2879	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2880	}
				2881
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2882	static __inline __m256i __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2883	_mm256_set_m128i (__m128i __hi, __m128i __lo) {
				2884	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2885	}
				2886
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2887	static __inline __m256 __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2888	_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
				2889	return _mm256_set_m128(__hi, __lo);
				2890	}
				2891
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2892	static __inline __m256d __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2893	_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
				2894	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2895	}
				2896
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2897	static __inline __m256i __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2898	_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
				2899	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2900	}
				2901
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2902	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	2903
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	2904	#endif /* __AVXINTRIN_H */