Blame - clang/lib/Headers/avxintrin.h - toolchain/llvm-project

blob: ce4f4243f2acddcbbadf51954ca84c07074e3980 [file] [log] [blame]

Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
Benjamin Kramer	6f35f3c	2010-08-20 23:00:03 +0000	[diff] [blame]	24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	27
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	38	/* Unsigned types */
				39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				43
Chandler Carruth	cbe6411	2015-10-01 23:40:12 +0000	[diff] [blame]	44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
				46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				47
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
				51
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	52	/* Define the default attributes for the functions in this file. */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	54
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	55	/* Arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	56	/// \brief Adds two 256-bit vectors of [4 x double].
				57	///
				58	/// \headerfile <x86intrin.h>
				59	///
				60	/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
				61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	68	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	69	_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	70	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	71	return (__m256d)((__v4df)__a+(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	72	}
				73
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	74	/// \brief Adds two 256-bit vectors of [8 x float].
				75	///
				76	/// \headerfile <x86intrin.h>
				77	///
				78	/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
				79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	86	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	87	_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	88	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	89	return (__m256)((__v8sf)__a+(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	90	}
				91
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	92	/// \brief Subtracts two 256-bit vectors of [4 x double].
				93	///
				94	/// \headerfile <x86intrin.h>
				95	///
				96	/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
				97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	104	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	105	_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	106	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	107	return (__m256d)((__v4df)__a-(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	108	}
				109
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	110	/// \brief Subtracts two 256-bit vectors of [8 x float].
				111	///
				112	/// \headerfile <x86intrin.h>
				113	///
				114	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
				115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	122	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	123	_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	124	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	125	return (__m256)((__v8sf)__a-(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	126	}
				127
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	128	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
				133	/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
				134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	141	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	142	_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	143	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	145	}
				146
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	147	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
				152	/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
				153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	160	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	161	_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	162	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	164	}
				165
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	166	/// \brief Divides two 256-bit vectors of [4 x double].
				167	///
				168	/// \headerfile <x86intrin.h>
				169	///
				170	/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
				171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	178	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	179	_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	180	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	181	return (__m256d)((__v4df)__a/(__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	182	}
				183
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	184	/// \brief Divides two 256-bit vectors of [8 x float].
				185	///
				186	/// \headerfile <x86intrin.h>
				187	///
				188	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
				189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	196	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	197	_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	198	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	199	return (__m256)((__v8sf)__a/(__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	200	}
				201
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	202	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
				207	/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
				208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	215	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	216	_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	217	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	219	}
				220
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	221	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
				226	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
				227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	234	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	235	_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	236	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	238	}
				239
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	240	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
				245	/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
				246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	253	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	254	_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	255	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	257	}
				258
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	259	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
				264	/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
				265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	272	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	273	_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	274	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	276	}
				277
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	278	/// \brief Multiplies two 256-bit vectors of [4 x double].
				279	///
				280	/// \headerfile <x86intrin.h>
				281	///
				282	/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
				283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	290	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	291	_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	292	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	293	return (__m256d)((__v4df)__a * (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	294	}
				295
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	296	/// \brief Multiplies two 256-bit vectors of [8 x float].
				297	///
				298	/// \headerfile <x86intrin.h>
				299	///
				300	/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
				301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	308	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	309	_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	310	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	311	return (__m256)((__v8sf)__a * (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	312	}
				313
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	314	/// \brief Calculates the square roots of the values in a 256-bit vector of
				315	/// [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	316	///
				317	/// \headerfile <x86intrin.h>
				318	///
				319	/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
				320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	325	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	326	_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	327	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	329	}
				330
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	331	/// \brief Calculates the square roots of the values in a 256-bit vector of
				332	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	333	///
				334	/// \headerfile <x86intrin.h>
				335	///
				336	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
				337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	342	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	343	_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	344	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	346	}
				347
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	348	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				349	/// vector of [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	350	///
				351	/// \headerfile <x86intrin.h>
				352	///
				353	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
				354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	359	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	360	_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	361	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	363	}
				364
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	365	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				366	/// [8 x float].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	367	///
				368	/// \headerfile <x86intrin.h>
				369	///
				370	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
				371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	376	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	377	_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	378	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	380	}
				381
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	382	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
				392	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
				397	/// An integer value that specifies the rounding operation.
				398	/// Bits [7:4] are reserved.
				399	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	400	/// 0: A normal PE exception is used.
				401	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	402	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	403	/// 0: Use bits [1:0] of M.
				404	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	405	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	406	/// 00: Nearest.
				407	/// 01: Downward (toward negative infinity).
				408	/// 10: Upward (toward positive infinity).
				409	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	411	#define _mm256_round_pd(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	413
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	414	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
				424	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
				429	/// An integer value that specifies the rounding operation.
				430	/// Bits [7:4] are reserved.
				431	/// Bit [3] is a precision exception value:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	432	/// 0: A normal PE exception is used.
				433	/// 1: The PE field is not updated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	434	/// Bit [2] is the rounding control source:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	435	/// 0: Use bits [1:0] of M.
				436	/// 1: Use the current MXCSR setting.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	437	/// Bits [1:0] contain the rounding control definition:
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	438	/// 00: Nearest.
				439	/// 01: Downward (toward negative infinity).
				440	/// 10: Upward (toward positive infinity).
				441	/// 11: Truncated.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Chad Rosier	060d03b	2011-12-17 00:15:26 +0000	[diff] [blame]	443	#define _mm256_round_ps(V, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	445
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	446	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
				456	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	462
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	463	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
				473	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	480
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	481	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
				491	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	497
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	498	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
				508	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	516	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				517	///
				518	/// \headerfile <x86intrin.h>
				519	///
				520	/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
				521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	528	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	529	_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	530	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	531	return (__m256d)((__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	532	}
				533
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	534	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				535	///
				536	/// \headerfile <x86intrin.h>
				537	///
				538	/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
				539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	546	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	547	_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	548	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	549	return (__m256)((__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	550	}
				551
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	552	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
				557	/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
				558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	567	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	568	_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	569	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	571	}
				572
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	573	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
				578	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
				579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	588	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	589	_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	590	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	591	return (__m256)(~(__v8su)__a & (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	592	}
				593
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	594	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				595	///
				596	/// \headerfile <x86intrin.h>
				597	///
				598	/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
				599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	606	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	607	_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	608	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	609	return (__m256d)((__v4du)__a \| (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	610	}
				611
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	612	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				613	///
				614	/// \headerfile <x86intrin.h>
				615	///
				616	/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
				617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	624	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	625	_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	626	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	627	return (__m256)((__v8su)__a \| (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	628	}
				629
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	630	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				631	///
				632	/// \headerfile <x86intrin.h>
				633	///
				634	/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
				635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	643	_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	644	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	646	}
				647
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	648	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				649	///
				650	/// \headerfile <x86intrin.h>
				651	///
				652	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	660	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	661	_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	662	{
Craig Topper	6a77b62	2016-06-04 05:43:41 +0000	[diff] [blame]	663	return (__m256)((__v8su)__a ^ (__v8su)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	664	}
				665
				666	/* Horizontal arithmetic */
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	667	/// \brief Horizontally adds the adjacent pairs of values contained in two
				668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
				672	/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
				673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	684	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	685	_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	686	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	688	}
				689
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	690	/// \brief Horizontally adds the adjacent pairs of values contained in two
				691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
				695	/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
				696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	707	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	708	_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	709	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	711	}
				712
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	713	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
				718	/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
				719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	730	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	731	_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	732	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	734	}
				735
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	736	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
				741	/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
				742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	753	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	754	_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	755	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	757	}
				758
				759	/* Vector permutations */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	760	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				761	/// by the 128-bit integer vector operand.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	762	///
				763	/// \headerfile <x86intrin.h>
				764	///
				765	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
				770	/// A 128-bit integer vector operand specifying how the values are to be
				771	/// copied.
				772	/// Bit [1]:
				773	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	774	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	775	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	776	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	777	/// Bit [65]:
				778	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	779	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	780	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	781	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	783	static __inline __m128d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	784	_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	785	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	787	}
				788
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	789	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	790	/// specified by the 256-bit integer vector operand.
				791	///
				792	/// \headerfile <x86intrin.h>
				793	///
				794	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
				800	/// copied.
				801	/// Bit [1]:
				802	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	803	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	804	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	805	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	806	/// Bit [65]:
				807	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	808	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	809	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	810	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	811	/// Bit [129]:
				812	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	813	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	814	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	815	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	816	/// Bit [193]:
				817	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	818	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	819	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	820	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	822	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	824	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	826	}
				827
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	828	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				829	/// specified by the 128-bit integer vector operand.
				830	///
				831	/// \headerfile <x86intrin.h>
				832	///
				833	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				834	///
				835	/// \param __a
				836	/// A 128-bit vector of [4 x float].
				837	/// \param __c
				838	/// A 128-bit integer vector operand specifying how the values are to be
				839	/// copied.
				840	/// Bits [1:0]:
				841	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	842	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	843	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	844	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	845	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	846	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	847	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	848	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	849	/// Bits [33:32]:
				850	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	851	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	852	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	853	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	854	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	855	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	856	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	857	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	858	/// Bits [65:64]:
				859	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	860	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	861	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	862	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	863	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	864	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	865	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	866	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	867	/// Bits [97:96]:
				868	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	869	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	870	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	871	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	872	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	873	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	874	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	875	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	876	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	877	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	878	_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	879	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	880	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	881	}
				882
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	883	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				884	/// specified by the 256-bit integer vector operand.
				885	///
				886	/// \headerfile <x86intrin.h>
				887	///
				888	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				889	///
				890	/// \param __a
				891	/// A 256-bit vector of [8 x float].
				892	/// \param __c
				893	/// A 256-bit integer vector operand specifying how the values are to be
				894	/// copied.
				895	/// Bits [1:0]:
				896	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	897	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	898	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	899	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	900	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	901	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	902	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	903	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	904	/// Bits [33:32]:
				905	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	906	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	907	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	908	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	909	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	910	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	911	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	912	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	913	/// Bits [65:64]:
				914	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	915	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	916	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	917	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	918	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	919	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	920	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	921	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	922	/// Bits [97:96]:
				923	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	924	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	925	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	926	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	927	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	928	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	929	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	930	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	931	/// Bits [129:128]:
				932	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	933	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	934	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	935	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	936	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	937	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	938	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	939	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	940	/// Bits [161:160]:
				941	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	942	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	943	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	944	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	945	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	946	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	947	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	948	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	949	/// Bits [193:192]:
				950	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	951	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	952	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	953	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	954	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	955	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	956	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	957	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	958	/// Bits [225:224]:
				959	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	960	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	961	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	962	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	963	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	964	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	965	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	966	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	967	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	968	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	969	_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	970	{
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	971	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	972	}
				973
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	974	/// \brief Copies the values in a 128-bit vector of [2 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	975	/// specified by the immediate integer operand.
				976	///
				977	/// \headerfile <x86intrin.h>
				978	///
				979	/// \code
				980	/// __m128d _mm_permute_pd(__m128d A, const int C);
				981	/// \endcode
				982	///
				983	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				984	///
				985	/// \param A
				986	/// A 128-bit vector of [2 x double].
				987	/// \param C
				988	/// An immediate integer operand specifying how the values are to be copied.
				989	/// Bit [0]:
				990	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	991	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	992	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	993	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	994	/// Bit [1]:
				995	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	996	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	997	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	998	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	999	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1000	#define _mm_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1001	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
				1002	(__v2df)_mm_setzero_pd(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1003	(C) & 0x1, ((C) & 0x2) >> 1); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1004
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1005	/// \brief Copies the values in a 256-bit vector of [4 x double] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1006	/// specified by the immediate integer operand.
				1007	///
				1008	/// \headerfile <x86intrin.h>
				1009	///
				1010	/// \code
				1011	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1012	/// \endcode
				1013	///
				1014	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				1015	///
				1016	/// \param A
				1017	/// A 256-bit vector of [4 x double].
				1018	/// \param C
				1019	/// An immediate integer operand specifying how the values are to be copied.
				1020	/// Bit [0]:
				1021	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1022	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1023	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1024	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1025	/// Bit [1]:
				1026	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1027	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1028	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1029	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1030	/// Bit [2]:
				1031	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1032	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1033	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1034	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1035	/// Bit [3]:
				1036	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1037	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1038	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1039	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1040	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	93375d5	2011-12-17 01:39:56 +0000	[diff] [blame]	1041	#define _mm256_permute_pd(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1042	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
				1043	(__v4df)_mm256_setzero_pd(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1044	(C) & 0x1, ((C) & 0x2) >> 1, \
				1045	2 + (((C) & 0x4) >> 2), \
				1046	2 + (((C) & 0x8) >> 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1047
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1048	/// \brief Copies the values in a 128-bit vector of [4 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1049	/// specified by the immediate integer operand.
				1050	///
				1051	/// \headerfile <x86intrin.h>
				1052	///
				1053	/// \code
				1054	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1055	/// \endcode
				1056	///
				1057	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1058	///
				1059	/// \param A
				1060	/// A 128-bit vector of [4 x float].
				1061	/// \param C
				1062	/// An immediate integer operand specifying how the values are to be copied.
				1063	/// Bits [1:0]:
				1064	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1065	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1066	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1067	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1068	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1069	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1070	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1071	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1072	/// Bits [3:2]:
				1073	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1074	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1075	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1076	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1077	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1078	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1079	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1080	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1081	/// Bits [5:4]:
				1082	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1083	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1084	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1085	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1086	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1087	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1088	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1089	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1090	/// Bits [7:6]:
				1091	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1092	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1093	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1094	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1095	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1096	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1097	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1098	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1099	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1100	#define _mm_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1101	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
				1102	(__v4sf)_mm_setzero_ps(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1103	(C) & 0x3, ((C) & 0xc) >> 2, \
Craig Topper	678a53c	2012-03-30 05:09:18 +0000	[diff] [blame]	1104	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1105
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1106	/// \brief Copies the values in a 256-bit vector of [8 x float] as
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1107	/// specified by the immediate integer operand.
				1108	///
				1109	/// \headerfile <x86intrin.h>
				1110	///
				1111	/// \code
				1112	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1113	/// \endcode
				1114	///
				1115	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1116	///
				1117	/// \param A
				1118	/// A 256-bit vector of [8 x float].
				1119	/// \param C
				1120	/// An immediate integer operand specifying how the values are to be copied.
				1121	/// Bits [1:0]:
				1122	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1123	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1124	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1125	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1126	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1127	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1128	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1129	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1130	/// Bits [3:2]:
				1131	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1132	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1133	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1134	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1135	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1136	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1137	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1138	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1139	/// Bits [5:4]:
				1140	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1141	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1142	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1143	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1144	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1145	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1146	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1147	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1148	/// Bits [7:6]:
				1149	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1150	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1151	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1152	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1153	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1154	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1155	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1156	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1157	/// Bits [1:0]:
				1158	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1159	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1160	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1161	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1162	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1163	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1164	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1165	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1166	/// Bits [3:2]:
				1167	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1168	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1169	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1170	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1171	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1172	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1173	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1174	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1175	/// Bits [5:4]:
				1176	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1177	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1178	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1179	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1180	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1181	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1182	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1183	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1184	/// Bits [7:6]:
				1185	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1186	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1187	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1188	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1189	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1190	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1191	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1192	/// returned vector.
Ekaterina Romanova	13f189d	2016-03-11 00:05:54 +0000	[diff] [blame]	1193	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	7caca84	2011-12-17 01:51:05 +0000	[diff] [blame]	1194	#define _mm256_permute_ps(A, C) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1195	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
				1196	(__v8sf)_mm256_setzero_ps(), \
Craig Topper	fec9f8e	2012-02-08 05:16:54 +0000	[diff] [blame]	1197	(C) & 0x3, ((C) & 0xc) >> 2, \
				1198	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
				1199	4 + (((C) & 0x03) >> 0), \
				1200	4 + (((C) & 0x0c) >> 2), \
				1201	4 + (((C) & 0x30) >> 4), \
				1202	4 + (((C) & 0xc0) >> 6)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1203
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1204	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1205	/// [4 x double], as specified by the immediate integer operand.
				1206	///
				1207	/// \headerfile <x86intrin.h>
				1208	///
				1209	/// \code
				1210	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1211	/// \endcode
				1212	///
				1213	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1214	///
				1215	/// \param V1
				1216	/// A 256-bit vector of [4 x double].
				1217	/// \param V2
				1218	/// A 256-bit vector of [4 x double.
				1219	/// \param M
				1220	/// An immediate integer operand specifying how the values are to be
				1221	/// permuted.
				1222	/// Bits [1:0]:
				1223	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1224	/// destination.
				1225	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1226	/// destination.
				1227	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1228	/// destination.
				1229	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1230	/// destination.
				1231	/// Bits [5:4]:
				1232	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1233	/// destination.
				1234	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1235	/// destination.
				1236	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1237	/// destination.
				1238	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1239	/// destination.
				1240	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1241	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1242	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1243	(__v4df)(__m256d)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1244
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1245	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1246	/// [8 x float], as specified by the immediate integer operand.
				1247	///
				1248	/// \headerfile <x86intrin.h>
				1249	///
				1250	/// \code
				1251	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1252	/// \endcode
				1253	///
				1254	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1255	///
				1256	/// \param V1
				1257	/// A 256-bit vector of [8 x float].
				1258	/// \param V2
				1259	/// A 256-bit vector of [8 x float].
				1260	/// \param M
				1261	/// An immediate integer operand specifying how the values are to be
				1262	/// permuted.
				1263	/// Bits [1:0]:
				1264	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1265	/// destination.
				1266	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1267	/// destination.
				1268	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1269	/// destination.
				1270	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1271	/// destination.
				1272	/// Bits [5:4]:
				1273	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1274	/// destination.
				1275	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1276	/// destination.
				1277	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1278	/// destination.
				1279	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1280	/// destination.
				1281	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1282	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1283	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1284	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1285
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1286	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1287	/// as specified by the immediate integer operand.
				1288	///
				1289	/// \headerfile <x86intrin.h>
				1290	///
				1291	/// \code
				1292	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1293	/// \endcode
				1294	///
				1295	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1296	///
				1297	/// \param V1
				1298	/// A 256-bit integer vector.
				1299	/// \param V2
				1300	/// A 256-bit integer vector.
				1301	/// \param M
				1302	/// An immediate integer operand specifying how the values are to be copied.
				1303	/// Bits [1:0]:
				1304	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1305	/// destination.
				1306	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1307	/// destination.
				1308	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1309	/// destination.
				1310	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1311	/// destination.
				1312	/// Bits [5:4]:
				1313	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1314	/// destination.
				1315	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1316	/// destination.
				1317	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1318	/// destination.
				1319	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1320	/// destination.
				1321	/// \returns A 256-bit integer vector containing the copied values.
Chad Rosier	9138fea25	2011-12-16 21:07:34 +0000	[diff] [blame]	1322	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1323	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1324	(__v8si)(__m256i)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1325
				1326	/* Vector Blend */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1327	/// \brief Merges 64-bit double-precision data values stored in either of the
				1328	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1329	/// integer operand.
				1330	///
				1331	/// \headerfile <x86intrin.h>
				1332	///
				1333	/// \code
				1334	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1335	/// \endcode
				1336	///
				1337	/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
				1338	///
				1339	/// \param V1
				1340	/// A 256-bit vector of [4 x double].
				1341	/// \param V2
				1342	/// A 256-bit vector of [4 x double].
				1343	/// \param M
				1344	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1345	/// values are to be copied. The position of the mask bit corresponds to the
				1346	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				1347	/// element in operand V1 is copied to the same position in the destination.
				1348	/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
				1349	/// copied to the same position in the destination.
				1350	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1351	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1352	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1353	(__v4df)(__m256d)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1354	(((M) & 0x01) ? 4 : 0), \
				1355	(((M) & 0x02) ? 5 : 1), \
				1356	(((M) & 0x04) ? 6 : 2), \
				1357	(((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1358
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1359	/// \brief Merges 32-bit single-precision data values stored in either of the
				1360	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1361	/// integer operand.
				1362	///
				1363	/// \headerfile <x86intrin.h>
				1364	///
				1365	/// \code
				1366	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1367	/// \endcode
				1368	///
				1369	/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
				1370	///
				1371	/// \param V1
				1372	/// A 256-bit vector of [8 x float].
				1373	/// \param V2
				1374	/// A 256-bit vector of [8 x float].
				1375	/// \param M
				1376	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1377	/// values are to be copied. The position of the mask bit corresponds to the
				1378	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				1379	/// element in operand V1 is copied to the same position in the destination.
				1380	/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
				1381	/// copied to the same position in the destination.
				1382	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1383	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1384	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1385	(__v8sf)(__m256)(V2), \
Filipe Cabecinhas	5d289b4	2014-05-13 02:37:02 +0000	[diff] [blame]	1386	(((M) & 0x01) ? 8 : 0), \
				1387	(((M) & 0x02) ? 9 : 1), \
				1388	(((M) & 0x04) ? 10 : 2), \
				1389	(((M) & 0x08) ? 11 : 3), \
				1390	(((M) & 0x10) ? 12 : 4), \
				1391	(((M) & 0x20) ? 13 : 5), \
				1392	(((M) & 0x40) ? 14 : 6), \
				1393	(((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1394
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1395	/// \brief Merges 64-bit double-precision data values stored in either of the
				1396	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1397	/// operand.
				1398	///
				1399	/// \headerfile <x86intrin.h>
				1400	///
				1401	/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
				1402	///
				1403	/// \param __a
				1404	/// A 256-bit vector of [4 x double].
				1405	/// \param __b
				1406	/// A 256-bit vector of [4 x double].
				1407	/// \param __c
				1408	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1409	/// how the values are to be copied. The position of the mask bit corresponds
				1410	/// to the most significant bit of a copied value. When a mask bit is 0, the
				1411	/// corresponding 64-bit element in operand __a is copied to the same
				1412	/// position in the destination. When a mask bit is 1, the corresponding
				1413	/// 64-bit element in operand __b is copied to the same position in the
				1414	/// destination.
				1415	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1416	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1417	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1418	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1419	return (__m256d)__builtin_ia32_blendvpd256(
				1420	(__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1421	}
				1422
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1423	/// \brief Merges 32-bit single-precision data values stored in either of the
				1424	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1425	/// operand.
				1426	///
				1427	/// \headerfile <x86intrin.h>
				1428	///
				1429	/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
				1430	///
				1431	/// \param __a
				1432	/// A 256-bit vector of [8 x float].
				1433	/// \param __b
				1434	/// A 256-bit vector of [8 x float].
				1435	/// \param __c
				1436	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1437	/// and 31 specifying how the values are to be copied. The position of the
				1438	/// mask bit corresponds to the most significant bit of a copied value. When
				1439	/// a mask bit is 0, the corresponding 32-bit element in operand __a is
				1440	/// copied to the same position in the destination. When a mask bit is 1, the
				1441	/// corresponding 32-bit element in operand __b is copied to the same
				1442	/// position in the destination.
				1443	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1444	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1445	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1446	{
David Blaikie	5bb7003	2013-01-16 23:13:42 +0000	[diff] [blame]	1447	return (__m256)__builtin_ia32_blendvps256(
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1448	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1449	}
				1450
				1451	/* Vector Dot Product */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1452	/// \brief Computes two dot products in parallel, using the lower and upper
				1453	/// halves of two [8 x float] vectors as input to the two computations, and
				1454	/// returning the two dot products in the lower and upper halves of the
				1455	/// [8 x float] result. The immediate integer operand controls which
				1456	/// input elements will contribute to the dot product, and where the final
				1457	/// results are returned. In general, for each dot product, the four
				1458	/// corresponding elements of the input vectors are multiplied; the first
				1459	/// two and second two products are summed, then the two sums are added to
				1460	/// form the final result.
				1461	///
				1462	/// \headerfile <x86intrin.h>
				1463	///
				1464	/// \code
				1465	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1466	/// \endcode
				1467	///
				1468	/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
				1469	///
				1470	/// \param V1
				1471	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1472	/// \param V2
				1473	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1474	/// \param M
				1475	/// An immediate integer argument. Bits [7:4] determine which elements of
				1476	/// the input vectors are used, with bit [4] corresponding to the lowest
				1477	/// element and bit [7] corresponding to the highest element of each [4 x
				1478	/// float] subvector. If a bit is set, the corresponding elements from the
				1479	/// two input vectors are used as an input for dot product; otherwise that
				1480	/// input is treated as zero. Bits [3:0] determine which elements of the
				1481	/// result will receive a copy of the final dot product, with bit [0]
				1482	/// corresponding to the lowest element and bit [3] corresponding to the
				1483	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1484	/// product is returned in the corresponding element; otherwise that element
				1485	/// is set to zero. The bitmask is applied in the same way to each of the
				1486	/// two parallel dot product computations.
				1487	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Eli Friedman	f16beb3	2011-11-10 00:11:13 +0000	[diff] [blame]	1488	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1489	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1490	(__v8sf)(__m256)(V2), (M)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1491
				1492	/* Vector shuffle */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1493	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
				1494	/// specified by the immediate value operand. The four selected elements in
				1495	/// each operand are copied to the destination according to the bits
				1496	/// specified in the immediate operand. The selected elements from the first
				1497	/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
				1498	/// destination, and the selected elements from the second 256-bit operand
				1499	/// are copied to bits [127:64] and bits [255:192] of the destination. For
				1500	/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
				1501	/// the 256-bit destination vector would contain the following values: b[7],
				1502	/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
				1503	///
				1504	/// \headerfile <x86intrin.h>
				1505	///
				1506	/// \code
				1507	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1508	/// \endcode
				1509	///
				1510	/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
				1511	///
				1512	/// \param a
				1513	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1514	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1515	/// according to the bits specified in the immediate operand.
				1516	/// \param b
				1517	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1518	/// operand are copied to bits [127:64] and bits [255:192] in the
				1519	/// destination, according to the bits specified in the immediate operand.
				1520	/// \param mask
				1521	/// An immediate value containing an 8-bit value specifying which elements to
				1522	/// copy from a and b. Bits [3:0] specify the values copied from operand a.
				1523	/// Bits [7:4] specify the values copied from operand b.
				1524	/// The destinations within the 256-bit destination are assigned values as
				1525	/// follows, according to the bit value assignments described below:
				1526	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
				1527	/// destination.
				1528	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
				1529	/// destination.
				1530	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
				1531	/// destination.
				1532	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
				1533	/// the destination.
				1534	/// Bit value assignments:
				1535	/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
				1536	/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
				1537	/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
				1538	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1539	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1540	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1541	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1542	(__v8sf)(__m256)(b), \
				1543	(mask) & 0x3, \
				1544	((mask) & 0xc) >> 2, \
				1545	(((mask) & 0x30) >> 4) + 8, \
				1546	(((mask) & 0xc0) >> 6) + 8, \
				1547	((mask) & 0x3) + 4, \
				1548	(((mask) & 0xc) >> 2) + 4, \
				1549	(((mask) & 0x30) >> 4) + 12, \
				1550	(((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1551
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1552	/// \brief Selects four double-precision values from the 256-bit operands of
				1553	/// [4 x double], as specified by the immediate value operand. The selected
				1554	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1555	/// bits [191:128] in the destination, and the selected elements from the
				1556	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
				1557	/// the destination. For example, if bits [3:0] of the immediate operand
				1558	/// contain a value of 0xF, the 256-bit destination vector would contain the
				1559	/// following values: b[3], a[3], b[1], a[1].
				1560	///
				1561	/// \headerfile <x86intrin.h>
				1562	///
				1563	/// \code
				1564	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1565	/// \endcode
				1566	///
				1567	/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
				1568	///
				1569	/// \param a
				1570	/// A 256-bit vector of [4 x double].
				1571	/// \param b
				1572	/// A 256-bit vector of [4 x double].
				1573	/// \param mask
				1574	/// An immediate value containing 8-bit values specifying which elements to
				1575	/// copy from a and b:
				1576	/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
				1577	/// destination.
				1578	/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
				1579	/// destination.
				1580	/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
				1581	/// destination.
				1582	/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
				1583	/// destination.
				1584	/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
				1585	/// destination.
				1586	/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
				1587	/// destination.
				1588	/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
				1589	/// destination.
				1590	/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
				1591	/// destination.
				1592	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1593	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1594	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1595	(__v4df)(__m256d)(b), \
				1596	(mask) & 0x1, \
				1597	(((mask) & 0x2) >> 1) + 4, \
				1598	(((mask) & 0x4) >> 2) + 2, \
				1599	(((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1600
				1601	/* Compare */
				1602	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1603	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1604	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1605	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1606	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1607	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1608	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1609	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1610	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1611	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1612	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1613	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1614	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1615	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1616	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1617	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1618	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1619	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1620	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1621	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1622	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1623	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1624	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1625	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1626	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1627	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1628	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1629	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1630	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1631	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1632	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1633	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1634
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1635	/// \brief Compares each of the corresponding double-precision values of two
				1636	/// 128-bit vectors of [2 x double], using the operation specified by the
				1637	/// immediate integer operand. Returns a [2 x double] vector consisting of
				1638	/// two doubles corresponding to the two comparison results: zero if the
				1639	/// comparison is false, and all 1's if the comparison is true.
				1640	///
				1641	/// \headerfile <x86intrin.h>
				1642	///
				1643	/// \code
				1644	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1645	/// \endcode
				1646	///
				1647	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1648	///
				1649	/// \param a
				1650	/// A 128-bit vector of [2 x double].
				1651	/// \param b
				1652	/// A 128-bit vector of [2 x double].
				1653	/// \param c
				1654	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1655	/// operation to use:
				1656	/// 00h, 08h, 10h, 18h: Equal
				1657	/// 01h, 09h, 11h, 19h: Less than
				1658	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1659	/// operands)
				1660	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1661	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1662	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1663	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1664	/// (swapped operands)
				1665	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1666	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1667	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1668	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1669	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1670
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1671	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1672	/// [4 x float], using the operation specified by the immediate integer
				1673	/// operand. Returns a [4 x float] vector consisting of four floats
				1674	/// corresponding to the four comparison results: zero if the comparison is
				1675	/// false, and all 1's if the comparison is true.
				1676	///
				1677	/// \headerfile <x86intrin.h>
				1678	///
				1679	/// \code
				1680	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1681	/// \endcode
				1682	///
				1683	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1684	///
				1685	/// \param a
				1686	/// A 128-bit vector of [4 x float].
				1687	/// \param b
				1688	/// A 128-bit vector of [4 x float].
				1689	/// \param c
				1690	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1691	/// operation to use:
				1692	/// 00h, 08h, 10h, 18h: Equal
				1693	/// 01h, 09h, 11h, 19h: Less than
				1694	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1695	/// operands)
				1696	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1697	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1698	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1699	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1700	/// (swapped operands)
				1701	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1702	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1703	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1704	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1705	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1706
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1707	/// \brief Compares each of the corresponding double-precision values of two
				1708	/// 256-bit vectors of [4 x double], using the operation specified by the
				1709	/// immediate integer operand. Returns a [4 x double] vector consisting of
				1710	/// four doubles corresponding to the four comparison results: zero if the
				1711	/// comparison is false, and all 1's if the comparison is true.
				1712	///
				1713	/// \headerfile <x86intrin.h>
				1714	///
				1715	/// \code
				1716	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1717	/// \endcode
				1718	///
				1719	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1720	///
				1721	/// \param a
				1722	/// A 256-bit vector of [4 x double].
				1723	/// \param b
				1724	/// A 256-bit vector of [4 x double].
				1725	/// \param c
				1726	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1727	/// operation to use:
				1728	/// 00h, 08h, 10h, 18h: Equal
				1729	/// 01h, 09h, 11h, 19h: Less than
				1730	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1731	/// operands)
				1732	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1733	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1734	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1735	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1736	/// (swapped operands)
				1737	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1738	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1739	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1740	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1741	(__v4df)(__m256d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1742
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1743	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1744	/// [8 x float], using the operation specified by the immediate integer
				1745	/// operand. Returns a [8 x float] vector consisting of eight floats
				1746	/// corresponding to the eight comparison results: zero if the comparison is
				1747	/// false, and all 1's if the comparison is true.
				1748	///
				1749	/// \headerfile <x86intrin.h>
				1750	///
				1751	/// \code
				1752	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1753	/// \endcode
				1754	///
				1755	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1756	///
				1757	/// \param a
				1758	/// A 256-bit vector of [8 x float].
				1759	/// \param b
				1760	/// A 256-bit vector of [8 x float].
				1761	/// \param c
				1762	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1763	/// operation to use:
				1764	/// 00h, 08h, 10h, 18h: Equal
				1765	/// 01h, 09h, 11h, 19h: Less than
				1766	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1767	/// operands)
				1768	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1769	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1770	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1771	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1772	/// (swapped operands)
				1773	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1774	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1775	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1776	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1777	(__v8sf)(__m256)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1778
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1779	/// \brief Compares each of the corresponding scalar double-precision values of
				1780	/// two 128-bit vectors of [2 x double], using the operation specified by the
				1781	/// immediate integer operand. If the result is true, all 64 bits of the
				1782	/// destination vector are set; otherwise they are cleared.
				1783	///
				1784	/// \headerfile <x86intrin.h>
				1785	///
				1786	/// \code
				1787	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1788	/// \endcode
				1789	///
				1790	/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
				1791	///
				1792	/// \param a
				1793	/// A 128-bit vector of [2 x double].
				1794	/// \param b
				1795	/// A 128-bit vector of [2 x double].
				1796	/// \param c
				1797	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1798	/// operation to use:
				1799	/// 00h, 08h, 10h, 18h: Equal
				1800	/// 01h, 09h, 11h, 19h: Less than
				1801	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1802	/// operands)
				1803	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1804	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1805	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1806	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1807	/// (swapped operands)
				1808	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1809	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1810	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1811	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1812	(__v2df)(__m128d)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1813
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1814	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1815	/// vectors of [4 x float], using the operation specified by the immediate
				1816	/// integer operand. If the result is true, all 32 bits of the destination
				1817	/// vector are set; otherwise they are cleared.
				1818	///
				1819	/// \headerfile <x86intrin.h>
				1820	///
				1821	/// \code
				1822	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1823	/// \endcode
				1824	///
				1825	/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
				1826	///
				1827	/// \param a
				1828	/// A 128-bit vector of [4 x float].
				1829	/// \param b
				1830	/// A 128-bit vector of [4 x float].
				1831	/// \param c
				1832	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1833	/// operation to use:
				1834	/// 00h, 08h, 10h, 18h: Equal
				1835	/// 01h, 09h, 11h, 19h: Less than
				1836	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1837	/// operands)
				1838	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1839	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1840	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1841	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1842	/// (swapped operands)
				1843	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1844	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Bob Wilson	c9b97cc	2011-11-05 06:08:06 +0000	[diff] [blame]	1845	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
Craig Topper	7148166	2015-11-10 05:08:05 +0000	[diff] [blame]	1846	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1847	(__v4sf)(__m128)(b), (c)); })
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1848
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1849	/// \brief Takes a [8 x i32] vector and returns the vector element value
				1850	/// indexed by the immediate constant operand.
				1851	///
				1852	/// \headerfile <x86intrin.h>
				1853	///
				1854	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1855	/// EXTRACTF128+COMPOSITE instruction.
				1856	///
				1857	/// \param __a
				1858	/// A 256-bit vector of [8 x i32].
				1859	/// \param __imm
				1860	/// An immediate integer operand with bits [2:0] determining which vector
				1861	/// element is extracted and returned.
				1862	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1863	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1864	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1865	_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1866	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1867	__v8si __b = (__v8si)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1868	return __b[__imm & 7];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1869	}
				1870
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1871	/// \brief Takes a [16 x i16] vector and returns the vector element value
				1872	/// indexed by the immediate constant operand.
				1873	///
				1874	/// \headerfile <x86intrin.h>
				1875	///
				1876	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1877	/// EXTRACTF128+COMPOSITE instruction.
				1878	///
				1879	/// \param __a
				1880	/// A 256-bit integer vector of [16 x i16].
				1881	/// \param __imm
				1882	/// An immediate integer operand with bits [3:0] determining which vector
				1883	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1884	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1885	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1886	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1887	_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1888	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1889	__v16hi __b = (__v16hi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1890	return (unsigned short)__b[__imm & 15];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1891	}
				1892
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1893	/// \brief Takes a [32 x i8] vector and returns the vector element value
				1894	/// indexed by the immediate constant operand.
				1895	///
				1896	/// \headerfile <x86intrin.h>
				1897	///
				1898	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1899	/// EXTRACTF128+COMPOSITE instruction.
				1900	///
				1901	/// \param __a
				1902	/// A 256-bit integer vector of [32 x i8].
				1903	/// \param __imm
				1904	/// An immediate integer operand with bits [4:0] determining which vector
				1905	/// element is extracted and returned.
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1906	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				1907	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1908	static __inline int __DEFAULT_FN_ATTRS
Craig Topper	459554f	2015-01-31 06:31:30 +0000	[diff] [blame]	1909	_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1910	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1911	__v32qi __b = (__v32qi)__a;
Simon Pilgrim	28666ce	2016-05-21 21:14:35 +0000	[diff] [blame]	1912	return (unsigned char)__b[__imm & 31];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1913	}
				1914
				1915	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1916	/// \brief Takes a [4 x i64] vector and returns the vector element value
				1917	/// indexed by the immediate constant operand.
				1918	///
				1919	/// \headerfile <x86intrin.h>
				1920	///
				1921	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1922	/// EXTRACTF128+COMPOSITE instruction.
				1923	///
				1924	/// \param __a
				1925	/// A 256-bit integer vector of [4 x i64].
				1926	/// \param __imm
				1927	/// An immediate integer operand with bits [1:0] determining which vector
				1928	/// element is extracted and returned.
				1929	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				1930	/// packed data.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1931	static __inline long long __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1932	_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1933	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1934	__v4di __b = (__v4di)__a;
Manman Ren	c94122e	2013-10-23 20:33:14 +0000	[diff] [blame]	1935	return __b[__imm & 3];
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1936	}
				1937	#endif
				1938
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1939	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				1940	/// indexed by the immediate constant operand by a new value. Returns the
				1941	/// modified vector.
				1942	///
				1943	/// \headerfile <x86intrin.h>
				1944	///
				1945	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1946	/// INSERTF128+COMPOSITE instruction.
				1947	///
				1948	/// \param __a
				1949	/// A vector of [8 x i32] to be used by the insert operation.
				1950	/// \param __b
				1951	/// An integer value. The replacement value for the insert operation.
				1952	/// \param __imm
				1953	/// An immediate integer specifying the index of the vector element to be
				1954	/// replaced.
				1955	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1956	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1957	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1958	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1959	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1960	__v8si __c = (__v8si)__a;
				1961	__c[__imm & 7] = __b;
				1962	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1963	}
				1964
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1965
				1966	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				1967	/// indexed by the immediate constant operand with a new value. Returns the
				1968	/// modified vector.
				1969	///
				1970	/// \headerfile <x86intrin.h>
				1971	///
				1972	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1973	/// INSERTF128+COMPOSITE instruction.
				1974	///
				1975	/// \param __a
				1976	/// A vector of [16 x i16] to be used by the insert operation.
				1977	/// \param __b
				1978	/// An i16 integer value. The replacement value for the insert operation.
				1979	/// \param __imm
				1980	/// An immediate integer specifying the index of the vector element to be
				1981	/// replaced.
				1982	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1983	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	1984	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1985	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1986	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	1987	__v16hi __c = (__v16hi)__a;
				1988	__c[__imm & 15] = __b;
				1989	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	1990	}
				1991
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	1992	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				1993	/// indexed by the immediate constant operand with a new value. Returns the
				1994	/// modified vector.
				1995	///
				1996	/// \headerfile <x86intrin.h>
				1997	///
				1998	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1999	/// INSERTF128+COMPOSITE instruction.
				2000	///
				2001	/// \param __a
				2002	/// A vector of [32 x i8] to be used by the insert operation.
				2003	/// \param __b
				2004	/// An i8 integer value. The replacement value for the insert operation.
				2005	/// \param __imm
				2006	/// An immediate integer specifying the index of the vector element to be
				2007	/// replaced.
				2008	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2009	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2010	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2011	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2012	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2013	__v32qi __c = (__v32qi)__a;
				2014	__c[__imm & 31] = __b;
				2015	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2016	}
				2017
				2018	#ifdef __x86_64__
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2019	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2020	/// indexed by the immediate constant operand with a new value. Returns the
				2021	/// modified vector.
				2022	///
				2023	/// \headerfile <x86intrin.h>
				2024	///
				2025	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2026	/// INSERTF128+COMPOSITE instruction.
				2027	///
				2028	/// \param __a
				2029	/// A vector of [4 x i64] to be used by the insert operation.
				2030	/// \param __b
				2031	/// A 64-bit integer value. The replacement value for the insert operation.
				2032	/// \param __imm
				2033	/// An immediate integer specifying the index of the vector element to be
				2034	/// replaced.
				2035	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2036	/// with __b.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2037	static __inline __m256i __DEFAULT_FN_ATTRS
Filipe Cabecinhas	d740029	2015-02-19 19:00:33 +0000	[diff] [blame]	2038	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2039	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2040	__v4di __c = (__v4di)__a;
				2041	__c[__imm & 3] = __b;
				2042	return (__m256i)__c;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2043	}
				2044	#endif
				2045
				2046	/* Conversion */
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2047	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2048	///
				2049	/// \headerfile <x86intrin.h>
				2050	///
				2051	/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
				2052	///
				2053	/// \param __a
				2054	/// A 128-bit integer vector of [4 x i32].
				2055	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2056	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2057	_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2058	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2059	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2060	}
				2061
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2062	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2063	///
				2064	/// \headerfile <x86intrin.h>
				2065	///
				2066	/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
				2067	///
				2068	/// \param __a
				2069	/// A 256-bit integer vector.
				2070	/// \returns A 256-bit vector of [8 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2071	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2072	_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2073	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2074	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2075	}
				2076
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2077	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2078	/// [4 x float].
				2079	///
				2080	/// \headerfile <x86intrin.h>
				2081	///
				2082	/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
				2083	///
				2084	/// \param __a
				2085	/// A 256-bit vector of [4 x double].
				2086	/// \returns A 128-bit vector of [4 x float] containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2087	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2088	_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2089	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2090	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2091	}
				2092
Ekaterina Romanova	1168fdc	2016-05-16 22:54:45 +0000	[diff] [blame]	2093	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2094	///
				2095	/// \headerfile <x86intrin.h>
				2096	///
				2097	/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
				2098	///
				2099	/// \param __a
				2100	/// A 256-bit vector of [8 x float].
				2101	/// \returns A 256-bit integer vector containing the converted values.
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2102	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2103	_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2104	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2105	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2106	}
				2107
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2108	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2109	_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2110	{
Simon Pilgrim	90770c7	2016-05-23 22:13:02 +0000	[diff] [blame]	2111	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2112	}
				2113
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2114	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2115	_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2116	{
Simon Pilgrim	0088051	2016-06-01 21:46:51 +0000	[diff] [blame]	2117	return (__m128i)__builtin_convertvector((__v4df) __a, __v4si);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2118	}
				2119
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2120	static __inline __m128i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2121	_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2122	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2123	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2124	}
				2125
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2126	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2127	_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2128	{
Simon Pilgrim	0088051	2016-06-01 21:46:51 +0000	[diff] [blame]	2129	return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2130	}
				2131
Michael Zuckerman	e54093f	2016-06-01 12:21:00 +0000	[diff] [blame]	2132	static __inline double __DEFAULT_FN_ATTRS
				2133	_mm256_cvtsd_f64(__m256d __a)
				2134	{
				2135	return __a[0];
				2136	}
				2137
				2138	static __inline int __DEFAULT_FN_ATTRS
				2139	_mm256_cvtsi256_si32(__m256i __a)
				2140	{
				2141	__v8si __b = (__v8si)__a;
				2142	return __b[0];
				2143	}
				2144
				2145	static __inline float __DEFAULT_FN_ATTRS
				2146	_mm256_cvtss_f32(__m256 __a)
				2147	{
				2148	return __a[0];
				2149	}
				2150
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2151	/* Vector replicate */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2152	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2153	_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2154	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2155	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2156	}
				2157
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2158	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2159	_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2160	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2161	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2162	}
				2163
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2164	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2165	_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2166	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2167	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2168	}
				2169
				2170	/* Unpack and Interleave */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2171	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2172	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2173	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2174	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2175	}
				2176
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2177	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2178	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2179	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2180	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2181	}
				2182
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2183	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2184	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2185	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2186	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2187	}
				2188
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2189	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2190	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2191	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2192	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2193	}
				2194
				2195	/* Bit Test */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2196	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2197	_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2198	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2199	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2200	}
				2201
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2202	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2203	_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2204	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2205	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2206	}
				2207
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2208	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2209	_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2210	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2211	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2212	}
				2213
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2214	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2215	_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2216	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2217	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2218	}
				2219
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2220	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2221	_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2222	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2223	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2224	}
				2225
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2226	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2227	_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2228	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2229	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2230	}
				2231
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2232	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2233	_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2234	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2235	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2236	}
				2237
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2238	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2239	_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2240	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2241	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2242	}
				2243
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2244	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2245	_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2246	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2247	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2248	}
				2249
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2250	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2251	_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2252	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2253	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2254	}
				2255
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2256	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2257	_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2258	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2259	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2260	}
				2261
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2262	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2263	_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2264	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2265	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2266	}
				2267
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2268	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2269	_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2270	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2271	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2272	}
				2273
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2274	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2275	_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2276	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2277	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2278	}
				2279
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2280	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2281	_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2282	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2283	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2284	}
				2285
				2286	/* Vector extract sign mask */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2287	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2288	_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2289	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2290	return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2291	}
				2292
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2293	static __inline int __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2294	_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2295	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2296	return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2297	}
				2298
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2299	/* Vector __zero */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2300	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2301	_mm256_zeroall(void)
				2302	{
				2303	__builtin_ia32_vzeroall();
				2304	}
				2305
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2306	static __inline void __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2307	_mm256_zeroupper(void)
				2308	{
				2309	__builtin_ia32_vzeroupper();
				2310	}
				2311
				2312	/* Vector load with broadcast */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2313	static __inline __m128 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2314	_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2315	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2316	float __f = *__a;
				2317	return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2318	}
				2319
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2320	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2321	_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2322	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2323	double __d = *__a;
				2324	return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2325	}
				2326
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2327	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2328	_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2329	{
Adam Nemet	286ae08	2014-05-29 20:47:29 +0000	[diff] [blame]	2330	float __f = *__a;
				2331	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2332	}
				2333
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2334	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2335	_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2336	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2337	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2338	}
				2339
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2340	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2341	_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2342	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2343	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2344	}
				2345
				2346	/* SIMD load ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2347	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2348	_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2349	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2350	return (__m256d )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2351	}
				2352
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2353	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2354	_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2355	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2356	return (__m256 )__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2357	}
				2358
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2359	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2360	_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2361	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2362	struct __loadu_pd {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2363	__m256d __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2364	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2365	return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2366	}
				2367
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2368	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2369	_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2370	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2371	struct __loadu_ps {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2372	__m256 __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2373	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2374	return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2375	}
				2376
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2377	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2378	_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2379	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2380	return *__p;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2381	}
				2382
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2383	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2384	_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2385	{
Craig Topper	9e9301a	2012-01-25 04:26:17 +0000	[diff] [blame]	2386	struct __loadu_si256 {
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2387	__m256i __v;
David Majnemer	1cf22e6	2015-02-04 00:26:10 +0000	[diff] [blame]	2388	} __attribute__((__packed__, __may_alias__));
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2389	return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2390	}
				2391
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2392	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2393	_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2394	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2395	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2396	}
				2397
				2398	/* SIMD store ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2399	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2400	_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2401	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2402	(__m256d )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2403	}
				2404
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2405	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2406	_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2407	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2408	(__m256 )__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2409	}
				2410
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2411	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2412	_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2413	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2414	struct __storeu_pd {
				2415	__m256d __v;
				2416	} __attribute__((__packed__, __may_alias__));
				2417	((struct __storeu_pd*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2418	}
				2419
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2420	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2421	_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2422	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2423	struct __storeu_ps {
				2424	__m256 __v;
				2425	} __attribute__((__packed__, __may_alias__));
				2426	((struct __storeu_ps*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2427	}
				2428
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2429	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2430	_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2431	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2432	*__p = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2433	}
				2434
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2435	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2436	_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2437	{
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2438	struct __storeu_si256 {
				2439	__m256i __v;
				2440	} __attribute__((__packed__, __may_alias__));
				2441	((struct __storeu_si256*)__p)->__v = __a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2442	}
				2443
				2444	/* Conditional load ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2445	static __inline __m128d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2446	_mm_maskload_pd(double const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2447	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2448	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2449	}
				2450
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2451	static __inline __m256d __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2452	_mm256_maskload_pd(double const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2453	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2454	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2455	(__v4di)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2456	}
				2457
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2458	static __inline __m128 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2459	_mm_maskload_ps(float const *__p, __m128i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2460	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2461	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2462	}
				2463
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2464	static __inline __m256 __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2465	_mm256_maskload_ps(float const *__p, __m256i __m)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2466	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2467	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2468	}
				2469
				2470	/* Conditional store ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2471	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2472	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2473	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2474	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2475	}
				2476
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2477	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2478	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2479	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2480	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2481	}
				2482
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2483	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2484	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2485	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2486	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2487	}
				2488
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2489	static __inline void __DEFAULT_FN_ATTRS
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2490	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2491	{
Andrea Di Biagio	8bb12d0	2015-10-20 11:19:54 +0000	[diff] [blame]	2492	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2493	}
				2494
				2495	/* Cacheability support ops */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2496	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2497	_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2498	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2499	__builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2500	}
				2501
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2502	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2503	_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2504	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2505	__builtin_ia32_movntpd256(__a, (__v4df)__b);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2506	}
				2507
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2508	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2509	_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2510	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2511	__builtin_ia32_movntps256(__p, (__v8sf)__a);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2512	}
				2513
				2514	/* Create vectors */
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	2515	static __inline__ __m256d __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame^]	2516	_mm256_undefined_pd(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	2517	{
				2518	return (__m256d)__builtin_ia32_undef256();
				2519	}
				2520
				2521	static __inline__ __m256 __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame^]	2522	_mm256_undefined_ps(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	2523	{
				2524	return (__m256)__builtin_ia32_undef256();
				2525	}
				2526
				2527	static __inline__ __m256i __DEFAULT_FN_ATTRS
Craig Topper	3a0c726	2016-06-09 05:14:28 +0000	[diff] [blame^]	2528	_mm256_undefined_si256(void)
Simon Pilgrim	5aba992	2015-08-26 21:17:12 +0000	[diff] [blame]	2529	{
				2530	return (__m256i)__builtin_ia32_undef256();
				2531	}
				2532
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2533	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2534	_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2535	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2536	return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2537	}
				2538
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2539	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2540	_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2541	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2542	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2543	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2544	}
				2545
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2546	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2547	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2548	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2549	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2550	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2551	}
				2552
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2553	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2554	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2555	short __w11, short __w10, short __w09, short __w08,
				2556	short __w07, short __w06, short __w05, short __w04,
				2557	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2558	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2559	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				2560	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2561	}
				2562
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2563	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2564	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2565	char __b27, char __b26, char __b25, char __b24,
				2566	char __b23, char __b22, char __b21, char __b20,
				2567	char __b19, char __b18, char __b17, char __b16,
				2568	char __b15, char __b14, char __b13, char __b12,
				2569	char __b11, char __b10, char __b09, char __b08,
				2570	char __b07, char __b06, char __b05, char __b04,
				2571	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2572	{
				2573	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2574	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				2575	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				2576	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				2577	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2578	};
				2579	}
				2580
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2581	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2582	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2583	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2584	return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2585	}
				2586
				2587	/* Create vectors with elements in reverse order */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2588	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2589	_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2590	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2591	return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2592	}
				2593
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2594	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2595	_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2596	float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2597	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2598	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2599	}
				2600
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2601	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2602	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2603	int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2604	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2605	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2606	}
				2607
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2608	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2609	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2610	short __w11, short __w10, short __w09, short __w08,
				2611	short __w07, short __w06, short __w05, short __w04,
				2612	short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2613	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2614	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				2615	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2616	}
				2617
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2618	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2619	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2620	char __b27, char __b26, char __b25, char __b24,
				2621	char __b23, char __b22, char __b21, char __b20,
				2622	char __b19, char __b18, char __b17, char __b16,
				2623	char __b15, char __b14, char __b13, char __b12,
				2624	char __b11, char __b10, char __b09, char __b08,
				2625	char __b07, char __b06, char __b05, char __b04,
				2626	char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2627	{
				2628	return (__m256i)(__v32qi){
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2629	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper	9fee8ab	2015-01-31 06:33:59 +0000	[diff] [blame]	2630	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				2631	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				2632	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2633	}
				2634
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2635	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2636	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2637	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2638	return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2639	}
				2640
				2641	/* Create vectors with repeated elements */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2642	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2643	_mm256_set1_pd(double __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2644	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2645	return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2646	}
				2647
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2648	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2649	_mm256_set1_ps(float __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2650	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2651	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2652	}
				2653
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2654	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2655	_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2656	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2657	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2658	}
				2659
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2660	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2661	_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2662	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2663	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				2664	__w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2665	}
				2666
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2667	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2668	_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2669	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2670	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2671	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2672	__b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2673	}
				2674
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2675	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2676	_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2677	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2678	return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2679	}
				2680
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2681	/* Create __zeroed vectors */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2682	static __inline __m256d __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2683	_mm256_setzero_pd(void)
				2684	{
				2685	return (__m256d){ 0, 0, 0, 0 };
				2686	}
				2687
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2688	static __inline __m256 __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2689	_mm256_setzero_ps(void)
				2690	{
				2691	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				2692	}
				2693
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2694	static __inline __m256i __DEFAULT_FN_ATTRS
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2695	_mm256_setzero_si256(void)
				2696	{
				2697	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				2698	}
				2699
				2700	/* Cast between vector types */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2701	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2702	_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2703	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2704	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2705	}
				2706
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2707	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2708	_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2709	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2710	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2711	}
				2712
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2713	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2714	_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2715	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2716	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2717	}
				2718
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2719	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2720	_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2721	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2722	return (__m256i)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2723	}
				2724
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2725	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2726	_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2727	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2728	return (__m256)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2729	}
				2730
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2731	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2732	_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2733	{
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2734	return (__m256d)__a;
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2735	}
				2736
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2737	static __inline __m128d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2738	_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2739	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2740	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2741	}
				2742
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2743	static __inline __m128 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2744	_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2745	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2746	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2747	}
				2748
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2749	static __inline __m128i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2750	_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2751	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2752	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2753	}
				2754
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2755	static __inline __m256d __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2756	_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2757	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2758	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2759	}
				2760
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2761	static __inline __m256 __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2762	_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2763	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2764	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2765	}
				2766
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2767	static __inline __m256i __DEFAULT_FN_ATTRS
Reid Kleckner	7ab75b3	2013-04-19 17:00:14 +0000	[diff] [blame]	2768	_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2769	{
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2770	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
Bruno Cardoso Lopes	7c4b513	2010-08-04 22:03:36 +0000	[diff] [blame]	2771	}
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2772
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	2773	/*
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2774	Vector insert.
				2775	We use macros rather than inlines because we only want to accept
				2776	invocations where the immediate M is a constant expression.
				2777	*/
				2778	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				2779	(__m256)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2780	(__v8sf)(__m256)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2781	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				2782	(((M) & 1) ? 0 : 8), \
				2783	(((M) & 1) ? 1 : 9), \
				2784	(((M) & 1) ? 2 : 10), \
				2785	(((M) & 1) ? 3 : 11), \
				2786	(((M) & 1) ? 8 : 4), \
				2787	(((M) & 1) ? 9 : 5), \
				2788	(((M) & 1) ? 10 : 6), \
				2789	(((M) & 1) ? 11 : 7) );})
				2790
				2791	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				2792	(__m256d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2793	(__v4df)(__m256d)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2794	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				2795	(((M) & 1) ? 0 : 4), \
				2796	(((M) & 1) ? 1 : 5), \
				2797	(((M) & 1) ? 4 : 2), \
				2798	(((M) & 1) ? 5 : 3) );})
				2799
				2800	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				2801	(__m256i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2802	(__v4di)(__m256i)(V1), \
Sanjay Patel	7f6aa52	2015-03-10 15:19:26 +0000	[diff] [blame]	2803	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				2804	(((M) & 1) ? 0 : 4), \
				2805	(((M) & 1) ? 1 : 5), \
				2806	(((M) & 1) ? 4 : 2), \
				2807	(((M) & 1) ? 5 : 3) );})
				2808
Sean Silva	e4c3760	2015-09-12 02:55:19 +0000	[diff] [blame]	2809	/*
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2810	Vector extract.
				2811	We use macros rather than inlines because we only want to accept
				2812	invocations where the immediate M is a constant expression.
				2813	*/
				2814	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				2815	(__m128)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2816	(__v8sf)(__m256)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2817	(__v8sf)(_mm256_setzero_ps()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2818	(((M) & 1) ? 4 : 0), \
				2819	(((M) & 1) ? 5 : 1), \
				2820	(((M) & 1) ? 6 : 2), \
				2821	(((M) & 1) ? 7 : 3) );})
				2822
				2823	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				2824	(__m128d)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2825	(__v4df)(__m256d)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2826	(__v4df)(_mm256_setzero_pd()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2827	(((M) & 1) ? 2 : 0), \
				2828	(((M) & 1) ? 3 : 1) );})
				2829
				2830	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				2831	(__m128i)__builtin_shufflevector( \
Craig Topper	d619eaaa	2015-11-11 03:47:10 +0000	[diff] [blame]	2832	(__v4di)(__m256i)(V), \
Sanjay Patel	f204b00	2015-03-12 17:23:46 +0000	[diff] [blame]	2833	(__v4di)(_mm256_setzero_si256()), \
Sanjay Patel	0c351ab	2015-03-12 15:50:36 +0000	[diff] [blame]	2834	(((M) & 1) ? 2 : 0), \
				2835	(((M) & 1) ? 3 : 1) );})
				2836
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2837	/* SIMD load ops (unaligned) */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2838	static __inline __m256 __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2839	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2840	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	2841	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				2842	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2843	}
				2844
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2845	static __inline __m256d __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2846	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2847	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	2848	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				2849	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2850	}
				2851
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2852	static __inline __m256i __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2853	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2854	{
Craig Topper	74b5948	2016-05-31 05:49:13 +0000	[diff] [blame]	2855	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				2856	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2857	}
				2858
				2859	/* SIMD store ops (unaligned) */
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2860	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2861	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2862	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2863	__m128 __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2864
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2865	__v128 = _mm256_castps256_ps128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2866	_mm_storeu_ps(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2867	__v128 = _mm256_extractf128_ps(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2868	_mm_storeu_ps(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2869	}
				2870
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2871	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2872	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2873	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2874	__m128d __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2875
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2876	__v128 = _mm256_castpd256_pd128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2877	_mm_storeu_pd(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2878	__v128 = _mm256_extractf128_pd(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2879	_mm_storeu_pd(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2880	}
				2881
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2882	static __inline void __DEFAULT_FN_ATTRS
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2883	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2884	{
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2885	__m128i __v128;
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2886
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2887	__v128 = _mm256_castsi256_si128(__a);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2888	_mm_storeu_si128(__addr_lo, __v128);
David Blaikie	3302f2b	2013-01-16 23:08:36 +0000	[diff] [blame]	2889	__v128 = _mm256_extractf128_si256(__a, 1);
Craig Topper	09175da	2016-05-30 17:10:30 +0000	[diff] [blame]	2890	_mm_storeu_si128(__addr_hi, __v128);
Chad Rosier	f8df4f4	2012-03-20 16:40:00 +0000	[diff] [blame]	2891	}
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	2892
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2893	static __inline __m256 __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2894	_mm256_set_m128 (__m128 __hi, __m128 __lo) {
Craig Topper	1aa231e	2016-05-16 06:38:42 +0000	[diff] [blame]	2895	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2896	}
				2897
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2898	static __inline __m256d __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2899	_mm256_set_m128d (__m128d __hi, __m128d __lo) {
				2900	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2901	}
				2902
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2903	static __inline __m256i __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2904	_mm256_set_m128i (__m128i __hi, __m128i __lo) {
				2905	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2906	}
				2907
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2908	static __inline __m256 __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2909	_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
				2910	return _mm256_set_m128(__hi, __lo);
				2911	}
				2912
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2913	static __inline __m256d __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2914	_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
				2915	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2916	}
				2917
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2918	static __inline __m256i __DEFAULT_FN_ATTRS
Michael Kuperstein	7619004	2015-05-20 07:46:52 +0000	[diff] [blame]	2919	_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
				2920	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2921	}
				2922
Michael Kuperstein	e45af54	2015-06-30 13:36:19 +0000	[diff] [blame]	2923	#undef __DEFAULT_FN_ATTRS
Eric Christopher	4d185168	2015-06-17 07:09:20 +0000	[diff] [blame]	2924
Richard Smith	49e5644	2013-07-14 05:41:45 +0000	[diff] [blame]	2925	#endif /* __AVXINTRIN_H */