Blame - third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/avxintrin.h - fp2-dev/platform/external/v8

blob: 2a733230c6019017a5f8ffd49dd86da3500f9213 [file] [log] [blame]

Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
				24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
				27
				28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
				31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
				38	/* We need an explicitly signed variant for char. Note that this shouldn't
				39	* appear in the interface though. */
				40	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				41
				42	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				43	typedef double __m256d __attribute__((__vector_size__(32)));
				44	typedef long long __m256i __attribute__((__vector_size__(32)));
				45
				46	/* Define the default attributes for the functions in this file. */
				47	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
				48
				49	/* Arithmetic */
				50	/// \brief Adds two 256-bit vectors of [4 x double].
				51	///
				52	/// \headerfile <x86intrin.h>
				53	///
				54	/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
				55	///
				56	/// \param __a
				57	/// A 256-bit vector of [4 x double] containing one of the source operands.
				58	/// \param __b
				59	/// A 256-bit vector of [4 x double] containing one of the source operands.
				60	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				61	/// operands.
				62	static __inline __m256d __DEFAULT_FN_ATTRS
				63	_mm256_add_pd(__m256d __a, __m256d __b)
				64	{
				65	return __a+__b;
				66	}
				67
				68	/// \brief Adds two 256-bit vectors of [8 x float].
				69	///
				70	/// \headerfile <x86intrin.h>
				71	///
				72	/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
				73	///
				74	/// \param __a
				75	/// A 256-bit vector of [8 x float] containing one of the source operands.
				76	/// \param __b
				77	/// A 256-bit vector of [8 x float] containing one of the source operands.
				78	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				79	/// operands.
				80	static __inline __m256 __DEFAULT_FN_ATTRS
				81	_mm256_add_ps(__m256 __a, __m256 __b)
				82	{
				83	return __a+__b;
				84	}
				85
				86	/// \brief Subtracts two 256-bit vectors of [4 x double].
				87	///
				88	/// \headerfile <x86intrin.h>
				89	///
				90	/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
				91	///
				92	/// \param __a
				93	/// A 256-bit vector of [4 x double] containing the minuend.
				94	/// \param __b
				95	/// A 256-bit vector of [4 x double] containing the subtrahend.
				96	/// \returns A 256-bit vector of [4 x double] containing the differences between
				97	/// both operands.
				98	static __inline __m256d __DEFAULT_FN_ATTRS
				99	_mm256_sub_pd(__m256d __a, __m256d __b)
				100	{
				101	return __a-__b;
				102	}
				103
				104	/// \brief Subtracts two 256-bit vectors of [8 x float].
				105	///
				106	/// \headerfile <x86intrin.h>
				107	///
				108	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
				109	///
				110	/// \param __a
				111	/// A 256-bit vector of [8 x float] containing the minuend.
				112	/// \param __b
				113	/// A 256-bit vector of [8 x float] containing the subtrahend.
				114	/// \returns A 256-bit vector of [8 x float] containing the differences between
				115	/// both operands.
				116	static __inline __m256 __DEFAULT_FN_ATTRS
				117	_mm256_sub_ps(__m256 __a, __m256 __b)
				118	{
				119	return __a-__b;
				120	}
				121
				122	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				123	/// two 256-bit vectors of [4 x double].
				124	///
				125	/// \headerfile <x86intrin.h>
				126	///
				127	/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
				128	///
				129	/// \param __a
				130	/// A 256-bit vector of [4 x double] containing the left source operand.
				131	/// \param __b
				132	/// A 256-bit vector of [4 x double] containing the right source operand.
				133	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				134	/// and differences between both operands.
				135	static __inline __m256d __DEFAULT_FN_ATTRS
				136	_mm256_addsub_pd(__m256d __a, __m256d __b)
				137	{
				138	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
				139	}
				140
				141	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				142	/// two 256-bit vectors of [8 x float].
				143	///
				144	/// \headerfile <x86intrin.h>
				145	///
				146	/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
				147	///
				148	/// \param __a
				149	/// A 256-bit vector of [8 x float] containing the left source operand.
				150	/// \param __b
				151	/// A 256-bit vector of [8 x float] containing the right source operand.
				152	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				153	/// differences between both operands.
				154	static __inline __m256 __DEFAULT_FN_ATTRS
				155	_mm256_addsub_ps(__m256 __a, __m256 __b)
				156	{
				157	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
				158	}
				159
				160	/// \brief Divides two 256-bit vectors of [4 x double].
				161	///
				162	/// \headerfile <x86intrin.h>
				163	///
				164	/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
				165	///
				166	/// \param __a
				167	/// A 256-bit vector of [4 x double] containing the dividend.
				168	/// \param __b
				169	/// A 256-bit vector of [4 x double] containing the divisor.
				170	/// \returns A 256-bit vector of [4 x double] containing the quotients between
				171	/// both operands.
				172	static __inline __m256d __DEFAULT_FN_ATTRS
				173	_mm256_div_pd(__m256d __a, __m256d __b)
				174	{
				175	return __a / __b;
				176	}
				177
				178	/// \brief Divides two 256-bit vectors of [8 x float].
				179	///
				180	/// \headerfile <x86intrin.h>
				181	///
				182	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
				183	///
				184	/// \param __a
				185	/// A 256-bit vector of [8 x float] containing the dividend.
				186	/// \param __b
				187	/// A 256-bit vector of [8 x float] containing the divisor.
				188	/// \returns A 256-bit vector of [8 x float] containing the quotients between
				189	/// both operands.
				190	static __inline __m256 __DEFAULT_FN_ATTRS
				191	_mm256_div_ps(__m256 __a, __m256 __b)
				192	{
				193	return __a / __b;
				194	}
				195
				196	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				197	/// of each pair of values.
				198	///
				199	/// \headerfile <x86intrin.h>
				200	///
				201	/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
				202	///
				203	/// \param __a
				204	/// A 256-bit vector of [4 x double] containing one of the operands.
				205	/// \param __b
				206	/// A 256-bit vector of [4 x double] containing one of the operands.
				207	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				208	/// between both operands.
				209	static __inline __m256d __DEFAULT_FN_ATTRS
				210	_mm256_max_pd(__m256d __a, __m256d __b)
				211	{
				212	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
				213	}
				214
				215	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				216	/// of each pair of values.
				217	///
				218	/// \headerfile <x86intrin.h>
				219	///
				220	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
				221	///
				222	/// \param __a
				223	/// A 256-bit vector of [8 x float] containing one of the operands.
				224	/// \param __b
				225	/// A 256-bit vector of [8 x float] containing one of the operands.
				226	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				227	/// between both operands.
				228	static __inline __m256 __DEFAULT_FN_ATTRS
				229	_mm256_max_ps(__m256 __a, __m256 __b)
				230	{
				231	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
				232	}
				233
				234	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				235	/// of each pair of values.
				236	///
				237	/// \headerfile <x86intrin.h>
				238	///
				239	/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
				240	///
				241	/// \param __a
				242	/// A 256-bit vector of [4 x double] containing one of the operands.
				243	/// \param __b
				244	/// A 256-bit vector of [4 x double] containing one of the operands.
				245	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				246	/// between both operands.
				247	static __inline __m256d __DEFAULT_FN_ATTRS
				248	_mm256_min_pd(__m256d __a, __m256d __b)
				249	{
				250	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
				251	}
				252
				253	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				254	/// of each pair of values.
				255	///
				256	/// \headerfile <x86intrin.h>
				257	///
				258	/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
				259	///
				260	/// \param __a
				261	/// A 256-bit vector of [8 x float] containing one of the operands.
				262	/// \param __b
				263	/// A 256-bit vector of [8 x float] containing one of the operands.
				264	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				265	/// between both operands.
				266	static __inline __m256 __DEFAULT_FN_ATTRS
				267	_mm256_min_ps(__m256 __a, __m256 __b)
				268	{
				269	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
				270	}
				271
				272	/// \brief Multiplies two 256-bit vectors of [4 x double].
				273	///
				274	/// \headerfile <x86intrin.h>
				275	///
				276	/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
				277	///
				278	/// \param __a
				279	/// A 256-bit vector of [4 x double] containing one of the operands.
				280	/// \param __b
				281	/// A 256-bit vector of [4 x double] containing one of the operands.
				282	/// \returns A 256-bit vector of [4 x double] containing the products between
				283	/// both operands.
				284	static __inline __m256d __DEFAULT_FN_ATTRS
				285	_mm256_mul_pd(__m256d __a, __m256d __b)
				286	{
				287	return __a * __b;
				288	}
				289
				290	/// \brief Multiplies two 256-bit vectors of [8 x float].
				291	///
				292	/// \headerfile <x86intrin.h>
				293	///
				294	/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
				295	///
				296	/// \param __a
				297	/// A 256-bit vector of [8 x float] containing one of the operands.
				298	/// \param __b
				299	/// A 256-bit vector of [8 x float] containing one of the operands.
				300	/// \returns A 256-bit vector of [8 x float] containing the products between
				301	/// both operands.
				302	static __inline __m256 __DEFAULT_FN_ATTRS
				303	_mm256_mul_ps(__m256 __a, __m256 __b)
				304	{
				305	return __a * __b;
				306	}
				307
				308	/// \brief Calculates the square roots of the values stored in a 256-bit vector
				309	/// of [4 x double].
				310	///
				311	/// \headerfile <x86intrin.h>
				312	///
				313	/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
				314	///
				315	/// \param __a
				316	/// A 256-bit vector of [4 x double].
				317	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				318	/// values in the operand.
				319	static __inline __m256d __DEFAULT_FN_ATTRS
				320	_mm256_sqrt_pd(__m256d __a)
				321	{
				322	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
				323	}
				324
				325	/// \brief Calculates the square roots of the values stored in a 256-bit vector
				326	/// of [8 x float].
				327	///
				328	/// \headerfile <x86intrin.h>
				329	///
				330	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
				331	///
				332	/// \param __a
				333	/// A 256-bit vector of [8 x float].
				334	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				335	/// values in the operand.
				336	static __inline __m256 __DEFAULT_FN_ATTRS
				337	_mm256_sqrt_ps(__m256 __a)
				338	{
				339	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
				340	}
				341
				342	/// \brief Calculates the reciprocal square roots of the values stored in a
				343	/// 256-bit vector of [8 x float].
				344	///
				345	/// \headerfile <x86intrin.h>
				346	///
				347	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
				348	///
				349	/// \param __a
				350	/// A 256-bit vector of [8 x float].
				351	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				352	/// roots of the values in the operand.
				353	static __inline __m256 __DEFAULT_FN_ATTRS
				354	_mm256_rsqrt_ps(__m256 __a)
				355	{
				356	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
				357	}
				358
				359	/// \brief Calculates the reciprocals of the values stored in a 256-bit vector
				360	/// of [8 x float].
				361	///
				362	/// \headerfile <x86intrin.h>
				363	///
				364	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
				365	///
				366	/// \param __a
				367	/// A 256-bit vector of [8 x float].
				368	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				369	/// values in the operand.
				370	static __inline __m256 __DEFAULT_FN_ATTRS
				371	_mm256_rcp_ps(__m256 __a)
				372	{
				373	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
				374	}
				375
				376	/// \brief Rounds the values stored in a 256-bit vector of [4 x double] as
				377	/// specified by the byte operand. The source values are rounded to integer
				378	/// values and returned as 64-bit double-precision floating-point values.
				379	///
				380	/// \headerfile <x86intrin.h>
				381	///
				382	/// \code
				383	/// __m256d _mm256_round_pd(__m256d V, const int M);
				384	/// \endcode
				385	///
				386	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				387	///
				388	/// \param V
				389	/// A 256-bit vector of [4 x double].
				390	/// \param M
				391	/// An integer value that specifies the rounding operation.
				392	/// Bits [7:4] are reserved.
				393	/// Bit [3] is a precision exception value:
				394	/// 0: A normal PE exception is used
				395	/// 1: The PE field is not updated
				396	/// Bit [2] is the rounding control source:
				397	/// 0: Use bits [1:0] of M
				398	/// 1: Use the current MXCSR setting
				399	/// Bits [1:0] contain the rounding control definition:
				400	/// 00: Nearest
				401	/// 01: Downward (toward negative infinity)
				402	/// 10: Upward (toward positive infinity)
				403	/// 11: Truncated
				404	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
				405	#define _mm256_round_pd(V, M) __extension__ ({ \
				406	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
				407
				408	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				409	/// specified by the byte operand. The source values are rounded to integer
				410	/// values and returned as floating-point values.
				411	///
				412	/// \headerfile <x86intrin.h>
				413	///
				414	/// \code
				415	/// __m256 _mm256_round_ps(__m256 V, const int M);
				416	/// \endcode
				417	///
				418	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				419	///
				420	/// \param V
				421	/// A 256-bit vector of [8 x float].
				422	/// \param M
				423	/// An integer value that specifies the rounding operation.
				424	/// Bits [7:4] are reserved.
				425	/// Bit [3] is a precision exception value:
				426	/// 0: A normal PE exception is used
				427	/// 1: The PE field is not updated
				428	/// Bit [2] is the rounding control source:
				429	/// 0: Use bits [1:0] of M
				430	/// 1: Use the current MXCSR setting
				431	/// Bits [1:0] contain the rounding control definition:
				432	/// 00: Nearest
				433	/// 01: Downward (toward negative infinity)
				434	/// 10: Upward (toward positive infinity)
				435	/// 11: Truncated
				436	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
				437	#define _mm256_round_ps(V, M) __extension__ ({ \
				438	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
				439
				440	/// \brief Round up the values stored in a 256-bit vector of [4 x double]. The
				441	/// source values are rounded up to integer values and returned as 64-bit
				442	/// double-precision floating-point values.
				443	///
				444	/// \headerfile <x86intrin.h>
				445	///
				446	/// \code
				447	/// __m256d _mm256_ceil_pd(__m256d V);
				448	/// \endcode
				449	///
				450	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				451	///
				452	/// \param V
				453	/// A 256-bit vector of [4 x double].
				454	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
				455	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
				456
				457	/// \brief Round down the values stored in a 256-bit vector of [4 x double].
				458	/// The source values are rounded down to integer values and returned as
				459	/// 64-bit double-precision floating-point values.
				460	///
				461	/// \headerfile <x86intrin.h>
				462	///
				463	/// \code
				464	/// __m256d _mm256_floor_pd(__m256d V);
				465	/// \endcode
				466	///
				467	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				468	///
				469	/// \param V
				470	/// A 256-bit vector of [4 x double].
				471	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				472	/// values.
				473	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
				474
				475	/// \brief Round up the values stored in a 256-bit vector of [8 x float]. The
				476	/// source values are rounded up to integer values and returned as
				477	/// floating-point values.
				478	///
				479	/// \headerfile <x86intrin.h>
				480	///
				481	/// \code
				482	/// __m256 _mm256_ceil_ps(__m256 V);
				483	/// \endcode
				484	///
				485	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				486	///
				487	/// \param V
				488	/// A 256-bit vector of [8 x float].
				489	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
				490	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
				491
				492	/// \brief Round down the values stored in a 256-bit vector of [8 x float]. The
				493	/// source values are rounded down to integer values and returned as
				494	/// floating-point values.
				495	///
				496	/// \headerfile <x86intrin.h>
				497	///
				498	/// \code
				499	/// __m256 _mm256_floor_ps(__m256 V);
				500	/// \endcode
				501	///
				502	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				503	///
				504	/// \param V
				505	/// A 256-bit vector of [8 x float].
				506	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
				507	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				508
				509	/* Logical */
				510	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				511	///
				512	/// \headerfile <x86intrin.h>
				513	///
				514	/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
				515	///
				516	/// \param __a
				517	/// A 256-bit vector of [4 x double] containing one of the source operands.
				518	/// \param __b
				519	/// A 256-bit vector of [4 x double] containing one of the source operands.
				520	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				521	/// values between both operands.
				522	static __inline __m256d __DEFAULT_FN_ATTRS
				523	_mm256_and_pd(__m256d __a, __m256d __b)
				524	{
				525	return (__m256d)((__v4di)__a & (__v4di)__b);
				526	}
				527
				528	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				529	///
				530	/// \headerfile <x86intrin.h>
				531	///
				532	/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
				533	///
				534	/// \param __a
				535	/// A 256-bit vector of [8 x float] containing one of the source operands.
				536	/// \param __b
				537	/// A 256-bit vector of [8 x float] containing one of the source operands.
				538	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				539	/// values between both operands.
				540	static __inline __m256 __DEFAULT_FN_ATTRS
				541	_mm256_and_ps(__m256 __a, __m256 __b)
				542	{
				543	return (__m256)((__v8si)__a & (__v8si)__b);
				544	}
				545
				546	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				547	/// the one's complement of the values contained in the first source operand.
				548	///
				549	/// \headerfile <x86intrin.h>
				550	///
				551	/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
				552	///
				553	/// \param __a
				554	/// A 256-bit vector of [4 x double] containing the left source operand. The
				555	/// one's complement of this value is used in the bitwise AND.
				556	/// \param __b
				557	/// A 256-bit vector of [4 x double] containing the right source operand.
				558	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				559	/// values of the second operand and the one's complement of the first
				560	/// operand.
				561	static __inline __m256d __DEFAULT_FN_ATTRS
				562	_mm256_andnot_pd(__m256d __a, __m256d __b)
				563	{
				564	return (__m256d)(~(__v4di)__a & (__v4di)__b);
				565	}
				566
				567	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				568	/// the one's complement of the values contained in the first source operand.
				569	///
				570	/// \headerfile <x86intrin.h>
				571	///
				572	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
				573	///
				574	/// \param __a
				575	/// A 256-bit vector of [8 x float] containing the left source operand. The
				576	/// one's complement of this value is used in the bitwise AND.
				577	/// \param __b
				578	/// A 256-bit vector of [8 x float] containing the right source operand.
				579	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				580	/// values of the second operand and the one's complement of the first
				581	/// operand.
				582	static __inline __m256 __DEFAULT_FN_ATTRS
				583	_mm256_andnot_ps(__m256 __a, __m256 __b)
				584	{
				585	return (__m256)(~(__v8si)__a & (__v8si)__b);
				586	}
				587
				588	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				589	///
				590	/// \headerfile <x86intrin.h>
				591	///
				592	/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
				593	///
				594	/// \param __a
				595	/// A 256-bit vector of [4 x double] containing one of the source operands.
				596	/// \param __b
				597	/// A 256-bit vector of [4 x double] containing one of the source operands.
				598	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				599	/// values between both operands.
				600	static __inline __m256d __DEFAULT_FN_ATTRS
				601	_mm256_or_pd(__m256d __a, __m256d __b)
				602	{
				603	return (__m256d)((__v4di)__a \| (__v4di)__b);
				604	}
				605
				606	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				607	///
				608	/// \headerfile <x86intrin.h>
				609	///
				610	/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
				611	///
				612	/// \param __a
				613	/// A 256-bit vector of [8 x float] containing one of the source operands.
				614	/// \param __b
				615	/// A 256-bit vector of [8 x float] containing one of the source operands.
				616	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				617	/// values between both operands.
				618	static __inline __m256 __DEFAULT_FN_ATTRS
				619	_mm256_or_ps(__m256 __a, __m256 __b)
				620	{
				621	return (__m256)((__v8si)__a \| (__v8si)__b);
				622	}
				623
				624	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				625	///
				626	/// \headerfile <x86intrin.h>
				627	///
				628	/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
				629	///
				630	/// \param __a
				631	/// A 256-bit vector of [4 x double] containing one of the source operands.
				632	/// \param __b
				633	/// A 256-bit vector of [4 x double] containing one of the source operands.
				634	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				635	/// values between both operands.
				636	static __inline __m256d __DEFAULT_FN_ATTRS
				637	_mm256_xor_pd(__m256d __a, __m256d __b)
				638	{
				639	return (__m256d)((__v4di)__a ^ (__v4di)__b);
				640	}
				641
				642	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				643	///
				644	/// \headerfile <x86intrin.h>
				645	///
				646	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				647	///
				648	/// \param __a
				649	/// A 256-bit vector of [8 x float] containing one of the source operands.
				650	/// \param __b
				651	/// A 256-bit vector of [8 x float] containing one of the source operands.
				652	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				653	/// values between both operands.
				654	static __inline __m256 __DEFAULT_FN_ATTRS
				655	_mm256_xor_ps(__m256 __a, __m256 __b)
				656	{
				657	return (__m256)((__v8si)__a ^ (__v8si)__b);
				658	}
				659
				660	/* Horizontal arithmetic */
				661	/// \brief Horizontally adds the adjacent pairs of values contained in two
				662	/// 256-bit vectors of [4 x double].
				663	///
				664	/// \headerfile <x86intrin.h>
				665	///
				666	/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
				667	///
				668	/// \param __a
				669	/// A 256-bit vector of [4 x double] containing one of the source operands.
				670	/// The horizontal sums of the values are returned in the even-indexed
				671	/// elements of a vector of [4 x double].
				672	/// \param __b
				673	/// A 256-bit vector of [4 x double] containing one of the source operands.
				674	/// The horizontal sums of the values are returned in the odd-indexed
				675	/// elements of a vector of [4 x double].
				676	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				677	/// both operands.
				678	static __inline __m256d __DEFAULT_FN_ATTRS
				679	_mm256_hadd_pd(__m256d __a, __m256d __b)
				680	{
				681	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
				682	}
				683
				684	/// \brief Horizontally adds the adjacent pairs of values contained in two
				685	/// 256-bit vectors of [8 x float].
				686	///
				687	/// \headerfile <x86intrin.h>
				688	///
				689	/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
				690	///
				691	/// \param __a
				692	/// A 256-bit vector of [8 x float] containing one of the source operands.
				693	/// The horizontal sums of the values are returned in the elements with
				694	/// index 0, 1, 4, 5 of a vector of [8 x float].
				695	/// \param __b
				696	/// A 256-bit vector of [8 x float] containing one of the source operands.
				697	/// The horizontal sums of the values are returned in the elements with
				698	/// index 2, 3, 6, 7 of a vector of [8 x float].
				699	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				700	/// both operands.
				701	static __inline __m256 __DEFAULT_FN_ATTRS
				702	_mm256_hadd_ps(__m256 __a, __m256 __b)
				703	{
				704	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
				705	}
				706
				707	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				708	/// 256-bit vectors of [4 x double].
				709	///
				710	/// \headerfile <x86intrin.h>
				711	///
				712	/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
				713	///
				714	/// \param __a
				715	/// A 256-bit vector of [4 x double] containing one of the source operands.
				716	/// The horizontal differences between the values are returned in the
				717	/// even-indexed elements of a vector of [4 x double].
				718	/// \param __b
				719	/// A 256-bit vector of [4 x double] containing one of the source operands.
				720	/// The horizontal differences between the values are returned in the
				721	/// odd-indexed elements of a vector of [4 x double].
				722	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				723	/// differences of both operands.
				724	static __inline __m256d __DEFAULT_FN_ATTRS
				725	_mm256_hsub_pd(__m256d __a, __m256d __b)
				726	{
				727	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
				728	}
				729
				730	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				731	/// 256-bit vectors of [8 x float].
				732	///
				733	/// \headerfile <x86intrin.h>
				734	///
				735	/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
				736	///
				737	/// \param __a
				738	/// A 256-bit vector of [8 x float] containing one of the source operands.
				739	/// The horizontal differences between the values are returned in the
				740	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				741	/// \param __b
				742	/// A 256-bit vector of [8 x float] containing one of the source operands.
				743	/// The horizontal differences between the values are returned in the
				744	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				745	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				746	/// differences of both operands.
				747	static __inline __m256 __DEFAULT_FN_ATTRS
				748	_mm256_hsub_ps(__m256 __a, __m256 __b)
				749	{
				750	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
				751	}
				752
				753	/* Vector permutations */
				754	/// \brief Copies the values stored in a 128-bit vector of [2 x double] as
				755	/// specified by the 128-bit integer vector operand.
				756	///
				757	/// \headerfile <x86intrin.h>
				758	///
				759	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				760	///
				761	/// \param __a
				762	/// A 128-bit vector of [2 x double].
				763	/// \param __c
				764	/// A 128-bit integer vector operand specifying how the values are to be
				765	/// copied.
				766	/// Bit [1]:
				767	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				768	/// returned vector
				769	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				770	/// returned vector
				771	/// Bit [65]:
				772	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				773	/// returned vector
				774	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				775	/// returned vector
				776	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				777	static __inline __m128d __DEFAULT_FN_ATTRS
				778	_mm_permutevar_pd(__m128d __a, __m128i __c)
				779	{
				780	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
				781	}
				782
				783	/// \brief Copies the values stored in a 256-bit vector of [4 x double] as
				784	/// specified by the 256-bit integer vector operand.
				785	///
				786	/// \headerfile <x86intrin.h>
				787	///
				788	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				789	///
				790	/// \param __a
				791	/// A 256-bit vector of [4 x double].
				792	/// \param __c
				793	/// A 256-bit integer vector operand specifying how the values are to be
				794	/// copied.
				795	/// Bit [1]:
				796	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				797	/// returned vector
				798	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				799	/// returned vector
				800	/// Bit [65]:
				801	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				802	/// returned vector
				803	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				804	/// returned vector
				805	/// Bit [129]:
				806	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				807	/// returned vector
				808	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				809	/// returned vector
				810	/// Bit [193]:
				811	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				812	/// returned vector
				813	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				814	/// returned vector
				815	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				816	static __inline __m256d __DEFAULT_FN_ATTRS
				817	_mm256_permutevar_pd(__m256d __a, __m256i __c)
				818	{
				819	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
				820	}
				821
				822	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				823	/// specified by the 128-bit integer vector operand.
				824	///
				825	/// \headerfile <x86intrin.h>
				826	///
				827	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				828	///
				829	/// \param __a
				830	/// A 128-bit vector of [4 x float].
				831	/// \param __c
				832	/// A 128-bit integer vector operand specifying how the values are to be
				833	/// copied.
				834	/// Bits [1:0]:
				835	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				836	/// returned vector
				837	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				838	/// returned vector
				839	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				840	/// returned vector
				841	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				842	/// returned vector
				843	/// Bits [33:32]:
				844	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				845	/// returned vector
				846	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				847	/// returned vector
				848	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				849	/// returned vector
				850	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				851	/// returned vector
				852	/// Bits [65:64]:
				853	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				854	/// returned vector
				855	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				856	/// returned vector
				857	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				858	/// returned vector
				859	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				860	/// returned vector
				861	/// Bits [97:96]:
				862	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				863	/// returned vector
				864	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				865	/// returned vector
				866	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				867	/// returned vector
				868	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				869	/// returned vector
				870	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				871	static __inline __m128 __DEFAULT_FN_ATTRS
				872	_mm_permutevar_ps(__m128 __a, __m128i __c)
				873	{
				874	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
				875	}
				876
				877	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				878	/// specified by the 256-bit integer vector operand.
				879	///
				880	/// \headerfile <x86intrin.h>
				881	///
				882	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				883	///
				884	/// \param __a
				885	/// A 256-bit vector of [8 x float].
				886	/// \param __c
				887	/// A 256-bit integer vector operand specifying how the values are to be
				888	/// copied.
				889	/// Bits [1:0]:
				890	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				891	/// returned vector
				892	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				893	/// returned vector
				894	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				895	/// returned vector
				896	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				897	/// returned vector
				898	/// Bits [33:32]:
				899	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				900	/// returned vector
				901	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				902	/// returned vector
				903	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				904	/// returned vector
				905	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				906	/// returned vector
				907	/// Bits [65:64]:
				908	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				909	/// returned vector
				910	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				911	/// returned vector
				912	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				913	/// returned vector
				914	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				915	/// returned vector
				916	/// Bits [97:96]:
				917	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				918	/// returned vector
				919	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				920	/// returned vector
				921	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				922	/// returned vector
				923	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				924	/// returned vector
				925	/// Bits [129:128]:
				926	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				927	/// returned vector
				928	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				929	/// returned vector
				930	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				931	/// returned vector
				932	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				933	/// returned vector
				934	/// Bits [161:160]:
				935	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				936	/// returned vector
				937	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				938	/// returned vector
				939	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				940	/// returned vector
				941	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				942	/// returned vector
				943	/// Bits [193:192]:
				944	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				945	/// returned vector
				946	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				947	/// returned vector
				948	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				949	/// returned vector
				950	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				951	/// returned vector
				952	/// Bits [225:224]:
				953	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				954	/// returned vector
				955	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				956	/// returned vector
				957	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				958	/// returned vector
				959	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				960	/// returned vector
				961	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				962	static __inline __m256 __DEFAULT_FN_ATTRS
				963	_mm256_permutevar_ps(__m256 __a, __m256i __c)
				964	{
				965	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
				966	}
				967
				968	/// \brief Copies the values stored in a 128-bit vector of [2 x double] as
				969	/// specified by the immediate integer operand.
				970	///
				971	/// \headerfile <x86intrin.h>
				972	///
				973	/// \code
				974	/// __m128d _mm_permute_pd(__m128d A, const int C);
				975	/// \endcode
				976	///
				977	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				978	///
				979	/// \param A
				980	/// A 128-bit vector of [2 x double].
				981	/// \param C
				982	/// An immediate integer operand specifying how the values are to be copied.
				983	/// Bit [0]:
				984	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				985	/// returned vector
				986	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				987	/// returned vector
				988	/// Bit [1]:
				989	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				990	/// returned vector
				991	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				992	/// returned vector
				993	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				994	#define _mm_permute_pd(A, C) __extension__ ({ \
				995	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
				996	(__v2df)_mm_setzero_pd(), \
				997	(C) & 0x1, ((C) & 0x2) >> 1); })
				998
				999	/// \brief Copies the values stored in a 256-bit vector of [4 x double] as
				1000	/// specified by the immediate integer operand.
				1001	///
				1002	/// \headerfile <x86intrin.h>
				1003	///
				1004	/// \code
				1005	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1006	/// \endcode
				1007	///
				1008	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				1009	///
				1010	/// \param A
				1011	/// A 256-bit vector of [4 x double].
				1012	/// \param C
				1013	/// An immediate integer operand specifying how the values are to be copied.
				1014	/// Bit [0]:
				1015	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				1016	/// returned vector
				1017	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1018	/// returned vector
				1019	/// Bit [1]:
				1020	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1021	/// returned vector
				1022	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1023	/// returned vector
				1024	/// Bit [2]:
				1025	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1026	/// returned vector
				1027	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1028	/// returned vector
				1029	/// Bit [3]:
				1030	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1031	/// returned vector
				1032	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1033	/// returned vector
				1034	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1035	#define _mm256_permute_pd(A, C) __extension__ ({ \
				1036	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
				1037	(__v4df)_mm256_setzero_pd(), \
				1038	(C) & 0x1, ((C) & 0x2) >> 1, \
				1039	2 + (((C) & 0x4) >> 2), \
				1040	2 + (((C) & 0x8) >> 3)); })
				1041
				1042	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				1043	/// specified by the immediate integer operand.
				1044	///
				1045	/// \headerfile <x86intrin.h>
				1046	///
				1047	/// \code
				1048	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1049	/// \endcode
				1050	///
				1051	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1052	///
				1053	/// \param A
				1054	/// A 128-bit vector of [4 x float].
				1055	/// \param C
				1056	/// An immediate integer operand specifying how the values are to be copied.
				1057	/// Bits [1:0]:
				1058	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1059	/// returned vector
				1060	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1061	/// returned vector
				1062	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1063	/// returned vector
				1064	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1065	/// returned vector
				1066	/// Bits [3:2]:
				1067	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1068	/// returned vector
				1069	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1070	/// returned vector
				1071	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1072	/// returned vector
				1073	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1074	/// returned vector
				1075	/// Bits [5:4]:
				1076	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1077	/// returned vector
				1078	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1079	/// returned vector
				1080	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1081	/// returned vector
				1082	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1083	/// returned vector
				1084	/// Bits [7:6]:
				1085	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1086	/// returned vector
				1087	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1088	/// returned vector
				1089	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1090	/// returned vector
				1091	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1092	/// returned vector
				1093	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				1094	#define _mm_permute_ps(A, C) __extension__ ({ \
				1095	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
				1096	(__v4sf)_mm_setzero_ps(), \
				1097	(C) & 0x3, ((C) & 0xc) >> 2, \
				1098	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
				1099
				1100	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				1101	/// specified by the immediate integer operand.
				1102	///
				1103	/// \headerfile <x86intrin.h>
				1104	///
				1105	/// \code
				1106	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1107	/// \endcode
				1108	///
				1109	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1110	///
				1111	/// \param A
				1112	/// A 256-bit vector of [8 x float].
				1113	/// \param C
				1114	/// An immediate integer operand specifying how the values are to be copied.
				1115	/// Bits [1:0]:
				1116	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1117	/// returned vector
				1118	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1119	/// returned vector
				1120	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1121	/// returned vector
				1122	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1123	/// returned vector
				1124	/// Bits [3:2]:
				1125	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1126	/// returned vector
				1127	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1128	/// returned vector
				1129	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1130	/// returned vector
				1131	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1132	/// returned vector
				1133	/// Bits [5:4]:
				1134	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1135	/// returned vector
				1136	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1137	/// returned vector
				1138	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1139	/// returned vector
				1140	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1141	/// returned vector
				1142	/// Bits [7:6]:
				1143	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1144	/// returned vector
				1145	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1146	/// returned vector
				1147	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1148	/// returned vector
				1149	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1150	/// returned vector
				1151	/// Bits [1:0]:
				1152	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1153	/// returned vector
				1154	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1155	/// returned vector
				1156	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1157	/// returned vector
				1158	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1159	/// returned vector
				1160	/// Bits [3:2]:
				1161	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1162	/// returned vector
				1163	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1164	/// returned vector
				1165	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1166	/// returned vector
				1167	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1168	/// returned vector
				1169	/// Bits [5:4]:
				1170	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1171	/// returned vector
				1172	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1173	/// returned vector
				1174	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1175	/// returned vector
				1176	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1177	/// returned vector
				1178	/// Bits [7:6]:
				1179	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1180	/// returned vector
				1181	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1182	/// returned vector
				1183	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1184	/// returned vector
				1185	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1186	/// returned vector
				1187	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1188	#define _mm256_permute_ps(A, C) __extension__ ({ \
				1189	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
				1190	(__v8sf)_mm256_setzero_ps(), \
				1191	(C) & 0x3, ((C) & 0xc) >> 2, \
				1192	((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
				1193	4 + (((C) & 0x03) >> 0), \
				1194	4 + (((C) & 0x0c) >> 2), \
				1195	4 + (((C) & 0x30) >> 4), \
				1196	4 + (((C) & 0xc0) >> 6)); })
				1197
				1198	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
				1199	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1200	(__v4df)(__m256d)(V2), (M)); })
				1201
				1202	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
				1203	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1204	(__v8sf)(__m256)(V2), (M)); })
				1205
				1206	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
				1207	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1208	(__v8si)(__m256i)(V2), (M)); })
				1209
				1210	/* Vector Blend */
				1211	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
				1212	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1213	(__v4df)(__m256d)(V2), \
				1214	(((M) & 0x01) ? 4 : 0), \
				1215	(((M) & 0x02) ? 5 : 1), \
				1216	(((M) & 0x04) ? 6 : 2), \
				1217	(((M) & 0x08) ? 7 : 3)); })
				1218
				1219	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
				1220	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1221	(__v8sf)(__m256)(V2), \
				1222	(((M) & 0x01) ? 8 : 0), \
				1223	(((M) & 0x02) ? 9 : 1), \
				1224	(((M) & 0x04) ? 10 : 2), \
				1225	(((M) & 0x08) ? 11 : 3), \
				1226	(((M) & 0x10) ? 12 : 4), \
				1227	(((M) & 0x20) ? 13 : 5), \
				1228	(((M) & 0x40) ? 14 : 6), \
				1229	(((M) & 0x80) ? 15 : 7)); })
				1230
				1231	static __inline __m256d __DEFAULT_FN_ATTRS
				1232	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
				1233	{
				1234	return (__m256d)__builtin_ia32_blendvpd256(
				1235	(__v4df)__a, (__v4df)__b, (__v4df)__c);
				1236	}
				1237
				1238	static __inline __m256 __DEFAULT_FN_ATTRS
				1239	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
				1240	{
				1241	return (__m256)__builtin_ia32_blendvps256(
				1242	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
				1243	}
				1244
				1245	/* Vector Dot Product */
				1246	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
				1247	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1248	(__v8sf)(__m256)(V2), (M)); })
				1249
				1250	/* Vector shuffle */
				1251	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
				1252	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1253	(__v8sf)(__m256)(b), \
				1254	(mask) & 0x3, \
				1255	((mask) & 0xc) >> 2, \
				1256	(((mask) & 0x30) >> 4) + 8, \
				1257	(((mask) & 0xc0) >> 6) + 8, \
				1258	((mask) & 0x3) + 4, \
				1259	(((mask) & 0xc) >> 2) + 4, \
				1260	(((mask) & 0x30) >> 4) + 12, \
				1261	(((mask) & 0xc0) >> 6) + 12); })
				1262
				1263	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
				1264	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1265	(__v4df)(__m256d)(b), \
				1266	(mask) & 0x1, \
				1267	(((mask) & 0x2) >> 1) + 4, \
				1268	(((mask) & 0x4) >> 2) + 2, \
				1269	(((mask) & 0x8) >> 3) + 6); })
				1270
				1271	/* Compare */
				1272	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1273	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1274	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1275	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1276	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1277	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1278	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1279	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1280	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1281	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1282	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1283	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1284	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1285	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1286	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1287	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1288	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1289	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1290	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1291	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1292	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1293	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1294	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1295	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1296	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1297	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1298	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1299	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1300	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1301	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1302	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1303	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1304
				1305	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
				1306	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1307	(__v2df)(__m128d)(b), (c)); })
				1308
				1309	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
				1310	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1311	(__v4sf)(__m128)(b), (c)); })
				1312
				1313	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
				1314	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1315	(__v4df)(__m256d)(b), (c)); })
				1316
				1317	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
				1318	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1319	(__v8sf)(__m256)(b), (c)); })
				1320
				1321	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
				1322	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1323	(__v2df)(__m128d)(b), (c)); })
				1324
				1325	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
				1326	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1327	(__v4sf)(__m128)(b), (c)); })
				1328
				1329	static __inline int __DEFAULT_FN_ATTRS
				1330	_mm256_extract_epi32(__m256i __a, const int __imm)
				1331	{
				1332	__v8si __b = (__v8si)__a;
				1333	return __b[__imm & 7];
				1334	}
				1335
				1336	static __inline int __DEFAULT_FN_ATTRS
				1337	_mm256_extract_epi16(__m256i __a, const int __imm)
				1338	{
				1339	__v16hi __b = (__v16hi)__a;
				1340	return __b[__imm & 15];
				1341	}
				1342
				1343	static __inline int __DEFAULT_FN_ATTRS
				1344	_mm256_extract_epi8(__m256i __a, const int __imm)
				1345	{
				1346	__v32qi __b = (__v32qi)__a;
				1347	return __b[__imm & 31];
				1348	}
				1349
				1350	#ifdef __x86_64__
				1351	static __inline long long __DEFAULT_FN_ATTRS
				1352	_mm256_extract_epi64(__m256i __a, const int __imm)
				1353	{
				1354	__v4di __b = (__v4di)__a;
				1355	return __b[__imm & 3];
				1356	}
				1357	#endif
				1358
				1359	static __inline __m256i __DEFAULT_FN_ATTRS
				1360	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
				1361	{
				1362	__v8si __c = (__v8si)__a;
				1363	__c[__imm & 7] = __b;
				1364	return (__m256i)__c;
				1365	}
				1366
				1367	static __inline __m256i __DEFAULT_FN_ATTRS
				1368	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
				1369	{
				1370	__v16hi __c = (__v16hi)__a;
				1371	__c[__imm & 15] = __b;
				1372	return (__m256i)__c;
				1373	}
				1374
				1375	static __inline __m256i __DEFAULT_FN_ATTRS
				1376	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
				1377	{
				1378	__v32qi __c = (__v32qi)__a;
				1379	__c[__imm & 31] = __b;
				1380	return (__m256i)__c;
				1381	}
				1382
				1383	#ifdef __x86_64__
				1384	static __inline __m256i __DEFAULT_FN_ATTRS
				1385	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
				1386	{
				1387	__v4di __c = (__v4di)__a;
				1388	__c[__imm & 3] = __b;
				1389	return (__m256i)__c;
				1390	}
				1391	#endif
				1392
				1393	/* Conversion */
				1394	static __inline __m256d __DEFAULT_FN_ATTRS
				1395	_mm256_cvtepi32_pd(__m128i __a)
				1396	{
				1397	return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
				1398	}
				1399
				1400	static __inline __m256 __DEFAULT_FN_ATTRS
				1401	_mm256_cvtepi32_ps(__m256i __a)
				1402	{
				1403	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
				1404	}
				1405
				1406	static __inline __m128 __DEFAULT_FN_ATTRS
				1407	_mm256_cvtpd_ps(__m256d __a)
				1408	{
				1409	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
				1410	}
				1411
				1412	static __inline __m256i __DEFAULT_FN_ATTRS
				1413	_mm256_cvtps_epi32(__m256 __a)
				1414	{
				1415	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
				1416	}
				1417
				1418	static __inline __m256d __DEFAULT_FN_ATTRS
				1419	_mm256_cvtps_pd(__m128 __a)
				1420	{
				1421	return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
				1422	}
				1423
				1424	static __inline __m128i __DEFAULT_FN_ATTRS
				1425	_mm256_cvttpd_epi32(__m256d __a)
				1426	{
				1427	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
				1428	}
				1429
				1430	static __inline __m128i __DEFAULT_FN_ATTRS
				1431	_mm256_cvtpd_epi32(__m256d __a)
				1432	{
				1433	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
				1434	}
				1435
				1436	static __inline __m256i __DEFAULT_FN_ATTRS
				1437	_mm256_cvttps_epi32(__m256 __a)
				1438	{
				1439	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
				1440	}
				1441
				1442	/* Vector replicate */
				1443	static __inline __m256 __DEFAULT_FN_ATTRS
				1444	_mm256_movehdup_ps(__m256 __a)
				1445	{
				1446	return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
				1447	}
				1448
				1449	static __inline __m256 __DEFAULT_FN_ATTRS
				1450	_mm256_moveldup_ps(__m256 __a)
				1451	{
				1452	return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
				1453	}
				1454
				1455	static __inline __m256d __DEFAULT_FN_ATTRS
				1456	_mm256_movedup_pd(__m256d __a)
				1457	{
				1458	return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
				1459	}
				1460
				1461	/* Unpack and Interleave */
				1462	static __inline __m256d __DEFAULT_FN_ATTRS
				1463	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
				1464	{
				1465	return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
				1466	}
				1467
				1468	static __inline __m256d __DEFAULT_FN_ATTRS
				1469	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
				1470	{
				1471	return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
				1472	}
				1473
				1474	static __inline __m256 __DEFAULT_FN_ATTRS
				1475	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
				1476	{
				1477	return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
				1478	}
				1479
				1480	static __inline __m256 __DEFAULT_FN_ATTRS
				1481	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
				1482	{
				1483	return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
				1484	}
				1485
				1486	/* Bit Test */
				1487	static __inline int __DEFAULT_FN_ATTRS
				1488	_mm_testz_pd(__m128d __a, __m128d __b)
				1489	{
				1490	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
				1491	}
				1492
				1493	static __inline int __DEFAULT_FN_ATTRS
				1494	_mm_testc_pd(__m128d __a, __m128d __b)
				1495	{
				1496	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
				1497	}
				1498
				1499	static __inline int __DEFAULT_FN_ATTRS
				1500	_mm_testnzc_pd(__m128d __a, __m128d __b)
				1501	{
				1502	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
				1503	}
				1504
				1505	static __inline int __DEFAULT_FN_ATTRS
				1506	_mm_testz_ps(__m128 __a, __m128 __b)
				1507	{
				1508	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
				1509	}
				1510
				1511	static __inline int __DEFAULT_FN_ATTRS
				1512	_mm_testc_ps(__m128 __a, __m128 __b)
				1513	{
				1514	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
				1515	}
				1516
				1517	static __inline int __DEFAULT_FN_ATTRS
				1518	_mm_testnzc_ps(__m128 __a, __m128 __b)
				1519	{
				1520	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
				1521	}
				1522
				1523	static __inline int __DEFAULT_FN_ATTRS
				1524	_mm256_testz_pd(__m256d __a, __m256d __b)
				1525	{
				1526	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
				1527	}
				1528
				1529	static __inline int __DEFAULT_FN_ATTRS
				1530	_mm256_testc_pd(__m256d __a, __m256d __b)
				1531	{
				1532	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
				1533	}
				1534
				1535	static __inline int __DEFAULT_FN_ATTRS
				1536	_mm256_testnzc_pd(__m256d __a, __m256d __b)
				1537	{
				1538	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
				1539	}
				1540
				1541	static __inline int __DEFAULT_FN_ATTRS
				1542	_mm256_testz_ps(__m256 __a, __m256 __b)
				1543	{
				1544	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
				1545	}
				1546
				1547	static __inline int __DEFAULT_FN_ATTRS
				1548	_mm256_testc_ps(__m256 __a, __m256 __b)
				1549	{
				1550	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
				1551	}
				1552
				1553	static __inline int __DEFAULT_FN_ATTRS
				1554	_mm256_testnzc_ps(__m256 __a, __m256 __b)
				1555	{
				1556	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
				1557	}
				1558
				1559	static __inline int __DEFAULT_FN_ATTRS
				1560	_mm256_testz_si256(__m256i __a, __m256i __b)
				1561	{
				1562	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
				1563	}
				1564
				1565	static __inline int __DEFAULT_FN_ATTRS
				1566	_mm256_testc_si256(__m256i __a, __m256i __b)
				1567	{
				1568	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
				1569	}
				1570
				1571	static __inline int __DEFAULT_FN_ATTRS
				1572	_mm256_testnzc_si256(__m256i __a, __m256i __b)
				1573	{
				1574	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
				1575	}
				1576
				1577	/* Vector extract sign mask */
				1578	static __inline int __DEFAULT_FN_ATTRS
				1579	_mm256_movemask_pd(__m256d __a)
				1580	{
				1581	return __builtin_ia32_movmskpd256((__v4df)__a);
				1582	}
				1583
				1584	static __inline int __DEFAULT_FN_ATTRS
				1585	_mm256_movemask_ps(__m256 __a)
				1586	{
				1587	return __builtin_ia32_movmskps256((__v8sf)__a);
				1588	}
				1589
				1590	/* Vector __zero */
				1591	static __inline void __DEFAULT_FN_ATTRS
				1592	_mm256_zeroall(void)
				1593	{
				1594	__builtin_ia32_vzeroall();
				1595	}
				1596
				1597	static __inline void __DEFAULT_FN_ATTRS
				1598	_mm256_zeroupper(void)
				1599	{
				1600	__builtin_ia32_vzeroupper();
				1601	}
				1602
				1603	/* Vector load with broadcast */
				1604	static __inline __m128 __DEFAULT_FN_ATTRS
				1605	_mm_broadcast_ss(float const *__a)
				1606	{
				1607	float __f = *__a;
				1608	return (__m128)(__v4sf){ __f, __f, __f, __f };
				1609	}
				1610
				1611	static __inline __m256d __DEFAULT_FN_ATTRS
				1612	_mm256_broadcast_sd(double const *__a)
				1613	{
				1614	double __d = *__a;
				1615	return (__m256d)(__v4df){ __d, __d, __d, __d };
				1616	}
				1617
				1618	static __inline __m256 __DEFAULT_FN_ATTRS
				1619	_mm256_broadcast_ss(float const *__a)
				1620	{
				1621	float __f = *__a;
				1622	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
				1623	}
				1624
				1625	static __inline __m256d __DEFAULT_FN_ATTRS
				1626	_mm256_broadcast_pd(__m128d const *__a)
				1627	{
				1628	return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
				1629	}
				1630
				1631	static __inline __m256 __DEFAULT_FN_ATTRS
				1632	_mm256_broadcast_ps(__m128 const *__a)
				1633	{
				1634	return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
				1635	}
				1636
				1637	/* SIMD load ops */
				1638	static __inline __m256d __DEFAULT_FN_ATTRS
				1639	_mm256_load_pd(double const *__p)
				1640	{
				1641	return (__m256d )__p;
				1642	}
				1643
				1644	static __inline __m256 __DEFAULT_FN_ATTRS
				1645	_mm256_load_ps(float const *__p)
				1646	{
				1647	return (__m256 )__p;
				1648	}
				1649
				1650	static __inline __m256d __DEFAULT_FN_ATTRS
				1651	_mm256_loadu_pd(double const *__p)
				1652	{
				1653	struct __loadu_pd {
				1654	__m256d __v;
				1655	} __attribute__((__packed__, __may_alias__));
				1656	return ((struct __loadu_pd*)__p)->__v;
				1657	}
				1658
				1659	static __inline __m256 __DEFAULT_FN_ATTRS
				1660	_mm256_loadu_ps(float const *__p)
				1661	{
				1662	struct __loadu_ps {
				1663	__m256 __v;
				1664	} __attribute__((__packed__, __may_alias__));
				1665	return ((struct __loadu_ps*)__p)->__v;
				1666	}
				1667
				1668	static __inline __m256i __DEFAULT_FN_ATTRS
				1669	_mm256_load_si256(__m256i const *__p)
				1670	{
				1671	return *__p;
				1672	}
				1673
				1674	static __inline __m256i __DEFAULT_FN_ATTRS
				1675	_mm256_loadu_si256(__m256i const *__p)
				1676	{
				1677	struct __loadu_si256 {
				1678	__m256i __v;
				1679	} __attribute__((__packed__, __may_alias__));
				1680	return ((struct __loadu_si256*)__p)->__v;
				1681	}
				1682
				1683	static __inline __m256i __DEFAULT_FN_ATTRS
				1684	_mm256_lddqu_si256(__m256i const *__p)
				1685	{
				1686	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
				1687	}
				1688
				1689	/* SIMD store ops */
				1690	static __inline void __DEFAULT_FN_ATTRS
				1691	_mm256_store_pd(double *__p, __m256d __a)
				1692	{
				1693	(__m256d )__p = __a;
				1694	}
				1695
				1696	static __inline void __DEFAULT_FN_ATTRS
				1697	_mm256_store_ps(float *__p, __m256 __a)
				1698	{
				1699	(__m256 )__p = __a;
				1700	}
				1701
				1702	static __inline void __DEFAULT_FN_ATTRS
				1703	_mm256_storeu_pd(double *__p, __m256d __a)
				1704	{
				1705	__builtin_ia32_storeupd256(__p, (__v4df)__a);
				1706	}
				1707
				1708	static __inline void __DEFAULT_FN_ATTRS
				1709	_mm256_storeu_ps(float *__p, __m256 __a)
				1710	{
				1711	__builtin_ia32_storeups256(__p, (__v8sf)__a);
				1712	}
				1713
				1714	static __inline void __DEFAULT_FN_ATTRS
				1715	_mm256_store_si256(__m256i *__p, __m256i __a)
				1716	{
				1717	*__p = __a;
				1718	}
				1719
				1720	static __inline void __DEFAULT_FN_ATTRS
				1721	_mm256_storeu_si256(__m256i *__p, __m256i __a)
				1722	{
				1723	__builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
				1724	}
				1725
				1726	/* Conditional load ops */
				1727	static __inline __m128d __DEFAULT_FN_ATTRS
				1728	_mm_maskload_pd(double const *__p, __m128i __m)
				1729	{
				1730	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
				1731	}
				1732
				1733	static __inline __m256d __DEFAULT_FN_ATTRS
				1734	_mm256_maskload_pd(double const *__p, __m256i __m)
				1735	{
				1736	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
				1737	(__v4di)__m);
				1738	}
				1739
				1740	static __inline __m128 __DEFAULT_FN_ATTRS
				1741	_mm_maskload_ps(float const *__p, __m128i __m)
				1742	{
				1743	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
				1744	}
				1745
				1746	static __inline __m256 __DEFAULT_FN_ATTRS
				1747	_mm256_maskload_ps(float const *__p, __m256i __m)
				1748	{
				1749	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
				1750	}
				1751
				1752	/* Conditional store ops */
				1753	static __inline void __DEFAULT_FN_ATTRS
				1754	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
				1755	{
				1756	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
				1757	}
				1758
				1759	static __inline void __DEFAULT_FN_ATTRS
				1760	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
				1761	{
				1762	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
				1763	}
				1764
				1765	static __inline void __DEFAULT_FN_ATTRS
				1766	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
				1767	{
				1768	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
				1769	}
				1770
				1771	static __inline void __DEFAULT_FN_ATTRS
				1772	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
				1773	{
				1774	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
				1775	}
				1776
				1777	/* Cacheability support ops */
				1778	static __inline void __DEFAULT_FN_ATTRS
				1779	_mm256_stream_si256(__m256i *__a, __m256i __b)
				1780	{
				1781	__builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
				1782	}
				1783
				1784	static __inline void __DEFAULT_FN_ATTRS
				1785	_mm256_stream_pd(double *__a, __m256d __b)
				1786	{
				1787	__builtin_ia32_movntpd256(__a, (__v4df)__b);
				1788	}
				1789
				1790	static __inline void __DEFAULT_FN_ATTRS
				1791	_mm256_stream_ps(float *__p, __m256 __a)
				1792	{
				1793	__builtin_ia32_movntps256(__p, (__v8sf)__a);
				1794	}
				1795
				1796	/* Create vectors */
				1797	static __inline__ __m256d __DEFAULT_FN_ATTRS
				1798	_mm256_undefined_pd()
				1799	{
				1800	return (__m256d)__builtin_ia32_undef256();
				1801	}
				1802
				1803	static __inline__ __m256 __DEFAULT_FN_ATTRS
				1804	_mm256_undefined_ps()
				1805	{
				1806	return (__m256)__builtin_ia32_undef256();
				1807	}
				1808
				1809	static __inline__ __m256i __DEFAULT_FN_ATTRS
				1810	_mm256_undefined_si256()
				1811	{
				1812	return (__m256i)__builtin_ia32_undef256();
				1813	}
				1814
				1815	static __inline __m256d __DEFAULT_FN_ATTRS
				1816	_mm256_set_pd(double __a, double __b, double __c, double __d)
				1817	{
				1818	return (__m256d){ __d, __c, __b, __a };
				1819	}
				1820
				1821	static __inline __m256 __DEFAULT_FN_ATTRS
				1822	_mm256_set_ps(float __a, float __b, float __c, float __d,
				1823	float __e, float __f, float __g, float __h)
				1824	{
				1825	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
				1826	}
				1827
				1828	static __inline __m256i __DEFAULT_FN_ATTRS
				1829	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
				1830	int __i4, int __i5, int __i6, int __i7)
				1831	{
				1832	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
				1833	}
				1834
				1835	static __inline __m256i __DEFAULT_FN_ATTRS
				1836	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
				1837	short __w11, short __w10, short __w09, short __w08,
				1838	short __w07, short __w06, short __w05, short __w04,
				1839	short __w03, short __w02, short __w01, short __w00)
				1840	{
				1841	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				1842	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
				1843	}
				1844
				1845	static __inline __m256i __DEFAULT_FN_ATTRS
				1846	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
				1847	char __b27, char __b26, char __b25, char __b24,
				1848	char __b23, char __b22, char __b21, char __b20,
				1849	char __b19, char __b18, char __b17, char __b16,
				1850	char __b15, char __b14, char __b13, char __b12,
				1851	char __b11, char __b10, char __b09, char __b08,
				1852	char __b07, char __b06, char __b05, char __b04,
				1853	char __b03, char __b02, char __b01, char __b00)
				1854	{
				1855	return (__m256i)(__v32qi){
				1856	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				1857	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				1858	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				1859	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
				1860	};
				1861	}
				1862
				1863	static __inline __m256i __DEFAULT_FN_ATTRS
				1864	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
				1865	{
				1866	return (__m256i)(__v4di){ __d, __c, __b, __a };
				1867	}
				1868
				1869	/* Create vectors with elements in reverse order */
				1870	static __inline __m256d __DEFAULT_FN_ATTRS
				1871	_mm256_setr_pd(double __a, double __b, double __c, double __d)
				1872	{
				1873	return (__m256d){ __a, __b, __c, __d };
				1874	}
				1875
				1876	static __inline __m256 __DEFAULT_FN_ATTRS
				1877	_mm256_setr_ps(float __a, float __b, float __c, float __d,
				1878	float __e, float __f, float __g, float __h)
				1879	{
				1880	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
				1881	}
				1882
				1883	static __inline __m256i __DEFAULT_FN_ATTRS
				1884	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
				1885	int __i4, int __i5, int __i6, int __i7)
				1886	{
				1887	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
				1888	}
				1889
				1890	static __inline __m256i __DEFAULT_FN_ATTRS
				1891	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
				1892	short __w11, short __w10, short __w09, short __w08,
				1893	short __w07, short __w06, short __w05, short __w04,
				1894	short __w03, short __w02, short __w01, short __w00)
				1895	{
				1896	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				1897	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
				1898	}
				1899
				1900	static __inline __m256i __DEFAULT_FN_ATTRS
				1901	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
				1902	char __b27, char __b26, char __b25, char __b24,
				1903	char __b23, char __b22, char __b21, char __b20,
				1904	char __b19, char __b18, char __b17, char __b16,
				1905	char __b15, char __b14, char __b13, char __b12,
				1906	char __b11, char __b10, char __b09, char __b08,
				1907	char __b07, char __b06, char __b05, char __b04,
				1908	char __b03, char __b02, char __b01, char __b00)
				1909	{
				1910	return (__m256i)(__v32qi){
				1911	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
				1912	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				1913	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				1914	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
				1915	}
				1916
				1917	static __inline __m256i __DEFAULT_FN_ATTRS
				1918	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
				1919	{
				1920	return (__m256i)(__v4di){ __a, __b, __c, __d };
				1921	}
				1922
				1923	/* Create vectors with repeated elements */
				1924	static __inline __m256d __DEFAULT_FN_ATTRS
				1925	_mm256_set1_pd(double __w)
				1926	{
				1927	return (__m256d){ __w, __w, __w, __w };
				1928	}
				1929
				1930	static __inline __m256 __DEFAULT_FN_ATTRS
				1931	_mm256_set1_ps(float __w)
				1932	{
				1933	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
				1934	}
				1935
				1936	static __inline __m256i __DEFAULT_FN_ATTRS
				1937	_mm256_set1_epi32(int __i)
				1938	{
				1939	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
				1940	}
				1941
				1942	static __inline __m256i __DEFAULT_FN_ATTRS
				1943	_mm256_set1_epi16(short __w)
				1944	{
				1945	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				1946	__w, __w, __w, __w, __w, __w };
				1947	}
				1948
				1949	static __inline __m256i __DEFAULT_FN_ATTRS
				1950	_mm256_set1_epi8(char __b)
				1951	{
				1952	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				1953	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				1954	__b, __b, __b, __b, __b, __b, __b };
				1955	}
				1956
				1957	static __inline __m256i __DEFAULT_FN_ATTRS
				1958	_mm256_set1_epi64x(long long __q)
				1959	{
				1960	return (__m256i)(__v4di){ __q, __q, __q, __q };
				1961	}
				1962
				1963	/* Create __zeroed vectors */
				1964	static __inline __m256d __DEFAULT_FN_ATTRS
				1965	_mm256_setzero_pd(void)
				1966	{
				1967	return (__m256d){ 0, 0, 0, 0 };
				1968	}
				1969
				1970	static __inline __m256 __DEFAULT_FN_ATTRS
				1971	_mm256_setzero_ps(void)
				1972	{
				1973	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				1974	}
				1975
				1976	static __inline __m256i __DEFAULT_FN_ATTRS
				1977	_mm256_setzero_si256(void)
				1978	{
				1979	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				1980	}
				1981
				1982	/* Cast between vector types */
				1983	static __inline __m256 __DEFAULT_FN_ATTRS
				1984	_mm256_castpd_ps(__m256d __a)
				1985	{
				1986	return (__m256)__a;
				1987	}
				1988
				1989	static __inline __m256i __DEFAULT_FN_ATTRS
				1990	_mm256_castpd_si256(__m256d __a)
				1991	{
				1992	return (__m256i)__a;
				1993	}
				1994
				1995	static __inline __m256d __DEFAULT_FN_ATTRS
				1996	_mm256_castps_pd(__m256 __a)
				1997	{
				1998	return (__m256d)__a;
				1999	}
				2000
				2001	static __inline __m256i __DEFAULT_FN_ATTRS
				2002	_mm256_castps_si256(__m256 __a)
				2003	{
				2004	return (__m256i)__a;
				2005	}
				2006
				2007	static __inline __m256 __DEFAULT_FN_ATTRS
				2008	_mm256_castsi256_ps(__m256i __a)
				2009	{
				2010	return (__m256)__a;
				2011	}
				2012
				2013	static __inline __m256d __DEFAULT_FN_ATTRS
				2014	_mm256_castsi256_pd(__m256i __a)
				2015	{
				2016	return (__m256d)__a;
				2017	}
				2018
				2019	static __inline __m128d __DEFAULT_FN_ATTRS
				2020	_mm256_castpd256_pd128(__m256d __a)
				2021	{
				2022	return __builtin_shufflevector(__a, __a, 0, 1);
				2023	}
				2024
				2025	static __inline __m128 __DEFAULT_FN_ATTRS
				2026	_mm256_castps256_ps128(__m256 __a)
				2027	{
				2028	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
				2029	}
				2030
				2031	static __inline __m128i __DEFAULT_FN_ATTRS
				2032	_mm256_castsi256_si128(__m256i __a)
				2033	{
				2034	return __builtin_shufflevector(__a, __a, 0, 1);
				2035	}
				2036
				2037	static __inline __m256d __DEFAULT_FN_ATTRS
				2038	_mm256_castpd128_pd256(__m128d __a)
				2039	{
				2040	return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
				2041	}
				2042
				2043	static __inline __m256 __DEFAULT_FN_ATTRS
				2044	_mm256_castps128_ps256(__m128 __a)
				2045	{
				2046	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
				2047	}
				2048
				2049	static __inline __m256i __DEFAULT_FN_ATTRS
				2050	_mm256_castsi128_si256(__m128i __a)
				2051	{
				2052	return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
				2053	}
				2054
				2055	/*
				2056	Vector insert.
				2057	We use macros rather than inlines because we only want to accept
				2058	invocations where the immediate M is a constant expression.
				2059	*/
				2060	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				2061	(__m256)__builtin_shufflevector( \
				2062	(__v8sf)(__m256)(V1), \
				2063	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				2064	(((M) & 1) ? 0 : 8), \
				2065	(((M) & 1) ? 1 : 9), \
				2066	(((M) & 1) ? 2 : 10), \
				2067	(((M) & 1) ? 3 : 11), \
				2068	(((M) & 1) ? 8 : 4), \
				2069	(((M) & 1) ? 9 : 5), \
				2070	(((M) & 1) ? 10 : 6), \
				2071	(((M) & 1) ? 11 : 7) );})
				2072
				2073	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				2074	(__m256d)__builtin_shufflevector( \
				2075	(__v4df)(__m256d)(V1), \
				2076	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				2077	(((M) & 1) ? 0 : 4), \
				2078	(((M) & 1) ? 1 : 5), \
				2079	(((M) & 1) ? 4 : 2), \
				2080	(((M) & 1) ? 5 : 3) );})
				2081
				2082	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				2083	(__m256i)__builtin_shufflevector( \
				2084	(__v4di)(__m256i)(V1), \
				2085	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				2086	(((M) & 1) ? 0 : 4), \
				2087	(((M) & 1) ? 1 : 5), \
				2088	(((M) & 1) ? 4 : 2), \
				2089	(((M) & 1) ? 5 : 3) );})
				2090
				2091	/*
				2092	Vector extract.
				2093	We use macros rather than inlines because we only want to accept
				2094	invocations where the immediate M is a constant expression.
				2095	*/
				2096	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				2097	(__m128)__builtin_shufflevector( \
				2098	(__v8sf)(__m256)(V), \
				2099	(__v8sf)(_mm256_setzero_ps()), \
				2100	(((M) & 1) ? 4 : 0), \
				2101	(((M) & 1) ? 5 : 1), \
				2102	(((M) & 1) ? 6 : 2), \
				2103	(((M) & 1) ? 7 : 3) );})
				2104
				2105	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				2106	(__m128d)__builtin_shufflevector( \
				2107	(__v4df)(__m256d)(V), \
				2108	(__v4df)(_mm256_setzero_pd()), \
				2109	(((M) & 1) ? 2 : 0), \
				2110	(((M) & 1) ? 3 : 1) );})
				2111
				2112	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				2113	(__m128i)__builtin_shufflevector( \
				2114	(__v4di)(__m256i)(V), \
				2115	(__v4di)(_mm256_setzero_si256()), \
				2116	(((M) & 1) ? 2 : 0), \
				2117	(((M) & 1) ? 3 : 1) );})
				2118
				2119	/* SIMD load ops (unaligned) */
				2120	static __inline __m256 __DEFAULT_FN_ATTRS
				2121	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
				2122	{
				2123	struct __loadu_ps {
				2124	__m128 __v;
				2125	} __attribute__((__packed__, __may_alias__));
				2126
				2127	__m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
				2128	return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
				2129	}
				2130
				2131	static __inline __m256d __DEFAULT_FN_ATTRS
				2132	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
				2133	{
				2134	struct __loadu_pd {
				2135	__m128d __v;
				2136	} __attribute__((__packed__, __may_alias__));
				2137
				2138	__m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
				2139	return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
				2140	}
				2141
				2142	static __inline __m256i __DEFAULT_FN_ATTRS
				2143	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
				2144	{
				2145	struct __loadu_si128 {
				2146	__m128i __v;
				2147	} __attribute__((__packed__, __may_alias__));
				2148	__m256i __v256 = _mm256_castsi128_si256(
				2149	((struct __loadu_si128*)__addr_lo)->__v);
				2150	return _mm256_insertf128_si256(__v256,
				2151	((struct __loadu_si128*)__addr_hi)->__v, 1);
				2152	}
				2153
				2154	/* SIMD store ops (unaligned) */
				2155	static __inline void __DEFAULT_FN_ATTRS
				2156	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
				2157	{
				2158	__m128 __v128;
				2159
				2160	__v128 = _mm256_castps256_ps128(__a);
				2161	__builtin_ia32_storeups(__addr_lo, __v128);
				2162	__v128 = _mm256_extractf128_ps(__a, 1);
				2163	__builtin_ia32_storeups(__addr_hi, __v128);
				2164	}
				2165
				2166	static __inline void __DEFAULT_FN_ATTRS
				2167	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
				2168	{
				2169	__m128d __v128;
				2170
				2171	__v128 = _mm256_castpd256_pd128(__a);
				2172	__builtin_ia32_storeupd(__addr_lo, __v128);
				2173	__v128 = _mm256_extractf128_pd(__a, 1);
				2174	__builtin_ia32_storeupd(__addr_hi, __v128);
				2175	}
				2176
				2177	static __inline void __DEFAULT_FN_ATTRS
				2178	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
				2179	{
				2180	__m128i __v128;
				2181
				2182	__v128 = _mm256_castsi256_si128(__a);
				2183	__builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
				2184	__v128 = _mm256_extractf128_si256(__a, 1);
				2185	__builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
				2186	}
				2187
				2188	static __inline __m256 __DEFAULT_FN_ATTRS
				2189	_mm256_set_m128 (__m128 __hi, __m128 __lo) {
				2190	return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
				2191	}
				2192
				2193	static __inline __m256d __DEFAULT_FN_ATTRS
				2194	_mm256_set_m128d (__m128d __hi, __m128d __lo) {
				2195	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2196	}
				2197
				2198	static __inline __m256i __DEFAULT_FN_ATTRS
				2199	_mm256_set_m128i (__m128i __hi, __m128i __lo) {
				2200	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2201	}
				2202
				2203	static __inline __m256 __DEFAULT_FN_ATTRS
				2204	_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
				2205	return _mm256_set_m128(__hi, __lo);
				2206	}
				2207
				2208	static __inline __m256d __DEFAULT_FN_ATTRS
				2209	_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
				2210	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2211	}
				2212
				2213	static __inline __m256i __DEFAULT_FN_ATTRS
				2214	_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
				2215	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2216	}
				2217
				2218	#undef __DEFAULT_FN_ATTRS
				2219
				2220	#endif /* __AVXINTRIN_H */