Blame - third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/xmmintrin.h - fp2-dev/platform/external/v8

blob: 43f94229f164fa7ee4473a4bd48f9c1ce9e48c8d [file] [log] [blame]

Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
				24	#ifndef __XMMINTRIN_H
				25	#define __XMMINTRIN_H
				26
				27	#include <mmintrin.h>
				28
				29	typedef int __v4si __attribute__((__vector_size__(16)));
				30	typedef float __v4sf __attribute__((__vector_size__(16)));
				31	typedef float __m128 __attribute__((__vector_size__(16)));
				32
				33	/* This header should only be included in a hosted environment as it depends on
				34	* a standard library to provide allocation routines. */
				35	#if __STDC_HOSTED__
				36	#include <mm_malloc.h>
				37	#endif
				38
				39	/* Define the default attributes for the functions in this file. */
				40	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
				41
				42	/// \brief Adds the 32-bit float values in the low-order bits of the operands.
				43	///
				44	/// \headerfile <x86intrin.h>
				45	///
				46	/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
				47	///
				48	/// \param __a
				49	/// A 128-bit vector of [4 x float] containing one of the source operands.
				50	/// The lower 32 bits of this operand are used in the calculation.
				51	/// \param __b
				52	/// A 128-bit vector of [4 x float] containing one of the source operands.
				53	/// The lower 32 bits of this operand are used in the calculation.
				54	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
				55	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
				56	/// the upper 96 bits of the first source operand.
				57	static __inline__ __m128 __DEFAULT_FN_ATTRS
				58	_mm_add_ss(__m128 __a, __m128 __b)
				59	{
				60	__a[0] += __b[0];
				61	return __a;
				62	}
				63
				64	/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
				65	/// the addition.
				66	///
				67	/// \headerfile <x86intrin.h>
				68	///
				69	/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
				70	///
				71	/// \param __a
				72	/// A 128-bit vector of [4 x float] containing one of the source operands.
				73	/// \param __b
				74	/// A 128-bit vector of [4 x float] containing one of the source operands.
				75	/// \returns A 128-bit vector of [4 x float] containing the sums of both
				76	/// operands.
				77	static __inline__ __m128 __DEFAULT_FN_ATTRS
				78	_mm_add_ps(__m128 __a, __m128 __b)
				79	{
				80	return __a + __b;
				81	}
				82
				83	/// \brief Subtracts the 32-bit float value in the low-order bits of the second
				84	/// operand from the corresponding value in the first operand.
				85	///
				86	/// \headerfile <x86intrin.h>
				87	///
				88	/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
				89	///
				90	/// \param __a
				91	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
				92	/// of this operand are used in the calculation.
				93	/// \param __b
				94	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
				95	/// bits of this operand are used in the calculation.
				96	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				97	/// difference of the lower 32 bits of both operands. The upper 96 bits are
				98	/// copied from the upper 96 bits of the first source operand.
				99	static __inline__ __m128 __DEFAULT_FN_ATTRS
				100	_mm_sub_ss(__m128 __a, __m128 __b)
				101	{
				102	__a[0] -= __b[0];
				103	return __a;
				104	}
				105
				106	/// \brief Subtracts each of the values of the second operand from the first
				107	/// operand, both of which are 128-bit vectors of [4 x float] and returns
				108	/// the results of the subtraction.
				109	///
				110	/// \headerfile <x86intrin.h>
				111	///
				112	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
				113	///
				114	/// \param __a
				115	/// A 128-bit vector of [4 x float] containing the minuend.
				116	/// \param __b
				117	/// A 128-bit vector of [4 x float] containing the subtrahend.
				118	/// \returns A 128-bit vector of [4 x float] containing the differences between
				119	/// both operands.
				120	static __inline__ __m128 __DEFAULT_FN_ATTRS
				121	_mm_sub_ps(__m128 __a, __m128 __b)
				122	{
				123	return __a - __b;
				124	}
				125
				126	/// \brief Multiplies two 32-bit float values in the low-order bits of the
				127	/// operands.
				128	///
				129	/// \headerfile <x86intrin.h>
				130	///
				131	/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
				132	///
				133	/// \param __a
				134	/// A 128-bit vector of [4 x float] containing one of the source operands.
				135	/// The lower 32 bits of this operand are used in the calculation.
				136	/// \param __b
				137	/// A 128-bit vector of [4 x float] containing one of the source operands.
				138	/// The lower 32 bits of this operand are used in the calculation.
				139	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
				140	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
				141	/// bits of the first source operand.
				142	static __inline__ __m128 __DEFAULT_FN_ATTRS
				143	_mm_mul_ss(__m128 __a, __m128 __b)
				144	{
				145	__a[0] *= __b[0];
				146	return __a;
				147	}
				148
				149	/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
				150	/// results of the multiplication.
				151	///
				152	/// \headerfile <x86intrin.h>
				153	///
				154	/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
				155	///
				156	/// \param __a
				157	/// A 128-bit vector of [4 x float] containing one of the source operands.
				158	/// \param __b
				159	/// A 128-bit vector of [4 x float] containing one of the source operands.
				160	/// \returns A 128-bit vector of [4 x float] containing the products of both
				161	/// operands.
				162	static __inline__ __m128 __DEFAULT_FN_ATTRS
				163	_mm_mul_ps(__m128 __a, __m128 __b)
				164	{
				165	return __a * __b;
				166	}
				167
				168	/// \brief Divides the value in the low-order 32 bits of the first operand by
				169	/// the corresponding value in the second operand.
				170	///
				171	/// \headerfile <x86intrin.h>
				172	///
				173	/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
				174	///
				175	/// \param __a
				176	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
				177	/// bits of this operand are used in the calculation.
				178	/// \param __b
				179	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
				180	/// of this operand are used in the calculation.
				181	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
				182	/// lower 32 bits of both operands. The upper 96 bits are copied from the
				183	/// upper 96 bits of the first source operand.
				184	static __inline__ __m128 __DEFAULT_FN_ATTRS
				185	_mm_div_ss(__m128 __a, __m128 __b)
				186	{
				187	__a[0] /= __b[0];
				188	return __a;
				189	}
				190
				191	/// \brief Divides two 128-bit vectors of [4 x float].
				192	///
				193	/// \headerfile <x86intrin.h>
				194	///
				195	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
				196	///
				197	/// \param __a
				198	/// A 128-bit vector of [4 x float] containing the dividend.
				199	/// \param __b
				200	/// A 128-bit vector of [4 x float] containing the divisor.
				201	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
				202	/// operands.
				203	static __inline__ __m128 __DEFAULT_FN_ATTRS
				204	_mm_div_ps(__m128 __a, __m128 __b)
				205	{
				206	return __a / __b;
				207	}
				208
				209	/// \brief Calculates the square root of the value stored in the low-order bits
				210	/// of a 128-bit vector of [4 x float].
				211	///
				212	/// \headerfile <x86intrin.h>
				213	///
				214	/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
				215	///
				216	/// \param __a
				217	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				218	/// used in the calculation.
				219	/// \returns A 128-bit vector of [4 x float] containing the square root of the
				220	/// value in the low-order bits of the operand.
				221	static __inline__ __m128 __DEFAULT_FN_ATTRS
				222	_mm_sqrt_ss(__m128 __a)
				223	{
				224	__m128 __c = __builtin_ia32_sqrtss(__a);
				225	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
				226	}
				227
				228	/// \brief Calculates the square roots of the values stored in a 128-bit vector
				229	/// of [4 x float].
				230	///
				231	/// \headerfile <x86intrin.h>
				232	///
				233	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
				234	///
				235	/// \param __a
				236	/// A 128-bit vector of [4 x float].
				237	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
				238	/// values in the operand.
				239	static __inline__ __m128 __DEFAULT_FN_ATTRS
				240	_mm_sqrt_ps(__m128 __a)
				241	{
				242	return __builtin_ia32_sqrtps(__a);
				243	}
				244
				245	/// \brief Calculates the approximate reciprocal of the value stored in the
				246	/// low-order bits of a 128-bit vector of [4 x float].
				247	///
				248	/// \headerfile <x86intrin.h>
				249	///
				250	/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
				251	///
				252	/// \param __a
				253	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				254	/// used in the calculation.
				255	/// \returns A 128-bit vector of [4 x float] containing the approximate
				256	/// reciprocal of the value in the low-order bits of the operand.
				257	static __inline__ __m128 __DEFAULT_FN_ATTRS
				258	_mm_rcp_ss(__m128 __a)
				259	{
				260	__m128 __c = __builtin_ia32_rcpss(__a);
				261	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
				262	}
				263
				264	/// \brief Calculates the approximate reciprocals of the values stored in a
				265	/// 128-bit vector of [4 x float].
				266	///
				267	/// \headerfile <x86intrin.h>
				268	///
				269	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
				270	///
				271	/// \param __a
				272	/// A 128-bit vector of [4 x float].
				273	/// \returns A 128-bit vector of [4 x float] containing the approximate
				274	/// reciprocals of the values in the operand.
				275	static __inline__ __m128 __DEFAULT_FN_ATTRS
				276	_mm_rcp_ps(__m128 __a)
				277	{
				278	return __builtin_ia32_rcpps(__a);
				279	}
				280
				281	/// \brief Calculates the approximate reciprocal of the square root of the value
				282	/// stored in the low-order bits of a 128-bit vector of [4 x float].
				283	///
				284	/// \headerfile <x86intrin.h>
				285	///
				286	/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
				287	///
				288	/// \param __a
				289	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				290	/// used in the calculation.
				291	/// \returns A 128-bit vector of [4 x float] containing the approximate
				292	/// reciprocal of the square root of the value in the low-order bits of the
				293	/// operand.
				294	static __inline__ __m128 __DEFAULT_FN_ATTRS
				295	_mm_rsqrt_ss(__m128 __a)
				296	{
				297	__m128 __c = __builtin_ia32_rsqrtss(__a);
				298	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
				299	}
				300
				301	/// \brief Calculates the approximate reciprocals of the square roots of the
				302	/// values stored in a 128-bit vector of [4 x float].
				303	///
				304	/// \headerfile <x86intrin.h>
				305	///
				306	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
				307	///
				308	/// \param __a
				309	/// A 128-bit vector of [4 x float].
				310	/// \returns A 128-bit vector of [4 x float] containing the approximate
				311	/// reciprocals of the square roots of the values in the operand.
				312	static __inline__ __m128 __DEFAULT_FN_ATTRS
				313	_mm_rsqrt_ps(__m128 __a)
				314	{
				315	return __builtin_ia32_rsqrtps(__a);
				316	}
				317
				318	/// \brief Compares two 32-bit float values in the low-order bits of both
				319	/// operands and returns the lesser value in the low-order bits of the
				320	/// vector of [4 x float].
				321	///
				322	/// \headerfile <x86intrin.h>
				323	///
				324	/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
				325	///
				326	/// \param __a
				327	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				328	/// 32 bits of this operand are used in the comparison.
				329	/// \param __b
				330	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				331	/// 32 bits of this operand are used in the comparison.
				332	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				333	/// minimum value between both operands. The upper 96 bits are copied from
				334	/// the upper 96 bits of the first source operand.
				335	static __inline__ __m128 __DEFAULT_FN_ATTRS
				336	_mm_min_ss(__m128 __a, __m128 __b)
				337	{
				338	return __builtin_ia32_minss(__a, __b);
				339	}
				340
				341	/// \brief Compares two 128-bit vectors of [4 x float] and returns the
				342	/// lesser of each pair of values.
				343	///
				344	/// \headerfile <x86intrin.h>
				345	///
				346	/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
				347	///
				348	/// \param __a
				349	/// A 128-bit vector of [4 x float] containing one of the operands.
				350	/// \param __b
				351	/// A 128-bit vector of [4 x float] containing one of the operands.
				352	/// \returns A 128-bit vector of [4 x float] containing the minimum values
				353	/// between both operands.
				354	static __inline__ __m128 __DEFAULT_FN_ATTRS
				355	_mm_min_ps(__m128 __a, __m128 __b)
				356	{
				357	return __builtin_ia32_minps(__a, __b);
				358	}
				359
				360	/// \brief Compares two 32-bit float values in the low-order bits of both
				361	/// operands and returns the greater value in the low-order bits of
				362	/// a vector [4 x float].
				363	///
				364	/// \headerfile <x86intrin.h>
				365	///
				366	/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
				367	///
				368	/// \param __a
				369	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				370	/// 32 bits of this operand are used in the comparison.
				371	/// \param __b
				372	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				373	/// 32 bits of this operand are used in the comparison.
				374	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				375	/// maximum value between both operands. The upper 96 bits are copied from
				376	/// the upper 96 bits of the first source operand.
				377	static __inline__ __m128 __DEFAULT_FN_ATTRS
				378	_mm_max_ss(__m128 __a, __m128 __b)
				379	{
				380	return __builtin_ia32_maxss(__a, __b);
				381	}
				382
				383	/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
				384	/// of each pair of values.
				385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
				389	///
				390	/// \param __a
				391	/// A 128-bit vector of [4 x float] containing one of the operands.
				392	/// \param __b
				393	/// A 128-bit vector of [4 x float] containing one of the operands.
				394	/// \returns A 128-bit vector of [4 x float] containing the maximum values
				395	/// between both operands.
				396	static __inline__ __m128 __DEFAULT_FN_ATTRS
				397	_mm_max_ps(__m128 __a, __m128 __b)
				398	{
				399	return __builtin_ia32_maxps(__a, __b);
				400	}
				401
				402	/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
				403	///
				404	/// \headerfile <x86intrin.h>
				405	///
				406	/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
				407	///
				408	/// \param __a
				409	/// A 128-bit vector containing one of the source operands.
				410	/// \param __b
				411	/// A 128-bit vector containing one of the source operands.
				412	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				413	/// values between both operands.
				414	static __inline__ __m128 __DEFAULT_FN_ATTRS
				415	_mm_and_ps(__m128 __a, __m128 __b)
				416	{
				417	return (__m128)((__v4si)__a & (__v4si)__b);
				418	}
				419
				420	/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
				421	/// the one's complement of the values contained in the first source
				422	/// operand.
				423	///
				424	/// \headerfile <x86intrin.h>
				425	///
				426	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
				427	///
				428	/// \param __a
				429	/// A 128-bit vector of [4 x float] containing the first source operand. The
				430	/// one's complement of this value is used in the bitwise AND.
				431	/// \param __b
				432	/// A 128-bit vector of [4 x float] containing the second source operand.
				433	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				434	/// one's complement of the first operand and the values in the second
				435	/// operand.
				436	static __inline__ __m128 __DEFAULT_FN_ATTRS
				437	_mm_andnot_ps(__m128 __a, __m128 __b)
				438	{
				439	return (__m128)(~(__v4si)__a & (__v4si)__b);
				440	}
				441
				442	/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
				443	///
				444	/// \headerfile <x86intrin.h>
				445	///
				446	/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
				447	///
				448	/// \param __a
				449	/// A 128-bit vector of [4 x float] containing one of the source operands.
				450	/// \param __b
				451	/// A 128-bit vector of [4 x float] containing one of the source operands.
				452	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
				453	/// values between both operands.
				454	static __inline__ __m128 __DEFAULT_FN_ATTRS
				455	_mm_or_ps(__m128 __a, __m128 __b)
				456	{
				457	return (__m128)((__v4si)__a \| (__v4si)__b);
				458	}
				459
				460	/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
				461	/// [4 x float].
				462	///
				463	/// \headerfile <x86intrin.h>
				464	///
				465	/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
				466	///
				467	/// \param __a
				468	/// A 128-bit vector of [4 x float] containing one of the source operands.
				469	/// \param __b
				470	/// A 128-bit vector of [4 x float] containing one of the source operands.
				471	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
				472	/// of the values between both operands.
				473	static __inline__ __m128 __DEFAULT_FN_ATTRS
				474	_mm_xor_ps(__m128 __a, __m128 __b)
				475	{
				476	return (__m128)((__v4si)__a ^ (__v4si)__b);
				477	}
				478
				479	/// \brief Compares two 32-bit float values in the low-order bits of both
				480	/// operands for equality and returns the result of the comparison in the
				481	/// low-order bits of a vector [4 x float].
				482	///
				483	/// \headerfile <x86intrin.h>
				484	///
				485	/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
				486	///
				487	/// \param __a
				488	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				489	/// 32 bits of this operand are used in the comparison.
				490	/// \param __b
				491	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				492	/// 32 bits of this operand are used in the comparison.
				493	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				494	/// in the low-order bits.
				495	static __inline__ __m128 __DEFAULT_FN_ATTRS
				496	_mm_cmpeq_ss(__m128 __a, __m128 __b)
				497	{
				498	return (__m128)__builtin_ia32_cmpeqss(__a, __b);
				499	}
				500
				501	/// \brief Compares each of the corresponding 32-bit float values of the
				502	/// 128-bit vectors of [4 x float] for equality.
				503	///
				504	/// \headerfile <x86intrin.h>
				505	///
				506	/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
				507	///
				508	/// \param __a
				509	/// A 128-bit vector of [4 x float].
				510	/// \param __b
				511	/// A 128-bit vector of [4 x float].
				512	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				513	static __inline__ __m128 __DEFAULT_FN_ATTRS
				514	_mm_cmpeq_ps(__m128 __a, __m128 __b)
				515	{
				516	return (__m128)__builtin_ia32_cmpeqps(__a, __b);
				517	}
				518
				519	/// \brief Compares two 32-bit float values in the low-order bits of both
				520	/// operands to determine if the value in the first operand is less than the
				521	/// corresponding value in the second operand and returns the result of the
				522	/// comparison in the low-order bits of a vector of [4 x float].
				523	///
				524	/// \headerfile <x86intrin.h>
				525	///
				526	/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
				527	///
				528	/// \param __a
				529	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				530	/// 32 bits of this operand are used in the comparison.
				531	/// \param __b
				532	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				533	/// 32 bits of this operand are used in the comparison.
				534	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				535	/// in the low-order bits.
				536	static __inline__ __m128 __DEFAULT_FN_ATTRS
				537	_mm_cmplt_ss(__m128 __a, __m128 __b)
				538	{
				539	return (__m128)__builtin_ia32_cmpltss(__a, __b);
				540	}
				541
				542	/// \brief Compares each of the corresponding 32-bit float values of the
				543	/// 128-bit vectors of [4 x float] to determine if the values in the first
				544	/// operand are less than those in the second operand.
				545	///
				546	/// \headerfile <x86intrin.h>
				547	///
				548	/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
				549	///
				550	/// \param __a
				551	/// A 128-bit vector of [4 x float].
				552	/// \param __b
				553	/// A 128-bit vector of [4 x float].
				554	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				555	static __inline__ __m128 __DEFAULT_FN_ATTRS
				556	_mm_cmplt_ps(__m128 __a, __m128 __b)
				557	{
				558	return (__m128)__builtin_ia32_cmpltps(__a, __b);
				559	}
				560
				561	/// \brief Compares two 32-bit float values in the low-order bits of both
				562	/// operands to determine if the value in the first operand is less than or
				563	/// equal to the corresponding value in the second operand and returns the
				564	/// result of the comparison in the low-order bits of a vector of
				565	/// [4 x float].
				566	///
				567	/// \headerfile <x86intrin.h>
				568	///
				569	/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
				570	///
				571	/// \param __a
				572	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				573	/// 32 bits of this operand are used in the comparison.
				574	/// \param __b
				575	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				576	/// 32 bits of this operand are used in the comparison.
				577	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				578	/// in the low-order bits.
				579	static __inline__ __m128 __DEFAULT_FN_ATTRS
				580	_mm_cmple_ss(__m128 __a, __m128 __b)
				581	{
				582	return (__m128)__builtin_ia32_cmpless(__a, __b);
				583	}
				584
				585	/// \brief Compares each of the corresponding 32-bit float values of the
				586	/// 128-bit vectors of [4 x float] to determine if the values in the first
				587	/// operand are less than or equal to those in the second operand.
				588	///
				589	/// \headerfile <x86intrin.h>
				590	///
				591	/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
				592	///
				593	/// \param __a
				594	/// A 128-bit vector of [4 x float].
				595	/// \param __b
				596	/// A 128-bit vector of [4 x float].
				597	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				598	static __inline__ __m128 __DEFAULT_FN_ATTRS
				599	_mm_cmple_ps(__m128 __a, __m128 __b)
				600	{
				601	return (__m128)__builtin_ia32_cmpleps(__a, __b);
				602	}
				603
				604	/// \brief Compares two 32-bit float values in the low-order bits of both
				605	/// operands to determine if the value in the first operand is greater than
				606	/// the corresponding value in the second operand and returns the result of
				607	/// the comparison in the low-order bits of a vector of [4 x float].
				608	///
				609	/// \headerfile <x86intrin.h>
				610	///
				611	/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
				612	///
				613	/// \param __a
				614	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				615	/// 32 bits of this operand are used in the comparison.
				616	/// \param __b
				617	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				618	/// 32 bits of this operand are used in the comparison.
				619	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				620	/// in the low-order bits.
				621	static __inline__ __m128 __DEFAULT_FN_ATTRS
				622	_mm_cmpgt_ss(__m128 __a, __m128 __b)
				623	{
				624	return (__m128)__builtin_shufflevector(__a,
				625	__builtin_ia32_cmpltss(__b, __a),
				626	4, 1, 2, 3);
				627	}
				628
				629	/// \brief Compares each of the corresponding 32-bit float values of the
				630	/// 128-bit vectors of [4 x float] to determine if the values in the first
				631	/// operand are greater than those in the second operand.
				632	///
				633	/// \headerfile <x86intrin.h>
				634	///
				635	/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
				636	///
				637	/// \param __a
				638	/// A 128-bit vector of [4 x float].
				639	/// \param __b
				640	/// A 128-bit vector of [4 x float].
				641	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				642	static __inline__ __m128 __DEFAULT_FN_ATTRS
				643	_mm_cmpgt_ps(__m128 __a, __m128 __b)
				644	{
				645	return (__m128)__builtin_ia32_cmpltps(__b, __a);
				646	}
				647
				648	/// \brief Compares two 32-bit float values in the low-order bits of both
				649	/// operands to determine if the value in the first operand is greater than
				650	/// or equal to the corresponding value in the second operand and returns
				651	/// the result of the comparison in the low-order bits of a vector of
				652	/// [4 x float].
				653	///
				654	/// \headerfile <x86intrin.h>
				655	///
				656	/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
				657	///
				658	/// \param __a
				659	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				660	/// 32 bits of this operand are used in the comparison.
				661	/// \param __b
				662	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				663	/// 32 bits of this operand are used in the comparison.
				664	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				665	/// in the low-order bits.
				666	static __inline__ __m128 __DEFAULT_FN_ATTRS
				667	_mm_cmpge_ss(__m128 __a, __m128 __b)
				668	{
				669	return (__m128)__builtin_shufflevector(__a,
				670	__builtin_ia32_cmpless(__b, __a),
				671	4, 1, 2, 3);
				672	}
				673
				674	/// \brief Compares each of the corresponding 32-bit float values of the
				675	/// 128-bit vectors of [4 x float] to determine if the values in the first
				676	/// operand are greater than or equal to those in the second operand.
				677	///
				678	/// \headerfile <x86intrin.h>
				679	///
				680	/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
				681	///
				682	/// \param __a
				683	/// A 128-bit vector of [4 x float].
				684	/// \param __b
				685	/// A 128-bit vector of [4 x float].
				686	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				687	static __inline__ __m128 __DEFAULT_FN_ATTRS
				688	_mm_cmpge_ps(__m128 __a, __m128 __b)
				689	{
				690	return (__m128)__builtin_ia32_cmpleps(__b, __a);
				691	}
				692
				693	/// \brief Compares two 32-bit float values in the low-order bits of both
				694	/// operands for inequality and returns the result of the comparison in the
				695	/// low-order bits of a vector of [4 x float].
				696	///
				697	/// \headerfile <x86intrin.h>
				698	///
				699	/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
				700	///
				701	/// \param __a
				702	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				703	/// 32 bits of this operand are used in the comparison.
				704	/// \param __b
				705	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				706	/// 32 bits of this operand are used in the comparison.
				707	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				708	/// in the low-order bits.
				709	static __inline__ __m128 __DEFAULT_FN_ATTRS
				710	_mm_cmpneq_ss(__m128 __a, __m128 __b)
				711	{
				712	return (__m128)__builtin_ia32_cmpneqss(__a, __b);
				713	}
				714
				715	/// \brief Compares each of the corresponding 32-bit float values of the
				716	/// 128-bit vectors of [4 x float] for inequality.
				717	///
				718	/// \headerfile <x86intrin.h>
				719	///
				720	/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
				721	///
				722	/// \param __a
				723	/// A 128-bit vector of [4 x float].
				724	/// \param __b
				725	/// A 128-bit vector of [4 x float].
				726	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				727	static __inline__ __m128 __DEFAULT_FN_ATTRS
				728	_mm_cmpneq_ps(__m128 __a, __m128 __b)
				729	{
				730	return (__m128)__builtin_ia32_cmpneqps(__a, __b);
				731	}
				732
				733	/// \brief Compares two 32-bit float values in the low-order bits of both
				734	/// operands to determine if the value in the first operand is not less than
				735	/// the corresponding value in the second operand and returns the result of
				736	/// the comparison in the low-order bits of a vector of [4 x float].
				737	///
				738	/// \headerfile <x86intrin.h>
				739	///
				740	/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
				741	///
				742	/// \param __a
				743	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				744	/// 32 bits of this operand are used in the comparison.
				745	/// \param __b
				746	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				747	/// 32 bits of this operand are used in the comparison.
				748	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				749	/// in the low-order bits.
				750	static __inline__ __m128 __DEFAULT_FN_ATTRS
				751	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
				752	{
				753	return (__m128)__builtin_ia32_cmpnltss(__a, __b);
				754	}
				755
				756	/// \brief Compares each of the corresponding 32-bit float values of the
				757	/// 128-bit vectors of [4 x float] to determine if the values in the first
				758	/// operand are not less than those in the second operand.
				759	///
				760	/// \headerfile <x86intrin.h>
				761	///
				762	/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
				763	///
				764	/// \param __a
				765	/// A 128-bit vector of [4 x float].
				766	/// \param __b
				767	/// A 128-bit vector of [4 x float].
				768	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				769	static __inline__ __m128 __DEFAULT_FN_ATTRS
				770	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
				771	{
				772	return (__m128)__builtin_ia32_cmpnltps(__a, __b);
				773	}
				774
				775	/// \brief Compares two 32-bit float values in the low-order bits of both
				776	/// operands to determine if the value in the first operand is not less than
				777	/// or equal to the corresponding value in the second operand and returns
				778	/// the result of the comparison in the low-order bits of a vector of
				779	/// [4 x float].
				780	///
				781	/// \headerfile <x86intrin.h>
				782	///
				783	/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
				784	///
				785	/// \param __a
				786	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				787	/// 32 bits of this operand are used in the comparison.
				788	/// \param __b
				789	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				790	/// 32 bits of this operand are used in the comparison.
				791	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				792	/// in the low-order bits.
				793	static __inline__ __m128 __DEFAULT_FN_ATTRS
				794	_mm_cmpnle_ss(__m128 __a, __m128 __b)
				795	{
				796	return (__m128)__builtin_ia32_cmpnless(__a, __b);
				797	}
				798
				799	/// \brief Compares each of the corresponding 32-bit float values of the
				800	/// 128-bit vectors of [4 x float] to determine if the values in the first
				801	/// operand are not less than or equal to those in the second operand.
				802	///
				803	/// \headerfile <x86intrin.h>
				804	///
				805	/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
				806	///
				807	/// \param __a
				808	/// A 128-bit vector of [4 x float].
				809	/// \param __b
				810	/// A 128-bit vector of [4 x float].
				811	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				812	static __inline__ __m128 __DEFAULT_FN_ATTRS
				813	_mm_cmpnle_ps(__m128 __a, __m128 __b)
				814	{
				815	return (__m128)__builtin_ia32_cmpnleps(__a, __b);
				816	}
				817
				818	/// \brief Compares two 32-bit float values in the low-order bits of both
				819	/// operands to determine if the value in the first operand is not greater
				820	/// than the corresponding value in the second operand and returns the
				821	/// result of the comparison in the low-order bits of a vector of
				822	/// [4 x float].
				823	///
				824	/// \headerfile <x86intrin.h>
				825	///
				826	/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
				827	///
				828	/// \param __a
				829	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				830	/// 32 bits of this operand are used in the comparison.
				831	/// \param __b
				832	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				833	/// 32 bits of this operand are used in the comparison.
				834	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				835	/// in the low-order bits.
				836	static __inline__ __m128 __DEFAULT_FN_ATTRS
				837	_mm_cmpngt_ss(__m128 __a, __m128 __b)
				838	{
				839	return (__m128)__builtin_shufflevector(__a,
				840	__builtin_ia32_cmpnltss(__b, __a),
				841	4, 1, 2, 3);
				842	}
				843
				844	/// \brief Compares each of the corresponding 32-bit float values of the
				845	/// 128-bit vectors of [4 x float] to determine if the values in the first
				846	/// operand are not greater than those in the second operand.
				847	///
				848	/// \headerfile <x86intrin.h>
				849	///
				850	/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
				851	///
				852	/// \param __a
				853	/// A 128-bit vector of [4 x float].
				854	/// \param __b
				855	/// A 128-bit vector of [4 x float].
				856	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				857	static __inline__ __m128 __DEFAULT_FN_ATTRS
				858	_mm_cmpngt_ps(__m128 __a, __m128 __b)
				859	{
				860	return (__m128)__builtin_ia32_cmpnltps(__b, __a);
				861	}
				862
				863	/// \brief Compares two 32-bit float values in the low-order bits of both
				864	/// operands to determine if the value in the first operand is not greater
				865	/// than or equal to the corresponding value in the second operand and
				866	/// returns the result of the comparison in the low-order bits of a vector
				867	/// of [4 x float].
				868	///
				869	/// \headerfile <x86intrin.h>
				870	///
				871	/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
				872	///
				873	/// \param __a
				874	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				875	/// 32 bits of this operand are used in the comparison.
				876	/// \param __b
				877	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				878	/// 32 bits of this operand are used in the comparison.
				879	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				880	/// in the low-order bits.
				881	static __inline__ __m128 __DEFAULT_FN_ATTRS
				882	_mm_cmpnge_ss(__m128 __a, __m128 __b)
				883	{
				884	return (__m128)__builtin_shufflevector(__a,
				885	__builtin_ia32_cmpnless(__b, __a),
				886	4, 1, 2, 3);
				887	}
				888
				889	/// \brief Compares each of the corresponding 32-bit float values of the
				890	/// 128-bit vectors of [4 x float] to determine if the values in the first
				891	/// operand are not greater than or equal to those in the second operand.
				892	///
				893	/// \headerfile <x86intrin.h>
				894	///
				895	/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
				896	///
				897	/// \param __a
				898	/// A 128-bit vector of [4 x float].
				899	/// \param __b
				900	/// A 128-bit vector of [4 x float].
				901	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				902	static __inline__ __m128 __DEFAULT_FN_ATTRS
				903	_mm_cmpnge_ps(__m128 __a, __m128 __b)
				904	{
				905	return (__m128)__builtin_ia32_cmpnleps(__b, __a);
				906	}
				907
				908	/// \brief Compares two 32-bit float values in the low-order bits of both
				909	/// operands to determine if the value in the first operand is ordered with
				910	/// respect to the corresponding value in the second operand and returns the
				911	/// result of the comparison in the low-order bits of a vector of
				912	/// [4 x float].
				913	///
				914	/// \headerfile <x86intrin.h>
				915	///
				916	/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
				917	///
				918	/// \param __a
				919	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				920	/// 32 bits of this operand are used in the comparison.
				921	/// \param __b
				922	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				923	/// 32 bits of this operand are used in the comparison.
				924	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				925	/// in the low-order bits.
				926	static __inline__ __m128 __DEFAULT_FN_ATTRS
				927	_mm_cmpord_ss(__m128 __a, __m128 __b)
				928	{
				929	return (__m128)__builtin_ia32_cmpordss(__a, __b);
				930	}
				931
				932	/// \brief Compares each of the corresponding 32-bit float values of the
				933	/// 128-bit vectors of [4 x float] to determine if the values in the first
				934	/// operand are ordered with respect to those in the second operand.
				935	///
				936	/// \headerfile <x86intrin.h>
				937	///
				938	/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
				939	///
				940	/// \param __a
				941	/// A 128-bit vector of [4 x float].
				942	/// \param __b
				943	/// A 128-bit vector of [4 x float].
				944	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				945	static __inline__ __m128 __DEFAULT_FN_ATTRS
				946	_mm_cmpord_ps(__m128 __a, __m128 __b)
				947	{
				948	return (__m128)__builtin_ia32_cmpordps(__a, __b);
				949	}
				950
				951	/// \brief Compares two 32-bit float values in the low-order bits of both
				952	/// operands to determine if the value in the first operand is unordered
				953	/// with respect to the corresponding value in the second operand and
				954	/// returns the result of the comparison in the low-order bits of a vector
				955	/// of [4 x float].
				956	///
				957	/// \headerfile <x86intrin.h>
				958	///
				959	/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
				960	///
				961	/// \param __a
				962	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				963	/// 32 bits of this operand are used in the comparison.
				964	/// \param __b
				965	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				966	/// 32 bits of this operand are used in the comparison.
				967	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				968	/// in the low-order bits.
				969	static __inline__ __m128 __DEFAULT_FN_ATTRS
				970	_mm_cmpunord_ss(__m128 __a, __m128 __b)
				971	{
				972	return (__m128)__builtin_ia32_cmpunordss(__a, __b);
				973	}
				974
				975	/// \brief Compares each of the corresponding 32-bit float values of the
				976	/// 128-bit vectors of [4 x float] to determine if the values in the first
				977	/// operand are unordered with respect to those in the second operand.
				978	///
				979	/// \headerfile <x86intrin.h>
				980	///
				981	/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
				982	///
				983	/// \param __a
				984	/// A 128-bit vector of [4 x float].
				985	/// \param __b
				986	/// A 128-bit vector of [4 x float].
				987	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				988	static __inline__ __m128 __DEFAULT_FN_ATTRS
				989	_mm_cmpunord_ps(__m128 __a, __m128 __b)
				990	{
				991	return (__m128)__builtin_ia32_cmpunordps(__a, __b);
				992	}
				993
				994	/// \brief Compares two 32-bit float values in the low-order bits of both
				995	/// operands for equality and returns the result of the comparison.
				996	///
				997	/// \headerfile <x86intrin.h>
				998	///
				999	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1000	///
				1001	/// \param __a
				1002	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1003	/// used in the comparison.
				1004	/// \param __b
				1005	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1006	/// used in the comparison.
				1007	/// \returns An integer containing the comparison results.
				1008	static __inline__ int __DEFAULT_FN_ATTRS
				1009	_mm_comieq_ss(__m128 __a, __m128 __b)
				1010	{
				1011	return __builtin_ia32_comieq(__a, __b);
				1012	}
				1013
				1014	/// \brief Compares two 32-bit float values in the low-order bits of both
				1015	/// operands to determine if the first operand is less than the second
				1016	/// operand and returns the result of the comparison.
				1017	///
				1018	/// \headerfile <x86intrin.h>
				1019	///
				1020	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1021	///
				1022	/// \param __a
				1023	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1024	/// used in the comparison.
				1025	/// \param __b
				1026	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1027	/// used in the comparison.
				1028	/// \returns An integer containing the comparison results.
				1029	static __inline__ int __DEFAULT_FN_ATTRS
				1030	_mm_comilt_ss(__m128 __a, __m128 __b)
				1031	{
				1032	return __builtin_ia32_comilt(__a, __b);
				1033	}
				1034
				1035	/// \brief Compares two 32-bit float values in the low-order bits of both
				1036	/// operands to determine if the first operand is less than or equal to the
				1037	/// second operand and returns the result of the comparison.
				1038	///
				1039	/// \headerfile <x86intrin.h>
				1040	///
				1041	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1042	///
				1043	/// \param __a
				1044	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1045	/// used in the comparison.
				1046	/// \param __b
				1047	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1048	/// used in the comparison.
				1049	/// \returns An integer containing the comparison results.
				1050	static __inline__ int __DEFAULT_FN_ATTRS
				1051	_mm_comile_ss(__m128 __a, __m128 __b)
				1052	{
				1053	return __builtin_ia32_comile(__a, __b);
				1054	}
				1055
				1056	/// \brief Compares two 32-bit float values in the low-order bits of both
				1057	/// operands to determine if the first operand is greater than the second
				1058	/// operand and returns the result of the comparison.
				1059	///
				1060	/// \headerfile <x86intrin.h>
				1061	///
				1062	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1063	///
				1064	/// \param __a
				1065	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1066	/// used in the comparison.
				1067	/// \param __b
				1068	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1069	/// used in the comparison.
				1070	/// \returns An integer containing the comparison results.
				1071	static __inline__ int __DEFAULT_FN_ATTRS
				1072	_mm_comigt_ss(__m128 __a, __m128 __b)
				1073	{
				1074	return __builtin_ia32_comigt(__a, __b);
				1075	}
				1076
				1077	/// \brief Compares two 32-bit float values in the low-order bits of both
				1078	/// operands to determine if the first operand is greater than or equal to
				1079	/// the second operand and returns the result of the comparison.
				1080	///
				1081	/// \headerfile <x86intrin.h>
				1082	///
				1083	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1084	///
				1085	/// \param __a
				1086	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1087	/// used in the comparison.
				1088	/// \param __b
				1089	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1090	/// used in the comparison.
				1091	/// \returns An integer containing the comparison results.
				1092	static __inline__ int __DEFAULT_FN_ATTRS
				1093	_mm_comige_ss(__m128 __a, __m128 __b)
				1094	{
				1095	return __builtin_ia32_comige(__a, __b);
				1096	}
				1097
				1098	/// \brief Compares two 32-bit float values in the low-order bits of both
				1099	/// operands to determine if the first operand is not equal to the second
				1100	/// operand and returns the result of the comparison.
				1101	///
				1102	/// \headerfile <x86intrin.h>
				1103	///
				1104	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1105	///
				1106	/// \param __a
				1107	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1108	/// used in the comparison.
				1109	/// \param __b
				1110	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1111	/// used in the comparison.
				1112	/// \returns An integer containing the comparison results.
				1113	static __inline__ int __DEFAULT_FN_ATTRS
				1114	_mm_comineq_ss(__m128 __a, __m128 __b)
				1115	{
				1116	return __builtin_ia32_comineq(__a, __b);
				1117	}
				1118
				1119	/// \brief Performs an unordered comparison of two 32-bit float values using
				1120	/// the low-order bits of both operands to determine equality and returns
				1121	/// the result of the comparison.
				1122	///
				1123	/// \headerfile <x86intrin.h>
				1124	///
				1125	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1126	///
				1127	/// \param __a
				1128	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1129	/// used in the comparison.
				1130	/// \param __b
				1131	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1132	/// used in the comparison.
				1133	/// \returns An integer containing the comparison results.
				1134	static __inline__ int __DEFAULT_FN_ATTRS
				1135	_mm_ucomieq_ss(__m128 __a, __m128 __b)
				1136	{
				1137	return __builtin_ia32_ucomieq(__a, __b);
				1138	}
				1139
				1140	/// \brief Performs an unordered comparison of two 32-bit float values using
				1141	/// the low-order bits of both operands to determine if the first operand is
				1142	/// less than the second operand and returns the result of the comparison.
				1143	///
				1144	/// \headerfile <x86intrin.h>
				1145	///
				1146	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1147	///
				1148	/// \param __a
				1149	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1150	/// used in the comparison.
				1151	/// \param __b
				1152	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1153	/// used in the comparison.
				1154	/// \returns An integer containing the comparison results.
				1155	static __inline__ int __DEFAULT_FN_ATTRS
				1156	_mm_ucomilt_ss(__m128 __a, __m128 __b)
				1157	{
				1158	return __builtin_ia32_ucomilt(__a, __b);
				1159	}
				1160
				1161	/// \brief Performs an unordered comparison of two 32-bit float values using
				1162	/// the low-order bits of both operands to determine if the first operand
				1163	/// is less than or equal to the second operand and returns the result of
				1164	/// the comparison.
				1165	///
				1166	/// \headerfile <x86intrin.h>
				1167	///
				1168	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1169	///
				1170	/// \param __a
				1171	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1172	/// used in the comparison.
				1173	/// \param __b
				1174	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1175	/// used in the comparison.
				1176	/// \returns An integer containing the comparison results.
				1177	static __inline__ int __DEFAULT_FN_ATTRS
				1178	_mm_ucomile_ss(__m128 __a, __m128 __b)
				1179	{
				1180	return __builtin_ia32_ucomile(__a, __b);
				1181	}
				1182
				1183	/// \brief Performs an unordered comparison of two 32-bit float values using
				1184	/// the low-order bits of both operands to determine if the first operand
				1185	/// is greater than the second operand and returns the result of the
				1186	/// comparison.
				1187	///
				1188	/// \headerfile <x86intrin.h>
				1189	///
				1190	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1191	///
				1192	/// \param __a
				1193	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1194	/// used in the comparison.
				1195	/// \param __b
				1196	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1197	/// used in the comparison.
				1198	/// \returns An integer containing the comparison results.
				1199	static __inline__ int __DEFAULT_FN_ATTRS
				1200	_mm_ucomigt_ss(__m128 __a, __m128 __b)
				1201	{
				1202	return __builtin_ia32_ucomigt(__a, __b);
				1203	}
				1204
				1205	/// \brief Performs an unordered comparison of two 32-bit float values using
				1206	/// the low-order bits of both operands to determine if the first operand is
				1207	/// greater than or equal to the second operand and returns the result of
				1208	/// the comparison.
				1209	///
				1210	/// \headerfile <x86intrin.h>
				1211	///
				1212	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1213	///
				1214	/// \param __a
				1215	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1216	/// used in the comparison.
				1217	/// \param __b
				1218	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1219	/// used in the comparison.
				1220	/// \returns An integer containing the comparison results.
				1221	static __inline__ int __DEFAULT_FN_ATTRS
				1222	_mm_ucomige_ss(__m128 __a, __m128 __b)
				1223	{
				1224	return __builtin_ia32_ucomige(__a, __b);
				1225	}
				1226
				1227	/// \brief Performs an unordered comparison of two 32-bit float values using
				1228	/// the low-order bits of both operands to determine inequality and returns
				1229	/// the result of the comparison.
				1230	///
				1231	/// \headerfile <x86intrin.h>
				1232	///
				1233	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1234	///
				1235	/// \param __a
				1236	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1237	/// used in the comparison.
				1238	/// \param __b
				1239	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1240	/// used in the comparison.
				1241	/// \returns An integer containing the comparison results.
				1242	static __inline__ int __DEFAULT_FN_ATTRS
				1243	_mm_ucomineq_ss(__m128 __a, __m128 __b)
				1244	{
				1245	return __builtin_ia32_ucomineq(__a, __b);
				1246	}
				1247
				1248	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1249	/// [4 x float] into a 32-bit integer.
				1250	///
				1251	/// \headerfile <x86intrin.h>
				1252	///
				1253	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1254	///
				1255	/// \param __a
				1256	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1257	/// used in the conversion.
				1258	/// \returns A 32-bit integer containing the converted value.
				1259	static __inline__ int __DEFAULT_FN_ATTRS
				1260	_mm_cvtss_si32(__m128 __a)
				1261	{
				1262	return __builtin_ia32_cvtss2si(__a);
				1263	}
				1264
				1265	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1266	/// [4 x float] into a 32-bit integer.
				1267	///
				1268	/// \headerfile <x86intrin.h>
				1269	///
				1270	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1271	///
				1272	/// \param __a
				1273	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1274	/// used in the conversion.
				1275	/// \returns A 32-bit integer containing the converted value.
				1276	static __inline__ int __DEFAULT_FN_ATTRS
				1277	_mm_cvt_ss2si(__m128 __a)
				1278	{
				1279	return _mm_cvtss_si32(__a);
				1280	}
				1281
				1282	#ifdef __x86_64__
				1283
				1284	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1285	/// [4 x float] into a 64-bit integer.
				1286	///
				1287	/// \headerfile <x86intrin.h>
				1288	///
				1289	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1290	///
				1291	/// \param __a
				1292	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1293	/// used in the conversion.
				1294	/// \returns A 64-bit integer containing the converted value.
				1295	static __inline__ long long __DEFAULT_FN_ATTRS
				1296	_mm_cvtss_si64(__m128 __a)
				1297	{
				1298	return __builtin_ia32_cvtss2si64(__a);
				1299	}
				1300
				1301	#endif
				1302
				1303	/// \brief Converts two low-order float values in a 128-bit vector of
				1304	/// [4 x float] into a 64-bit vector of [2 x i32].
				1305	///
				1306	/// \headerfile <x86intrin.h>
				1307	///
				1308	/// This intrinsic corresponds to the \c CVTPS2PI instruction.
				1309	///
				1310	/// \param __a
				1311	/// A 128-bit vector of [4 x float].
				1312	/// \returns A 64-bit integer vector containing the converted values.
				1313	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1314	_mm_cvtps_pi32(__m128 __a)
				1315	{
				1316	return (__m64)__builtin_ia32_cvtps2pi(__a);
				1317	}
				1318
				1319	/// \brief Converts two low-order float values in a 128-bit vector of
				1320	/// [4 x float] into a 64-bit vector of [2 x i32].
				1321	///
				1322	/// \headerfile <x86intrin.h>
				1323	///
				1324	/// This intrinsic corresponds to the \c CVTPS2PI instruction.
				1325	///
				1326	/// \param __a
				1327	/// A 128-bit vector of [4 x float].
				1328	/// \returns A 64-bit integer vector containing the converted values.
				1329	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1330	_mm_cvt_ps2pi(__m128 __a)
				1331	{
				1332	return _mm_cvtps_pi32(__a);
				1333	}
				1334
				1335	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1336	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1337	/// inexact.
				1338	///
				1339	/// \headerfile <x86intrin.h>
				1340	///
				1341	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1342	///
				1343	/// \param __a
				1344	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1345	/// used in the conversion.
				1346	/// \returns A 32-bit integer containing the converted value.
				1347	static __inline__ int __DEFAULT_FN_ATTRS
				1348	_mm_cvttss_si32(__m128 __a)
				1349	{
				1350	return __a[0];
				1351	}
				1352
				1353	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1354	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1355	/// inexact.
				1356	///
				1357	/// \headerfile <x86intrin.h>
				1358	///
				1359	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1360	///
				1361	/// \param __a
				1362	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1363	/// used in the conversion.
				1364	/// \returns A 32-bit integer containing the converted value.
				1365	static __inline__ int __DEFAULT_FN_ATTRS
				1366	_mm_cvtt_ss2si(__m128 __a)
				1367	{
				1368	return _mm_cvttss_si32(__a);
				1369	}
				1370
				1371	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1372	/// [4 x float] into a 64-bit integer, truncating the result when it is
				1373	/// inexact.
				1374	///
				1375	/// \headerfile <x86intrin.h>
				1376	///
				1377	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1378	///
				1379	/// \param __a
				1380	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1381	/// used in the conversion.
				1382	/// \returns A 64-bit integer containing the converted value.
				1383	static __inline__ long long __DEFAULT_FN_ATTRS
				1384	_mm_cvttss_si64(__m128 __a)
				1385	{
				1386	return __a[0];
				1387	}
				1388
				1389	/// \brief Converts two low-order float values in a 128-bit vector of
				1390	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
				1391	/// when it is inexact.
				1392	///
				1393	/// \headerfile <x86intrin.h>
				1394	///
				1395	/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
				1396	///
				1397	/// \param __a
				1398	/// A 128-bit vector of [4 x float].
				1399	/// \returns A 64-bit integer vector containing the converted values.
				1400	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1401	_mm_cvttps_pi32(__m128 __a)
				1402	{
				1403	return (__m64)__builtin_ia32_cvttps2pi(__a);
				1404	}
				1405
				1406	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1407	_mm_cvtt_ps2pi(__m128 __a)
				1408	{
				1409	return _mm_cvttps_pi32(__a);
				1410	}
				1411
				1412	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1413	_mm_cvtsi32_ss(__m128 __a, int __b)
				1414	{
				1415	__a[0] = __b;
				1416	return __a;
				1417	}
				1418
				1419	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1420	_mm_cvt_si2ss(__m128 __a, int __b)
				1421	{
				1422	return _mm_cvtsi32_ss(__a, __b);
				1423	}
				1424
				1425	#ifdef __x86_64__
				1426
				1427	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1428	_mm_cvtsi64_ss(__m128 __a, long long __b)
				1429	{
				1430	__a[0] = __b;
				1431	return __a;
				1432	}
				1433
				1434	#endif
				1435
				1436	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1437	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
				1438	{
				1439	return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
				1440	}
				1441
				1442	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1443	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
				1444	{
				1445	return _mm_cvtpi32_ps(__a, __b);
				1446	}
				1447
				1448	static __inline__ float __DEFAULT_FN_ATTRS
				1449	_mm_cvtss_f32(__m128 __a)
				1450	{
				1451	return __a[0];
				1452	}
				1453
				1454	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1455	_mm_loadh_pi(__m128 __a, const __m64 *__p)
				1456	{
				1457	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
				1458	struct __mm_loadh_pi_struct {
				1459	__mm_loadh_pi_v2f32 __u;
				1460	} __attribute__((__packed__, __may_alias__));
				1461	__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
				1462	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1463	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
				1464	}
				1465
				1466	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1467	_mm_loadl_pi(__m128 __a, const __m64 *__p)
				1468	{
				1469	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
				1470	struct __mm_loadl_pi_struct {
				1471	__mm_loadl_pi_v2f32 __u;
				1472	} __attribute__((__packed__, __may_alias__));
				1473	__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
				1474	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1475	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
				1476	}
				1477
				1478	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1479	_mm_load_ss(const float *__p)
				1480	{
				1481	struct __mm_load_ss_struct {
				1482	float __u;
				1483	} __attribute__((__packed__, __may_alias__));
				1484	float __u = ((struct __mm_load_ss_struct*)__p)->__u;
				1485	return (__m128){ __u, 0, 0, 0 };
				1486	}
				1487
				1488	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1489	_mm_load1_ps(const float *__p)
				1490	{
				1491	struct __mm_load1_ps_struct {
				1492	float __u;
				1493	} __attribute__((__packed__, __may_alias__));
				1494	float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
				1495	return (__m128){ __u, __u, __u, __u };
				1496	}
				1497
				1498	#define _mm_load_ps1(p) _mm_load1_ps(p)
				1499
				1500	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1501	_mm_load_ps(const float *__p)
				1502	{
				1503	return (__m128)__p;
				1504	}
				1505
				1506	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1507	_mm_loadu_ps(const float *__p)
				1508	{
				1509	struct __loadu_ps {
				1510	__m128 __v;
				1511	} __attribute__((__packed__, __may_alias__));
				1512	return ((struct __loadu_ps*)__p)->__v;
				1513	}
				1514
				1515	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1516	_mm_loadr_ps(const float *__p)
				1517	{
				1518	__m128 __a = _mm_load_ps(__p);
				1519	return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
				1520	}
				1521
				1522	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1523	_mm_undefined_ps()
				1524	{
				1525	return (__m128)__builtin_ia32_undef128();
				1526	}
				1527
				1528	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1529	_mm_set_ss(float __w)
				1530	{
				1531	return (__m128){ __w, 0, 0, 0 };
				1532	}
				1533
				1534	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1535	_mm_set1_ps(float __w)
				1536	{
				1537	return (__m128){ __w, __w, __w, __w };
				1538	}
				1539
				1540	/* Microsoft specific. */
				1541	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1542	_mm_set_ps1(float __w)
				1543	{
				1544	return _mm_set1_ps(__w);
				1545	}
				1546
				1547	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1548	_mm_set_ps(float __z, float __y, float __x, float __w)
				1549	{
				1550	return (__m128){ __w, __x, __y, __z };
				1551	}
				1552
				1553	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1554	_mm_setr_ps(float __z, float __y, float __x, float __w)
				1555	{
				1556	return (__m128){ __z, __y, __x, __w };
				1557	}
				1558
				1559	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1560	_mm_setzero_ps(void)
				1561	{
				1562	return (__m128){ 0, 0, 0, 0 };
				1563	}
				1564
				1565	static __inline__ void __DEFAULT_FN_ATTRS
				1566	_mm_storeh_pi(__m64 *__p, __m128 __a)
				1567	{
				1568	__builtin_ia32_storehps((__v2si *)__p, __a);
				1569	}
				1570
				1571	static __inline__ void __DEFAULT_FN_ATTRS
				1572	_mm_storel_pi(__m64 *__p, __m128 __a)
				1573	{
				1574	__builtin_ia32_storelps((__v2si *)__p, __a);
				1575	}
				1576
				1577	static __inline__ void __DEFAULT_FN_ATTRS
				1578	_mm_store_ss(float *__p, __m128 __a)
				1579	{
				1580	struct __mm_store_ss_struct {
				1581	float __u;
				1582	} __attribute__((__packed__, __may_alias__));
				1583	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
				1584	}
				1585
				1586	static __inline__ void __DEFAULT_FN_ATTRS
				1587	_mm_storeu_ps(float *__p, __m128 __a)
				1588	{
				1589	__builtin_ia32_storeups(__p, __a);
				1590	}
				1591
				1592	static __inline__ void __DEFAULT_FN_ATTRS
				1593	_mm_store1_ps(float *__p, __m128 __a)
				1594	{
				1595	__a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
				1596	_mm_storeu_ps(__p, __a);
				1597	}
				1598
				1599	static __inline__ void __DEFAULT_FN_ATTRS
				1600	_mm_store_ps1(float *__p, __m128 __a)
				1601	{
				1602	return _mm_store1_ps(__p, __a);
				1603	}
				1604
				1605	static __inline__ void __DEFAULT_FN_ATTRS
				1606	_mm_store_ps(float *__p, __m128 __a)
				1607	{
				1608	(__m128 )__p = __a;
				1609	}
				1610
				1611	static __inline__ void __DEFAULT_FN_ATTRS
				1612	_mm_storer_ps(float *__p, __m128 __a)
				1613	{
				1614	__a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
				1615	_mm_store_ps(__p, __a);
				1616	}
				1617
				1618	#define _MM_HINT_T0 3
				1619	#define _MM_HINT_T1 2
				1620	#define _MM_HINT_T2 1
				1621	#define _MM_HINT_NTA 0
				1622
				1623	#ifndef _MSC_VER
				1624	/* FIXME: We have to #define this because "sel" must be a constant integer, and
				1625	Sema doesn't do any form of constant propagation yet. */
				1626
				1627	#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
				1628	#endif
				1629
				1630	static __inline__ void __DEFAULT_FN_ATTRS
				1631	_mm_stream_pi(__m64 *__p, __m64 __a)
				1632	{
				1633	__builtin_ia32_movntq(__p, __a);
				1634	}
				1635
				1636	static __inline__ void __DEFAULT_FN_ATTRS
				1637	_mm_stream_ps(float *__p, __m128 __a)
				1638	{
				1639	__builtin_ia32_movntps(__p, __a);
				1640	}
				1641
				1642	static __inline__ void __DEFAULT_FN_ATTRS
				1643	_mm_sfence(void)
				1644	{
				1645	__builtin_ia32_sfence();
				1646	}
				1647
				1648	static __inline__ int __DEFAULT_FN_ATTRS
				1649	_mm_extract_pi16(__m64 __a, int __n)
				1650	{
				1651	__v4hi __b = (__v4hi)__a;
				1652	return (unsigned short)__b[__n & 3];
				1653	}
				1654
				1655	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1656	_mm_insert_pi16(__m64 __a, int __d, int __n)
				1657	{
				1658	__v4hi __b = (__v4hi)__a;
				1659	__b[__n & 3] = __d;
				1660	return (__m64)__b;
				1661	}
				1662
				1663	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1664	_mm_max_pi16(__m64 __a, __m64 __b)
				1665	{
				1666	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
				1667	}
				1668
				1669	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1670	_mm_max_pu8(__m64 __a, __m64 __b)
				1671	{
				1672	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
				1673	}
				1674
				1675	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1676	_mm_min_pi16(__m64 __a, __m64 __b)
				1677	{
				1678	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
				1679	}
				1680
				1681	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1682	_mm_min_pu8(__m64 __a, __m64 __b)
				1683	{
				1684	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
				1685	}
				1686
				1687	static __inline__ int __DEFAULT_FN_ATTRS
				1688	_mm_movemask_pi8(__m64 __a)
				1689	{
				1690	return __builtin_ia32_pmovmskb((__v8qi)__a);
				1691	}
				1692
				1693	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1694	_mm_mulhi_pu16(__m64 __a, __m64 __b)
				1695	{
				1696	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
				1697	}
				1698
				1699	#define _mm_shuffle_pi16(a, n) __extension__ ({ \
				1700	(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
				1701
				1702	static __inline__ void __DEFAULT_FN_ATTRS
				1703	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
				1704	{
				1705	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
				1706	}
				1707
				1708	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1709	_mm_avg_pu8(__m64 __a, __m64 __b)
				1710	{
				1711	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
				1712	}
				1713
				1714	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1715	_mm_avg_pu16(__m64 __a, __m64 __b)
				1716	{
				1717	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
				1718	}
				1719
				1720	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1721	_mm_sad_pu8(__m64 __a, __m64 __b)
				1722	{
				1723	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
				1724	}
				1725
				1726	static __inline__ unsigned int __DEFAULT_FN_ATTRS
				1727	_mm_getcsr(void)
				1728	{
				1729	return __builtin_ia32_stmxcsr();
				1730	}
				1731
				1732	static __inline__ void __DEFAULT_FN_ATTRS
				1733	_mm_setcsr(unsigned int __i)
				1734	{
				1735	__builtin_ia32_ldmxcsr(__i);
				1736	}
				1737
				1738	#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
				1739	(__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
				1740	(mask) & 0x3, ((mask) & 0xc) >> 2, \
				1741	(((mask) & 0x30) >> 4) + 4, \
				1742	(((mask) & 0xc0) >> 6) + 4); })
				1743
				1744	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1745	_mm_unpackhi_ps(__m128 __a, __m128 __b)
				1746	{
				1747	return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
				1748	}
				1749
				1750	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1751	_mm_unpacklo_ps(__m128 __a, __m128 __b)
				1752	{
				1753	return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
				1754	}
				1755
				1756	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1757	_mm_move_ss(__m128 __a, __m128 __b)
				1758	{
				1759	return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
				1760	}
				1761
				1762	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1763	_mm_movehl_ps(__m128 __a, __m128 __b)
				1764	{
				1765	return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
				1766	}
				1767
				1768	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1769	_mm_movelh_ps(__m128 __a, __m128 __b)
				1770	{
				1771	return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
				1772	}
				1773
				1774	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1775	_mm_cvtpi16_ps(__m64 __a)
				1776	{
				1777	__m64 __b, __c;
				1778	__m128 __r;
				1779
				1780	__b = _mm_setzero_si64();
				1781	__b = _mm_cmpgt_pi16(__b, __a);
				1782	__c = _mm_unpackhi_pi16(__a, __b);
				1783	__r = _mm_setzero_ps();
				1784	__r = _mm_cvtpi32_ps(__r, __c);
				1785	__r = _mm_movelh_ps(__r, __r);
				1786	__c = _mm_unpacklo_pi16(__a, __b);
				1787	__r = _mm_cvtpi32_ps(__r, __c);
				1788
				1789	return __r;
				1790	}
				1791
				1792	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1793	_mm_cvtpu16_ps(__m64 __a)
				1794	{
				1795	__m64 __b, __c;
				1796	__m128 __r;
				1797
				1798	__b = _mm_setzero_si64();
				1799	__c = _mm_unpackhi_pi16(__a, __b);
				1800	__r = _mm_setzero_ps();
				1801	__r = _mm_cvtpi32_ps(__r, __c);
				1802	__r = _mm_movelh_ps(__r, __r);
				1803	__c = _mm_unpacklo_pi16(__a, __b);
				1804	__r = _mm_cvtpi32_ps(__r, __c);
				1805
				1806	return __r;
				1807	}
				1808
				1809	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1810	_mm_cvtpi8_ps(__m64 __a)
				1811	{
				1812	__m64 __b;
				1813
				1814	__b = _mm_setzero_si64();
				1815	__b = _mm_cmpgt_pi8(__b, __a);
				1816	__b = _mm_unpacklo_pi8(__a, __b);
				1817
				1818	return _mm_cvtpi16_ps(__b);
				1819	}
				1820
				1821	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1822	_mm_cvtpu8_ps(__m64 __a)
				1823	{
				1824	__m64 __b;
				1825
				1826	__b = _mm_setzero_si64();
				1827	__b = _mm_unpacklo_pi8(__a, __b);
				1828
				1829	return _mm_cvtpi16_ps(__b);
				1830	}
				1831
				1832	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1833	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
				1834	{
				1835	__m128 __c;
				1836
				1837	__c = _mm_setzero_ps();
				1838	__c = _mm_cvtpi32_ps(__c, __b);
				1839	__c = _mm_movelh_ps(__c, __c);
				1840
				1841	return _mm_cvtpi32_ps(__c, __a);
				1842	}
				1843
				1844	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1845	_mm_cvtps_pi16(__m128 __a)
				1846	{
				1847	__m64 __b, __c;
				1848
				1849	__b = _mm_cvtps_pi32(__a);
				1850	__a = _mm_movehl_ps(__a, __a);
				1851	__c = _mm_cvtps_pi32(__a);
				1852
				1853	return _mm_packs_pi32(__b, __c);
				1854	}
				1855
				1856	static __inline__ __m64 __DEFAULT_FN_ATTRS
				1857	_mm_cvtps_pi8(__m128 __a)
				1858	{
				1859	__m64 __b, __c;
				1860
				1861	__b = _mm_cvtps_pi16(__a);
				1862	__c = _mm_setzero_si64();
				1863
				1864	return _mm_packs_pi16(__b, __c);
				1865	}
				1866
				1867	static __inline__ int __DEFAULT_FN_ATTRS
				1868	_mm_movemask_ps(__m128 __a)
				1869	{
				1870	return __builtin_ia32_movmskps(__a);
				1871	}
				1872
				1873
				1874	#ifdef _MSC_VER
				1875	#define _MM_ALIGN16 __declspec(align(16))
				1876	#endif
				1877
				1878	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
				1879
				1880	#define _MM_EXCEPT_INVALID (0x0001)
				1881	#define _MM_EXCEPT_DENORM (0x0002)
				1882	#define _MM_EXCEPT_DIV_ZERO (0x0004)
				1883	#define _MM_EXCEPT_OVERFLOW (0x0008)
				1884	#define _MM_EXCEPT_UNDERFLOW (0x0010)
				1885	#define _MM_EXCEPT_INEXACT (0x0020)
				1886	#define _MM_EXCEPT_MASK (0x003f)
				1887
				1888	#define _MM_MASK_INVALID (0x0080)
				1889	#define _MM_MASK_DENORM (0x0100)
				1890	#define _MM_MASK_DIV_ZERO (0x0200)
				1891	#define _MM_MASK_OVERFLOW (0x0400)
				1892	#define _MM_MASK_UNDERFLOW (0x0800)
				1893	#define _MM_MASK_INEXACT (0x1000)
				1894	#define _MM_MASK_MASK (0x1f80)
				1895
				1896	#define _MM_ROUND_NEAREST (0x0000)
				1897	#define _MM_ROUND_DOWN (0x2000)
				1898	#define _MM_ROUND_UP (0x4000)
				1899	#define _MM_ROUND_TOWARD_ZERO (0x6000)
				1900	#define _MM_ROUND_MASK (0x6000)
				1901
				1902	#define _MM_FLUSH_ZERO_MASK (0x8000)
				1903	#define _MM_FLUSH_ZERO_ON (0x8000)
				1904	#define _MM_FLUSH_ZERO_OFF (0x0000)
				1905
				1906	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
				1907	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
				1908	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
				1909	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
				1910
				1911	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
				1912	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
				1913	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
				1914	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
				1915
				1916	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				1917	do { \
				1918	__m128 tmp3, tmp2, tmp1, tmp0; \
				1919	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
				1920	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
				1921	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
				1922	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
				1923	(row0) = _mm_movelh_ps(tmp0, tmp2); \
				1924	(row1) = _mm_movehl_ps(tmp2, tmp0); \
				1925	(row2) = _mm_movelh_ps(tmp1, tmp3); \
				1926	(row3) = _mm_movehl_ps(tmp3, tmp1); \
				1927	} while (0)
				1928
				1929	/* Aliases for compatibility. */
				1930	#define _m_pextrw _mm_extract_pi16
				1931	#define _m_pinsrw _mm_insert_pi16
				1932	#define _m_pmaxsw _mm_max_pi16
				1933	#define _m_pmaxub _mm_max_pu8
				1934	#define _m_pminsw _mm_min_pi16
				1935	#define _m_pminub _mm_min_pu8
				1936	#define _m_pmovmskb _mm_movemask_pi8
				1937	#define _m_pmulhuw _mm_mulhi_pu16
				1938	#define _m_pshufw _mm_shuffle_pi16
				1939	#define _m_maskmovq _mm_maskmove_si64
				1940	#define _m_pavgb _mm_avg_pu8
				1941	#define _m_pavgw _mm_avg_pu16
				1942	#define _m_psadbw _mm_sad_pu8
				1943	#define _m_ _mm_
				1944	#define _m_ _mm_
				1945
				1946	#undef __DEFAULT_FN_ATTRS
				1947
				1948	/* Ugly hack for backwards-compatibility (compatible with gcc) */
Ben Murdoch	61f157c	2016-09-16 13:49:30 +0100	[diff] [blame]	1949	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1950	#include <emmintrin.h>
				1951	#endif
				1952
				1953	#endif /* __XMMINTRIN_H */