Blame - linux-x86/lib64/clang/9.0.2/include/xmmintrin.h - platform/prebuilts/clang-tools

blob: ff21a570e9c21ce84c310a18b341dc7a5ea0e011 [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
				24	#ifndef __XMMINTRIN_H
				25	#define __XMMINTRIN_H
				26
				27	#include <mmintrin.h>
				28
				29	typedef int __v4si __attribute__((__vector_size__(16)));
				30	typedef float __v4sf __attribute__((__vector_size__(16)));
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	31	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
				32
				33	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	34
				35	/* Unsigned types */
				36	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
				37
				38	/* This header should only be included in a hosted environment as it depends on
				39	* a standard library to provide allocation routines. */
				40	#if __STDC_HOSTED__
				41	#include <mm_malloc.h>
				42	#endif
				43
				44	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	45	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
				46	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	47
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	48	/// Adds the 32-bit float values in the low-order bits of the operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	49	///
				50	/// \headerfile <x86intrin.h>
				51	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	52	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	53	///
				54	/// \param __a
				55	/// A 128-bit vector of [4 x float] containing one of the source operands.
				56	/// The lower 32 bits of this operand are used in the calculation.
				57	/// \param __b
				58	/// A 128-bit vector of [4 x float] containing one of the source operands.
				59	/// The lower 32 bits of this operand are used in the calculation.
				60	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
				61	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
				62	/// the upper 96 bits of the first source operand.
				63	static __inline__ __m128 __DEFAULT_FN_ATTRS
				64	_mm_add_ss(__m128 __a, __m128 __b)
				65	{
				66	__a[0] += __b[0];
				67	return __a;
				68	}
				69
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	70	/// Adds two 128-bit vectors of [4 x float], and returns the results of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	71	/// the addition.
				72	///
				73	/// \headerfile <x86intrin.h>
				74	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	75	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	76	///
				77	/// \param __a
				78	/// A 128-bit vector of [4 x float] containing one of the source operands.
				79	/// \param __b
				80	/// A 128-bit vector of [4 x float] containing one of the source operands.
				81	/// \returns A 128-bit vector of [4 x float] containing the sums of both
				82	/// operands.
				83	static __inline__ __m128 __DEFAULT_FN_ATTRS
				84	_mm_add_ps(__m128 __a, __m128 __b)
				85	{
				86	return (__m128)((__v4sf)__a + (__v4sf)__b);
				87	}
				88
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	89	/// Subtracts the 32-bit float value in the low-order bits of the second
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	90	/// operand from the corresponding value in the first operand.
				91	///
				92	/// \headerfile <x86intrin.h>
				93	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	94	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	95	///
				96	/// \param __a
				97	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
				98	/// of this operand are used in the calculation.
				99	/// \param __b
				100	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
				101	/// bits of this operand are used in the calculation.
				102	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				103	/// difference of the lower 32 bits of both operands. The upper 96 bits are
				104	/// copied from the upper 96 bits of the first source operand.
				105	static __inline__ __m128 __DEFAULT_FN_ATTRS
				106	_mm_sub_ss(__m128 __a, __m128 __b)
				107	{
				108	__a[0] -= __b[0];
				109	return __a;
				110	}
				111
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	112	/// Subtracts each of the values of the second operand from the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	113	/// operand, both of which are 128-bit vectors of [4 x float] and returns
				114	/// the results of the subtraction.
				115	///
				116	/// \headerfile <x86intrin.h>
				117	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	118	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	119	///
				120	/// \param __a
				121	/// A 128-bit vector of [4 x float] containing the minuend.
				122	/// \param __b
				123	/// A 128-bit vector of [4 x float] containing the subtrahend.
				124	/// \returns A 128-bit vector of [4 x float] containing the differences between
				125	/// both operands.
				126	static __inline__ __m128 __DEFAULT_FN_ATTRS
				127	_mm_sub_ps(__m128 __a, __m128 __b)
				128	{
				129	return (__m128)((__v4sf)__a - (__v4sf)__b);
				130	}
				131
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	132	/// Multiplies two 32-bit float values in the low-order bits of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	133	/// operands.
				134	///
				135	/// \headerfile <x86intrin.h>
				136	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	137	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	138	///
				139	/// \param __a
				140	/// A 128-bit vector of [4 x float] containing one of the source operands.
				141	/// The lower 32 bits of this operand are used in the calculation.
				142	/// \param __b
				143	/// A 128-bit vector of [4 x float] containing one of the source operands.
				144	/// The lower 32 bits of this operand are used in the calculation.
				145	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
				146	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
				147	/// bits of the first source operand.
				148	static __inline__ __m128 __DEFAULT_FN_ATTRS
				149	_mm_mul_ss(__m128 __a, __m128 __b)
				150	{
				151	__a[0] *= __b[0];
				152	return __a;
				153	}
				154
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	155	/// Multiplies two 128-bit vectors of [4 x float] and returns the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	156	/// results of the multiplication.
				157	///
				158	/// \headerfile <x86intrin.h>
				159	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	160	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	161	///
				162	/// \param __a
				163	/// A 128-bit vector of [4 x float] containing one of the source operands.
				164	/// \param __b
				165	/// A 128-bit vector of [4 x float] containing one of the source operands.
				166	/// \returns A 128-bit vector of [4 x float] containing the products of both
				167	/// operands.
				168	static __inline__ __m128 __DEFAULT_FN_ATTRS
				169	_mm_mul_ps(__m128 __a, __m128 __b)
				170	{
				171	return (__m128)((__v4sf)__a * (__v4sf)__b);
				172	}
				173
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	174	/// Divides the value in the low-order 32 bits of the first operand by
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	175	/// the corresponding value in the second operand.
				176	///
				177	/// \headerfile <x86intrin.h>
				178	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	179	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	180	///
				181	/// \param __a
				182	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
				183	/// bits of this operand are used in the calculation.
				184	/// \param __b
				185	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
				186	/// of this operand are used in the calculation.
				187	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
				188	/// lower 32 bits of both operands. The upper 96 bits are copied from the
				189	/// upper 96 bits of the first source operand.
				190	static __inline__ __m128 __DEFAULT_FN_ATTRS
				191	_mm_div_ss(__m128 __a, __m128 __b)
				192	{
				193	__a[0] /= __b[0];
				194	return __a;
				195	}
				196
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	197	/// Divides two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	198	///
				199	/// \headerfile <x86intrin.h>
				200	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	201	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	202	///
				203	/// \param __a
				204	/// A 128-bit vector of [4 x float] containing the dividend.
				205	/// \param __b
				206	/// A 128-bit vector of [4 x float] containing the divisor.
				207	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
				208	/// operands.
				209	static __inline__ __m128 __DEFAULT_FN_ATTRS
				210	_mm_div_ps(__m128 __a, __m128 __b)
				211	{
				212	return (__m128)((__v4sf)__a / (__v4sf)__b);
				213	}
				214
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	215	/// Calculates the square root of the value stored in the low-order bits
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	216	/// of a 128-bit vector of [4 x float].
				217	///
				218	/// \headerfile <x86intrin.h>
				219	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	220	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	221	///
				222	/// \param __a
				223	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				224	/// used in the calculation.
				225	/// \returns A 128-bit vector of [4 x float] containing the square root of the
				226	/// value in the low-order bits of the operand.
				227	static __inline__ __m128 __DEFAULT_FN_ATTRS
				228	_mm_sqrt_ss(__m128 __a)
				229	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	230	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	231	}
				232
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	233	/// Calculates the square roots of the values stored in a 128-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	234	/// of [4 x float].
				235	///
				236	/// \headerfile <x86intrin.h>
				237	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	238	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	239	///
				240	/// \param __a
				241	/// A 128-bit vector of [4 x float].
				242	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
				243	/// values in the operand.
				244	static __inline__ __m128 __DEFAULT_FN_ATTRS
				245	_mm_sqrt_ps(__m128 __a)
				246	{
				247	return __builtin_ia32_sqrtps((__v4sf)__a);
				248	}
				249
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	250	/// Calculates the approximate reciprocal of the value stored in the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	251	/// low-order bits of a 128-bit vector of [4 x float].
				252	///
				253	/// \headerfile <x86intrin.h>
				254	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	255	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	256	///
				257	/// \param __a
				258	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				259	/// used in the calculation.
				260	/// \returns A 128-bit vector of [4 x float] containing the approximate
				261	/// reciprocal of the value in the low-order bits of the operand.
				262	static __inline__ __m128 __DEFAULT_FN_ATTRS
				263	_mm_rcp_ss(__m128 __a)
				264	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	265	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	266	}
				267
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	268	/// Calculates the approximate reciprocals of the values stored in a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	269	/// 128-bit vector of [4 x float].
				270	///
				271	/// \headerfile <x86intrin.h>
				272	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	273	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	274	///
				275	/// \param __a
				276	/// A 128-bit vector of [4 x float].
				277	/// \returns A 128-bit vector of [4 x float] containing the approximate
				278	/// reciprocals of the values in the operand.
				279	static __inline__ __m128 __DEFAULT_FN_ATTRS
				280	_mm_rcp_ps(__m128 __a)
				281	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	282	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	283	}
				284
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	285	/// Calculates the approximate reciprocal of the square root of the value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	286	/// stored in the low-order bits of a 128-bit vector of [4 x float].
				287	///
				288	/// \headerfile <x86intrin.h>
				289	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	290	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	291	///
				292	/// \param __a
				293	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				294	/// used in the calculation.
				295	/// \returns A 128-bit vector of [4 x float] containing the approximate
				296	/// reciprocal of the square root of the value in the low-order bits of the
				297	/// operand.
				298	static __inline__ __m128 __DEFAULT_FN_ATTRS
				299	_mm_rsqrt_ss(__m128 __a)
				300	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	301	return __builtin_ia32_rsqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	302	}
				303
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	304	/// Calculates the approximate reciprocals of the square roots of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	305	/// values stored in a 128-bit vector of [4 x float].
				306	///
				307	/// \headerfile <x86intrin.h>
				308	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	309	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	310	///
				311	/// \param __a
				312	/// A 128-bit vector of [4 x float].
				313	/// \returns A 128-bit vector of [4 x float] containing the approximate
				314	/// reciprocals of the square roots of the values in the operand.
				315	static __inline__ __m128 __DEFAULT_FN_ATTRS
				316	_mm_rsqrt_ps(__m128 __a)
				317	{
				318	return __builtin_ia32_rsqrtps((__v4sf)__a);
				319	}
				320
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	321	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	322	/// operands and returns the lesser value in the low-order bits of the
				323	/// vector of [4 x float].
				324	///
				325	/// \headerfile <x86intrin.h>
				326	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	327	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	328	///
				329	/// \param __a
				330	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				331	/// 32 bits of this operand are used in the comparison.
				332	/// \param __b
				333	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				334	/// 32 bits of this operand are used in the comparison.
				335	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				336	/// minimum value between both operands. The upper 96 bits are copied from
				337	/// the upper 96 bits of the first source operand.
				338	static __inline__ __m128 __DEFAULT_FN_ATTRS
				339	_mm_min_ss(__m128 __a, __m128 __b)
				340	{
				341	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
				342	}
				343
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	344	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
				345	/// of each pair of values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	346	///
				347	/// \headerfile <x86intrin.h>
				348	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	349	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	350	///
				351	/// \param __a
				352	/// A 128-bit vector of [4 x float] containing one of the operands.
				353	/// \param __b
				354	/// A 128-bit vector of [4 x float] containing one of the operands.
				355	/// \returns A 128-bit vector of [4 x float] containing the minimum values
				356	/// between both operands.
				357	static __inline__ __m128 __DEFAULT_FN_ATTRS
				358	_mm_min_ps(__m128 __a, __m128 __b)
				359	{
				360	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
				361	}
				362
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	363	/// Compares two 32-bit float values in the low-order bits of both
				364	/// operands and returns the greater value in the low-order bits of a 128-bit
				365	/// vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	366	///
				367	/// \headerfile <x86intrin.h>
				368	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	369	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	370	///
				371	/// \param __a
				372	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				373	/// 32 bits of this operand are used in the comparison.
				374	/// \param __b
				375	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				376	/// 32 bits of this operand are used in the comparison.
				377	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				378	/// maximum value between both operands. The upper 96 bits are copied from
				379	/// the upper 96 bits of the first source operand.
				380	static __inline__ __m128 __DEFAULT_FN_ATTRS
				381	_mm_max_ss(__m128 __a, __m128 __b)
				382	{
				383	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
				384	}
				385
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	386	/// Compares two 128-bit vectors of [4 x float] and returns the greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	387	/// of each pair of values.
				388	///
				389	/// \headerfile <x86intrin.h>
				390	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	391	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	392	///
				393	/// \param __a
				394	/// A 128-bit vector of [4 x float] containing one of the operands.
				395	/// \param __b
				396	/// A 128-bit vector of [4 x float] containing one of the operands.
				397	/// \returns A 128-bit vector of [4 x float] containing the maximum values
				398	/// between both operands.
				399	static __inline__ __m128 __DEFAULT_FN_ATTRS
				400	_mm_max_ps(__m128 __a, __m128 __b)
				401	{
				402	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
				403	}
				404
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	405	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	406	///
				407	/// \headerfile <x86intrin.h>
				408	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	409	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	410	///
				411	/// \param __a
				412	/// A 128-bit vector containing one of the source operands.
				413	/// \param __b
				414	/// A 128-bit vector containing one of the source operands.
				415	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				416	/// values between both operands.
				417	static __inline__ __m128 __DEFAULT_FN_ATTRS
				418	_mm_and_ps(__m128 __a, __m128 __b)
				419	{
				420	return (__m128)((__v4su)__a & (__v4su)__b);
				421	}
				422
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	423	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	424	/// the one's complement of the values contained in the first source
				425	/// operand.
				426	///
				427	/// \headerfile <x86intrin.h>
				428	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	429	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	430	///
				431	/// \param __a
				432	/// A 128-bit vector of [4 x float] containing the first source operand. The
				433	/// one's complement of this value is used in the bitwise AND.
				434	/// \param __b
				435	/// A 128-bit vector of [4 x float] containing the second source operand.
				436	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				437	/// one's complement of the first operand and the values in the second
				438	/// operand.
				439	static __inline__ __m128 __DEFAULT_FN_ATTRS
				440	_mm_andnot_ps(__m128 __a, __m128 __b)
				441	{
				442	return (__m128)(~(__v4su)__a & (__v4su)__b);
				443	}
				444
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	445	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	446	///
				447	/// \headerfile <x86intrin.h>
				448	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	449	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	450	///
				451	/// \param __a
				452	/// A 128-bit vector of [4 x float] containing one of the source operands.
				453	/// \param __b
				454	/// A 128-bit vector of [4 x float] containing one of the source operands.
				455	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
				456	/// values between both operands.
				457	static __inline__ __m128 __DEFAULT_FN_ATTRS
				458	_mm_or_ps(__m128 __a, __m128 __b)
				459	{
				460	return (__m128)((__v4su)__a \| (__v4su)__b);
				461	}
				462
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	463	/// Performs a bitwise exclusive OR of two 128-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	464	/// [4 x float].
				465	///
				466	/// \headerfile <x86intrin.h>
				467	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	468	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	469	///
				470	/// \param __a
				471	/// A 128-bit vector of [4 x float] containing one of the source operands.
				472	/// \param __b
				473	/// A 128-bit vector of [4 x float] containing one of the source operands.
				474	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
				475	/// of the values between both operands.
				476	static __inline__ __m128 __DEFAULT_FN_ATTRS
				477	_mm_xor_ps(__m128 __a, __m128 __b)
				478	{
				479	return (__m128)((__v4su)__a ^ (__v4su)__b);
				480	}
				481
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	482	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	483	/// operands for equality and returns the result of the comparison in the
				484	/// low-order bits of a vector [4 x float].
				485	///
				486	/// \headerfile <x86intrin.h>
				487	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	488	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	489	///
				490	/// \param __a
				491	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				492	/// 32 bits of this operand are used in the comparison.
				493	/// \param __b
				494	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				495	/// 32 bits of this operand are used in the comparison.
				496	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				497	/// in the low-order bits.
				498	static __inline__ __m128 __DEFAULT_FN_ATTRS
				499	_mm_cmpeq_ss(__m128 __a, __m128 __b)
				500	{
				501	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
				502	}
				503
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	504	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	505	/// 128-bit vectors of [4 x float] for equality.
				506	///
				507	/// \headerfile <x86intrin.h>
				508	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	509	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	510	///
				511	/// \param __a
				512	/// A 128-bit vector of [4 x float].
				513	/// \param __b
				514	/// A 128-bit vector of [4 x float].
				515	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				516	static __inline__ __m128 __DEFAULT_FN_ATTRS
				517	_mm_cmpeq_ps(__m128 __a, __m128 __b)
				518	{
				519	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
				520	}
				521
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	522	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	523	/// operands to determine if the value in the first operand is less than the
				524	/// corresponding value in the second operand and returns the result of the
				525	/// comparison in the low-order bits of a vector of [4 x float].
				526	///
				527	/// \headerfile <x86intrin.h>
				528	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	529	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	530	///
				531	/// \param __a
				532	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				533	/// 32 bits of this operand are used in the comparison.
				534	/// \param __b
				535	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				536	/// 32 bits of this operand are used in the comparison.
				537	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				538	/// in the low-order bits.
				539	static __inline__ __m128 __DEFAULT_FN_ATTRS
				540	_mm_cmplt_ss(__m128 __a, __m128 __b)
				541	{
				542	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
				543	}
				544
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	545	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	546	/// 128-bit vectors of [4 x float] to determine if the values in the first
				547	/// operand are less than those in the second operand.
				548	///
				549	/// \headerfile <x86intrin.h>
				550	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	551	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	552	///
				553	/// \param __a
				554	/// A 128-bit vector of [4 x float].
				555	/// \param __b
				556	/// A 128-bit vector of [4 x float].
				557	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				558	static __inline__ __m128 __DEFAULT_FN_ATTRS
				559	_mm_cmplt_ps(__m128 __a, __m128 __b)
				560	{
				561	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
				562	}
				563
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	564	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	565	/// operands to determine if the value in the first operand is less than or
				566	/// equal to the corresponding value in the second operand and returns the
				567	/// result of the comparison in the low-order bits of a vector of
				568	/// [4 x float].
				569	///
				570	/// \headerfile <x86intrin.h>
				571	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	572	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	573	///
				574	/// \param __a
				575	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				576	/// 32 bits of this operand are used in the comparison.
				577	/// \param __b
				578	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				579	/// 32 bits of this operand are used in the comparison.
				580	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				581	/// in the low-order bits.
				582	static __inline__ __m128 __DEFAULT_FN_ATTRS
				583	_mm_cmple_ss(__m128 __a, __m128 __b)
				584	{
				585	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
				586	}
				587
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	588	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	589	/// 128-bit vectors of [4 x float] to determine if the values in the first
				590	/// operand are less than or equal to those in the second operand.
				591	///
				592	/// \headerfile <x86intrin.h>
				593	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	594	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	595	///
				596	/// \param __a
				597	/// A 128-bit vector of [4 x float].
				598	/// \param __b
				599	/// A 128-bit vector of [4 x float].
				600	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				601	static __inline__ __m128 __DEFAULT_FN_ATTRS
				602	_mm_cmple_ps(__m128 __a, __m128 __b)
				603	{
				604	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
				605	}
				606
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	607	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	608	/// operands to determine if the value in the first operand is greater than
				609	/// the corresponding value in the second operand and returns the result of
				610	/// the comparison in the low-order bits of a vector of [4 x float].
				611	///
				612	/// \headerfile <x86intrin.h>
				613	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	614	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	615	///
				616	/// \param __a
				617	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				618	/// 32 bits of this operand are used in the comparison.
				619	/// \param __b
				620	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				621	/// 32 bits of this operand are used in the comparison.
				622	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				623	/// in the low-order bits.
				624	static __inline__ __m128 __DEFAULT_FN_ATTRS
				625	_mm_cmpgt_ss(__m128 __a, __m128 __b)
				626	{
				627	return (__m128)__builtin_shufflevector((__v4sf)__a,
				628	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
				629	4, 1, 2, 3);
				630	}
				631
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	632	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	633	/// 128-bit vectors of [4 x float] to determine if the values in the first
				634	/// operand are greater than those in the second operand.
				635	///
				636	/// \headerfile <x86intrin.h>
				637	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	638	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	639	///
				640	/// \param __a
				641	/// A 128-bit vector of [4 x float].
				642	/// \param __b
				643	/// A 128-bit vector of [4 x float].
				644	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				645	static __inline__ __m128 __DEFAULT_FN_ATTRS
				646	_mm_cmpgt_ps(__m128 __a, __m128 __b)
				647	{
				648	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
				649	}
				650
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	651	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	652	/// operands to determine if the value in the first operand is greater than
				653	/// or equal to the corresponding value in the second operand and returns
				654	/// the result of the comparison in the low-order bits of a vector of
				655	/// [4 x float].
				656	///
				657	/// \headerfile <x86intrin.h>
				658	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	659	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	660	///
				661	/// \param __a
				662	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				663	/// 32 bits of this operand are used in the comparison.
				664	/// \param __b
				665	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				666	/// 32 bits of this operand are used in the comparison.
				667	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				668	/// in the low-order bits.
				669	static __inline__ __m128 __DEFAULT_FN_ATTRS
				670	_mm_cmpge_ss(__m128 __a, __m128 __b)
				671	{
				672	return (__m128)__builtin_shufflevector((__v4sf)__a,
				673	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
				674	4, 1, 2, 3);
				675	}
				676
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	677	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	678	/// 128-bit vectors of [4 x float] to determine if the values in the first
				679	/// operand are greater than or equal to those in the second operand.
				680	///
				681	/// \headerfile <x86intrin.h>
				682	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	683	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	684	///
				685	/// \param __a
				686	/// A 128-bit vector of [4 x float].
				687	/// \param __b
				688	/// A 128-bit vector of [4 x float].
				689	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				690	static __inline__ __m128 __DEFAULT_FN_ATTRS
				691	_mm_cmpge_ps(__m128 __a, __m128 __b)
				692	{
				693	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
				694	}
				695
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	696	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	697	/// operands for inequality and returns the result of the comparison in the
				698	/// low-order bits of a vector of [4 x float].
				699	///
				700	/// \headerfile <x86intrin.h>
				701	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	702	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
				703	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	704	///
				705	/// \param __a
				706	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				707	/// 32 bits of this operand are used in the comparison.
				708	/// \param __b
				709	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				710	/// 32 bits of this operand are used in the comparison.
				711	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				712	/// in the low-order bits.
				713	static __inline__ __m128 __DEFAULT_FN_ATTRS
				714	_mm_cmpneq_ss(__m128 __a, __m128 __b)
				715	{
				716	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
				717	}
				718
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	719	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	720	/// 128-bit vectors of [4 x float] for inequality.
				721	///
				722	/// \headerfile <x86intrin.h>
				723	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	724	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
				725	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	726	///
				727	/// \param __a
				728	/// A 128-bit vector of [4 x float].
				729	/// \param __b
				730	/// A 128-bit vector of [4 x float].
				731	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				732	static __inline__ __m128 __DEFAULT_FN_ATTRS
				733	_mm_cmpneq_ps(__m128 __a, __m128 __b)
				734	{
				735	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
				736	}
				737
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	738	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	739	/// operands to determine if the value in the first operand is not less than
				740	/// the corresponding value in the second operand and returns the result of
				741	/// the comparison in the low-order bits of a vector of [4 x float].
				742	///
				743	/// \headerfile <x86intrin.h>
				744	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	745	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				746	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	747	///
				748	/// \param __a
				749	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				750	/// 32 bits of this operand are used in the comparison.
				751	/// \param __b
				752	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				753	/// 32 bits of this operand are used in the comparison.
				754	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				755	/// in the low-order bits.
				756	static __inline__ __m128 __DEFAULT_FN_ATTRS
				757	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
				758	{
				759	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
				760	}
				761
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	762	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	763	/// 128-bit vectors of [4 x float] to determine if the values in the first
				764	/// operand are not less than those in the second operand.
				765	///
				766	/// \headerfile <x86intrin.h>
				767	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	768	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				769	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	770	///
				771	/// \param __a
				772	/// A 128-bit vector of [4 x float].
				773	/// \param __b
				774	/// A 128-bit vector of [4 x float].
				775	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				776	static __inline__ __m128 __DEFAULT_FN_ATTRS
				777	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
				778	{
				779	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
				780	}
				781
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	782	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	783	/// operands to determine if the value in the first operand is not less than
				784	/// or equal to the corresponding value in the second operand and returns
				785	/// the result of the comparison in the low-order bits of a vector of
				786	/// [4 x float].
				787	///
				788	/// \headerfile <x86intrin.h>
				789	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	790	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				791	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	792	///
				793	/// \param __a
				794	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				795	/// 32 bits of this operand are used in the comparison.
				796	/// \param __b
				797	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				798	/// 32 bits of this operand are used in the comparison.
				799	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				800	/// in the low-order bits.
				801	static __inline__ __m128 __DEFAULT_FN_ATTRS
				802	_mm_cmpnle_ss(__m128 __a, __m128 __b)
				803	{
				804	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
				805	}
				806
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	807	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	808	/// 128-bit vectors of [4 x float] to determine if the values in the first
				809	/// operand are not less than or equal to those in the second operand.
				810	///
				811	/// \headerfile <x86intrin.h>
				812	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	813	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				814	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	815	///
				816	/// \param __a
				817	/// A 128-bit vector of [4 x float].
				818	/// \param __b
				819	/// A 128-bit vector of [4 x float].
				820	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				821	static __inline__ __m128 __DEFAULT_FN_ATTRS
				822	_mm_cmpnle_ps(__m128 __a, __m128 __b)
				823	{
				824	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
				825	}
				826
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	827	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	828	/// operands to determine if the value in the first operand is not greater
				829	/// than the corresponding value in the second operand and returns the
				830	/// result of the comparison in the low-order bits of a vector of
				831	/// [4 x float].
				832	///
				833	/// \headerfile <x86intrin.h>
				834	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	835	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				836	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	837	///
				838	/// \param __a
				839	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				840	/// 32 bits of this operand are used in the comparison.
				841	/// \param __b
				842	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				843	/// 32 bits of this operand are used in the comparison.
				844	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				845	/// in the low-order bits.
				846	static __inline__ __m128 __DEFAULT_FN_ATTRS
				847	_mm_cmpngt_ss(__m128 __a, __m128 __b)
				848	{
				849	return (__m128)__builtin_shufflevector((__v4sf)__a,
				850	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
				851	4, 1, 2, 3);
				852	}
				853
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	854	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	855	/// 128-bit vectors of [4 x float] to determine if the values in the first
				856	/// operand are not greater than those in the second operand.
				857	///
				858	/// \headerfile <x86intrin.h>
				859	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	860	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				861	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	862	///
				863	/// \param __a
				864	/// A 128-bit vector of [4 x float].
				865	/// \param __b
				866	/// A 128-bit vector of [4 x float].
				867	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				868	static __inline__ __m128 __DEFAULT_FN_ATTRS
				869	_mm_cmpngt_ps(__m128 __a, __m128 __b)
				870	{
				871	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
				872	}
				873
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	874	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	875	/// operands to determine if the value in the first operand is not greater
				876	/// than or equal to the corresponding value in the second operand and
				877	/// returns the result of the comparison in the low-order bits of a vector
				878	/// of [4 x float].
				879	///
				880	/// \headerfile <x86intrin.h>
				881	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	882	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				883	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	884	///
				885	/// \param __a
				886	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				887	/// 32 bits of this operand are used in the comparison.
				888	/// \param __b
				889	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				890	/// 32 bits of this operand are used in the comparison.
				891	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				892	/// in the low-order bits.
				893	static __inline__ __m128 __DEFAULT_FN_ATTRS
				894	_mm_cmpnge_ss(__m128 __a, __m128 __b)
				895	{
				896	return (__m128)__builtin_shufflevector((__v4sf)__a,
				897	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
				898	4, 1, 2, 3);
				899	}
				900
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	901	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	902	/// 128-bit vectors of [4 x float] to determine if the values in the first
				903	/// operand are not greater than or equal to those in the second operand.
				904	///
				905	/// \headerfile <x86intrin.h>
				906	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	907	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				908	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	909	///
				910	/// \param __a
				911	/// A 128-bit vector of [4 x float].
				912	/// \param __b
				913	/// A 128-bit vector of [4 x float].
				914	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				915	static __inline__ __m128 __DEFAULT_FN_ATTRS
				916	_mm_cmpnge_ps(__m128 __a, __m128 __b)
				917	{
				918	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
				919	}
				920
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	921	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	922	/// operands to determine if the value in the first operand is ordered with
				923	/// respect to the corresponding value in the second operand and returns the
				924	/// result of the comparison in the low-order bits of a vector of
				925	/// [4 x float].
				926	///
				927	/// \headerfile <x86intrin.h>
				928	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	929	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
				930	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	931	///
				932	/// \param __a
				933	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				934	/// 32 bits of this operand are used in the comparison.
				935	/// \param __b
				936	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				937	/// 32 bits of this operand are used in the comparison.
				938	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				939	/// in the low-order bits.
				940	static __inline__ __m128 __DEFAULT_FN_ATTRS
				941	_mm_cmpord_ss(__m128 __a, __m128 __b)
				942	{
				943	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
				944	}
				945
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	946	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	947	/// 128-bit vectors of [4 x float] to determine if the values in the first
				948	/// operand are ordered with respect to those in the second operand.
				949	///
				950	/// \headerfile <x86intrin.h>
				951	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	952	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
				953	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	954	///
				955	/// \param __a
				956	/// A 128-bit vector of [4 x float].
				957	/// \param __b
				958	/// A 128-bit vector of [4 x float].
				959	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				960	static __inline__ __m128 __DEFAULT_FN_ATTRS
				961	_mm_cmpord_ps(__m128 __a, __m128 __b)
				962	{
				963	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
				964	}
				965
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	966	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	967	/// operands to determine if the value in the first operand is unordered
				968	/// with respect to the corresponding value in the second operand and
				969	/// returns the result of the comparison in the low-order bits of a vector
				970	/// of [4 x float].
				971	///
				972	/// \headerfile <x86intrin.h>
				973	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	974	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
				975	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	976	///
				977	/// \param __a
				978	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				979	/// 32 bits of this operand are used in the comparison.
				980	/// \param __b
				981	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				982	/// 32 bits of this operand are used in the comparison.
				983	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				984	/// in the low-order bits.
				985	static __inline__ __m128 __DEFAULT_FN_ATTRS
				986	_mm_cmpunord_ss(__m128 __a, __m128 __b)
				987	{
				988	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
				989	}
				990
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	991	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	992	/// 128-bit vectors of [4 x float] to determine if the values in the first
				993	/// operand are unordered with respect to those in the second operand.
				994	///
				995	/// \headerfile <x86intrin.h>
				996	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	997	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
				998	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	999	///
				1000	/// \param __a
				1001	/// A 128-bit vector of [4 x float].
				1002	/// \param __b
				1003	/// A 128-bit vector of [4 x float].
				1004	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				1005	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1006	_mm_cmpunord_ps(__m128 __a, __m128 __b)
				1007	{
				1008	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
				1009	}
				1010
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1011	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1012	/// operands for equality and returns the result of the comparison.
				1013	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1014	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1015	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1016	/// \headerfile <x86intrin.h>
				1017	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1018	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1019	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1020	///
				1021	/// \param __a
				1022	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1023	/// used in the comparison.
				1024	/// \param __b
				1025	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1026	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1027	/// \returns An integer containing the comparison results. If either of the
				1028	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1029	static __inline__ int __DEFAULT_FN_ATTRS
				1030	_mm_comieq_ss(__m128 __a, __m128 __b)
				1031	{
				1032	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
				1033	}
				1034
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1035	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1036	/// operands to determine if the first operand is less than the second
				1037	/// operand and returns the result of the comparison.
				1038	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1039	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1040	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1041	/// \headerfile <x86intrin.h>
				1042	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1043	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1044	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1045	///
				1046	/// \param __a
				1047	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1048	/// used in the comparison.
				1049	/// \param __b
				1050	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1051	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1052	/// \returns An integer containing the comparison results. If either of the two
				1053	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1054	static __inline__ int __DEFAULT_FN_ATTRS
				1055	_mm_comilt_ss(__m128 __a, __m128 __b)
				1056	{
				1057	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
				1058	}
				1059
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1060	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1061	/// operands to determine if the first operand is less than or equal to the
				1062	/// second operand and returns the result of the comparison.
				1063	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1064	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1065	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1066	/// \headerfile <x86intrin.h>
				1067	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1068	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1069	///
				1070	/// \param __a
				1071	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1072	/// used in the comparison.
				1073	/// \param __b
				1074	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1075	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1076	/// \returns An integer containing the comparison results. If either of the two
				1077	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1078	static __inline__ int __DEFAULT_FN_ATTRS
				1079	_mm_comile_ss(__m128 __a, __m128 __b)
				1080	{
				1081	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
				1082	}
				1083
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1084	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1085	/// operands to determine if the first operand is greater than the second
				1086	/// operand and returns the result of the comparison.
				1087	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1088	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1089	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1090	/// \headerfile <x86intrin.h>
				1091	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1092	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1093	///
				1094	/// \param __a
				1095	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1096	/// used in the comparison.
				1097	/// \param __b
				1098	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1099	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1100	/// \returns An integer containing the comparison results. If either of the
				1101	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1102	static __inline__ int __DEFAULT_FN_ATTRS
				1103	_mm_comigt_ss(__m128 __a, __m128 __b)
				1104	{
				1105	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
				1106	}
				1107
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1108	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1109	/// operands to determine if the first operand is greater than or equal to
				1110	/// the second operand and returns the result of the comparison.
				1111	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1112	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1113	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1114	/// \headerfile <x86intrin.h>
				1115	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1116	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1117	///
				1118	/// \param __a
				1119	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1120	/// used in the comparison.
				1121	/// \param __b
				1122	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1123	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1124	/// \returns An integer containing the comparison results. If either of the two
				1125	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1126	static __inline__ int __DEFAULT_FN_ATTRS
				1127	_mm_comige_ss(__m128 __a, __m128 __b)
				1128	{
				1129	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
				1130	}
				1131
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1132	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1133	/// operands to determine if the first operand is not equal to the second
				1134	/// operand and returns the result of the comparison.
				1135	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1136	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1137	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1138	/// \headerfile <x86intrin.h>
				1139	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1140	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1141	///
				1142	/// \param __a
				1143	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1144	/// used in the comparison.
				1145	/// \param __b
				1146	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1147	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1148	/// \returns An integer containing the comparison results. If either of the
				1149	/// two lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1150	static __inline__ int __DEFAULT_FN_ATTRS
				1151	_mm_comineq_ss(__m128 __a, __m128 __b)
				1152	{
				1153	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
				1154	}
				1155
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1156	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1157	/// the low-order bits of both operands to determine equality and returns
				1158	/// the result of the comparison.
				1159	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1160	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1161	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1162	/// \headerfile <x86intrin.h>
				1163	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1164	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1165	///
				1166	/// \param __a
				1167	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1168	/// used in the comparison.
				1169	/// \param __b
				1170	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1171	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1172	/// \returns An integer containing the comparison results. If either of the two
				1173	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1174	static __inline__ int __DEFAULT_FN_ATTRS
				1175	_mm_ucomieq_ss(__m128 __a, __m128 __b)
				1176	{
				1177	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
				1178	}
				1179
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1180	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1181	/// the low-order bits of both operands to determine if the first operand is
				1182	/// less than the second operand and returns the result of the comparison.
				1183	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1184	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1185	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1186	/// \headerfile <x86intrin.h>
				1187	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1188	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1189	///
				1190	/// \param __a
				1191	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1192	/// used in the comparison.
				1193	/// \param __b
				1194	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1195	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1196	/// \returns An integer containing the comparison results. If either of the two
				1197	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1198	static __inline__ int __DEFAULT_FN_ATTRS
				1199	_mm_ucomilt_ss(__m128 __a, __m128 __b)
				1200	{
				1201	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
				1202	}
				1203
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1204	/// Performs an unordered comparison of two 32-bit float values using
				1205	/// the low-order bits of both operands to determine if the first operand is
				1206	/// less than or equal to the second operand and returns the result of the
				1207	/// comparison.
				1208	///
				1209	/// If either of the two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1210	///
				1211	/// \headerfile <x86intrin.h>
				1212	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1213	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1214	///
				1215	/// \param __a
				1216	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1217	/// used in the comparison.
				1218	/// \param __b
				1219	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1220	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1221	/// \returns An integer containing the comparison results. If either of the two
				1222	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1223	static __inline__ int __DEFAULT_FN_ATTRS
				1224	_mm_ucomile_ss(__m128 __a, __m128 __b)
				1225	{
				1226	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
				1227	}
				1228
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1229	/// Performs an unordered comparison of two 32-bit float values using
				1230	/// the low-order bits of both operands to determine if the first operand is
				1231	/// greater than the second operand and returns the result of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1232	/// comparison.
				1233	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1234	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1235	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1236	/// \headerfile <x86intrin.h>
				1237	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1238	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1239	///
				1240	/// \param __a
				1241	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1242	/// used in the comparison.
				1243	/// \param __b
				1244	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1245	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1246	/// \returns An integer containing the comparison results. If either of the two
				1247	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1248	static __inline__ int __DEFAULT_FN_ATTRS
				1249	_mm_ucomigt_ss(__m128 __a, __m128 __b)
				1250	{
				1251	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
				1252	}
				1253
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1254	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1255	/// the low-order bits of both operands to determine if the first operand is
				1256	/// greater than or equal to the second operand and returns the result of
				1257	/// the comparison.
				1258	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1259	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1260	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1261	/// \headerfile <x86intrin.h>
				1262	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1263	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1264	///
				1265	/// \param __a
				1266	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1267	/// used in the comparison.
				1268	/// \param __b
				1269	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1270	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1271	/// \returns An integer containing the comparison results. If either of the two
				1272	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1273	static __inline__ int __DEFAULT_FN_ATTRS
				1274	_mm_ucomige_ss(__m128 __a, __m128 __b)
				1275	{
				1276	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
				1277	}
				1278
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1279	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1280	/// the low-order bits of both operands to determine inequality and returns
				1281	/// the result of the comparison.
				1282	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1283	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1284	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1285	/// \headerfile <x86intrin.h>
				1286	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1287	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1288	///
				1289	/// \param __a
				1290	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1291	/// used in the comparison.
				1292	/// \param __b
				1293	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1294	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1295	/// \returns An integer containing the comparison results. If either of the two
				1296	/// lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1297	static __inline__ int __DEFAULT_FN_ATTRS
				1298	_mm_ucomineq_ss(__m128 __a, __m128 __b)
				1299	{
				1300	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
				1301	}
				1302
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1303	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1304	/// [4 x float] into a 32-bit integer.
				1305	///
				1306	/// \headerfile <x86intrin.h>
				1307	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1308	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1309	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1310	///
				1311	/// \param __a
				1312	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1313	/// used in the conversion.
				1314	/// \returns A 32-bit integer containing the converted value.
				1315	static __inline__ int __DEFAULT_FN_ATTRS
				1316	_mm_cvtss_si32(__m128 __a)
				1317	{
				1318	return __builtin_ia32_cvtss2si((__v4sf)__a);
				1319	}
				1320
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1321	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1322	/// [4 x float] into a 32-bit integer.
				1323	///
				1324	/// \headerfile <x86intrin.h>
				1325	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1326	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1327	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1328	///
				1329	/// \param __a
				1330	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1331	/// used in the conversion.
				1332	/// \returns A 32-bit integer containing the converted value.
				1333	static __inline__ int __DEFAULT_FN_ATTRS
				1334	_mm_cvt_ss2si(__m128 __a)
				1335	{
				1336	return _mm_cvtss_si32(__a);
				1337	}
				1338
				1339	#ifdef __x86_64__
				1340
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1341	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1342	/// [4 x float] into a 64-bit integer.
				1343	///
				1344	/// \headerfile <x86intrin.h>
				1345	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1346	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1347	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1348	///
				1349	/// \param __a
				1350	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1351	/// used in the conversion.
				1352	/// \returns A 64-bit integer containing the converted value.
				1353	static __inline__ long long __DEFAULT_FN_ATTRS
				1354	_mm_cvtss_si64(__m128 __a)
				1355	{
				1356	return __builtin_ia32_cvtss2si64((__v4sf)__a);
				1357	}
				1358
				1359	#endif
				1360
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1361	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1362	/// [4 x float] into a 64-bit vector of [2 x i32].
				1363	///
				1364	/// \headerfile <x86intrin.h>
				1365	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1366	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1367	///
				1368	/// \param __a
				1369	/// A 128-bit vector of [4 x float].
				1370	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1371	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1372	_mm_cvtps_pi32(__m128 __a)
				1373	{
				1374	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
				1375	}
				1376
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1377	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1378	/// [4 x float] into a 64-bit vector of [2 x i32].
				1379	///
				1380	/// \headerfile <x86intrin.h>
				1381	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1382	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1383	///
				1384	/// \param __a
				1385	/// A 128-bit vector of [4 x float].
				1386	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1387	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1388	_mm_cvt_ps2pi(__m128 __a)
				1389	{
				1390	return _mm_cvtps_pi32(__a);
				1391	}
				1392
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1393	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1394	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1395	/// inexact.
				1396	///
				1397	/// \headerfile <x86intrin.h>
				1398	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1399	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1400	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1401	///
				1402	/// \param __a
				1403	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1404	/// used in the conversion.
				1405	/// \returns A 32-bit integer containing the converted value.
				1406	static __inline__ int __DEFAULT_FN_ATTRS
				1407	_mm_cvttss_si32(__m128 __a)
				1408	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1409	return __builtin_ia32_cvttss2si((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1410	}
				1411
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1412	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1413	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1414	/// inexact.
				1415	///
				1416	/// \headerfile <x86intrin.h>
				1417	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1418	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1419	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1420	///
				1421	/// \param __a
				1422	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1423	/// used in the conversion.
				1424	/// \returns A 32-bit integer containing the converted value.
				1425	static __inline__ int __DEFAULT_FN_ATTRS
				1426	_mm_cvtt_ss2si(__m128 __a)
				1427	{
				1428	return _mm_cvttss_si32(__a);
				1429	}
				1430
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1431	#ifdef __x86_64__
				1432	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1433	/// [4 x float] into a 64-bit integer, truncating the result when it is
				1434	/// inexact.
				1435	///
				1436	/// \headerfile <x86intrin.h>
				1437	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1438	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1439	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1440	///
				1441	/// \param __a
				1442	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1443	/// used in the conversion.
				1444	/// \returns A 64-bit integer containing the converted value.
				1445	static __inline__ long long __DEFAULT_FN_ATTRS
				1446	_mm_cvttss_si64(__m128 __a)
				1447	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1448	return __builtin_ia32_cvttss2si64((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1449	}
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1450	#endif
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1451
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1452	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1453	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
				1454	/// when it is inexact.
				1455	///
				1456	/// \headerfile <x86intrin.h>
				1457	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1458	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
				1459	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1460	///
				1461	/// \param __a
				1462	/// A 128-bit vector of [4 x float].
				1463	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1464	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1465	_mm_cvttps_pi32(__m128 __a)
				1466	{
				1467	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
				1468	}
				1469
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1470	/// Converts two low-order float values in a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1471	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
				1472	/// is inexact.
				1473	///
				1474	/// \headerfile <x86intrin.h>
				1475	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1476	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1477	///
				1478	/// \param __a
				1479	/// A 128-bit vector of [4 x float].
				1480	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1481	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1482	_mm_cvtt_ps2pi(__m128 __a)
				1483	{
				1484	return _mm_cvttps_pi32(__a);
				1485	}
				1486
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1487	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1488	/// and writes it to the lower 32 bits of the destination. The remaining
				1489	/// higher order elements of the destination vector are copied from the
				1490	/// corresponding elements in the first operand.
				1491	///
				1492	/// \headerfile <x86intrin.h>
				1493	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1494	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1495	///
				1496	/// \param __a
				1497	/// A 128-bit vector of [4 x float].
				1498	/// \param __b
				1499	/// A 32-bit signed integer operand containing the value to be converted.
				1500	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1501	/// converted value of the second operand. The upper 96 bits are copied from
				1502	/// the upper 96 bits of the first operand.
				1503	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1504	_mm_cvtsi32_ss(__m128 __a, int __b)
				1505	{
				1506	__a[0] = __b;
				1507	return __a;
				1508	}
				1509
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1510	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1511	/// and writes it to the lower 32 bits of the destination. The remaining
				1512	/// higher order elements of the destination are copied from the
				1513	/// corresponding elements in the first operand.
				1514	///
				1515	/// \headerfile <x86intrin.h>
				1516	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1517	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1518	///
				1519	/// \param __a
				1520	/// A 128-bit vector of [4 x float].
				1521	/// \param __b
				1522	/// A 32-bit signed integer operand containing the value to be converted.
				1523	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1524	/// converted value of the second operand. The upper 96 bits are copied from
				1525	/// the upper 96 bits of the first operand.
				1526	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1527	_mm_cvt_si2ss(__m128 __a, int __b)
				1528	{
				1529	return _mm_cvtsi32_ss(__a, __b);
				1530	}
				1531
				1532	#ifdef __x86_64__
				1533
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1534	/// Converts a 64-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1535	/// and writes it to the lower 32 bits of the destination. The remaining
				1536	/// higher order elements of the destination are copied from the
				1537	/// corresponding elements in the first operand.
				1538	///
				1539	/// \headerfile <x86intrin.h>
				1540	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1541	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1542	///
				1543	/// \param __a
				1544	/// A 128-bit vector of [4 x float].
				1545	/// \param __b
				1546	/// A 64-bit signed integer operand containing the value to be converted.
				1547	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1548	/// converted value of the second operand. The upper 96 bits are copied from
				1549	/// the upper 96 bits of the first operand.
				1550	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1551	_mm_cvtsi64_ss(__m128 __a, long long __b)
				1552	{
				1553	__a[0] = __b;
				1554	return __a;
				1555	}
				1556
				1557	#endif
				1558
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1559	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1560	/// floating point values and writes them to the lower 64-bits of the
				1561	/// destination. The remaining higher order elements of the destination are
				1562	/// copied from the corresponding elements in the first operand.
				1563	///
				1564	/// \headerfile <x86intrin.h>
				1565	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1566	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1567	///
				1568	/// \param __a
				1569	/// A 128-bit vector of [4 x float].
				1570	/// \param __b
				1571	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1572	/// and written to the corresponding low-order elements in the destination.
				1573	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1574	/// converted value of the second operand. The upper 64 bits are copied from
				1575	/// the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1576	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1577	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
				1578	{
				1579	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
				1580	}
				1581
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1582	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1583	/// floating point values and writes them to the lower 64-bits of the
				1584	/// destination. The remaining higher order elements of the destination are
				1585	/// copied from the corresponding elements in the first operand.
				1586	///
				1587	/// \headerfile <x86intrin.h>
				1588	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1589	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1590	///
				1591	/// \param __a
				1592	/// A 128-bit vector of [4 x float].
				1593	/// \param __b
				1594	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1595	/// and written to the corresponding low-order elements in the destination.
				1596	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1597	/// converted value from the second operand. The upper 64 bits are copied
				1598	/// from the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1599	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1600	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
				1601	{
				1602	return _mm_cvtpi32_ps(__a, __b);
				1603	}
				1604
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1605	/// Extracts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1606	/// [4 x float].
				1607	///
				1608	/// \headerfile <x86intrin.h>
				1609	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1610	/// This intrinsic has no corresponding instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1611	///
				1612	/// \param __a
				1613	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1614	/// used in the extraction.
				1615	/// \returns A 32-bit float containing the extracted value.
				1616	static __inline__ float __DEFAULT_FN_ATTRS
				1617	_mm_cvtss_f32(__m128 __a)
				1618	{
				1619	return __a[0];
				1620	}
				1621
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1622	/// Loads two packed float values from the address \a __p into the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1623	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
				1624	/// are copied from the low-order bits of the first operand.
				1625	///
				1626	/// \headerfile <x86intrin.h>
				1627	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1628	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1629	///
				1630	/// \param __a
				1631	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
				1632	/// of the destination.
				1633	/// \param __p
				1634	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1635	/// [127:64] of the destination.
				1636	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1637	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1638	_mm_loadh_pi(__m128 __a, const __m64 *__p)
				1639	{
				1640	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
				1641	struct __mm_loadh_pi_struct {
				1642	__mm_loadh_pi_v2f32 __u;
				1643	} __attribute__((__packed__, __may_alias__));
				1644	__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
				1645	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1646	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
				1647	}
				1648
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1649	/// Loads two packed float values from the address \a __p into the
				1650	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
				1651	/// are copied from the high-order bits of the first operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1652	///
				1653	/// \headerfile <x86intrin.h>
				1654	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1655	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1656	///
				1657	/// \param __a
				1658	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
				1659	/// [127:64] of the destination.
				1660	/// \param __p
				1661	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1662	/// [63:0] of the destination.
				1663	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1664	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1665	_mm_loadl_pi(__m128 __a, const __m64 *__p)
				1666	{
				1667	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
				1668	struct __mm_loadl_pi_struct {
				1669	__mm_loadl_pi_v2f32 __u;
				1670	} __attribute__((__packed__, __may_alias__));
				1671	__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
				1672	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1673	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
				1674	}
				1675
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1676	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1677	/// 32 bits of the vector are initialized with the single-precision
				1678	/// floating-point value loaded from a specified memory location. The upper
				1679	/// 96 bits are set to zero.
				1680	///
				1681	/// \headerfile <x86intrin.h>
				1682	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1683	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1684	///
				1685	/// \param __p
				1686	/// A pointer to a 32-bit memory location containing a single-precision
				1687	/// floating-point value.
				1688	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1689	/// lower 32 bits contain the value loaded from the memory location. The
				1690	/// upper 96 bits are set to zero.
				1691	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1692	_mm_load_ss(const float *__p)
				1693	{
				1694	struct __mm_load_ss_struct {
				1695	float __u;
				1696	} __attribute__((__packed__, __may_alias__));
				1697	float __u = ((struct __mm_load_ss_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1698	return __extension__ (__m128){ __u, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1699	}
				1700
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1701	/// Loads a 32-bit float value and duplicates it to all four vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1702	/// elements of a 128-bit vector of [4 x float].
				1703	///
				1704	/// \headerfile <x86intrin.h>
				1705	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1706	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1707	/// instruction.
				1708	///
				1709	/// \param __p
				1710	/// A pointer to a float value to be loaded and duplicated.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1711	/// \returns A 128-bit vector of [4 x float] containing the loaded and
				1712	/// duplicated values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1713	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1714	_mm_load1_ps(const float *__p)
				1715	{
				1716	struct __mm_load1_ps_struct {
				1717	float __u;
				1718	} __attribute__((__packed__, __may_alias__));
				1719	float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1720	return __extension__ (__m128){ __u, __u, __u, __u };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1721	}
				1722
				1723	#define _mm_load_ps1(p) _mm_load1_ps(p)
				1724
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1725	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1726	/// memory location.
				1727	///
				1728	/// \headerfile <x86intrin.h>
				1729	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1730	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1731	///
				1732	/// \param __p
				1733	/// A pointer to a 128-bit memory location. The address of the memory
				1734	/// location has to be 128-bit aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1735	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1736	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1737	_mm_load_ps(const float *__p)
				1738	{
				1739	return (__m128)__p;
				1740	}
				1741
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1742	/// Loads a 128-bit floating-point vector of [4 x float] from an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1743	/// unaligned memory location.
				1744	///
				1745	/// \headerfile <x86intrin.h>
				1746	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1747	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1748	///
				1749	/// \param __p
				1750	/// A pointer to a 128-bit memory location. The address of the memory
				1751	/// location does not have to be aligned.
				1752	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
				1753	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1754	_mm_loadu_ps(const float *__p)
				1755	{
				1756	struct __loadu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1757	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1758	} __attribute__((__packed__, __may_alias__));
				1759	return ((struct __loadu_ps*)__p)->__v;
				1760	}
				1761
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1762	/// Loads four packed float values, in reverse order, from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1763	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
				1764	///
				1765	/// \headerfile <x86intrin.h>
				1766	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1767	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1768	/// instruction.
				1769	///
				1770	/// \param __p
				1771	/// A pointer to a 128-bit memory location. The address of the memory
				1772	/// location has to be 128-bit aligned.
				1773	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
				1774	/// in reverse order.
				1775	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1776	_mm_loadr_ps(const float *__p)
				1777	{
				1778	__m128 __a = _mm_load_ps(__p);
				1779	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				1780	}
				1781
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1782	/// Create a 128-bit vector of [4 x float] with undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1783	///
				1784	/// \headerfile <x86intrin.h>
				1785	///
				1786	/// This intrinsic has no corresponding instruction.
				1787	///
				1788	/// \returns A 128-bit vector of [4 x float] containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1789	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1790	_mm_undefined_ps(void)
				1791	{
				1792	return (__m128)__builtin_ia32_undef128();
				1793	}
				1794
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1795	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1796	/// 32 bits of the vector are initialized with the specified single-precision
				1797	/// floating-point value. The upper 96 bits are set to zero.
				1798	///
				1799	/// \headerfile <x86intrin.h>
				1800	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1801	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1802	///
				1803	/// \param __w
				1804	/// A single-precision floating-point value used to initialize the lower 32
				1805	/// bits of the result.
				1806	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1807	/// lower 32 bits contain the value provided in the source operand. The
				1808	/// upper 96 bits are set to zero.
				1809	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1810	_mm_set_ss(float __w)
				1811	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1812	return __extension__ (__m128){ __w, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1813	}
				1814
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1815	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1816	/// of the four single-precision floating-point vector elements set to the
				1817	/// specified single-precision floating-point value.
				1818	///
				1819	/// \headerfile <x86intrin.h>
				1820	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1821	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1822	///
				1823	/// \param __w
				1824	/// A single-precision floating-point value used to initialize each vector
				1825	/// element of the result.
				1826	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1827	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1828	_mm_set1_ps(float __w)
				1829	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1830	return __extension__ (__m128){ __w, __w, __w, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1831	}
				1832
				1833	/* Microsoft specific. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1834	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1835	/// of the four single-precision floating-point vector elements set to the
				1836	/// specified single-precision floating-point value.
				1837	///
				1838	/// \headerfile <x86intrin.h>
				1839	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1840	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1841	///
				1842	/// \param __w
				1843	/// A single-precision floating-point value used to initialize each vector
				1844	/// element of the result.
				1845	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1846	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1847	_mm_set_ps1(float __w)
				1848	{
				1849	return _mm_set1_ps(__w);
				1850	}
				1851
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1852	/// Constructs a 128-bit floating-point vector of [4 x float]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1853	/// initialized with the specified single-precision floating-point values.
				1854	///
				1855	/// \headerfile <x86intrin.h>
				1856	///
				1857	/// This intrinsic is a utility function and does not correspond to a specific
				1858	/// instruction.
				1859	///
				1860	/// \param __z
				1861	/// A single-precision floating-point value used to initialize bits [127:96]
				1862	/// of the result.
				1863	/// \param __y
				1864	/// A single-precision floating-point value used to initialize bits [95:64]
				1865	/// of the result.
				1866	/// \param __x
				1867	/// A single-precision floating-point value used to initialize bits [63:32]
				1868	/// of the result.
				1869	/// \param __w
				1870	/// A single-precision floating-point value used to initialize bits [31:0]
				1871	/// of the result.
				1872	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1873	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1874	_mm_set_ps(float __z, float __y, float __x, float __w)
				1875	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1876	return __extension__ (__m128){ __w, __x, __y, __z };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1877	}
				1878
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1879	/// Constructs a 128-bit floating-point vector of [4 x float],
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1880	/// initialized in reverse order with the specified 32-bit single-precision
				1881	/// float-point values.
				1882	///
				1883	/// \headerfile <x86intrin.h>
				1884	///
				1885	/// This intrinsic is a utility function and does not correspond to a specific
				1886	/// instruction.
				1887	///
				1888	/// \param __z
				1889	/// A single-precision floating-point value used to initialize bits [31:0]
				1890	/// of the result.
				1891	/// \param __y
				1892	/// A single-precision floating-point value used to initialize bits [63:32]
				1893	/// of the result.
				1894	/// \param __x
				1895	/// A single-precision floating-point value used to initialize bits [95:64]
				1896	/// of the result.
				1897	/// \param __w
				1898	/// A single-precision floating-point value used to initialize bits [127:96]
				1899	/// of the result.
				1900	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1901	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1902	_mm_setr_ps(float __z, float __y, float __x, float __w)
				1903	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1904	return __extension__ (__m128){ __z, __y, __x, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1905	}
				1906
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1907	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1908	/// to zero.
				1909	///
				1910	/// \headerfile <x86intrin.h>
				1911	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1912	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1913	///
				1914	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
				1915	/// all elements set to zero.
				1916	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1917	_mm_setzero_ps(void)
				1918	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1919	return __extension__ (__m128){ 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1920	}
				1921
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1922	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1923	/// memory location.
				1924	///
				1925	/// \headerfile <x86intrin.h>
				1926	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1927	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1928	///
				1929	/// \param __p
				1930	/// A pointer to a 64-bit memory location.
				1931	/// \param __a
				1932	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1933	static __inline__ void __DEFAULT_FN_ATTRS
				1934	_mm_storeh_pi(__m64 *__p, __m128 __a)
				1935	{
				1936	__builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
				1937	}
				1938
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1939	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1940	/// memory location.
				1941	///
				1942	/// \headerfile <x86intrin.h>
				1943	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1944	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1945	///
				1946	/// \param __p
				1947	/// A pointer to a memory location that will receive the float values.
				1948	/// \param __a
				1949	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1950	static __inline__ void __DEFAULT_FN_ATTRS
				1951	_mm_storel_pi(__m64 *__p, __m128 __a)
				1952	{
				1953	__builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
				1954	}
				1955
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1956	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1957	/// memory location.
				1958	///
				1959	/// \headerfile <x86intrin.h>
				1960	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1961	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1962	///
				1963	/// \param __p
				1964	/// A pointer to a 32-bit memory location.
				1965	/// \param __a
				1966	/// A 128-bit vector of [4 x float] containing the value to be stored.
				1967	static __inline__ void __DEFAULT_FN_ATTRS
				1968	_mm_store_ss(float *__p, __m128 __a)
				1969	{
				1970	struct __mm_store_ss_struct {
				1971	float __u;
				1972	} __attribute__((__packed__, __may_alias__));
				1973	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
				1974	}
				1975
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1976	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
				1977	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1978	///
				1979	/// \headerfile <x86intrin.h>
				1980	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1981	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1982	///
				1983	/// \param __p
				1984	/// A pointer to a 128-bit memory location. The address of the memory
				1985	/// location does not have to be aligned.
				1986	/// \param __a
				1987	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1988	static __inline__ void __DEFAULT_FN_ATTRS
				1989	_mm_storeu_ps(float *__p, __m128 __a)
				1990	{
				1991	struct __storeu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1992	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1993	} __attribute__((__packed__, __may_alias__));
				1994	((struct __storeu_ps*)__p)->__v = __a;
				1995	}
				1996
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1997	/// Stores a 128-bit vector of [4 x float] into an aligned memory
				1998	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1999	///
				2000	/// \headerfile <x86intrin.h>
				2001	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2002	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2003	///
				2004	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2005	/// A pointer to a 128-bit memory location. The address of the memory
				2006	/// location has to be 16-byte aligned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2007	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2008	/// A 128-bit vector of [4 x float] containing the values to be stored.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2009	static __inline__ void __DEFAULT_FN_ATTRS
				2010	_mm_store_ps(float *__p, __m128 __a)
				2011	{
				2012	(__m128)__p = __a;
				2013	}
				2014
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2015	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2016	/// four contiguous elements in an aligned memory location.
				2017	///
				2018	/// \headerfile <x86intrin.h>
				2019	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2020	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2021	/// instruction.
				2022	///
				2023	/// \param __p
				2024	/// A pointer to a 128-bit memory location.
				2025	/// \param __a
				2026	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2027	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2028	static __inline__ void __DEFAULT_FN_ATTRS
				2029	_mm_store1_ps(float *__p, __m128 __a)
				2030	{
				2031	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
				2032	_mm_store_ps(__p, __a);
				2033	}
				2034
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2035	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
				2036	/// four contiguous elements in an aligned memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2037	///
				2038	/// \headerfile <x86intrin.h>
				2039	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2040	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
				2041	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2042	///
				2043	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2044	/// A pointer to a 128-bit memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2045	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2046	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
				2047	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2048	static __inline__ void __DEFAULT_FN_ATTRS
				2049	_mm_store_ps1(float *__p, __m128 __a)
				2050	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2051	_mm_store1_ps(__p, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2052	}
				2053
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2054	/// Stores float values from a 128-bit vector of [4 x float] to an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2055	/// aligned memory location in reverse order.
				2056	///
				2057	/// \headerfile <x86intrin.h>
				2058	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2059	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2060	/// instruction.
				2061	///
				2062	/// \param __p
				2063	/// A pointer to a 128-bit memory location. The address of the memory
				2064	/// location has to be 128-bit aligned.
				2065	/// \param __a
				2066	/// A 128-bit vector of [4 x float] containing the values to be stored.
				2067	static __inline__ void __DEFAULT_FN_ATTRS
				2068	_mm_storer_ps(float *__p, __m128 __a)
				2069	{
				2070	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				2071	_mm_store_ps(__p, __a);
				2072	}
				2073
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2074	#define _MM_HINT_ET0 7
				2075	#define _MM_HINT_ET1 6
				2076	#define _MM_HINT_T0 3
				2077	#define _MM_HINT_T1 2
				2078	#define _MM_HINT_T2 1
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2079	#define _MM_HINT_NTA 0
				2080
				2081	#ifndef _MSC_VER
				2082	/* FIXME: We have to #define this because "sel" must be a constant integer, and
				2083	Sema doesn't do any form of constant propagation yet. */
				2084
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2085	/// Loads one cache line of data from the specified address to a location
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2086	/// closer to the processor.
				2087	///
				2088	/// \headerfile <x86intrin.h>
				2089	///
				2090	/// \code
				2091	/// void _mm_prefetch(const void * a, const int sel);
				2092	/// \endcode
				2093	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2094	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2095	///
				2096	/// \param a
				2097	/// A pointer to a memory location containing a cache line of data.
				2098	/// \param sel
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2099	/// A predefined integer constant specifying the type of prefetch
				2100	/// operation: \n
				2101	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
				2102	/// PREFETCHNTA instruction will be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2103	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2104	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2105	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2106	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2107	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
				2108	/// be generated.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2109	#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
				2110	((sel) >> 2) & 1, (sel) & 0x3))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2111	#endif
				2112
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2113	/// Stores a 64-bit integer in the specified aligned memory location. To
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2114	/// minimize caching, the data is flagged as non-temporal (unlikely to be
				2115	/// used again soon).
				2116	///
				2117	/// \headerfile <x86intrin.h>
				2118	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2119	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2120	///
				2121	/// \param __p
				2122	/// A pointer to an aligned memory location used to store the register value.
				2123	/// \param __a
				2124	/// A 64-bit integer containing the value to be stored.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2125	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2126	_mm_stream_pi(__m64 *__p, __m64 __a)
				2127	{
				2128	__builtin_ia32_movntq(__p, __a);
				2129	}
				2130
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2131	/// Moves packed float values from a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2132	/// 128-bit aligned memory location. To minimize caching, the data is flagged
				2133	/// as non-temporal (unlikely to be used again soon).
				2134	///
				2135	/// \headerfile <x86intrin.h>
				2136	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2137	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2138	///
				2139	/// \param __p
				2140	/// A pointer to a 128-bit aligned memory location that will receive the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2141	/// single-precision floating-point values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2142	/// \param __a
				2143	/// A 128-bit vector of [4 x float] containing the values to be moved.
				2144	static __inline__ void __DEFAULT_FN_ATTRS
				2145	_mm_stream_ps(float *__p, __m128 __a)
				2146	{
				2147	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
				2148	}
				2149
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2150	#if defined(__cplusplus)
				2151	extern "C" {
				2152	#endif
				2153
				2154	/// Forces strong memory ordering (serialization) between store
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2155	/// instructions preceding this instruction and store instructions following
				2156	/// this instruction, ensuring the system completes all previous stores
				2157	/// before executing subsequent stores.
				2158	///
				2159	/// \headerfile <x86intrin.h>
				2160	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2161	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2162	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2163	void _mm_sfence(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2164
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2165	#if defined(__cplusplus)
				2166	} // extern "C"
				2167	#endif
				2168
				2169	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2170	/// returns it, as specified by the immediate integer operand.
				2171	///
				2172	/// \headerfile <x86intrin.h>
				2173	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2174	/// \code
				2175	/// int _mm_extract_pi16(__m64 a, int n);
				2176	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2177	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2178	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
				2179	///
				2180	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2181	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2182	/// \param n
				2183	/// An immediate integer operand that determines which bits are extracted: \n
				2184	/// 0: Bits [15:0] are copied to the destination. \n
				2185	/// 1: Bits [31:16] are copied to the destination. \n
				2186	/// 2: Bits [47:32] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2187	/// 3: Bits [63:48] are copied to the destination.
				2188	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2189	#define _mm_extract_pi16(a, n) \
				2190	(int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2191
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2192	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2193	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2194	/// specified by the immediate operand \a n.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2195	///
				2196	/// \headerfile <x86intrin.h>
				2197	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2198	/// \code
				2199	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
				2200	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2201	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2202	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
				2203	///
				2204	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2205	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2206	/// \param d
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2207	/// An integer. The lower 16-bit value from this operand is written to the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2208	/// destination at the offset specified by operand \a n.
				2209	/// \param n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2210	/// An immediate integer operant that determines which the bits to be used
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2211	/// in the destination. \n
				2212	/// 0: Bits [15:0] are copied to the destination. \n
				2213	/// 1: Bits [31:16] are copied to the destination. \n
				2214	/// 2: Bits [47:32] are copied to the destination. \n
				2215	/// 3: Bits [63:48] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2216	/// The remaining bits in the destination are copied from the corresponding
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2217	/// bits in operand \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2218	/// \returns A 64-bit integer vector containing the copied packed data from the
				2219	/// operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2220	#define _mm_insert_pi16(a, d, n) \
				2221	(__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2222
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2223	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2224	/// the 64-bit integer vectors, and writes the greater value to the
				2225	/// corresponding bits in the destination.
				2226	///
				2227	/// \headerfile <x86intrin.h>
				2228	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2229	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2230	///
				2231	/// \param __a
				2232	/// A 64-bit integer vector containing one of the source operands.
				2233	/// \param __b
				2234	/// A 64-bit integer vector containing one of the source operands.
				2235	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2236	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2237	_mm_max_pi16(__m64 __a, __m64 __b)
				2238	{
				2239	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
				2240	}
				2241
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2242	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2243	/// values of the 64-bit integer vectors, and writes the greater value to the
				2244	/// corresponding bits in the destination.
				2245	///
				2246	/// \headerfile <x86intrin.h>
				2247	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2248	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2249	///
				2250	/// \param __a
				2251	/// A 64-bit integer vector containing one of the source operands.
				2252	/// \param __b
				2253	/// A 64-bit integer vector containing one of the source operands.
				2254	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2255	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2256	_mm_max_pu8(__m64 __a, __m64 __b)
				2257	{
				2258	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
				2259	}
				2260
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2261	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2262	/// the 64-bit integer vectors, and writes the lesser value to the
				2263	/// corresponding bits in the destination.
				2264	///
				2265	/// \headerfile <x86intrin.h>
				2266	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2267	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2268	///
				2269	/// \param __a
				2270	/// A 64-bit integer vector containing one of the source operands.
				2271	/// \param __b
				2272	/// A 64-bit integer vector containing one of the source operands.
				2273	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2274	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2275	_mm_min_pi16(__m64 __a, __m64 __b)
				2276	{
				2277	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
				2278	}
				2279
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2280	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2281	/// values of the 64-bit integer vectors, and writes the lesser value to the
				2282	/// corresponding bits in the destination.
				2283	///
				2284	/// \headerfile <x86intrin.h>
				2285	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2286	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2287	///
				2288	/// \param __a
				2289	/// A 64-bit integer vector containing one of the source operands.
				2290	/// \param __b
				2291	/// A 64-bit integer vector containing one of the source operands.
				2292	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2293	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2294	_mm_min_pu8(__m64 __a, __m64 __b)
				2295	{
				2296	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
				2297	}
				2298
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2299	/// Takes the most significant bit from each 8-bit element in a 64-bit
				2300	/// integer vector to create an 8-bit mask value. Zero-extends the value to
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2301	/// 32-bit integer and writes it to the destination.
				2302	///
				2303	/// \headerfile <x86intrin.h>
				2304	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2305	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2306	///
				2307	/// \param __a
				2308	/// A 64-bit integer vector containing the values with bits to be extracted.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2309	/// \returns The most significant bit from each 8-bit element in \a __a,
				2310	/// written to bits [7:0].
				2311	static __inline__ int __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2312	_mm_movemask_pi8(__m64 __a)
				2313	{
				2314	return __builtin_ia32_pmovmskb((__v8qi)__a);
				2315	}
				2316
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2317	/// Multiplies packed 16-bit unsigned integer values and writes the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2318	/// high-order 16 bits of each 32-bit product to the corresponding bits in
				2319	/// the destination.
				2320	///
				2321	/// \headerfile <x86intrin.h>
				2322	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2323	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2324	///
				2325	/// \param __a
				2326	/// A 64-bit integer vector containing one of the source operands.
				2327	/// \param __b
				2328	/// A 64-bit integer vector containing one of the source operands.
				2329	/// \returns A 64-bit integer vector containing the products of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2330	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2331	_mm_mulhi_pu16(__m64 __a, __m64 __b)
				2332	{
				2333	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
				2334	}
				2335
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2336	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2337	/// destination, as specified by the immediate value operand.
				2338	///
				2339	/// \headerfile <x86intrin.h>
				2340	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2341	/// \code
				2342	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
				2343	/// \endcode
				2344	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2345	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
				2346	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2347	/// \param a
				2348	/// A 64-bit integer vector containing the values to be shuffled.
				2349	/// \param n
				2350	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2351	/// copy from \a a. The destinations within the 64-bit destination are
				2352	/// assigned values as follows: \n
				2353	/// Bits [1:0] are used to assign values to bits [15:0] in the
				2354	/// destination. \n
				2355	/// Bits [3:2] are used to assign values to bits [31:16] in the
				2356	/// destination. \n
				2357	/// Bits [5:4] are used to assign values to bits [47:32] in the
				2358	/// destination. \n
				2359	/// Bits [7:6] are used to assign values to bits [63:48] in the
				2360	/// destination. \n
				2361	/// Bit value assignments: \n
				2362	/// 00: assigned from bits [15:0] of \a a. \n
				2363	/// 01: assigned from bits [31:16] of \a a. \n
				2364	/// 10: assigned from bits [47:32] of \a a. \n
				2365	/// 11: assigned from bits [63:48] of \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2366	/// \returns A 64-bit integer vector containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2367	#define _mm_shuffle_pi16(a, n) \
				2368	(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2369
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2370	/// Conditionally copies the values from each 8-bit element in the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2371	/// 64-bit integer vector operand to the specified memory location, as
				2372	/// specified by the most significant bit in the corresponding element in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2373	/// second 64-bit integer vector operand.
				2374	///
				2375	/// To minimize caching, the data is flagged as non-temporal
				2376	/// (unlikely to be used again soon).
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2377	///
				2378	/// \headerfile <x86intrin.h>
				2379	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2380	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2381	///
				2382	/// \param __d
				2383	/// A 64-bit integer vector containing the values with elements to be copied.
				2384	/// \param __n
				2385	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2386	/// element determines whether the corresponding element in operand \a __d
				2387	/// is copied. If the most significant bit of a given element is 1, the
				2388	/// corresponding element in operand \a __d is copied.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2389	/// \param __p
				2390	/// A pointer to a 64-bit memory location that will receive the conditionally
				2391	/// copied integer values. The address of the memory location does not have
				2392	/// to be aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2393	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2394	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
				2395	{
				2396	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
				2397	}
				2398
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2399	/// Computes the rounded averages of the packed unsigned 8-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2400	/// values and writes the averages to the corresponding bits in the
				2401	/// destination.
				2402	///
				2403	/// \headerfile <x86intrin.h>
				2404	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2405	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2406	///
				2407	/// \param __a
				2408	/// A 64-bit integer vector containing one of the source operands.
				2409	/// \param __b
				2410	/// A 64-bit integer vector containing one of the source operands.
				2411	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2412	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2413	_mm_avg_pu8(__m64 __a, __m64 __b)
				2414	{
				2415	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
				2416	}
				2417
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2418	/// Computes the rounded averages of the packed unsigned 16-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2419	/// values and writes the averages to the corresponding bits in the
				2420	/// destination.
				2421	///
				2422	/// \headerfile <x86intrin.h>
				2423	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2424	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2425	///
				2426	/// \param __a
				2427	/// A 64-bit integer vector containing one of the source operands.
				2428	/// \param __b
				2429	/// A 64-bit integer vector containing one of the source operands.
				2430	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2431	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2432	_mm_avg_pu16(__m64 __a, __m64 __b)
				2433	{
				2434	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
				2435	}
				2436
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2437	/// Subtracts the corresponding 8-bit unsigned integer values of the two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2438	/// 64-bit vector operands and computes the absolute value for each of the
				2439	/// difference. Then sum of the 8 absolute differences is written to the
				2440	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
				2441	///
				2442	/// \headerfile <x86intrin.h>
				2443	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2444	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2445	///
				2446	/// \param __a
				2447	/// A 64-bit integer vector containing one of the source operands.
				2448	/// \param __b
				2449	/// A 64-bit integer vector containing one of the source operands.
				2450	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
				2451	/// sets of absolute differences between both operands. The upper bits are
				2452	/// cleared.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2453	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2454	_mm_sad_pu8(__m64 __a, __m64 __b)
				2455	{
				2456	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
				2457	}
				2458
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2459	#if defined(__cplusplus)
				2460	extern "C" {
				2461	#endif
				2462
				2463	/// Returns the contents of the MXCSR register as a 32-bit unsigned
				2464	/// integer value.
				2465	///
				2466	/// There are several groups of macros associated with this
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2467	/// intrinsic, including:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2468	/// <ul>
				2469	/// <li>
				2470	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2471	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2472	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2473	/// _MM_GET_EXCEPTION_STATE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2474	/// </li>
				2475	/// <li>
				2476	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2477	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2478	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2479	/// </li>
				2480	/// <li>
				2481	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2482	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2483	/// _MM_GET_ROUNDING_MODE().
				2484	/// </li>
				2485	/// <li>
				2486	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2487	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2488	/// </li>
				2489	/// <li>
				2490	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2491	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2492	/// _MM_GET_DENORMALS_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2493	/// </li>
				2494	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2495	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2496	/// For example, the following expression checks if an overflow exception has
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2497	/// occurred:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2498	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2499	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2500	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2501	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2502	/// The following expression gets the current rounding mode:
				2503	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2504	/// _MM_GET_ROUNDING_MODE()
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2505	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2506	///
				2507	/// \headerfile <x86intrin.h>
				2508	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2509	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2510	///
				2511	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
				2512	/// register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2513	unsigned int _mm_getcsr(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2514
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2515	/// Sets the MXCSR register with the 32-bit unsigned integer value.
				2516	///
				2517	/// There are several groups of macros associated with this intrinsic,
				2518	/// including:
				2519	/// <ul>
				2520	/// <li>
				2521	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2522	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2523	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2524	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2525	/// </li>
				2526	/// <li>
				2527	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2528	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2529	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
				2530	/// of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2531	/// </li>
				2532	/// <li>
				2533	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2534	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
				2535	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2536	/// </li>
				2537	/// <li>
				2538	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2539	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
				2540	/// one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2541	/// </li>
				2542	/// <li>
				2543	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2544	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2545	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2546	/// </li>
				2547	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2548	///
				2549	/// For example, the following expression causes subsequent floating-point
				2550	/// operations to round up:
				2551	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
				2552	///
				2553	/// The following example sets the DAZ and FTZ flags:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2554	/// \code
				2555	/// void setFlags() {
				2556	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
				2557	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
				2558	/// }
				2559	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2560	///
				2561	/// \headerfile <x86intrin.h>
				2562	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2563	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2564	///
				2565	/// \param __i
				2566	/// A 32-bit unsigned integer value to be written to the MXCSR register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2567	void _mm_setcsr(unsigned int __i);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2568
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2569	#if defined(__cplusplus)
				2570	} // extern "C"
				2571	#endif
				2572
				2573	/// Selects 4 float values from the 128-bit operands of [4 x float], as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2574	/// specified by the immediate value operand.
				2575	///
				2576	/// \headerfile <x86intrin.h>
				2577	///
				2578	/// \code
				2579	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
				2580	/// \endcode
				2581	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2582	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2583	///
				2584	/// \param a
				2585	/// A 128-bit vector of [4 x float].
				2586	/// \param b
				2587	/// A 128-bit vector of [4 x float].
				2588	/// \param mask
				2589	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2590	/// copy from \a a and \a b. \n
				2591	/// Bits [3:0] specify the values copied from operand \a a. \n
				2592	/// Bits [7:4] specify the values copied from operand \a b. \n
				2593	/// The destinations within the 128-bit destination are assigned values as
				2594	/// follows: \n
				2595	/// Bits [1:0] are used to assign values to bits [31:0] in the
				2596	/// destination. \n
				2597	/// Bits [3:2] are used to assign values to bits [63:32] in the
				2598	/// destination. \n
				2599	/// Bits [5:4] are used to assign values to bits [95:64] in the
				2600	/// destination. \n
				2601	/// Bits [7:6] are used to assign values to bits [127:96] in the
				2602	/// destination. \n
				2603	/// Bit value assignments: \n
				2604	/// 00: Bits [31:0] copied from the specified operand. \n
				2605	/// 01: Bits [63:32] copied from the specified operand. \n
				2606	/// 10: Bits [95:64] copied from the specified operand. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2607	/// 11: Bits [127:96] copied from the specified operand.
				2608	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2609	#define _mm_shuffle_ps(a, b, mask) \
				2610	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
				2611	(int)(mask))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2612
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2613	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
				2614	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2615	///
				2616	/// \headerfile <x86intrin.h>
				2617	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2618	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2619	///
				2620	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2621	/// A 128-bit vector of [4 x float]. \n
				2622	/// Bits [95:64] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2623	/// Bits [127:96] are written to bits [95:64] of the destination.
				2624	/// \param __b
				2625	/// A 128-bit vector of [4 x float].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2626	/// Bits [95:64] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2627	/// Bits [127:96] are written to bits [127:96] of the destination.
				2628	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2629	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2630	_mm_unpackhi_ps(__m128 __a, __m128 __b)
				2631	{
				2632	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
				2633	}
				2634
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2635	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
				2636	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2637	///
				2638	/// \headerfile <x86intrin.h>
				2639	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2640	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2641	///
				2642	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2643	/// A 128-bit vector of [4 x float]. \n
				2644	/// Bits [31:0] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2645	/// Bits [63:32] are written to bits [95:64] of the destination.
				2646	/// \param __b
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2647	/// A 128-bit vector of [4 x float]. \n
				2648	/// Bits [31:0] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2649	/// Bits [63:32] are written to bits [127:96] of the destination.
				2650	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2651	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2652	_mm_unpacklo_ps(__m128 __a, __m128 __b)
				2653	{
				2654	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
				2655	}
				2656
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2657	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2658	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
				2659	/// 96 bits are set to the upper 96 bits of the first parameter.
				2660	///
				2661	/// \headerfile <x86intrin.h>
				2662	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2663	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
				2664	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2665	///
				2666	/// \param __a
				2667	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
				2668	/// written to the upper 96 bits of the result.
				2669	/// \param __b
				2670	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
				2671	/// written to the lower 32 bits of the result.
				2672	/// \returns A 128-bit floating-point vector of [4 x float].
				2673	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2674	_mm_move_ss(__m128 __a, __m128 __b)
				2675	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2676	__a[0] = __b[0];
				2677	return __a;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2678	}
				2679
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2680	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2681	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
				2682	/// 64 bits are set to the upper 64 bits of the first parameter.
				2683	///
				2684	/// \headerfile <x86intrin.h>
				2685	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2686	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2687	///
				2688	/// \param __a
				2689	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2690	/// written to the upper 64 bits of the result.
				2691	/// \param __b
				2692	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2693	/// written to the lower 64 bits of the result.
				2694	/// \returns A 128-bit floating-point vector of [4 x float].
				2695	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2696	_mm_movehl_ps(__m128 __a, __m128 __b)
				2697	{
				2698	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
				2699	}
				2700
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2701	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2702	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
				2703	/// 64 bits are set to the lower 64 bits of the second parameter.
				2704	///
				2705	/// \headerfile <x86intrin.h>
				2706	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2707	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2708	///
				2709	/// \param __a
				2710	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2711	/// written to the lower 64 bits of the result.
				2712	/// \param __b
				2713	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2714	/// written to the upper 64 bits of the result.
				2715	/// \returns A 128-bit floating-point vector of [4 x float].
				2716	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2717	_mm_movelh_ps(__m128 __a, __m128 __b)
				2718	{
				2719	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
				2720	}
				2721
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2722	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2723	/// float].
				2724	///
				2725	/// \headerfile <x86intrin.h>
				2726	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2727	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2728	///
				2729	/// \param __a
				2730	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
				2731	/// from the corresponding elements in this operand.
				2732	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2733	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2734	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2735	_mm_cvtpi16_ps(__m64 __a)
				2736	{
				2737	__m64 __b, __c;
				2738	__m128 __r;
				2739
				2740	__b = _mm_setzero_si64();
				2741	__b = _mm_cmpgt_pi16(__b, __a);
				2742	__c = _mm_unpackhi_pi16(__a, __b);
				2743	__r = _mm_setzero_ps();
				2744	__r = _mm_cvtpi32_ps(__r, __c);
				2745	__r = _mm_movelh_ps(__r, __r);
				2746	__c = _mm_unpacklo_pi16(__a, __b);
				2747	__r = _mm_cvtpi32_ps(__r, __c);
				2748
				2749	return __r;
				2750	}
				2751
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2752	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2753	/// 128-bit vector of [4 x float].
				2754	///
				2755	/// \headerfile <x86intrin.h>
				2756	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2757	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2758	///
				2759	/// \param __a
				2760	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
				2761	/// destination are copied from the corresponding elements in this operand.
				2762	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2763	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2764	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2765	_mm_cvtpu16_ps(__m64 __a)
				2766	{
				2767	__m64 __b, __c;
				2768	__m128 __r;
				2769
				2770	__b = _mm_setzero_si64();
				2771	__c = _mm_unpackhi_pi16(__a, __b);
				2772	__r = _mm_setzero_ps();
				2773	__r = _mm_cvtpi32_ps(__r, __c);
				2774	__r = _mm_movelh_ps(__r, __r);
				2775	__c = _mm_unpacklo_pi16(__a, __b);
				2776	__r = _mm_cvtpi32_ps(__r, __c);
				2777
				2778	return __r;
				2779	}
				2780
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2781	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2782	/// into a 128-bit vector of [4 x float].
				2783	///
				2784	/// \headerfile <x86intrin.h>
				2785	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2786	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2787	///
				2788	/// \param __a
				2789	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
				2790	/// from the corresponding lower 4 elements in this operand.
				2791	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2792	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2793	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2794	_mm_cvtpi8_ps(__m64 __a)
				2795	{
				2796	__m64 __b;
				2797
				2798	__b = _mm_setzero_si64();
				2799	__b = _mm_cmpgt_pi8(__b, __a);
				2800	__b = _mm_unpacklo_pi8(__a, __b);
				2801
				2802	return _mm_cvtpi16_ps(__b);
				2803	}
				2804
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2805	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2806	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
				2807	///
				2808	/// \headerfile <x86intrin.h>
				2809	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2810	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2811	///
				2812	/// \param __a
				2813	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
				2814	/// destination are copied from the corresponding lower 4 elements in this
				2815	/// operand.
				2816	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2817	/// values from the source operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2818	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2819	_mm_cvtpu8_ps(__m64 __a)
				2820	{
				2821	__m64 __b;
				2822
				2823	__b = _mm_setzero_si64();
				2824	__b = _mm_unpacklo_pi8(__a, __b);
				2825
				2826	return _mm_cvtpi16_ps(__b);
				2827	}
				2828
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2829	/// Converts the two 32-bit signed integer values from each 64-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2830	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
				2831	///
				2832	/// \headerfile <x86intrin.h>
				2833	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2834	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2835	///
				2836	/// \param __a
				2837	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
				2838	/// copied from the elements in this operand.
				2839	/// \param __b
				2840	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
				2841	/// copied from the elements in this operand.
				2842	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				2843	/// copied and converted values from the first operand. The upper 64 bits
				2844	/// contain the copied and converted values from the second operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2845	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2846	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
				2847	{
				2848	__m128 __c;
				2849
				2850	__c = _mm_setzero_ps();
				2851	__c = _mm_cvtpi32_ps(__c, __b);
				2852	__c = _mm_movelh_ps(__c, __c);
				2853
				2854	return _mm_cvtpi32_ps(__c, __a);
				2855	}
				2856
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2857	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2858	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2859	/// packs the results into a 64-bit integer vector of [4 x i16].
				2860	///
				2861	/// If the floating-point element is NaN or infinity, or if the
				2862	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
				2863	/// it is converted to 0x8000. Otherwise if the floating-point element is
				2864	/// greater than 0x7FFF, it is converted to 0x7FFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2865	///
				2866	/// \headerfile <x86intrin.h>
				2867	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2868	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2869	///
				2870	/// \param __a
				2871	/// A 128-bit floating-point vector of [4 x float].
				2872	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
				2873	/// values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2874	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2875	_mm_cvtps_pi16(__m128 __a)
				2876	{
				2877	__m64 __b, __c;
				2878
				2879	__b = _mm_cvtps_pi32(__a);
				2880	__a = _mm_movehl_ps(__a, __a);
				2881	__c = _mm_cvtps_pi32(__a);
				2882
				2883	return _mm_packs_pi32(__b, __c);
				2884	}
				2885
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2886	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2887	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
				2888	/// packs the results into the lower 32 bits of a 64-bit integer vector of
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2889	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
				2890	///
				2891	/// If the floating-point element is NaN or infinity, or if the
				2892	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
				2893	/// is converted to 0x80. Otherwise if the floating-point element is greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2894	/// than 0x7F, it is converted to 0x7F.
				2895	///
				2896	/// \headerfile <x86intrin.h>
				2897	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2898	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2899	///
				2900	/// \param __a
				2901	/// 128-bit floating-point vector of [4 x float].
				2902	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
				2903	/// converted values and the uppper 32 bits are set to zero.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2904	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2905	_mm_cvtps_pi8(__m128 __a)
				2906	{
				2907	__m64 __b, __c;
				2908
				2909	__b = _mm_cvtps_pi16(__a);
				2910	__c = _mm_setzero_si64();
				2911
				2912	return _mm_packs_pi16(__b, __c);
				2913	}
				2914
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2915	/// Extracts the sign bits from each single-precision floating-point
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2916	/// element of a 128-bit floating-point vector of [4 x float] and returns the
				2917	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
				2918	/// to zero.
				2919	///
				2920	/// \headerfile <x86intrin.h>
				2921	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2922	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2923	///
				2924	/// \param __a
				2925	/// A 128-bit floating-point vector of [4 x float].
				2926	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
				2927	/// single-precision floating-point element of the parameter. Bits [31:4] are
				2928	/// set to zero.
				2929	static __inline__ int __DEFAULT_FN_ATTRS
				2930	_mm_movemask_ps(__m128 __a)
				2931	{
				2932	return __builtin_ia32_movmskps((__v4sf)__a);
				2933	}
				2934
				2935
				2936	#define _MM_ALIGN16 __attribute__((aligned(16)))
				2937
				2938	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
				2939
				2940	#define _MM_EXCEPT_INVALID (0x0001)
				2941	#define _MM_EXCEPT_DENORM (0x0002)
				2942	#define _MM_EXCEPT_DIV_ZERO (0x0004)
				2943	#define _MM_EXCEPT_OVERFLOW (0x0008)
				2944	#define _MM_EXCEPT_UNDERFLOW (0x0010)
				2945	#define _MM_EXCEPT_INEXACT (0x0020)
				2946	#define _MM_EXCEPT_MASK (0x003f)
				2947
				2948	#define _MM_MASK_INVALID (0x0080)
				2949	#define _MM_MASK_DENORM (0x0100)
				2950	#define _MM_MASK_DIV_ZERO (0x0200)
				2951	#define _MM_MASK_OVERFLOW (0x0400)
				2952	#define _MM_MASK_UNDERFLOW (0x0800)
				2953	#define _MM_MASK_INEXACT (0x1000)
				2954	#define _MM_MASK_MASK (0x1f80)
				2955
				2956	#define _MM_ROUND_NEAREST (0x0000)
				2957	#define _MM_ROUND_DOWN (0x2000)
				2958	#define _MM_ROUND_UP (0x4000)
				2959	#define _MM_ROUND_TOWARD_ZERO (0x6000)
				2960	#define _MM_ROUND_MASK (0x6000)
				2961
				2962	#define _MM_FLUSH_ZERO_MASK (0x8000)
				2963	#define _MM_FLUSH_ZERO_ON (0x8000)
				2964	#define _MM_FLUSH_ZERO_OFF (0x0000)
				2965
				2966	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
				2967	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
				2968	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
				2969	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
				2970
				2971	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
				2972	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
				2973	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
				2974	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
				2975
				2976	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				2977	do { \
				2978	__m128 tmp3, tmp2, tmp1, tmp0; \
				2979	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
				2980	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
				2981	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
				2982	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
				2983	(row0) = _mm_movelh_ps(tmp0, tmp2); \
				2984	(row1) = _mm_movehl_ps(tmp2, tmp0); \
				2985	(row2) = _mm_movelh_ps(tmp1, tmp3); \
				2986	(row3) = _mm_movehl_ps(tmp3, tmp1); \
				2987	} while (0)
				2988
				2989	/* Aliases for compatibility. */
				2990	#define _m_pextrw _mm_extract_pi16
				2991	#define _m_pinsrw _mm_insert_pi16
				2992	#define _m_pmaxsw _mm_max_pi16
				2993	#define _m_pmaxub _mm_max_pu8
				2994	#define _m_pminsw _mm_min_pi16
				2995	#define _m_pminub _mm_min_pu8
				2996	#define _m_pmovmskb _mm_movemask_pi8
				2997	#define _m_pmulhuw _mm_mulhi_pu16
				2998	#define _m_pshufw _mm_shuffle_pi16
				2999	#define _m_maskmovq _mm_maskmove_si64
				3000	#define _m_pavgb _mm_avg_pu8
				3001	#define _m_pavgw _mm_avg_pu16
				3002	#define _m_psadbw _mm_sad_pu8
				3003	#define _m_ _mm_
				3004	#define _m_ _mm_
				3005
				3006	#undef __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3007	#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3008
				3009	/* Ugly hack for backwards-compatibility (compatible with gcc) */
				3010	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
				3011	#include <emmintrin.h>
				3012	#endif
				3013
				3014	#endif /* __XMMINTRIN_H */