Blame - linux-x86/lib64/clang/14.0.1/include/xmmintrin.h - platform/prebuilts/clang-tools

blob: 620453c97783cd9d7e0013e90f90759635fb7080 [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
				2	*
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
				10	#ifndef __XMMINTRIN_H
				11	#define __XMMINTRIN_H
				12
				13	#include <mmintrin.h>
				14
				15	typedef int __v4si __attribute__((__vector_size__(16)));
				16	typedef float __v4sf __attribute__((__vector_size__(16)));
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	17	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
				18
				19	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	20
				21	/* Unsigned types */
				22	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
				23
				24	/* This header should only be included in a hosted environment as it depends on
				25	* a standard library to provide allocation routines. */
				26	#if __STDC_HOSTED__
				27	#include <mm_malloc.h>
				28	#endif
				29
				30	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	31	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
				32	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	33
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	34	/// Adds the 32-bit float values in the low-order bits of the operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	35	///
				36	/// \headerfile <x86intrin.h>
				37	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	38	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	39	///
				40	/// \param __a
				41	/// A 128-bit vector of [4 x float] containing one of the source operands.
				42	/// The lower 32 bits of this operand are used in the calculation.
				43	/// \param __b
				44	/// A 128-bit vector of [4 x float] containing one of the source operands.
				45	/// The lower 32 bits of this operand are used in the calculation.
				46	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
				47	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
				48	/// the upper 96 bits of the first source operand.
				49	static __inline__ __m128 __DEFAULT_FN_ATTRS
				50	_mm_add_ss(__m128 __a, __m128 __b)
				51	{
				52	__a[0] += __b[0];
				53	return __a;
				54	}
				55
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	56	/// Adds two 128-bit vectors of [4 x float], and returns the results of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	57	/// the addition.
				58	///
				59	/// \headerfile <x86intrin.h>
				60	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	61	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	62	///
				63	/// \param __a
				64	/// A 128-bit vector of [4 x float] containing one of the source operands.
				65	/// \param __b
				66	/// A 128-bit vector of [4 x float] containing one of the source operands.
				67	/// \returns A 128-bit vector of [4 x float] containing the sums of both
				68	/// operands.
				69	static __inline__ __m128 __DEFAULT_FN_ATTRS
				70	_mm_add_ps(__m128 __a, __m128 __b)
				71	{
				72	return (__m128)((__v4sf)__a + (__v4sf)__b);
				73	}
				74
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	75	/// Subtracts the 32-bit float value in the low-order bits of the second
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	76	/// operand from the corresponding value in the first operand.
				77	///
				78	/// \headerfile <x86intrin.h>
				79	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	80	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	81	///
				82	/// \param __a
				83	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
				84	/// of this operand are used in the calculation.
				85	/// \param __b
				86	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
				87	/// bits of this operand are used in the calculation.
				88	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				89	/// difference of the lower 32 bits of both operands. The upper 96 bits are
				90	/// copied from the upper 96 bits of the first source operand.
				91	static __inline__ __m128 __DEFAULT_FN_ATTRS
				92	_mm_sub_ss(__m128 __a, __m128 __b)
				93	{
				94	__a[0] -= __b[0];
				95	return __a;
				96	}
				97
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	98	/// Subtracts each of the values of the second operand from the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	99	/// operand, both of which are 128-bit vectors of [4 x float] and returns
				100	/// the results of the subtraction.
				101	///
				102	/// \headerfile <x86intrin.h>
				103	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	104	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	105	///
				106	/// \param __a
				107	/// A 128-bit vector of [4 x float] containing the minuend.
				108	/// \param __b
				109	/// A 128-bit vector of [4 x float] containing the subtrahend.
				110	/// \returns A 128-bit vector of [4 x float] containing the differences between
				111	/// both operands.
				112	static __inline__ __m128 __DEFAULT_FN_ATTRS
				113	_mm_sub_ps(__m128 __a, __m128 __b)
				114	{
				115	return (__m128)((__v4sf)__a - (__v4sf)__b);
				116	}
				117
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	118	/// Multiplies two 32-bit float values in the low-order bits of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	119	/// operands.
				120	///
				121	/// \headerfile <x86intrin.h>
				122	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	123	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	124	///
				125	/// \param __a
				126	/// A 128-bit vector of [4 x float] containing one of the source operands.
				127	/// The lower 32 bits of this operand are used in the calculation.
				128	/// \param __b
				129	/// A 128-bit vector of [4 x float] containing one of the source operands.
				130	/// The lower 32 bits of this operand are used in the calculation.
				131	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
				132	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
				133	/// bits of the first source operand.
				134	static __inline__ __m128 __DEFAULT_FN_ATTRS
				135	_mm_mul_ss(__m128 __a, __m128 __b)
				136	{
				137	__a[0] *= __b[0];
				138	return __a;
				139	}
				140
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	141	/// Multiplies two 128-bit vectors of [4 x float] and returns the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	142	/// results of the multiplication.
				143	///
				144	/// \headerfile <x86intrin.h>
				145	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	146	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	147	///
				148	/// \param __a
				149	/// A 128-bit vector of [4 x float] containing one of the source operands.
				150	/// \param __b
				151	/// A 128-bit vector of [4 x float] containing one of the source operands.
				152	/// \returns A 128-bit vector of [4 x float] containing the products of both
				153	/// operands.
				154	static __inline__ __m128 __DEFAULT_FN_ATTRS
				155	_mm_mul_ps(__m128 __a, __m128 __b)
				156	{
				157	return (__m128)((__v4sf)__a * (__v4sf)__b);
				158	}
				159
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	160	/// Divides the value in the low-order 32 bits of the first operand by
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	161	/// the corresponding value in the second operand.
				162	///
				163	/// \headerfile <x86intrin.h>
				164	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	165	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	166	///
				167	/// \param __a
				168	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
				169	/// bits of this operand are used in the calculation.
				170	/// \param __b
				171	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
				172	/// of this operand are used in the calculation.
				173	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
				174	/// lower 32 bits of both operands. The upper 96 bits are copied from the
				175	/// upper 96 bits of the first source operand.
				176	static __inline__ __m128 __DEFAULT_FN_ATTRS
				177	_mm_div_ss(__m128 __a, __m128 __b)
				178	{
				179	__a[0] /= __b[0];
				180	return __a;
				181	}
				182
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	183	/// Divides two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	184	///
				185	/// \headerfile <x86intrin.h>
				186	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	187	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	188	///
				189	/// \param __a
				190	/// A 128-bit vector of [4 x float] containing the dividend.
				191	/// \param __b
				192	/// A 128-bit vector of [4 x float] containing the divisor.
				193	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
				194	/// operands.
				195	static __inline__ __m128 __DEFAULT_FN_ATTRS
				196	_mm_div_ps(__m128 __a, __m128 __b)
				197	{
				198	return (__m128)((__v4sf)__a / (__v4sf)__b);
				199	}
				200
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	201	/// Calculates the square root of the value stored in the low-order bits
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	202	/// of a 128-bit vector of [4 x float].
				203	///
				204	/// \headerfile <x86intrin.h>
				205	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	206	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	207	///
				208	/// \param __a
				209	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				210	/// used in the calculation.
				211	/// \returns A 128-bit vector of [4 x float] containing the square root of the
				212	/// value in the low-order bits of the operand.
				213	static __inline__ __m128 __DEFAULT_FN_ATTRS
				214	_mm_sqrt_ss(__m128 __a)
				215	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	216	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	217	}
				218
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	219	/// Calculates the square roots of the values stored in a 128-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	220	/// of [4 x float].
				221	///
				222	/// \headerfile <x86intrin.h>
				223	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	224	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	225	///
				226	/// \param __a
				227	/// A 128-bit vector of [4 x float].
				228	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
				229	/// values in the operand.
				230	static __inline__ __m128 __DEFAULT_FN_ATTRS
				231	_mm_sqrt_ps(__m128 __a)
				232	{
				233	return __builtin_ia32_sqrtps((__v4sf)__a);
				234	}
				235
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	236	/// Calculates the approximate reciprocal of the value stored in the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	237	/// low-order bits of a 128-bit vector of [4 x float].
				238	///
				239	/// \headerfile <x86intrin.h>
				240	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	241	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	242	///
				243	/// \param __a
				244	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				245	/// used in the calculation.
				246	/// \returns A 128-bit vector of [4 x float] containing the approximate
				247	/// reciprocal of the value in the low-order bits of the operand.
				248	static __inline__ __m128 __DEFAULT_FN_ATTRS
				249	_mm_rcp_ss(__m128 __a)
				250	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	251	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	252	}
				253
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	254	/// Calculates the approximate reciprocals of the values stored in a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	255	/// 128-bit vector of [4 x float].
				256	///
				257	/// \headerfile <x86intrin.h>
				258	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	259	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	260	///
				261	/// \param __a
				262	/// A 128-bit vector of [4 x float].
				263	/// \returns A 128-bit vector of [4 x float] containing the approximate
				264	/// reciprocals of the values in the operand.
				265	static __inline__ __m128 __DEFAULT_FN_ATTRS
				266	_mm_rcp_ps(__m128 __a)
				267	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	268	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	269	}
				270
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	271	/// Calculates the approximate reciprocal of the square root of the value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	272	/// stored in the low-order bits of a 128-bit vector of [4 x float].
				273	///
				274	/// \headerfile <x86intrin.h>
				275	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	276	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	277	///
				278	/// \param __a
				279	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				280	/// used in the calculation.
				281	/// \returns A 128-bit vector of [4 x float] containing the approximate
				282	/// reciprocal of the square root of the value in the low-order bits of the
				283	/// operand.
				284	static __inline__ __m128 __DEFAULT_FN_ATTRS
				285	_mm_rsqrt_ss(__m128 __a)
				286	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	287	return __builtin_ia32_rsqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	288	}
				289
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	290	/// Calculates the approximate reciprocals of the square roots of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	291	/// values stored in a 128-bit vector of [4 x float].
				292	///
				293	/// \headerfile <x86intrin.h>
				294	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	295	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	296	///
				297	/// \param __a
				298	/// A 128-bit vector of [4 x float].
				299	/// \returns A 128-bit vector of [4 x float] containing the approximate
				300	/// reciprocals of the square roots of the values in the operand.
				301	static __inline__ __m128 __DEFAULT_FN_ATTRS
				302	_mm_rsqrt_ps(__m128 __a)
				303	{
				304	return __builtin_ia32_rsqrtps((__v4sf)__a);
				305	}
				306
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	307	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	308	/// operands and returns the lesser value in the low-order bits of the
				309	/// vector of [4 x float].
				310	///
				311	/// \headerfile <x86intrin.h>
				312	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	313	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	314	///
				315	/// \param __a
				316	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				317	/// 32 bits of this operand are used in the comparison.
				318	/// \param __b
				319	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				320	/// 32 bits of this operand are used in the comparison.
				321	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				322	/// minimum value between both operands. The upper 96 bits are copied from
				323	/// the upper 96 bits of the first source operand.
				324	static __inline__ __m128 __DEFAULT_FN_ATTRS
				325	_mm_min_ss(__m128 __a, __m128 __b)
				326	{
				327	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
				328	}
				329
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	330	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
				331	/// of each pair of values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	332	///
				333	/// \headerfile <x86intrin.h>
				334	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	335	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	336	///
				337	/// \param __a
				338	/// A 128-bit vector of [4 x float] containing one of the operands.
				339	/// \param __b
				340	/// A 128-bit vector of [4 x float] containing one of the operands.
				341	/// \returns A 128-bit vector of [4 x float] containing the minimum values
				342	/// between both operands.
				343	static __inline__ __m128 __DEFAULT_FN_ATTRS
				344	_mm_min_ps(__m128 __a, __m128 __b)
				345	{
				346	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
				347	}
				348
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	349	/// Compares two 32-bit float values in the low-order bits of both
				350	/// operands and returns the greater value in the low-order bits of a 128-bit
				351	/// vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	352	///
				353	/// \headerfile <x86intrin.h>
				354	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	355	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	356	///
				357	/// \param __a
				358	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				359	/// 32 bits of this operand are used in the comparison.
				360	/// \param __b
				361	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				362	/// 32 bits of this operand are used in the comparison.
				363	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				364	/// maximum value between both operands. The upper 96 bits are copied from
				365	/// the upper 96 bits of the first source operand.
				366	static __inline__ __m128 __DEFAULT_FN_ATTRS
				367	_mm_max_ss(__m128 __a, __m128 __b)
				368	{
				369	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
				370	}
				371
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	372	/// Compares two 128-bit vectors of [4 x float] and returns the greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	373	/// of each pair of values.
				374	///
				375	/// \headerfile <x86intrin.h>
				376	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	377	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	378	///
				379	/// \param __a
				380	/// A 128-bit vector of [4 x float] containing one of the operands.
				381	/// \param __b
				382	/// A 128-bit vector of [4 x float] containing one of the operands.
				383	/// \returns A 128-bit vector of [4 x float] containing the maximum values
				384	/// between both operands.
				385	static __inline__ __m128 __DEFAULT_FN_ATTRS
				386	_mm_max_ps(__m128 __a, __m128 __b)
				387	{
				388	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
				389	}
				390
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	391	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	392	///
				393	/// \headerfile <x86intrin.h>
				394	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	395	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	396	///
				397	/// \param __a
				398	/// A 128-bit vector containing one of the source operands.
				399	/// \param __b
				400	/// A 128-bit vector containing one of the source operands.
				401	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				402	/// values between both operands.
				403	static __inline__ __m128 __DEFAULT_FN_ATTRS
				404	_mm_and_ps(__m128 __a, __m128 __b)
				405	{
				406	return (__m128)((__v4su)__a & (__v4su)__b);
				407	}
				408
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	409	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	410	/// the one's complement of the values contained in the first source
				411	/// operand.
				412	///
				413	/// \headerfile <x86intrin.h>
				414	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	415	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	416	///
				417	/// \param __a
				418	/// A 128-bit vector of [4 x float] containing the first source operand. The
				419	/// one's complement of this value is used in the bitwise AND.
				420	/// \param __b
				421	/// A 128-bit vector of [4 x float] containing the second source operand.
				422	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				423	/// one's complement of the first operand and the values in the second
				424	/// operand.
				425	static __inline__ __m128 __DEFAULT_FN_ATTRS
				426	_mm_andnot_ps(__m128 __a, __m128 __b)
				427	{
				428	return (__m128)(~(__v4su)__a & (__v4su)__b);
				429	}
				430
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	431	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	432	///
				433	/// \headerfile <x86intrin.h>
				434	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	435	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	436	///
				437	/// \param __a
				438	/// A 128-bit vector of [4 x float] containing one of the source operands.
				439	/// \param __b
				440	/// A 128-bit vector of [4 x float] containing one of the source operands.
				441	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
				442	/// values between both operands.
				443	static __inline__ __m128 __DEFAULT_FN_ATTRS
				444	_mm_or_ps(__m128 __a, __m128 __b)
				445	{
				446	return (__m128)((__v4su)__a \| (__v4su)__b);
				447	}
				448
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	449	/// Performs a bitwise exclusive OR of two 128-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	450	/// [4 x float].
				451	///
				452	/// \headerfile <x86intrin.h>
				453	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	454	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	455	///
				456	/// \param __a
				457	/// A 128-bit vector of [4 x float] containing one of the source operands.
				458	/// \param __b
				459	/// A 128-bit vector of [4 x float] containing one of the source operands.
				460	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
				461	/// of the values between both operands.
				462	static __inline__ __m128 __DEFAULT_FN_ATTRS
				463	_mm_xor_ps(__m128 __a, __m128 __b)
				464	{
				465	return (__m128)((__v4su)__a ^ (__v4su)__b);
				466	}
				467
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	468	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	469	/// operands for equality and returns the result of the comparison in the
				470	/// low-order bits of a vector [4 x float].
				471	///
				472	/// \headerfile <x86intrin.h>
				473	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	474	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	475	///
				476	/// \param __a
				477	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				478	/// 32 bits of this operand are used in the comparison.
				479	/// \param __b
				480	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				481	/// 32 bits of this operand are used in the comparison.
				482	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				483	/// in the low-order bits.
				484	static __inline__ __m128 __DEFAULT_FN_ATTRS
				485	_mm_cmpeq_ss(__m128 __a, __m128 __b)
				486	{
				487	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
				488	}
				489
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	490	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	491	/// 128-bit vectors of [4 x float] for equality.
				492	///
				493	/// \headerfile <x86intrin.h>
				494	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	495	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	496	///
				497	/// \param __a
				498	/// A 128-bit vector of [4 x float].
				499	/// \param __b
				500	/// A 128-bit vector of [4 x float].
				501	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				502	static __inline__ __m128 __DEFAULT_FN_ATTRS
				503	_mm_cmpeq_ps(__m128 __a, __m128 __b)
				504	{
				505	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
				506	}
				507
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	508	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	509	/// operands to determine if the value in the first operand is less than the
				510	/// corresponding value in the second operand and returns the result of the
				511	/// comparison in the low-order bits of a vector of [4 x float].
				512	///
				513	/// \headerfile <x86intrin.h>
				514	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	515	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	516	///
				517	/// \param __a
				518	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				519	/// 32 bits of this operand are used in the comparison.
				520	/// \param __b
				521	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				522	/// 32 bits of this operand are used in the comparison.
				523	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				524	/// in the low-order bits.
				525	static __inline__ __m128 __DEFAULT_FN_ATTRS
				526	_mm_cmplt_ss(__m128 __a, __m128 __b)
				527	{
				528	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
				529	}
				530
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	531	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	532	/// 128-bit vectors of [4 x float] to determine if the values in the first
				533	/// operand are less than those in the second operand.
				534	///
				535	/// \headerfile <x86intrin.h>
				536	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	537	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	538	///
				539	/// \param __a
				540	/// A 128-bit vector of [4 x float].
				541	/// \param __b
				542	/// A 128-bit vector of [4 x float].
				543	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				544	static __inline__ __m128 __DEFAULT_FN_ATTRS
				545	_mm_cmplt_ps(__m128 __a, __m128 __b)
				546	{
				547	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
				548	}
				549
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	550	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	551	/// operands to determine if the value in the first operand is less than or
				552	/// equal to the corresponding value in the second operand and returns the
				553	/// result of the comparison in the low-order bits of a vector of
				554	/// [4 x float].
				555	///
				556	/// \headerfile <x86intrin.h>
				557	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	558	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	559	///
				560	/// \param __a
				561	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				562	/// 32 bits of this operand are used in the comparison.
				563	/// \param __b
				564	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				565	/// 32 bits of this operand are used in the comparison.
				566	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				567	/// in the low-order bits.
				568	static __inline__ __m128 __DEFAULT_FN_ATTRS
				569	_mm_cmple_ss(__m128 __a, __m128 __b)
				570	{
				571	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
				572	}
				573
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	574	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	575	/// 128-bit vectors of [4 x float] to determine if the values in the first
				576	/// operand are less than or equal to those in the second operand.
				577	///
				578	/// \headerfile <x86intrin.h>
				579	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	580	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	581	///
				582	/// \param __a
				583	/// A 128-bit vector of [4 x float].
				584	/// \param __b
				585	/// A 128-bit vector of [4 x float].
				586	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				587	static __inline__ __m128 __DEFAULT_FN_ATTRS
				588	_mm_cmple_ps(__m128 __a, __m128 __b)
				589	{
				590	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
				591	}
				592
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	593	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	594	/// operands to determine if the value in the first operand is greater than
				595	/// the corresponding value in the second operand and returns the result of
				596	/// the comparison in the low-order bits of a vector of [4 x float].
				597	///
				598	/// \headerfile <x86intrin.h>
				599	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	600	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	601	///
				602	/// \param __a
				603	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				604	/// 32 bits of this operand are used in the comparison.
				605	/// \param __b
				606	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				607	/// 32 bits of this operand are used in the comparison.
				608	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				609	/// in the low-order bits.
				610	static __inline__ __m128 __DEFAULT_FN_ATTRS
				611	_mm_cmpgt_ss(__m128 __a, __m128 __b)
				612	{
				613	return (__m128)__builtin_shufflevector((__v4sf)__a,
				614	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
				615	4, 1, 2, 3);
				616	}
				617
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	618	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	619	/// 128-bit vectors of [4 x float] to determine if the values in the first
				620	/// operand are greater than those in the second operand.
				621	///
				622	/// \headerfile <x86intrin.h>
				623	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	624	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	625	///
				626	/// \param __a
				627	/// A 128-bit vector of [4 x float].
				628	/// \param __b
				629	/// A 128-bit vector of [4 x float].
				630	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				631	static __inline__ __m128 __DEFAULT_FN_ATTRS
				632	_mm_cmpgt_ps(__m128 __a, __m128 __b)
				633	{
				634	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
				635	}
				636
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	637	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	638	/// operands to determine if the value in the first operand is greater than
				639	/// or equal to the corresponding value in the second operand and returns
				640	/// the result of the comparison in the low-order bits of a vector of
				641	/// [4 x float].
				642	///
				643	/// \headerfile <x86intrin.h>
				644	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	645	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	646	///
				647	/// \param __a
				648	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				649	/// 32 bits of this operand are used in the comparison.
				650	/// \param __b
				651	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				652	/// 32 bits of this operand are used in the comparison.
				653	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				654	/// in the low-order bits.
				655	static __inline__ __m128 __DEFAULT_FN_ATTRS
				656	_mm_cmpge_ss(__m128 __a, __m128 __b)
				657	{
				658	return (__m128)__builtin_shufflevector((__v4sf)__a,
				659	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
				660	4, 1, 2, 3);
				661	}
				662
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	663	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	664	/// 128-bit vectors of [4 x float] to determine if the values in the first
				665	/// operand are greater than or equal to those in the second operand.
				666	///
				667	/// \headerfile <x86intrin.h>
				668	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	669	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	670	///
				671	/// \param __a
				672	/// A 128-bit vector of [4 x float].
				673	/// \param __b
				674	/// A 128-bit vector of [4 x float].
				675	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				676	static __inline__ __m128 __DEFAULT_FN_ATTRS
				677	_mm_cmpge_ps(__m128 __a, __m128 __b)
				678	{
				679	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
				680	}
				681
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	682	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	683	/// operands for inequality and returns the result of the comparison in the
				684	/// low-order bits of a vector of [4 x float].
				685	///
				686	/// \headerfile <x86intrin.h>
				687	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	688	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
				689	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	690	///
				691	/// \param __a
				692	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				693	/// 32 bits of this operand are used in the comparison.
				694	/// \param __b
				695	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				696	/// 32 bits of this operand are used in the comparison.
				697	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				698	/// in the low-order bits.
				699	static __inline__ __m128 __DEFAULT_FN_ATTRS
				700	_mm_cmpneq_ss(__m128 __a, __m128 __b)
				701	{
				702	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
				703	}
				704
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	705	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	706	/// 128-bit vectors of [4 x float] for inequality.
				707	///
				708	/// \headerfile <x86intrin.h>
				709	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	710	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
				711	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	712	///
				713	/// \param __a
				714	/// A 128-bit vector of [4 x float].
				715	/// \param __b
				716	/// A 128-bit vector of [4 x float].
				717	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				718	static __inline__ __m128 __DEFAULT_FN_ATTRS
				719	_mm_cmpneq_ps(__m128 __a, __m128 __b)
				720	{
				721	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
				722	}
				723
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	724	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	725	/// operands to determine if the value in the first operand is not less than
				726	/// the corresponding value in the second operand and returns the result of
				727	/// the comparison in the low-order bits of a vector of [4 x float].
				728	///
				729	/// \headerfile <x86intrin.h>
				730	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	731	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				732	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	733	///
				734	/// \param __a
				735	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				736	/// 32 bits of this operand are used in the comparison.
				737	/// \param __b
				738	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				739	/// 32 bits of this operand are used in the comparison.
				740	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				741	/// in the low-order bits.
				742	static __inline__ __m128 __DEFAULT_FN_ATTRS
				743	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
				744	{
				745	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
				746	}
				747
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	748	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	749	/// 128-bit vectors of [4 x float] to determine if the values in the first
				750	/// operand are not less than those in the second operand.
				751	///
				752	/// \headerfile <x86intrin.h>
				753	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	754	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				755	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	756	///
				757	/// \param __a
				758	/// A 128-bit vector of [4 x float].
				759	/// \param __b
				760	/// A 128-bit vector of [4 x float].
				761	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				762	static __inline__ __m128 __DEFAULT_FN_ATTRS
				763	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
				764	{
				765	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
				766	}
				767
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	768	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	769	/// operands to determine if the value in the first operand is not less than
				770	/// or equal to the corresponding value in the second operand and returns
				771	/// the result of the comparison in the low-order bits of a vector of
				772	/// [4 x float].
				773	///
				774	/// \headerfile <x86intrin.h>
				775	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	776	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				777	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	778	///
				779	/// \param __a
				780	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				781	/// 32 bits of this operand are used in the comparison.
				782	/// \param __b
				783	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				784	/// 32 bits of this operand are used in the comparison.
				785	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				786	/// in the low-order bits.
				787	static __inline__ __m128 __DEFAULT_FN_ATTRS
				788	_mm_cmpnle_ss(__m128 __a, __m128 __b)
				789	{
				790	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
				791	}
				792
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	793	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	794	/// 128-bit vectors of [4 x float] to determine if the values in the first
				795	/// operand are not less than or equal to those in the second operand.
				796	///
				797	/// \headerfile <x86intrin.h>
				798	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	799	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				800	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	801	///
				802	/// \param __a
				803	/// A 128-bit vector of [4 x float].
				804	/// \param __b
				805	/// A 128-bit vector of [4 x float].
				806	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				807	static __inline__ __m128 __DEFAULT_FN_ATTRS
				808	_mm_cmpnle_ps(__m128 __a, __m128 __b)
				809	{
				810	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
				811	}
				812
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	813	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	814	/// operands to determine if the value in the first operand is not greater
				815	/// than the corresponding value in the second operand and returns the
				816	/// result of the comparison in the low-order bits of a vector of
				817	/// [4 x float].
				818	///
				819	/// \headerfile <x86intrin.h>
				820	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	821	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				822	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	823	///
				824	/// \param __a
				825	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				826	/// 32 bits of this operand are used in the comparison.
				827	/// \param __b
				828	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				829	/// 32 bits of this operand are used in the comparison.
				830	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				831	/// in the low-order bits.
				832	static __inline__ __m128 __DEFAULT_FN_ATTRS
				833	_mm_cmpngt_ss(__m128 __a, __m128 __b)
				834	{
				835	return (__m128)__builtin_shufflevector((__v4sf)__a,
				836	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
				837	4, 1, 2, 3);
				838	}
				839
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	840	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	841	/// 128-bit vectors of [4 x float] to determine if the values in the first
				842	/// operand are not greater than those in the second operand.
				843	///
				844	/// \headerfile <x86intrin.h>
				845	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	846	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				847	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	848	///
				849	/// \param __a
				850	/// A 128-bit vector of [4 x float].
				851	/// \param __b
				852	/// A 128-bit vector of [4 x float].
				853	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				854	static __inline__ __m128 __DEFAULT_FN_ATTRS
				855	_mm_cmpngt_ps(__m128 __a, __m128 __b)
				856	{
				857	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
				858	}
				859
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	860	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	861	/// operands to determine if the value in the first operand is not greater
				862	/// than or equal to the corresponding value in the second operand and
				863	/// returns the result of the comparison in the low-order bits of a vector
				864	/// of [4 x float].
				865	///
				866	/// \headerfile <x86intrin.h>
				867	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	868	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				869	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	870	///
				871	/// \param __a
				872	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				873	/// 32 bits of this operand are used in the comparison.
				874	/// \param __b
				875	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				876	/// 32 bits of this operand are used in the comparison.
				877	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				878	/// in the low-order bits.
				879	static __inline__ __m128 __DEFAULT_FN_ATTRS
				880	_mm_cmpnge_ss(__m128 __a, __m128 __b)
				881	{
				882	return (__m128)__builtin_shufflevector((__v4sf)__a,
				883	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
				884	4, 1, 2, 3);
				885	}
				886
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	887	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	888	/// 128-bit vectors of [4 x float] to determine if the values in the first
				889	/// operand are not greater than or equal to those in the second operand.
				890	///
				891	/// \headerfile <x86intrin.h>
				892	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	893	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				894	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	895	///
				896	/// \param __a
				897	/// A 128-bit vector of [4 x float].
				898	/// \param __b
				899	/// A 128-bit vector of [4 x float].
				900	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				901	static __inline__ __m128 __DEFAULT_FN_ATTRS
				902	_mm_cmpnge_ps(__m128 __a, __m128 __b)
				903	{
				904	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
				905	}
				906
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	907	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	908	/// operands to determine if the value in the first operand is ordered with
				909	/// respect to the corresponding value in the second operand and returns the
				910	/// result of the comparison in the low-order bits of a vector of
				911	/// [4 x float].
				912	///
				913	/// \headerfile <x86intrin.h>
				914	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	915	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
				916	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	917	///
				918	/// \param __a
				919	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				920	/// 32 bits of this operand are used in the comparison.
				921	/// \param __b
				922	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				923	/// 32 bits of this operand are used in the comparison.
				924	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				925	/// in the low-order bits.
				926	static __inline__ __m128 __DEFAULT_FN_ATTRS
				927	_mm_cmpord_ss(__m128 __a, __m128 __b)
				928	{
				929	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
				930	}
				931
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	932	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	933	/// 128-bit vectors of [4 x float] to determine if the values in the first
				934	/// operand are ordered with respect to those in the second operand.
				935	///
				936	/// \headerfile <x86intrin.h>
				937	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	938	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
				939	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	940	///
				941	/// \param __a
				942	/// A 128-bit vector of [4 x float].
				943	/// \param __b
				944	/// A 128-bit vector of [4 x float].
				945	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				946	static __inline__ __m128 __DEFAULT_FN_ATTRS
				947	_mm_cmpord_ps(__m128 __a, __m128 __b)
				948	{
				949	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
				950	}
				951
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	952	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	953	/// operands to determine if the value in the first operand is unordered
				954	/// with respect to the corresponding value in the second operand and
				955	/// returns the result of the comparison in the low-order bits of a vector
				956	/// of [4 x float].
				957	///
				958	/// \headerfile <x86intrin.h>
				959	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	960	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
				961	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	962	///
				963	/// \param __a
				964	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				965	/// 32 bits of this operand are used in the comparison.
				966	/// \param __b
				967	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				968	/// 32 bits of this operand are used in the comparison.
				969	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				970	/// in the low-order bits.
				971	static __inline__ __m128 __DEFAULT_FN_ATTRS
				972	_mm_cmpunord_ss(__m128 __a, __m128 __b)
				973	{
				974	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
				975	}
				976
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	977	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	978	/// 128-bit vectors of [4 x float] to determine if the values in the first
				979	/// operand are unordered with respect to those in the second operand.
				980	///
				981	/// \headerfile <x86intrin.h>
				982	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	983	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
				984	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	985	///
				986	/// \param __a
				987	/// A 128-bit vector of [4 x float].
				988	/// \param __b
				989	/// A 128-bit vector of [4 x float].
				990	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				991	static __inline__ __m128 __DEFAULT_FN_ATTRS
				992	_mm_cmpunord_ps(__m128 __a, __m128 __b)
				993	{
				994	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
				995	}
				996
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	997	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	998	/// operands for equality and returns the result of the comparison.
				999	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1000	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1001	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1002	/// \headerfile <x86intrin.h>
				1003	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1004	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1005	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1006	///
				1007	/// \param __a
				1008	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1009	/// used in the comparison.
				1010	/// \param __b
				1011	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1012	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1013	/// \returns An integer containing the comparison results. If either of the
				1014	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1015	static __inline__ int __DEFAULT_FN_ATTRS
				1016	_mm_comieq_ss(__m128 __a, __m128 __b)
				1017	{
				1018	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
				1019	}
				1020
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1021	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1022	/// operands to determine if the first operand is less than the second
				1023	/// operand and returns the result of the comparison.
				1024	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1025	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1026	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1027	/// \headerfile <x86intrin.h>
				1028	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1029	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1030	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1031	///
				1032	/// \param __a
				1033	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1034	/// used in the comparison.
				1035	/// \param __b
				1036	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1037	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1038	/// \returns An integer containing the comparison results. If either of the two
				1039	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1040	static __inline__ int __DEFAULT_FN_ATTRS
				1041	_mm_comilt_ss(__m128 __a, __m128 __b)
				1042	{
				1043	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
				1044	}
				1045
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1046	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1047	/// operands to determine if the first operand is less than or equal to the
				1048	/// second operand and returns the result of the comparison.
				1049	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1050	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1051	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1052	/// \headerfile <x86intrin.h>
				1053	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1054	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1055	///
				1056	/// \param __a
				1057	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1058	/// used in the comparison.
				1059	/// \param __b
				1060	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1061	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1062	/// \returns An integer containing the comparison results. If either of the two
				1063	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1064	static __inline__ int __DEFAULT_FN_ATTRS
				1065	_mm_comile_ss(__m128 __a, __m128 __b)
				1066	{
				1067	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
				1068	}
				1069
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1070	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1071	/// operands to determine if the first operand is greater than the second
				1072	/// operand and returns the result of the comparison.
				1073	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1074	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1075	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1076	/// \headerfile <x86intrin.h>
				1077	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1078	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1079	///
				1080	/// \param __a
				1081	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1082	/// used in the comparison.
				1083	/// \param __b
				1084	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1085	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1086	/// \returns An integer containing the comparison results. If either of the
				1087	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1088	static __inline__ int __DEFAULT_FN_ATTRS
				1089	_mm_comigt_ss(__m128 __a, __m128 __b)
				1090	{
				1091	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
				1092	}
				1093
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1094	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1095	/// operands to determine if the first operand is greater than or equal to
				1096	/// the second operand and returns the result of the comparison.
				1097	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1098	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1099	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1100	/// \headerfile <x86intrin.h>
				1101	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1102	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1103	///
				1104	/// \param __a
				1105	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1106	/// used in the comparison.
				1107	/// \param __b
				1108	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1109	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1110	/// \returns An integer containing the comparison results. If either of the two
				1111	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1112	static __inline__ int __DEFAULT_FN_ATTRS
				1113	_mm_comige_ss(__m128 __a, __m128 __b)
				1114	{
				1115	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
				1116	}
				1117
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1118	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1119	/// operands to determine if the first operand is not equal to the second
				1120	/// operand and returns the result of the comparison.
				1121	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1122	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1123	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1124	/// \headerfile <x86intrin.h>
				1125	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1126	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1127	///
				1128	/// \param __a
				1129	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1130	/// used in the comparison.
				1131	/// \param __b
				1132	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1133	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1134	/// \returns An integer containing the comparison results. If either of the
				1135	/// two lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1136	static __inline__ int __DEFAULT_FN_ATTRS
				1137	_mm_comineq_ss(__m128 __a, __m128 __b)
				1138	{
				1139	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
				1140	}
				1141
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1142	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1143	/// the low-order bits of both operands to determine equality and returns
				1144	/// the result of the comparison.
				1145	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1146	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1147	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1148	/// \headerfile <x86intrin.h>
				1149	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1150	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1151	///
				1152	/// \param __a
				1153	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1154	/// used in the comparison.
				1155	/// \param __b
				1156	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1157	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1158	/// \returns An integer containing the comparison results. If either of the two
				1159	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1160	static __inline__ int __DEFAULT_FN_ATTRS
				1161	_mm_ucomieq_ss(__m128 __a, __m128 __b)
				1162	{
				1163	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
				1164	}
				1165
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1166	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1167	/// the low-order bits of both operands to determine if the first operand is
				1168	/// less than the second operand and returns the result of the comparison.
				1169	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1170	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1171	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1172	/// \headerfile <x86intrin.h>
				1173	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1174	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1175	///
				1176	/// \param __a
				1177	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1178	/// used in the comparison.
				1179	/// \param __b
				1180	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1181	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1182	/// \returns An integer containing the comparison results. If either of the two
				1183	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1184	static __inline__ int __DEFAULT_FN_ATTRS
				1185	_mm_ucomilt_ss(__m128 __a, __m128 __b)
				1186	{
				1187	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
				1188	}
				1189
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1190	/// Performs an unordered comparison of two 32-bit float values using
				1191	/// the low-order bits of both operands to determine if the first operand is
				1192	/// less than or equal to the second operand and returns the result of the
				1193	/// comparison.
				1194	///
				1195	/// If either of the two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1196	///
				1197	/// \headerfile <x86intrin.h>
				1198	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1199	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1200	///
				1201	/// \param __a
				1202	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1203	/// used in the comparison.
				1204	/// \param __b
				1205	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1206	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1207	/// \returns An integer containing the comparison results. If either of the two
				1208	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1209	static __inline__ int __DEFAULT_FN_ATTRS
				1210	_mm_ucomile_ss(__m128 __a, __m128 __b)
				1211	{
				1212	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
				1213	}
				1214
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1215	/// Performs an unordered comparison of two 32-bit float values using
				1216	/// the low-order bits of both operands to determine if the first operand is
				1217	/// greater than the second operand and returns the result of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1218	/// comparison.
				1219	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1220	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1221	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1222	/// \headerfile <x86intrin.h>
				1223	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1224	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1225	///
				1226	/// \param __a
				1227	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1228	/// used in the comparison.
				1229	/// \param __b
				1230	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1231	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1232	/// \returns An integer containing the comparison results. If either of the two
				1233	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1234	static __inline__ int __DEFAULT_FN_ATTRS
				1235	_mm_ucomigt_ss(__m128 __a, __m128 __b)
				1236	{
				1237	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
				1238	}
				1239
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1240	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1241	/// the low-order bits of both operands to determine if the first operand is
				1242	/// greater than or equal to the second operand and returns the result of
				1243	/// the comparison.
				1244	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1245	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1246	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1247	/// \headerfile <x86intrin.h>
				1248	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1249	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1250	///
				1251	/// \param __a
				1252	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1253	/// used in the comparison.
				1254	/// \param __b
				1255	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1256	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1257	/// \returns An integer containing the comparison results. If either of the two
				1258	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1259	static __inline__ int __DEFAULT_FN_ATTRS
				1260	_mm_ucomige_ss(__m128 __a, __m128 __b)
				1261	{
				1262	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
				1263	}
				1264
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1265	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1266	/// the low-order bits of both operands to determine inequality and returns
				1267	/// the result of the comparison.
				1268	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1269	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1270	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1271	/// \headerfile <x86intrin.h>
				1272	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1273	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1274	///
				1275	/// \param __a
				1276	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1277	/// used in the comparison.
				1278	/// \param __b
				1279	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1280	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1281	/// \returns An integer containing the comparison results. If either of the two
				1282	/// lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1283	static __inline__ int __DEFAULT_FN_ATTRS
				1284	_mm_ucomineq_ss(__m128 __a, __m128 __b)
				1285	{
				1286	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
				1287	}
				1288
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1289	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1290	/// [4 x float] into a 32-bit integer.
				1291	///
				1292	/// \headerfile <x86intrin.h>
				1293	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1294	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1295	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1296	///
				1297	/// \param __a
				1298	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1299	/// used in the conversion.
				1300	/// \returns A 32-bit integer containing the converted value.
				1301	static __inline__ int __DEFAULT_FN_ATTRS
				1302	_mm_cvtss_si32(__m128 __a)
				1303	{
				1304	return __builtin_ia32_cvtss2si((__v4sf)__a);
				1305	}
				1306
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1307	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1308	/// [4 x float] into a 32-bit integer.
				1309	///
				1310	/// \headerfile <x86intrin.h>
				1311	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1312	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1313	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1314	///
				1315	/// \param __a
				1316	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1317	/// used in the conversion.
				1318	/// \returns A 32-bit integer containing the converted value.
				1319	static __inline__ int __DEFAULT_FN_ATTRS
				1320	_mm_cvt_ss2si(__m128 __a)
				1321	{
				1322	return _mm_cvtss_si32(__a);
				1323	}
				1324
				1325	#ifdef __x86_64__
				1326
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1327	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1328	/// [4 x float] into a 64-bit integer.
				1329	///
				1330	/// \headerfile <x86intrin.h>
				1331	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1332	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1333	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1334	///
				1335	/// \param __a
				1336	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1337	/// used in the conversion.
				1338	/// \returns A 64-bit integer containing the converted value.
				1339	static __inline__ long long __DEFAULT_FN_ATTRS
				1340	_mm_cvtss_si64(__m128 __a)
				1341	{
				1342	return __builtin_ia32_cvtss2si64((__v4sf)__a);
				1343	}
				1344
				1345	#endif
				1346
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1347	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1348	/// [4 x float] into a 64-bit vector of [2 x i32].
				1349	///
				1350	/// \headerfile <x86intrin.h>
				1351	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1352	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1353	///
				1354	/// \param __a
				1355	/// A 128-bit vector of [4 x float].
				1356	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1357	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1358	_mm_cvtps_pi32(__m128 __a)
				1359	{
				1360	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
				1361	}
				1362
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1363	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1364	/// [4 x float] into a 64-bit vector of [2 x i32].
				1365	///
				1366	/// \headerfile <x86intrin.h>
				1367	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1368	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1369	///
				1370	/// \param __a
				1371	/// A 128-bit vector of [4 x float].
				1372	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1373	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1374	_mm_cvt_ps2pi(__m128 __a)
				1375	{
				1376	return _mm_cvtps_pi32(__a);
				1377	}
				1378
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1379	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1380	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1381	/// inexact.
				1382	///
				1383	/// \headerfile <x86intrin.h>
				1384	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1385	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1386	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1387	///
				1388	/// \param __a
				1389	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1390	/// used in the conversion.
				1391	/// \returns A 32-bit integer containing the converted value.
				1392	static __inline__ int __DEFAULT_FN_ATTRS
				1393	_mm_cvttss_si32(__m128 __a)
				1394	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1395	return __builtin_ia32_cvttss2si((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1396	}
				1397
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1398	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1399	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1400	/// inexact.
				1401	///
				1402	/// \headerfile <x86intrin.h>
				1403	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1404	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1405	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1406	///
				1407	/// \param __a
				1408	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1409	/// used in the conversion.
				1410	/// \returns A 32-bit integer containing the converted value.
				1411	static __inline__ int __DEFAULT_FN_ATTRS
				1412	_mm_cvtt_ss2si(__m128 __a)
				1413	{
				1414	return _mm_cvttss_si32(__a);
				1415	}
				1416
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1417	#ifdef __x86_64__
				1418	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1419	/// [4 x float] into a 64-bit integer, truncating the result when it is
				1420	/// inexact.
				1421	///
				1422	/// \headerfile <x86intrin.h>
				1423	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1424	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1425	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1426	///
				1427	/// \param __a
				1428	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1429	/// used in the conversion.
				1430	/// \returns A 64-bit integer containing the converted value.
				1431	static __inline__ long long __DEFAULT_FN_ATTRS
				1432	_mm_cvttss_si64(__m128 __a)
				1433	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1434	return __builtin_ia32_cvttss2si64((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1435	}
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1436	#endif
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1437
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1438	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1439	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
				1440	/// when it is inexact.
				1441	///
				1442	/// \headerfile <x86intrin.h>
				1443	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1444	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
				1445	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1446	///
				1447	/// \param __a
				1448	/// A 128-bit vector of [4 x float].
				1449	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1450	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1451	_mm_cvttps_pi32(__m128 __a)
				1452	{
				1453	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
				1454	}
				1455
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1456	/// Converts two low-order float values in a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1457	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
				1458	/// is inexact.
				1459	///
				1460	/// \headerfile <x86intrin.h>
				1461	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1462	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1463	///
				1464	/// \param __a
				1465	/// A 128-bit vector of [4 x float].
				1466	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1467	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1468	_mm_cvtt_ps2pi(__m128 __a)
				1469	{
				1470	return _mm_cvttps_pi32(__a);
				1471	}
				1472
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1473	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1474	/// and writes it to the lower 32 bits of the destination. The remaining
				1475	/// higher order elements of the destination vector are copied from the
				1476	/// corresponding elements in the first operand.
				1477	///
				1478	/// \headerfile <x86intrin.h>
				1479	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1480	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1481	///
				1482	/// \param __a
				1483	/// A 128-bit vector of [4 x float].
				1484	/// \param __b
				1485	/// A 32-bit signed integer operand containing the value to be converted.
				1486	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1487	/// converted value of the second operand. The upper 96 bits are copied from
				1488	/// the upper 96 bits of the first operand.
				1489	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1490	_mm_cvtsi32_ss(__m128 __a, int __b)
				1491	{
				1492	__a[0] = __b;
				1493	return __a;
				1494	}
				1495
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1496	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1497	/// and writes it to the lower 32 bits of the destination. The remaining
				1498	/// higher order elements of the destination are copied from the
				1499	/// corresponding elements in the first operand.
				1500	///
				1501	/// \headerfile <x86intrin.h>
				1502	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1503	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1504	///
				1505	/// \param __a
				1506	/// A 128-bit vector of [4 x float].
				1507	/// \param __b
				1508	/// A 32-bit signed integer operand containing the value to be converted.
				1509	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1510	/// converted value of the second operand. The upper 96 bits are copied from
				1511	/// the upper 96 bits of the first operand.
				1512	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1513	_mm_cvt_si2ss(__m128 __a, int __b)
				1514	{
				1515	return _mm_cvtsi32_ss(__a, __b);
				1516	}
				1517
				1518	#ifdef __x86_64__
				1519
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1520	/// Converts a 64-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1521	/// and writes it to the lower 32 bits of the destination. The remaining
				1522	/// higher order elements of the destination are copied from the
				1523	/// corresponding elements in the first operand.
				1524	///
				1525	/// \headerfile <x86intrin.h>
				1526	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1527	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1528	///
				1529	/// \param __a
				1530	/// A 128-bit vector of [4 x float].
				1531	/// \param __b
				1532	/// A 64-bit signed integer operand containing the value to be converted.
				1533	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1534	/// converted value of the second operand. The upper 96 bits are copied from
				1535	/// the upper 96 bits of the first operand.
				1536	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1537	_mm_cvtsi64_ss(__m128 __a, long long __b)
				1538	{
				1539	__a[0] = __b;
				1540	return __a;
				1541	}
				1542
				1543	#endif
				1544
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1545	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1546	/// floating point values and writes them to the lower 64-bits of the
				1547	/// destination. The remaining higher order elements of the destination are
				1548	/// copied from the corresponding elements in the first operand.
				1549	///
				1550	/// \headerfile <x86intrin.h>
				1551	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1552	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1553	///
				1554	/// \param __a
				1555	/// A 128-bit vector of [4 x float].
				1556	/// \param __b
				1557	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1558	/// and written to the corresponding low-order elements in the destination.
				1559	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1560	/// converted value of the second operand. The upper 64 bits are copied from
				1561	/// the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1562	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1563	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
				1564	{
				1565	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
				1566	}
				1567
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1568	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1569	/// floating point values and writes them to the lower 64-bits of the
				1570	/// destination. The remaining higher order elements of the destination are
				1571	/// copied from the corresponding elements in the first operand.
				1572	///
				1573	/// \headerfile <x86intrin.h>
				1574	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1575	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1576	///
				1577	/// \param __a
				1578	/// A 128-bit vector of [4 x float].
				1579	/// \param __b
				1580	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1581	/// and written to the corresponding low-order elements in the destination.
				1582	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1583	/// converted value from the second operand. The upper 64 bits are copied
				1584	/// from the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1585	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1586	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
				1587	{
				1588	return _mm_cvtpi32_ps(__a, __b);
				1589	}
				1590
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1591	/// Extracts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1592	/// [4 x float].
				1593	///
				1594	/// \headerfile <x86intrin.h>
				1595	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1596	/// This intrinsic has no corresponding instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1597	///
				1598	/// \param __a
				1599	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1600	/// used in the extraction.
				1601	/// \returns A 32-bit float containing the extracted value.
				1602	static __inline__ float __DEFAULT_FN_ATTRS
				1603	_mm_cvtss_f32(__m128 __a)
				1604	{
				1605	return __a[0];
				1606	}
				1607
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1608	/// Loads two packed float values from the address \a __p into the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1609	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
				1610	/// are copied from the low-order bits of the first operand.
				1611	///
				1612	/// \headerfile <x86intrin.h>
				1613	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1614	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1615	///
				1616	/// \param __a
				1617	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
				1618	/// of the destination.
				1619	/// \param __p
				1620	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1621	/// [127:64] of the destination.
				1622	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1623	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1624	_mm_loadh_pi(__m128 __a, const __m64 *__p)
				1625	{
				1626	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
				1627	struct __mm_loadh_pi_struct {
				1628	__mm_loadh_pi_v2f32 __u;
				1629	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1630	__mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1631	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1632	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
				1633	}
				1634
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1635	/// Loads two packed float values from the address \a __p into the
				1636	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
				1637	/// are copied from the high-order bits of the first operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1638	///
				1639	/// \headerfile <x86intrin.h>
				1640	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1641	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1642	///
				1643	/// \param __a
				1644	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
				1645	/// [127:64] of the destination.
				1646	/// \param __p
				1647	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1648	/// [63:0] of the destination.
				1649	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1650	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1651	_mm_loadl_pi(__m128 __a, const __m64 *__p)
				1652	{
				1653	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
				1654	struct __mm_loadl_pi_struct {
				1655	__mm_loadl_pi_v2f32 __u;
				1656	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1657	__mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1658	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1659	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
				1660	}
				1661
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1662	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1663	/// 32 bits of the vector are initialized with the single-precision
				1664	/// floating-point value loaded from a specified memory location. The upper
				1665	/// 96 bits are set to zero.
				1666	///
				1667	/// \headerfile <x86intrin.h>
				1668	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1669	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1670	///
				1671	/// \param __p
				1672	/// A pointer to a 32-bit memory location containing a single-precision
				1673	/// floating-point value.
				1674	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1675	/// lower 32 bits contain the value loaded from the memory location. The
				1676	/// upper 96 bits are set to zero.
				1677	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1678	_mm_load_ss(const float *__p)
				1679	{
				1680	struct __mm_load_ss_struct {
				1681	float __u;
				1682	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1683	float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1684	return __extension__ (__m128){ __u, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1685	}
				1686
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1687	/// Loads a 32-bit float value and duplicates it to all four vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1688	/// elements of a 128-bit vector of [4 x float].
				1689	///
				1690	/// \headerfile <x86intrin.h>
				1691	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1692	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1693	/// instruction.
				1694	///
				1695	/// \param __p
				1696	/// A pointer to a float value to be loaded and duplicated.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1697	/// \returns A 128-bit vector of [4 x float] containing the loaded and
				1698	/// duplicated values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1699	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1700	_mm_load1_ps(const float *__p)
				1701	{
				1702	struct __mm_load1_ps_struct {
				1703	float __u;
				1704	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1705	float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1706	return __extension__ (__m128){ __u, __u, __u, __u };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1707	}
				1708
				1709	#define _mm_load_ps1(p) _mm_load1_ps(p)
				1710
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1711	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1712	/// memory location.
				1713	///
				1714	/// \headerfile <x86intrin.h>
				1715	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1716	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1717	///
				1718	/// \param __p
				1719	/// A pointer to a 128-bit memory location. The address of the memory
				1720	/// location has to be 128-bit aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1721	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1722	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1723	_mm_load_ps(const float *__p)
				1724	{
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1725	return (const __m128)__p;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1726	}
				1727
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1728	/// Loads a 128-bit floating-point vector of [4 x float] from an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1729	/// unaligned memory location.
				1730	///
				1731	/// \headerfile <x86intrin.h>
				1732	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1733	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1734	///
				1735	/// \param __p
				1736	/// A pointer to a 128-bit memory location. The address of the memory
				1737	/// location does not have to be aligned.
				1738	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
				1739	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1740	_mm_loadu_ps(const float *__p)
				1741	{
				1742	struct __loadu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1743	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1744	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1745	return ((const struct __loadu_ps*)__p)->__v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1746	}
				1747
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1748	/// Loads four packed float values, in reverse order, from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1749	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
				1750	///
				1751	/// \headerfile <x86intrin.h>
				1752	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1753	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1754	/// instruction.
				1755	///
				1756	/// \param __p
				1757	/// A pointer to a 128-bit memory location. The address of the memory
				1758	/// location has to be 128-bit aligned.
				1759	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
				1760	/// in reverse order.
				1761	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1762	_mm_loadr_ps(const float *__p)
				1763	{
				1764	__m128 __a = _mm_load_ps(__p);
				1765	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				1766	}
				1767
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1768	/// Create a 128-bit vector of [4 x float] with undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1769	///
				1770	/// \headerfile <x86intrin.h>
				1771	///
				1772	/// This intrinsic has no corresponding instruction.
				1773	///
				1774	/// \returns A 128-bit vector of [4 x float] containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1775	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1776	_mm_undefined_ps(void)
				1777	{
				1778	return (__m128)__builtin_ia32_undef128();
				1779	}
				1780
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1781	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1782	/// 32 bits of the vector are initialized with the specified single-precision
				1783	/// floating-point value. The upper 96 bits are set to zero.
				1784	///
				1785	/// \headerfile <x86intrin.h>
				1786	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1787	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1788	///
				1789	/// \param __w
				1790	/// A single-precision floating-point value used to initialize the lower 32
				1791	/// bits of the result.
				1792	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1793	/// lower 32 bits contain the value provided in the source operand. The
				1794	/// upper 96 bits are set to zero.
				1795	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1796	_mm_set_ss(float __w)
				1797	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1798	return __extension__ (__m128){ __w, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1799	}
				1800
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1801	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1802	/// of the four single-precision floating-point vector elements set to the
				1803	/// specified single-precision floating-point value.
				1804	///
				1805	/// \headerfile <x86intrin.h>
				1806	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1807	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1808	///
				1809	/// \param __w
				1810	/// A single-precision floating-point value used to initialize each vector
				1811	/// element of the result.
				1812	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1813	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1814	_mm_set1_ps(float __w)
				1815	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1816	return __extension__ (__m128){ __w, __w, __w, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1817	}
				1818
				1819	/* Microsoft specific. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1820	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1821	/// of the four single-precision floating-point vector elements set to the
				1822	/// specified single-precision floating-point value.
				1823	///
				1824	/// \headerfile <x86intrin.h>
				1825	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1826	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1827	///
				1828	/// \param __w
				1829	/// A single-precision floating-point value used to initialize each vector
				1830	/// element of the result.
				1831	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1832	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1833	_mm_set_ps1(float __w)
				1834	{
				1835	return _mm_set1_ps(__w);
				1836	}
				1837
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1838	/// Constructs a 128-bit floating-point vector of [4 x float]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1839	/// initialized with the specified single-precision floating-point values.
				1840	///
				1841	/// \headerfile <x86intrin.h>
				1842	///
				1843	/// This intrinsic is a utility function and does not correspond to a specific
				1844	/// instruction.
				1845	///
				1846	/// \param __z
				1847	/// A single-precision floating-point value used to initialize bits [127:96]
				1848	/// of the result.
				1849	/// \param __y
				1850	/// A single-precision floating-point value used to initialize bits [95:64]
				1851	/// of the result.
				1852	/// \param __x
				1853	/// A single-precision floating-point value used to initialize bits [63:32]
				1854	/// of the result.
				1855	/// \param __w
				1856	/// A single-precision floating-point value used to initialize bits [31:0]
				1857	/// of the result.
				1858	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1859	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1860	_mm_set_ps(float __z, float __y, float __x, float __w)
				1861	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1862	return __extension__ (__m128){ __w, __x, __y, __z };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1863	}
				1864
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1865	/// Constructs a 128-bit floating-point vector of [4 x float],
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1866	/// initialized in reverse order with the specified 32-bit single-precision
				1867	/// float-point values.
				1868	///
				1869	/// \headerfile <x86intrin.h>
				1870	///
				1871	/// This intrinsic is a utility function and does not correspond to a specific
				1872	/// instruction.
				1873	///
				1874	/// \param __z
				1875	/// A single-precision floating-point value used to initialize bits [31:0]
				1876	/// of the result.
				1877	/// \param __y
				1878	/// A single-precision floating-point value used to initialize bits [63:32]
				1879	/// of the result.
				1880	/// \param __x
				1881	/// A single-precision floating-point value used to initialize bits [95:64]
				1882	/// of the result.
				1883	/// \param __w
				1884	/// A single-precision floating-point value used to initialize bits [127:96]
				1885	/// of the result.
				1886	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1887	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1888	_mm_setr_ps(float __z, float __y, float __x, float __w)
				1889	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1890	return __extension__ (__m128){ __z, __y, __x, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1891	}
				1892
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1893	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1894	/// to zero.
				1895	///
				1896	/// \headerfile <x86intrin.h>
				1897	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1898	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1899	///
				1900	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
				1901	/// all elements set to zero.
				1902	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1903	_mm_setzero_ps(void)
				1904	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1905	return __extension__ (__m128){ 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1906	}
				1907
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1908	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1909	/// memory location.
				1910	///
				1911	/// \headerfile <x86intrin.h>
				1912	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1913	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1914	///
				1915	/// \param __p
				1916	/// A pointer to a 64-bit memory location.
				1917	/// \param __a
				1918	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1919	static __inline__ void __DEFAULT_FN_ATTRS
				1920	_mm_storeh_pi(__m64 *__p, __m128 __a)
				1921	{
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1922	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
				1923	struct __mm_storeh_pi_struct {
				1924	__mm_storeh_pi_v2f32 __u;
				1925	} __attribute__((__packed__, __may_alias__));
				1926	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1927	}
				1928
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1929	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1930	/// memory location.
				1931	///
				1932	/// \headerfile <x86intrin.h>
				1933	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1934	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1935	///
				1936	/// \param __p
				1937	/// A pointer to a memory location that will receive the float values.
				1938	/// \param __a
				1939	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1940	static __inline__ void __DEFAULT_FN_ATTRS
				1941	_mm_storel_pi(__m64 *__p, __m128 __a)
				1942	{
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1943	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
				1944	struct __mm_storeh_pi_struct {
				1945	__mm_storeh_pi_v2f32 __u;
				1946	} __attribute__((__packed__, __may_alias__));
				1947	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1948	}
				1949
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1950	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1951	/// memory location.
				1952	///
				1953	/// \headerfile <x86intrin.h>
				1954	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1955	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1956	///
				1957	/// \param __p
				1958	/// A pointer to a 32-bit memory location.
				1959	/// \param __a
				1960	/// A 128-bit vector of [4 x float] containing the value to be stored.
				1961	static __inline__ void __DEFAULT_FN_ATTRS
				1962	_mm_store_ss(float *__p, __m128 __a)
				1963	{
				1964	struct __mm_store_ss_struct {
				1965	float __u;
				1966	} __attribute__((__packed__, __may_alias__));
				1967	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
				1968	}
				1969
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1970	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
				1971	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1972	///
				1973	/// \headerfile <x86intrin.h>
				1974	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1975	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1976	///
				1977	/// \param __p
				1978	/// A pointer to a 128-bit memory location. The address of the memory
				1979	/// location does not have to be aligned.
				1980	/// \param __a
				1981	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1982	static __inline__ void __DEFAULT_FN_ATTRS
				1983	_mm_storeu_ps(float *__p, __m128 __a)
				1984	{
				1985	struct __storeu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1986	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1987	} __attribute__((__packed__, __may_alias__));
				1988	((struct __storeu_ps*)__p)->__v = __a;
				1989	}
				1990
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1991	/// Stores a 128-bit vector of [4 x float] into an aligned memory
				1992	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1993	///
				1994	/// \headerfile <x86intrin.h>
				1995	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1996	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1997	///
				1998	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1999	/// A pointer to a 128-bit memory location. The address of the memory
				2000	/// location has to be 16-byte aligned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2001	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2002	/// A 128-bit vector of [4 x float] containing the values to be stored.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2003	static __inline__ void __DEFAULT_FN_ATTRS
				2004	_mm_store_ps(float *__p, __m128 __a)
				2005	{
				2006	(__m128)__p = __a;
				2007	}
				2008
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2009	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2010	/// four contiguous elements in an aligned memory location.
				2011	///
				2012	/// \headerfile <x86intrin.h>
				2013	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2014	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2015	/// instruction.
				2016	///
				2017	/// \param __p
				2018	/// A pointer to a 128-bit memory location.
				2019	/// \param __a
				2020	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2021	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2022	static __inline__ void __DEFAULT_FN_ATTRS
				2023	_mm_store1_ps(float *__p, __m128 __a)
				2024	{
				2025	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
				2026	_mm_store_ps(__p, __a);
				2027	}
				2028
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2029	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
				2030	/// four contiguous elements in an aligned memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2031	///
				2032	/// \headerfile <x86intrin.h>
				2033	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2034	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
				2035	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2036	///
				2037	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2038	/// A pointer to a 128-bit memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2039	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2040	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
				2041	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2042	static __inline__ void __DEFAULT_FN_ATTRS
				2043	_mm_store_ps1(float *__p, __m128 __a)
				2044	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2045	_mm_store1_ps(__p, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2046	}
				2047
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2048	/// Stores float values from a 128-bit vector of [4 x float] to an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2049	/// aligned memory location in reverse order.
				2050	///
				2051	/// \headerfile <x86intrin.h>
				2052	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2053	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2054	/// instruction.
				2055	///
				2056	/// \param __p
				2057	/// A pointer to a 128-bit memory location. The address of the memory
				2058	/// location has to be 128-bit aligned.
				2059	/// \param __a
				2060	/// A 128-bit vector of [4 x float] containing the values to be stored.
				2061	static __inline__ void __DEFAULT_FN_ATTRS
				2062	_mm_storer_ps(float *__p, __m128 __a)
				2063	{
				2064	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				2065	_mm_store_ps(__p, __a);
				2066	}
				2067
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2068	#define _MM_HINT_ET0 7
				2069	#define _MM_HINT_ET1 6
				2070	#define _MM_HINT_T0 3
				2071	#define _MM_HINT_T1 2
				2072	#define _MM_HINT_T2 1
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2073	#define _MM_HINT_NTA 0
				2074
				2075	#ifndef _MSC_VER
				2076	/* FIXME: We have to #define this because "sel" must be a constant integer, and
				2077	Sema doesn't do any form of constant propagation yet. */
				2078
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2079	/// Loads one cache line of data from the specified address to a location
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2080	/// closer to the processor.
				2081	///
				2082	/// \headerfile <x86intrin.h>
				2083	///
				2084	/// \code
				2085	/// void _mm_prefetch(const void * a, const int sel);
				2086	/// \endcode
				2087	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2088	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2089	///
				2090	/// \param a
				2091	/// A pointer to a memory location containing a cache line of data.
				2092	/// \param sel
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2093	/// A predefined integer constant specifying the type of prefetch
				2094	/// operation: \n
				2095	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
				2096	/// PREFETCHNTA instruction will be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2097	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2098	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2099	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2100	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2101	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
				2102	/// be generated.
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	2103	#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2104	((sel) >> 2) & 1, (sel) & 0x3))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2105	#endif
				2106
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2107	/// Stores a 64-bit integer in the specified aligned memory location. To
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2108	/// minimize caching, the data is flagged as non-temporal (unlikely to be
				2109	/// used again soon).
				2110	///
				2111	/// \headerfile <x86intrin.h>
				2112	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2113	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2114	///
				2115	/// \param __p
				2116	/// A pointer to an aligned memory location used to store the register value.
				2117	/// \param __a
				2118	/// A 64-bit integer containing the value to be stored.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2119	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2120	_mm_stream_pi(__m64 *__p, __m64 __a)
				2121	{
				2122	__builtin_ia32_movntq(__p, __a);
				2123	}
				2124
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2125	/// Moves packed float values from a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2126	/// 128-bit aligned memory location. To minimize caching, the data is flagged
				2127	/// as non-temporal (unlikely to be used again soon).
				2128	///
				2129	/// \headerfile <x86intrin.h>
				2130	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2131	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2132	///
				2133	/// \param __p
				2134	/// A pointer to a 128-bit aligned memory location that will receive the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2135	/// single-precision floating-point values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2136	/// \param __a
				2137	/// A 128-bit vector of [4 x float] containing the values to be moved.
				2138	static __inline__ void __DEFAULT_FN_ATTRS
				2139	_mm_stream_ps(float *__p, __m128 __a)
				2140	{
				2141	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
				2142	}
				2143
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2144	#if defined(__cplusplus)
				2145	extern "C" {
				2146	#endif
				2147
				2148	/// Forces strong memory ordering (serialization) between store
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2149	/// instructions preceding this instruction and store instructions following
				2150	/// this instruction, ensuring the system completes all previous stores
				2151	/// before executing subsequent stores.
				2152	///
				2153	/// \headerfile <x86intrin.h>
				2154	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2155	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2156	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2157	void _mm_sfence(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2158
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2159	#if defined(__cplusplus)
				2160	} // extern "C"
				2161	#endif
				2162
				2163	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2164	/// returns it, as specified by the immediate integer operand.
				2165	///
				2166	/// \headerfile <x86intrin.h>
				2167	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2168	/// \code
				2169	/// int _mm_extract_pi16(__m64 a, int n);
				2170	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2171	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2172	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
				2173	///
				2174	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2175	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2176	/// \param n
				2177	/// An immediate integer operand that determines which bits are extracted: \n
				2178	/// 0: Bits [15:0] are copied to the destination. \n
				2179	/// 1: Bits [31:16] are copied to the destination. \n
				2180	/// 2: Bits [47:32] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2181	/// 3: Bits [63:48] are copied to the destination.
				2182	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2183	#define _mm_extract_pi16(a, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2184	((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2185
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2186	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2187	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2188	/// specified by the immediate operand \a n.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2189	///
				2190	/// \headerfile <x86intrin.h>
				2191	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2192	/// \code
				2193	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
				2194	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2195	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2196	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
				2197	///
				2198	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2199	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2200	/// \param d
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2201	/// An integer. The lower 16-bit value from this operand is written to the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2202	/// destination at the offset specified by operand \a n.
				2203	/// \param n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2204	/// An immediate integer operant that determines which the bits to be used
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2205	/// in the destination. \n
				2206	/// 0: Bits [15:0] are copied to the destination. \n
				2207	/// 1: Bits [31:16] are copied to the destination. \n
				2208	/// 2: Bits [47:32] are copied to the destination. \n
				2209	/// 3: Bits [63:48] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2210	/// The remaining bits in the destination are copied from the corresponding
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2211	/// bits in operand \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2212	/// \returns A 64-bit integer vector containing the copied packed data from the
				2213	/// operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2214	#define _mm_insert_pi16(a, d, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2215	((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2216
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2217	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2218	/// the 64-bit integer vectors, and writes the greater value to the
				2219	/// corresponding bits in the destination.
				2220	///
				2221	/// \headerfile <x86intrin.h>
				2222	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2223	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2224	///
				2225	/// \param __a
				2226	/// A 64-bit integer vector containing one of the source operands.
				2227	/// \param __b
				2228	/// A 64-bit integer vector containing one of the source operands.
				2229	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2230	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2231	_mm_max_pi16(__m64 __a, __m64 __b)
				2232	{
				2233	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
				2234	}
				2235
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2236	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2237	/// values of the 64-bit integer vectors, and writes the greater value to the
				2238	/// corresponding bits in the destination.
				2239	///
				2240	/// \headerfile <x86intrin.h>
				2241	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2242	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2243	///
				2244	/// \param __a
				2245	/// A 64-bit integer vector containing one of the source operands.
				2246	/// \param __b
				2247	/// A 64-bit integer vector containing one of the source operands.
				2248	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2249	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2250	_mm_max_pu8(__m64 __a, __m64 __b)
				2251	{
				2252	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
				2253	}
				2254
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2255	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2256	/// the 64-bit integer vectors, and writes the lesser value to the
				2257	/// corresponding bits in the destination.
				2258	///
				2259	/// \headerfile <x86intrin.h>
				2260	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2261	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2262	///
				2263	/// \param __a
				2264	/// A 64-bit integer vector containing one of the source operands.
				2265	/// \param __b
				2266	/// A 64-bit integer vector containing one of the source operands.
				2267	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2268	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2269	_mm_min_pi16(__m64 __a, __m64 __b)
				2270	{
				2271	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
				2272	}
				2273
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2274	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2275	/// values of the 64-bit integer vectors, and writes the lesser value to the
				2276	/// corresponding bits in the destination.
				2277	///
				2278	/// \headerfile <x86intrin.h>
				2279	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2280	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2281	///
				2282	/// \param __a
				2283	/// A 64-bit integer vector containing one of the source operands.
				2284	/// \param __b
				2285	/// A 64-bit integer vector containing one of the source operands.
				2286	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2287	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2288	_mm_min_pu8(__m64 __a, __m64 __b)
				2289	{
				2290	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
				2291	}
				2292
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2293	/// Takes the most significant bit from each 8-bit element in a 64-bit
				2294	/// integer vector to create an 8-bit mask value. Zero-extends the value to
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2295	/// 32-bit integer and writes it to the destination.
				2296	///
				2297	/// \headerfile <x86intrin.h>
				2298	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2299	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2300	///
				2301	/// \param __a
				2302	/// A 64-bit integer vector containing the values with bits to be extracted.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2303	/// \returns The most significant bit from each 8-bit element in \a __a,
				2304	/// written to bits [7:0].
				2305	static __inline__ int __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2306	_mm_movemask_pi8(__m64 __a)
				2307	{
				2308	return __builtin_ia32_pmovmskb((__v8qi)__a);
				2309	}
				2310
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2311	/// Multiplies packed 16-bit unsigned integer values and writes the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2312	/// high-order 16 bits of each 32-bit product to the corresponding bits in
				2313	/// the destination.
				2314	///
				2315	/// \headerfile <x86intrin.h>
				2316	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2317	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2318	///
				2319	/// \param __a
				2320	/// A 64-bit integer vector containing one of the source operands.
				2321	/// \param __b
				2322	/// A 64-bit integer vector containing one of the source operands.
				2323	/// \returns A 64-bit integer vector containing the products of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2324	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2325	_mm_mulhi_pu16(__m64 __a, __m64 __b)
				2326	{
				2327	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
				2328	}
				2329
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2330	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2331	/// destination, as specified by the immediate value operand.
				2332	///
				2333	/// \headerfile <x86intrin.h>
				2334	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2335	/// \code
				2336	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
				2337	/// \endcode
				2338	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2339	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
				2340	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2341	/// \param a
				2342	/// A 64-bit integer vector containing the values to be shuffled.
				2343	/// \param n
				2344	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2345	/// copy from \a a. The destinations within the 64-bit destination are
				2346	/// assigned values as follows: \n
				2347	/// Bits [1:0] are used to assign values to bits [15:0] in the
				2348	/// destination. \n
				2349	/// Bits [3:2] are used to assign values to bits [31:16] in the
				2350	/// destination. \n
				2351	/// Bits [5:4] are used to assign values to bits [47:32] in the
				2352	/// destination. \n
				2353	/// Bits [7:6] are used to assign values to bits [63:48] in the
				2354	/// destination. \n
				2355	/// Bit value assignments: \n
				2356	/// 00: assigned from bits [15:0] of \a a. \n
				2357	/// 01: assigned from bits [31:16] of \a a. \n
				2358	/// 10: assigned from bits [47:32] of \a a. \n
				2359	/// 11: assigned from bits [63:48] of \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2360	/// \returns A 64-bit integer vector containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2361	#define _mm_shuffle_pi16(a, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2362	((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2363
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2364	/// Conditionally copies the values from each 8-bit element in the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2365	/// 64-bit integer vector operand to the specified memory location, as
				2366	/// specified by the most significant bit in the corresponding element in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2367	/// second 64-bit integer vector operand.
				2368	///
				2369	/// To minimize caching, the data is flagged as non-temporal
				2370	/// (unlikely to be used again soon).
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2371	///
				2372	/// \headerfile <x86intrin.h>
				2373	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2374	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2375	///
				2376	/// \param __d
				2377	/// A 64-bit integer vector containing the values with elements to be copied.
				2378	/// \param __n
				2379	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2380	/// element determines whether the corresponding element in operand \a __d
				2381	/// is copied. If the most significant bit of a given element is 1, the
				2382	/// corresponding element in operand \a __d is copied.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2383	/// \param __p
				2384	/// A pointer to a 64-bit memory location that will receive the conditionally
				2385	/// copied integer values. The address of the memory location does not have
				2386	/// to be aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2387	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2388	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
				2389	{
				2390	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
				2391	}
				2392
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2393	/// Computes the rounded averages of the packed unsigned 8-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2394	/// values and writes the averages to the corresponding bits in the
				2395	/// destination.
				2396	///
				2397	/// \headerfile <x86intrin.h>
				2398	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2399	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2400	///
				2401	/// \param __a
				2402	/// A 64-bit integer vector containing one of the source operands.
				2403	/// \param __b
				2404	/// A 64-bit integer vector containing one of the source operands.
				2405	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2406	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2407	_mm_avg_pu8(__m64 __a, __m64 __b)
				2408	{
				2409	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
				2410	}
				2411
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2412	/// Computes the rounded averages of the packed unsigned 16-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2413	/// values and writes the averages to the corresponding bits in the
				2414	/// destination.
				2415	///
				2416	/// \headerfile <x86intrin.h>
				2417	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2418	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2419	///
				2420	/// \param __a
				2421	/// A 64-bit integer vector containing one of the source operands.
				2422	/// \param __b
				2423	/// A 64-bit integer vector containing one of the source operands.
				2424	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2425	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2426	_mm_avg_pu16(__m64 __a, __m64 __b)
				2427	{
				2428	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
				2429	}
				2430
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2431	/// Subtracts the corresponding 8-bit unsigned integer values of the two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2432	/// 64-bit vector operands and computes the absolute value for each of the
				2433	/// difference. Then sum of the 8 absolute differences is written to the
				2434	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
				2435	///
				2436	/// \headerfile <x86intrin.h>
				2437	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2438	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2439	///
				2440	/// \param __a
				2441	/// A 64-bit integer vector containing one of the source operands.
				2442	/// \param __b
				2443	/// A 64-bit integer vector containing one of the source operands.
				2444	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
				2445	/// sets of absolute differences between both operands. The upper bits are
				2446	/// cleared.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2447	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2448	_mm_sad_pu8(__m64 __a, __m64 __b)
				2449	{
				2450	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
				2451	}
				2452
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2453	#if defined(__cplusplus)
				2454	extern "C" {
				2455	#endif
				2456
				2457	/// Returns the contents of the MXCSR register as a 32-bit unsigned
				2458	/// integer value.
				2459	///
				2460	/// There are several groups of macros associated with this
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2461	/// intrinsic, including:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2462	/// <ul>
				2463	/// <li>
				2464	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2465	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2466	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2467	/// _MM_GET_EXCEPTION_STATE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2468	/// </li>
				2469	/// <li>
				2470	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2471	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2472	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2473	/// </li>
				2474	/// <li>
				2475	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2476	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2477	/// _MM_GET_ROUNDING_MODE().
				2478	/// </li>
				2479	/// <li>
				2480	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2481	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2482	/// </li>
				2483	/// <li>
				2484	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2485	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2486	/// _MM_GET_DENORMALS_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2487	/// </li>
				2488	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2489	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2490	/// For example, the following expression checks if an overflow exception has
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2491	/// occurred:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2492	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2493	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2494	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2495	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2496	/// The following expression gets the current rounding mode:
				2497	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2498	/// _MM_GET_ROUNDING_MODE()
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2499	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2500	///
				2501	/// \headerfile <x86intrin.h>
				2502	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2503	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2504	///
				2505	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
				2506	/// register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2507	unsigned int _mm_getcsr(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2508
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2509	/// Sets the MXCSR register with the 32-bit unsigned integer value.
				2510	///
				2511	/// There are several groups of macros associated with this intrinsic,
				2512	/// including:
				2513	/// <ul>
				2514	/// <li>
				2515	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2516	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2517	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2518	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2519	/// </li>
				2520	/// <li>
				2521	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2522	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2523	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
				2524	/// of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2525	/// </li>
				2526	/// <li>
				2527	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2528	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
				2529	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2530	/// </li>
				2531	/// <li>
				2532	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2533	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
				2534	/// one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2535	/// </li>
				2536	/// <li>
				2537	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2538	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2539	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2540	/// </li>
				2541	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2542	///
				2543	/// For example, the following expression causes subsequent floating-point
				2544	/// operations to round up:
				2545	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
				2546	///
				2547	/// The following example sets the DAZ and FTZ flags:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2548	/// \code
				2549	/// void setFlags() {
				2550	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
				2551	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
				2552	/// }
				2553	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2554	///
				2555	/// \headerfile <x86intrin.h>
				2556	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2557	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2558	///
				2559	/// \param __i
				2560	/// A 32-bit unsigned integer value to be written to the MXCSR register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2561	void _mm_setcsr(unsigned int __i);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2562
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2563	#if defined(__cplusplus)
				2564	} // extern "C"
				2565	#endif
				2566
				2567	/// Selects 4 float values from the 128-bit operands of [4 x float], as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2568	/// specified by the immediate value operand.
				2569	///
				2570	/// \headerfile <x86intrin.h>
				2571	///
				2572	/// \code
				2573	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
				2574	/// \endcode
				2575	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2576	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2577	///
				2578	/// \param a
				2579	/// A 128-bit vector of [4 x float].
				2580	/// \param b
				2581	/// A 128-bit vector of [4 x float].
				2582	/// \param mask
				2583	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2584	/// copy from \a a and \a b. \n
				2585	/// Bits [3:0] specify the values copied from operand \a a. \n
				2586	/// Bits [7:4] specify the values copied from operand \a b. \n
				2587	/// The destinations within the 128-bit destination are assigned values as
				2588	/// follows: \n
				2589	/// Bits [1:0] are used to assign values to bits [31:0] in the
				2590	/// destination. \n
				2591	/// Bits [3:2] are used to assign values to bits [63:32] in the
				2592	/// destination. \n
				2593	/// Bits [5:4] are used to assign values to bits [95:64] in the
				2594	/// destination. \n
				2595	/// Bits [7:6] are used to assign values to bits [127:96] in the
				2596	/// destination. \n
				2597	/// Bit value assignments: \n
				2598	/// 00: Bits [31:0] copied from the specified operand. \n
				2599	/// 01: Bits [63:32] copied from the specified operand. \n
				2600	/// 10: Bits [95:64] copied from the specified operand. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2601	/// 11: Bits [127:96] copied from the specified operand.
				2602	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2603	#define _mm_shuffle_ps(a, b, mask) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2604	((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
				2605	(int)(mask)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2606
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2607	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
				2608	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2609	///
				2610	/// \headerfile <x86intrin.h>
				2611	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2612	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2613	///
				2614	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2615	/// A 128-bit vector of [4 x float]. \n
				2616	/// Bits [95:64] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2617	/// Bits [127:96] are written to bits [95:64] of the destination.
				2618	/// \param __b
				2619	/// A 128-bit vector of [4 x float].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2620	/// Bits [95:64] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2621	/// Bits [127:96] are written to bits [127:96] of the destination.
				2622	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2623	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2624	_mm_unpackhi_ps(__m128 __a, __m128 __b)
				2625	{
				2626	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
				2627	}
				2628
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2629	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
				2630	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2631	///
				2632	/// \headerfile <x86intrin.h>
				2633	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2634	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2635	///
				2636	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2637	/// A 128-bit vector of [4 x float]. \n
				2638	/// Bits [31:0] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2639	/// Bits [63:32] are written to bits [95:64] of the destination.
				2640	/// \param __b
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2641	/// A 128-bit vector of [4 x float]. \n
				2642	/// Bits [31:0] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2643	/// Bits [63:32] are written to bits [127:96] of the destination.
				2644	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2645	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2646	_mm_unpacklo_ps(__m128 __a, __m128 __b)
				2647	{
				2648	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
				2649	}
				2650
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2651	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2652	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
				2653	/// 96 bits are set to the upper 96 bits of the first parameter.
				2654	///
				2655	/// \headerfile <x86intrin.h>
				2656	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2657	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
				2658	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2659	///
				2660	/// \param __a
				2661	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
				2662	/// written to the upper 96 bits of the result.
				2663	/// \param __b
				2664	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
				2665	/// written to the lower 32 bits of the result.
				2666	/// \returns A 128-bit floating-point vector of [4 x float].
				2667	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2668	_mm_move_ss(__m128 __a, __m128 __b)
				2669	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2670	__a[0] = __b[0];
				2671	return __a;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2672	}
				2673
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2674	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2675	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
				2676	/// 64 bits are set to the upper 64 bits of the first parameter.
				2677	///
				2678	/// \headerfile <x86intrin.h>
				2679	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2680	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2681	///
				2682	/// \param __a
				2683	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2684	/// written to the upper 64 bits of the result.
				2685	/// \param __b
				2686	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2687	/// written to the lower 64 bits of the result.
				2688	/// \returns A 128-bit floating-point vector of [4 x float].
				2689	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2690	_mm_movehl_ps(__m128 __a, __m128 __b)
				2691	{
				2692	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
				2693	}
				2694
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2695	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2696	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
				2697	/// 64 bits are set to the lower 64 bits of the second parameter.
				2698	///
				2699	/// \headerfile <x86intrin.h>
				2700	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2701	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2702	///
				2703	/// \param __a
				2704	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2705	/// written to the lower 64 bits of the result.
				2706	/// \param __b
				2707	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2708	/// written to the upper 64 bits of the result.
				2709	/// \returns A 128-bit floating-point vector of [4 x float].
				2710	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2711	_mm_movelh_ps(__m128 __a, __m128 __b)
				2712	{
				2713	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
				2714	}
				2715
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2716	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2717	/// float].
				2718	///
				2719	/// \headerfile <x86intrin.h>
				2720	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2721	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2722	///
				2723	/// \param __a
				2724	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
				2725	/// from the corresponding elements in this operand.
				2726	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2727	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2728	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2729	_mm_cvtpi16_ps(__m64 __a)
				2730	{
				2731	__m64 __b, __c;
				2732	__m128 __r;
				2733
				2734	__b = _mm_setzero_si64();
				2735	__b = _mm_cmpgt_pi16(__b, __a);
				2736	__c = _mm_unpackhi_pi16(__a, __b);
				2737	__r = _mm_setzero_ps();
				2738	__r = _mm_cvtpi32_ps(__r, __c);
				2739	__r = _mm_movelh_ps(__r, __r);
				2740	__c = _mm_unpacklo_pi16(__a, __b);
				2741	__r = _mm_cvtpi32_ps(__r, __c);
				2742
				2743	return __r;
				2744	}
				2745
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2746	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2747	/// 128-bit vector of [4 x float].
				2748	///
				2749	/// \headerfile <x86intrin.h>
				2750	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2751	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2752	///
				2753	/// \param __a
				2754	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
				2755	/// destination are copied from the corresponding elements in this operand.
				2756	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2757	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2758	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2759	_mm_cvtpu16_ps(__m64 __a)
				2760	{
				2761	__m64 __b, __c;
				2762	__m128 __r;
				2763
				2764	__b = _mm_setzero_si64();
				2765	__c = _mm_unpackhi_pi16(__a, __b);
				2766	__r = _mm_setzero_ps();
				2767	__r = _mm_cvtpi32_ps(__r, __c);
				2768	__r = _mm_movelh_ps(__r, __r);
				2769	__c = _mm_unpacklo_pi16(__a, __b);
				2770	__r = _mm_cvtpi32_ps(__r, __c);
				2771
				2772	return __r;
				2773	}
				2774
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2775	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2776	/// into a 128-bit vector of [4 x float].
				2777	///
				2778	/// \headerfile <x86intrin.h>
				2779	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2780	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2781	///
				2782	/// \param __a
				2783	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
				2784	/// from the corresponding lower 4 elements in this operand.
				2785	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2786	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2787	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2788	_mm_cvtpi8_ps(__m64 __a)
				2789	{
				2790	__m64 __b;
				2791
				2792	__b = _mm_setzero_si64();
				2793	__b = _mm_cmpgt_pi8(__b, __a);
				2794	__b = _mm_unpacklo_pi8(__a, __b);
				2795
				2796	return _mm_cvtpi16_ps(__b);
				2797	}
				2798
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2799	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2800	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
				2801	///
				2802	/// \headerfile <x86intrin.h>
				2803	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2804	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2805	///
				2806	/// \param __a
				2807	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
				2808	/// destination are copied from the corresponding lower 4 elements in this
				2809	/// operand.
				2810	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2811	/// values from the source operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2812	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2813	_mm_cvtpu8_ps(__m64 __a)
				2814	{
				2815	__m64 __b;
				2816
				2817	__b = _mm_setzero_si64();
				2818	__b = _mm_unpacklo_pi8(__a, __b);
				2819
				2820	return _mm_cvtpi16_ps(__b);
				2821	}
				2822
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2823	/// Converts the two 32-bit signed integer values from each 64-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2824	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
				2825	///
				2826	/// \headerfile <x86intrin.h>
				2827	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2828	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2829	///
				2830	/// \param __a
				2831	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
				2832	/// copied from the elements in this operand.
				2833	/// \param __b
				2834	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
				2835	/// copied from the elements in this operand.
				2836	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				2837	/// copied and converted values from the first operand. The upper 64 bits
				2838	/// contain the copied and converted values from the second operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2839	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2840	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
				2841	{
				2842	__m128 __c;
				2843
				2844	__c = _mm_setzero_ps();
				2845	__c = _mm_cvtpi32_ps(__c, __b);
				2846	__c = _mm_movelh_ps(__c, __c);
				2847
				2848	return _mm_cvtpi32_ps(__c, __a);
				2849	}
				2850
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2851	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2852	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2853	/// packs the results into a 64-bit integer vector of [4 x i16].
				2854	///
				2855	/// If the floating-point element is NaN or infinity, or if the
				2856	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
				2857	/// it is converted to 0x8000. Otherwise if the floating-point element is
				2858	/// greater than 0x7FFF, it is converted to 0x7FFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2859	///
				2860	/// \headerfile <x86intrin.h>
				2861	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2862	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2863	///
				2864	/// \param __a
				2865	/// A 128-bit floating-point vector of [4 x float].
				2866	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
				2867	/// values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2868	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2869	_mm_cvtps_pi16(__m128 __a)
				2870	{
				2871	__m64 __b, __c;
				2872
				2873	__b = _mm_cvtps_pi32(__a);
				2874	__a = _mm_movehl_ps(__a, __a);
				2875	__c = _mm_cvtps_pi32(__a);
				2876
				2877	return _mm_packs_pi32(__b, __c);
				2878	}
				2879
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2880	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2881	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
				2882	/// packs the results into the lower 32 bits of a 64-bit integer vector of
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2883	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
				2884	///
				2885	/// If the floating-point element is NaN or infinity, or if the
				2886	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
				2887	/// is converted to 0x80. Otherwise if the floating-point element is greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2888	/// than 0x7F, it is converted to 0x7F.
				2889	///
				2890	/// \headerfile <x86intrin.h>
				2891	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2892	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2893	///
				2894	/// \param __a
				2895	/// 128-bit floating-point vector of [4 x float].
				2896	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
				2897	/// converted values and the uppper 32 bits are set to zero.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2898	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2899	_mm_cvtps_pi8(__m128 __a)
				2900	{
				2901	__m64 __b, __c;
				2902
				2903	__b = _mm_cvtps_pi16(__a);
				2904	__c = _mm_setzero_si64();
				2905
				2906	return _mm_packs_pi16(__b, __c);
				2907	}
				2908
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2909	/// Extracts the sign bits from each single-precision floating-point
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2910	/// element of a 128-bit floating-point vector of [4 x float] and returns the
				2911	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
				2912	/// to zero.
				2913	///
				2914	/// \headerfile <x86intrin.h>
				2915	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2916	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2917	///
				2918	/// \param __a
				2919	/// A 128-bit floating-point vector of [4 x float].
				2920	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
				2921	/// single-precision floating-point element of the parameter. Bits [31:4] are
				2922	/// set to zero.
				2923	static __inline__ int __DEFAULT_FN_ATTRS
				2924	_mm_movemask_ps(__m128 __a)
				2925	{
				2926	return __builtin_ia32_movmskps((__v4sf)__a);
				2927	}
				2928
				2929
				2930	#define _MM_ALIGN16 __attribute__((aligned(16)))
				2931
				2932	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
				2933
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2934	#define _MM_EXCEPT_INVALID (0x0001U)
				2935	#define _MM_EXCEPT_DENORM (0x0002U)
				2936	#define _MM_EXCEPT_DIV_ZERO (0x0004U)
				2937	#define _MM_EXCEPT_OVERFLOW (0x0008U)
				2938	#define _MM_EXCEPT_UNDERFLOW (0x0010U)
				2939	#define _MM_EXCEPT_INEXACT (0x0020U)
				2940	#define _MM_EXCEPT_MASK (0x003fU)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2941
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2942	#define _MM_MASK_INVALID (0x0080U)
				2943	#define _MM_MASK_DENORM (0x0100U)
				2944	#define _MM_MASK_DIV_ZERO (0x0200U)
				2945	#define _MM_MASK_OVERFLOW (0x0400U)
				2946	#define _MM_MASK_UNDERFLOW (0x0800U)
				2947	#define _MM_MASK_INEXACT (0x1000U)
				2948	#define _MM_MASK_MASK (0x1f80U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2949
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2950	#define _MM_ROUND_NEAREST (0x0000U)
				2951	#define _MM_ROUND_DOWN (0x2000U)
				2952	#define _MM_ROUND_UP (0x4000U)
				2953	#define _MM_ROUND_TOWARD_ZERO (0x6000U)
				2954	#define _MM_ROUND_MASK (0x6000U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2955
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2956	#define _MM_FLUSH_ZERO_MASK (0x8000U)
				2957	#define _MM_FLUSH_ZERO_ON (0x8000U)
				2958	#define _MM_FLUSH_ZERO_OFF (0x0000U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2959
				2960	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
				2961	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
				2962	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
				2963	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
				2964
				2965	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
				2966	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
				2967	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
				2968	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
				2969
				2970	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				2971	do { \
				2972	__m128 tmp3, tmp2, tmp1, tmp0; \
				2973	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
				2974	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
				2975	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
				2976	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
				2977	(row0) = _mm_movelh_ps(tmp0, tmp2); \
				2978	(row1) = _mm_movehl_ps(tmp2, tmp0); \
				2979	(row2) = _mm_movelh_ps(tmp1, tmp3); \
				2980	(row3) = _mm_movehl_ps(tmp3, tmp1); \
				2981	} while (0)
				2982
				2983	/* Aliases for compatibility. */
				2984	#define _m_pextrw _mm_extract_pi16
				2985	#define _m_pinsrw _mm_insert_pi16
				2986	#define _m_pmaxsw _mm_max_pi16
				2987	#define _m_pmaxub _mm_max_pu8
				2988	#define _m_pminsw _mm_min_pi16
				2989	#define _m_pminub _mm_min_pu8
				2990	#define _m_pmovmskb _mm_movemask_pi8
				2991	#define _m_pmulhuw _mm_mulhi_pu16
				2992	#define _m_pshufw _mm_shuffle_pi16
				2993	#define _m_maskmovq _mm_maskmove_si64
				2994	#define _m_pavgb _mm_avg_pu8
				2995	#define _m_pavgw _mm_avg_pu16
				2996	#define _m_psadbw _mm_sad_pu8
				2997	#define _m_ _mm_
				2998	#define _m_ _mm_
				2999
				3000	#undef __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3001	#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3002
				3003	/* Ugly hack for backwards-compatibility (compatible with gcc) */
				3004	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
				3005	#include <emmintrin.h>
				3006	#endif
				3007
				3008	#endif /* __XMMINTRIN_H */