Blame - darwin-x86/lib64/clang/14.0.2/include/xmmintrin.h - platform/prebuilts/clang-tools

blob: 1612d3d2773d5e137bb092a8664e527299991c7b [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
				2	*
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
				10	#ifndef __XMMINTRIN_H
				11	#define __XMMINTRIN_H
				12
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	13	#if !defined(__i386__) && !defined(__x86_64__)
				14	#error "This header is only meant to be used on x86 and x64 architecture"
				15	#endif
				16
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	17	#include <mmintrin.h>
				18
				19	typedef int __v4si __attribute__((__vector_size__(16)));
				20	typedef float __v4sf __attribute__((__vector_size__(16)));
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	21	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
				22
				23	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	24
				25	/* Unsigned types */
				26	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
				27
				28	/* This header should only be included in a hosted environment as it depends on
				29	* a standard library to provide allocation routines. */
				30	#if __STDC_HOSTED__
				31	#include <mm_malloc.h>
				32	#endif
				33
				34	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	35	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
				36	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	37
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	38	/// Adds the 32-bit float values in the low-order bits of the operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	39	///
				40	/// \headerfile <x86intrin.h>
				41	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	42	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	43	///
				44	/// \param __a
				45	/// A 128-bit vector of [4 x float] containing one of the source operands.
				46	/// The lower 32 bits of this operand are used in the calculation.
				47	/// \param __b
				48	/// A 128-bit vector of [4 x float] containing one of the source operands.
				49	/// The lower 32 bits of this operand are used in the calculation.
				50	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
				51	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
				52	/// the upper 96 bits of the first source operand.
				53	static __inline__ __m128 __DEFAULT_FN_ATTRS
				54	_mm_add_ss(__m128 __a, __m128 __b)
				55	{
				56	__a[0] += __b[0];
				57	return __a;
				58	}
				59
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	60	/// Adds two 128-bit vectors of [4 x float], and returns the results of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	61	/// the addition.
				62	///
				63	/// \headerfile <x86intrin.h>
				64	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	65	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	66	///
				67	/// \param __a
				68	/// A 128-bit vector of [4 x float] containing one of the source operands.
				69	/// \param __b
				70	/// A 128-bit vector of [4 x float] containing one of the source operands.
				71	/// \returns A 128-bit vector of [4 x float] containing the sums of both
				72	/// operands.
				73	static __inline__ __m128 __DEFAULT_FN_ATTRS
				74	_mm_add_ps(__m128 __a, __m128 __b)
				75	{
				76	return (__m128)((__v4sf)__a + (__v4sf)__b);
				77	}
				78
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	79	/// Subtracts the 32-bit float value in the low-order bits of the second
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	80	/// operand from the corresponding value in the first operand.
				81	///
				82	/// \headerfile <x86intrin.h>
				83	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	84	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	85	///
				86	/// \param __a
				87	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
				88	/// of this operand are used in the calculation.
				89	/// \param __b
				90	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
				91	/// bits of this operand are used in the calculation.
				92	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				93	/// difference of the lower 32 bits of both operands. The upper 96 bits are
				94	/// copied from the upper 96 bits of the first source operand.
				95	static __inline__ __m128 __DEFAULT_FN_ATTRS
				96	_mm_sub_ss(__m128 __a, __m128 __b)
				97	{
				98	__a[0] -= __b[0];
				99	return __a;
				100	}
				101
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	102	/// Subtracts each of the values of the second operand from the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	103	/// operand, both of which are 128-bit vectors of [4 x float] and returns
				104	/// the results of the subtraction.
				105	///
				106	/// \headerfile <x86intrin.h>
				107	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	108	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	109	///
				110	/// \param __a
				111	/// A 128-bit vector of [4 x float] containing the minuend.
				112	/// \param __b
				113	/// A 128-bit vector of [4 x float] containing the subtrahend.
				114	/// \returns A 128-bit vector of [4 x float] containing the differences between
				115	/// both operands.
				116	static __inline__ __m128 __DEFAULT_FN_ATTRS
				117	_mm_sub_ps(__m128 __a, __m128 __b)
				118	{
				119	return (__m128)((__v4sf)__a - (__v4sf)__b);
				120	}
				121
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	122	/// Multiplies two 32-bit float values in the low-order bits of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	123	/// operands.
				124	///
				125	/// \headerfile <x86intrin.h>
				126	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	127	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	128	///
				129	/// \param __a
				130	/// A 128-bit vector of [4 x float] containing one of the source operands.
				131	/// The lower 32 bits of this operand are used in the calculation.
				132	/// \param __b
				133	/// A 128-bit vector of [4 x float] containing one of the source operands.
				134	/// The lower 32 bits of this operand are used in the calculation.
				135	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
				136	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
				137	/// bits of the first source operand.
				138	static __inline__ __m128 __DEFAULT_FN_ATTRS
				139	_mm_mul_ss(__m128 __a, __m128 __b)
				140	{
				141	__a[0] *= __b[0];
				142	return __a;
				143	}
				144
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	145	/// Multiplies two 128-bit vectors of [4 x float] and returns the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	146	/// results of the multiplication.
				147	///
				148	/// \headerfile <x86intrin.h>
				149	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	150	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	151	///
				152	/// \param __a
				153	/// A 128-bit vector of [4 x float] containing one of the source operands.
				154	/// \param __b
				155	/// A 128-bit vector of [4 x float] containing one of the source operands.
				156	/// \returns A 128-bit vector of [4 x float] containing the products of both
				157	/// operands.
				158	static __inline__ __m128 __DEFAULT_FN_ATTRS
				159	_mm_mul_ps(__m128 __a, __m128 __b)
				160	{
				161	return (__m128)((__v4sf)__a * (__v4sf)__b);
				162	}
				163
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	164	/// Divides the value in the low-order 32 bits of the first operand by
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	165	/// the corresponding value in the second operand.
				166	///
				167	/// \headerfile <x86intrin.h>
				168	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	169	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	170	///
				171	/// \param __a
				172	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
				173	/// bits of this operand are used in the calculation.
				174	/// \param __b
				175	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
				176	/// of this operand are used in the calculation.
				177	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
				178	/// lower 32 bits of both operands. The upper 96 bits are copied from the
				179	/// upper 96 bits of the first source operand.
				180	static __inline__ __m128 __DEFAULT_FN_ATTRS
				181	_mm_div_ss(__m128 __a, __m128 __b)
				182	{
				183	__a[0] /= __b[0];
				184	return __a;
				185	}
				186
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	187	/// Divides two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	188	///
				189	/// \headerfile <x86intrin.h>
				190	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	191	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	192	///
				193	/// \param __a
				194	/// A 128-bit vector of [4 x float] containing the dividend.
				195	/// \param __b
				196	/// A 128-bit vector of [4 x float] containing the divisor.
				197	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
				198	/// operands.
				199	static __inline__ __m128 __DEFAULT_FN_ATTRS
				200	_mm_div_ps(__m128 __a, __m128 __b)
				201	{
				202	return (__m128)((__v4sf)__a / (__v4sf)__b);
				203	}
				204
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	205	/// Calculates the square root of the value stored in the low-order bits
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	206	/// of a 128-bit vector of [4 x float].
				207	///
				208	/// \headerfile <x86intrin.h>
				209	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	210	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	211	///
				212	/// \param __a
				213	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				214	/// used in the calculation.
				215	/// \returns A 128-bit vector of [4 x float] containing the square root of the
				216	/// value in the low-order bits of the operand.
				217	static __inline__ __m128 __DEFAULT_FN_ATTRS
				218	_mm_sqrt_ss(__m128 __a)
				219	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	220	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	221	}
				222
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	223	/// Calculates the square roots of the values stored in a 128-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	224	/// of [4 x float].
				225	///
				226	/// \headerfile <x86intrin.h>
				227	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	228	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	229	///
				230	/// \param __a
				231	/// A 128-bit vector of [4 x float].
				232	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
				233	/// values in the operand.
				234	static __inline__ __m128 __DEFAULT_FN_ATTRS
				235	_mm_sqrt_ps(__m128 __a)
				236	{
				237	return __builtin_ia32_sqrtps((__v4sf)__a);
				238	}
				239
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	240	/// Calculates the approximate reciprocal of the value stored in the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	241	/// low-order bits of a 128-bit vector of [4 x float].
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	245	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	246	///
				247	/// \param __a
				248	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				249	/// used in the calculation.
				250	/// \returns A 128-bit vector of [4 x float] containing the approximate
				251	/// reciprocal of the value in the low-order bits of the operand.
				252	static __inline__ __m128 __DEFAULT_FN_ATTRS
				253	_mm_rcp_ss(__m128 __a)
				254	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	255	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	256	}
				257
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	258	/// Calculates the approximate reciprocals of the values stored in a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	259	/// 128-bit vector of [4 x float].
				260	///
				261	/// \headerfile <x86intrin.h>
				262	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	263	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	264	///
				265	/// \param __a
				266	/// A 128-bit vector of [4 x float].
				267	/// \returns A 128-bit vector of [4 x float] containing the approximate
				268	/// reciprocals of the values in the operand.
				269	static __inline__ __m128 __DEFAULT_FN_ATTRS
				270	_mm_rcp_ps(__m128 __a)
				271	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	272	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	273	}
				274
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	275	/// Calculates the approximate reciprocal of the square root of the value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	276	/// stored in the low-order bits of a 128-bit vector of [4 x float].
				277	///
				278	/// \headerfile <x86intrin.h>
				279	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	280	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	281	///
				282	/// \param __a
				283	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				284	/// used in the calculation.
				285	/// \returns A 128-bit vector of [4 x float] containing the approximate
				286	/// reciprocal of the square root of the value in the low-order bits of the
				287	/// operand.
				288	static __inline__ __m128 __DEFAULT_FN_ATTRS
				289	_mm_rsqrt_ss(__m128 __a)
				290	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	291	return __builtin_ia32_rsqrtss((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	292	}
				293
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	294	/// Calculates the approximate reciprocals of the square roots of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	295	/// values stored in a 128-bit vector of [4 x float].
				296	///
				297	/// \headerfile <x86intrin.h>
				298	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	299	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	300	///
				301	/// \param __a
				302	/// A 128-bit vector of [4 x float].
				303	/// \returns A 128-bit vector of [4 x float] containing the approximate
				304	/// reciprocals of the square roots of the values in the operand.
				305	static __inline__ __m128 __DEFAULT_FN_ATTRS
				306	_mm_rsqrt_ps(__m128 __a)
				307	{
				308	return __builtin_ia32_rsqrtps((__v4sf)__a);
				309	}
				310
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	311	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	312	/// operands and returns the lesser value in the low-order bits of the
				313	/// vector of [4 x float].
				314	///
				315	/// \headerfile <x86intrin.h>
				316	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	317	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	318	///
				319	/// \param __a
				320	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				321	/// 32 bits of this operand are used in the comparison.
				322	/// \param __b
				323	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				324	/// 32 bits of this operand are used in the comparison.
				325	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				326	/// minimum value between both operands. The upper 96 bits are copied from
				327	/// the upper 96 bits of the first source operand.
				328	static __inline__ __m128 __DEFAULT_FN_ATTRS
				329	_mm_min_ss(__m128 __a, __m128 __b)
				330	{
				331	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
				332	}
				333
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	334	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
				335	/// of each pair of values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	336	///
				337	/// \headerfile <x86intrin.h>
				338	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	339	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	340	///
				341	/// \param __a
				342	/// A 128-bit vector of [4 x float] containing one of the operands.
				343	/// \param __b
				344	/// A 128-bit vector of [4 x float] containing one of the operands.
				345	/// \returns A 128-bit vector of [4 x float] containing the minimum values
				346	/// between both operands.
				347	static __inline__ __m128 __DEFAULT_FN_ATTRS
				348	_mm_min_ps(__m128 __a, __m128 __b)
				349	{
				350	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
				351	}
				352
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	353	/// Compares two 32-bit float values in the low-order bits of both
				354	/// operands and returns the greater value in the low-order bits of a 128-bit
				355	/// vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	356	///
				357	/// \headerfile <x86intrin.h>
				358	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	359	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	360	///
				361	/// \param __a
				362	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				363	/// 32 bits of this operand are used in the comparison.
				364	/// \param __b
				365	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				366	/// 32 bits of this operand are used in the comparison.
				367	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				368	/// maximum value between both operands. The upper 96 bits are copied from
				369	/// the upper 96 bits of the first source operand.
				370	static __inline__ __m128 __DEFAULT_FN_ATTRS
				371	_mm_max_ss(__m128 __a, __m128 __b)
				372	{
				373	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
				374	}
				375
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	376	/// Compares two 128-bit vectors of [4 x float] and returns the greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	377	/// of each pair of values.
				378	///
				379	/// \headerfile <x86intrin.h>
				380	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	381	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	382	///
				383	/// \param __a
				384	/// A 128-bit vector of [4 x float] containing one of the operands.
				385	/// \param __b
				386	/// A 128-bit vector of [4 x float] containing one of the operands.
				387	/// \returns A 128-bit vector of [4 x float] containing the maximum values
				388	/// between both operands.
				389	static __inline__ __m128 __DEFAULT_FN_ATTRS
				390	_mm_max_ps(__m128 __a, __m128 __b)
				391	{
				392	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
				393	}
				394
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	395	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	396	///
				397	/// \headerfile <x86intrin.h>
				398	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	399	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	400	///
				401	/// \param __a
				402	/// A 128-bit vector containing one of the source operands.
				403	/// \param __b
				404	/// A 128-bit vector containing one of the source operands.
				405	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				406	/// values between both operands.
				407	static __inline__ __m128 __DEFAULT_FN_ATTRS
				408	_mm_and_ps(__m128 __a, __m128 __b)
				409	{
				410	return (__m128)((__v4su)__a & (__v4su)__b);
				411	}
				412
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	413	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	414	/// the one's complement of the values contained in the first source
				415	/// operand.
				416	///
				417	/// \headerfile <x86intrin.h>
				418	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	419	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	420	///
				421	/// \param __a
				422	/// A 128-bit vector of [4 x float] containing the first source operand. The
				423	/// one's complement of this value is used in the bitwise AND.
				424	/// \param __b
				425	/// A 128-bit vector of [4 x float] containing the second source operand.
				426	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				427	/// one's complement of the first operand and the values in the second
				428	/// operand.
				429	static __inline__ __m128 __DEFAULT_FN_ATTRS
				430	_mm_andnot_ps(__m128 __a, __m128 __b)
				431	{
				432	return (__m128)(~(__v4su)__a & (__v4su)__b);
				433	}
				434
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	435	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	436	///
				437	/// \headerfile <x86intrin.h>
				438	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	439	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	440	///
				441	/// \param __a
				442	/// A 128-bit vector of [4 x float] containing one of the source operands.
				443	/// \param __b
				444	/// A 128-bit vector of [4 x float] containing one of the source operands.
				445	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
				446	/// values between both operands.
				447	static __inline__ __m128 __DEFAULT_FN_ATTRS
				448	_mm_or_ps(__m128 __a, __m128 __b)
				449	{
				450	return (__m128)((__v4su)__a \| (__v4su)__b);
				451	}
				452
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	453	/// Performs a bitwise exclusive OR of two 128-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	454	/// [4 x float].
				455	///
				456	/// \headerfile <x86intrin.h>
				457	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	458	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	459	///
				460	/// \param __a
				461	/// A 128-bit vector of [4 x float] containing one of the source operands.
				462	/// \param __b
				463	/// A 128-bit vector of [4 x float] containing one of the source operands.
				464	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
				465	/// of the values between both operands.
				466	static __inline__ __m128 __DEFAULT_FN_ATTRS
				467	_mm_xor_ps(__m128 __a, __m128 __b)
				468	{
				469	return (__m128)((__v4su)__a ^ (__v4su)__b);
				470	}
				471
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	472	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	473	/// operands for equality and returns the result of the comparison in the
				474	/// low-order bits of a vector [4 x float].
				475	///
				476	/// \headerfile <x86intrin.h>
				477	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	478	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	479	///
				480	/// \param __a
				481	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				482	/// 32 bits of this operand are used in the comparison.
				483	/// \param __b
				484	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				485	/// 32 bits of this operand are used in the comparison.
				486	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				487	/// in the low-order bits.
				488	static __inline__ __m128 __DEFAULT_FN_ATTRS
				489	_mm_cmpeq_ss(__m128 __a, __m128 __b)
				490	{
				491	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
				492	}
				493
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	494	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	495	/// 128-bit vectors of [4 x float] for equality.
				496	///
				497	/// \headerfile <x86intrin.h>
				498	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	499	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	500	///
				501	/// \param __a
				502	/// A 128-bit vector of [4 x float].
				503	/// \param __b
				504	/// A 128-bit vector of [4 x float].
				505	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				506	static __inline__ __m128 __DEFAULT_FN_ATTRS
				507	_mm_cmpeq_ps(__m128 __a, __m128 __b)
				508	{
				509	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
				510	}
				511
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	512	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	513	/// operands to determine if the value in the first operand is less than the
				514	/// corresponding value in the second operand and returns the result of the
				515	/// comparison in the low-order bits of a vector of [4 x float].
				516	///
				517	/// \headerfile <x86intrin.h>
				518	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	519	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	520	///
				521	/// \param __a
				522	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				523	/// 32 bits of this operand are used in the comparison.
				524	/// \param __b
				525	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				526	/// 32 bits of this operand are used in the comparison.
				527	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				528	/// in the low-order bits.
				529	static __inline__ __m128 __DEFAULT_FN_ATTRS
				530	_mm_cmplt_ss(__m128 __a, __m128 __b)
				531	{
				532	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
				533	}
				534
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	535	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	536	/// 128-bit vectors of [4 x float] to determine if the values in the first
				537	/// operand are less than those in the second operand.
				538	///
				539	/// \headerfile <x86intrin.h>
				540	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	541	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	542	///
				543	/// \param __a
				544	/// A 128-bit vector of [4 x float].
				545	/// \param __b
				546	/// A 128-bit vector of [4 x float].
				547	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				548	static __inline__ __m128 __DEFAULT_FN_ATTRS
				549	_mm_cmplt_ps(__m128 __a, __m128 __b)
				550	{
				551	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
				552	}
				553
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	554	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	555	/// operands to determine if the value in the first operand is less than or
				556	/// equal to the corresponding value in the second operand and returns the
				557	/// result of the comparison in the low-order bits of a vector of
				558	/// [4 x float].
				559	///
				560	/// \headerfile <x86intrin.h>
				561	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	562	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	563	///
				564	/// \param __a
				565	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				566	/// 32 bits of this operand are used in the comparison.
				567	/// \param __b
				568	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				569	/// 32 bits of this operand are used in the comparison.
				570	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				571	/// in the low-order bits.
				572	static __inline__ __m128 __DEFAULT_FN_ATTRS
				573	_mm_cmple_ss(__m128 __a, __m128 __b)
				574	{
				575	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
				576	}
				577
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	578	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	579	/// 128-bit vectors of [4 x float] to determine if the values in the first
				580	/// operand are less than or equal to those in the second operand.
				581	///
				582	/// \headerfile <x86intrin.h>
				583	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	584	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	585	///
				586	/// \param __a
				587	/// A 128-bit vector of [4 x float].
				588	/// \param __b
				589	/// A 128-bit vector of [4 x float].
				590	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				591	static __inline__ __m128 __DEFAULT_FN_ATTRS
				592	_mm_cmple_ps(__m128 __a, __m128 __b)
				593	{
				594	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
				595	}
				596
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	597	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	598	/// operands to determine if the value in the first operand is greater than
				599	/// the corresponding value in the second operand and returns the result of
				600	/// the comparison in the low-order bits of a vector of [4 x float].
				601	///
				602	/// \headerfile <x86intrin.h>
				603	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	604	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	605	///
				606	/// \param __a
				607	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				608	/// 32 bits of this operand are used in the comparison.
				609	/// \param __b
				610	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				611	/// 32 bits of this operand are used in the comparison.
				612	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				613	/// in the low-order bits.
				614	static __inline__ __m128 __DEFAULT_FN_ATTRS
				615	_mm_cmpgt_ss(__m128 __a, __m128 __b)
				616	{
				617	return (__m128)__builtin_shufflevector((__v4sf)__a,
				618	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
				619	4, 1, 2, 3);
				620	}
				621
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	622	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	623	/// 128-bit vectors of [4 x float] to determine if the values in the first
				624	/// operand are greater than those in the second operand.
				625	///
				626	/// \headerfile <x86intrin.h>
				627	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	628	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	629	///
				630	/// \param __a
				631	/// A 128-bit vector of [4 x float].
				632	/// \param __b
				633	/// A 128-bit vector of [4 x float].
				634	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				635	static __inline__ __m128 __DEFAULT_FN_ATTRS
				636	_mm_cmpgt_ps(__m128 __a, __m128 __b)
				637	{
				638	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
				639	}
				640
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	641	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	642	/// operands to determine if the value in the first operand is greater than
				643	/// or equal to the corresponding value in the second operand and returns
				644	/// the result of the comparison in the low-order bits of a vector of
				645	/// [4 x float].
				646	///
				647	/// \headerfile <x86intrin.h>
				648	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	649	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	650	///
				651	/// \param __a
				652	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				653	/// 32 bits of this operand are used in the comparison.
				654	/// \param __b
				655	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				656	/// 32 bits of this operand are used in the comparison.
				657	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				658	/// in the low-order bits.
				659	static __inline__ __m128 __DEFAULT_FN_ATTRS
				660	_mm_cmpge_ss(__m128 __a, __m128 __b)
				661	{
				662	return (__m128)__builtin_shufflevector((__v4sf)__a,
				663	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
				664	4, 1, 2, 3);
				665	}
				666
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	667	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	668	/// 128-bit vectors of [4 x float] to determine if the values in the first
				669	/// operand are greater than or equal to those in the second operand.
				670	///
				671	/// \headerfile <x86intrin.h>
				672	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	673	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	674	///
				675	/// \param __a
				676	/// A 128-bit vector of [4 x float].
				677	/// \param __b
				678	/// A 128-bit vector of [4 x float].
				679	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				680	static __inline__ __m128 __DEFAULT_FN_ATTRS
				681	_mm_cmpge_ps(__m128 __a, __m128 __b)
				682	{
				683	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
				684	}
				685
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	686	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	687	/// operands for inequality and returns the result of the comparison in the
				688	/// low-order bits of a vector of [4 x float].
				689	///
				690	/// \headerfile <x86intrin.h>
				691	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	692	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
				693	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	694	///
				695	/// \param __a
				696	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				697	/// 32 bits of this operand are used in the comparison.
				698	/// \param __b
				699	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				700	/// 32 bits of this operand are used in the comparison.
				701	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				702	/// in the low-order bits.
				703	static __inline__ __m128 __DEFAULT_FN_ATTRS
				704	_mm_cmpneq_ss(__m128 __a, __m128 __b)
				705	{
				706	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
				707	}
				708
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	709	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	710	/// 128-bit vectors of [4 x float] for inequality.
				711	///
				712	/// \headerfile <x86intrin.h>
				713	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	714	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
				715	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	716	///
				717	/// \param __a
				718	/// A 128-bit vector of [4 x float].
				719	/// \param __b
				720	/// A 128-bit vector of [4 x float].
				721	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				722	static __inline__ __m128 __DEFAULT_FN_ATTRS
				723	_mm_cmpneq_ps(__m128 __a, __m128 __b)
				724	{
				725	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
				726	}
				727
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	728	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	729	/// operands to determine if the value in the first operand is not less than
				730	/// the corresponding value in the second operand and returns the result of
				731	/// the comparison in the low-order bits of a vector of [4 x float].
				732	///
				733	/// \headerfile <x86intrin.h>
				734	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	735	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				736	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	737	///
				738	/// \param __a
				739	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				740	/// 32 bits of this operand are used in the comparison.
				741	/// \param __b
				742	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				743	/// 32 bits of this operand are used in the comparison.
				744	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				745	/// in the low-order bits.
				746	static __inline__ __m128 __DEFAULT_FN_ATTRS
				747	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
				748	{
				749	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
				750	}
				751
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	752	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	753	/// 128-bit vectors of [4 x float] to determine if the values in the first
				754	/// operand are not less than those in the second operand.
				755	///
				756	/// \headerfile <x86intrin.h>
				757	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	758	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				759	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	760	///
				761	/// \param __a
				762	/// A 128-bit vector of [4 x float].
				763	/// \param __b
				764	/// A 128-bit vector of [4 x float].
				765	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				766	static __inline__ __m128 __DEFAULT_FN_ATTRS
				767	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
				768	{
				769	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
				770	}
				771
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	772	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	773	/// operands to determine if the value in the first operand is not less than
				774	/// or equal to the corresponding value in the second operand and returns
				775	/// the result of the comparison in the low-order bits of a vector of
				776	/// [4 x float].
				777	///
				778	/// \headerfile <x86intrin.h>
				779	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	780	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				781	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	782	///
				783	/// \param __a
				784	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				785	/// 32 bits of this operand are used in the comparison.
				786	/// \param __b
				787	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				788	/// 32 bits of this operand are used in the comparison.
				789	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				790	/// in the low-order bits.
				791	static __inline__ __m128 __DEFAULT_FN_ATTRS
				792	_mm_cmpnle_ss(__m128 __a, __m128 __b)
				793	{
				794	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
				795	}
				796
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	797	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	798	/// 128-bit vectors of [4 x float] to determine if the values in the first
				799	/// operand are not less than or equal to those in the second operand.
				800	///
				801	/// \headerfile <x86intrin.h>
				802	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	803	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				804	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	805	///
				806	/// \param __a
				807	/// A 128-bit vector of [4 x float].
				808	/// \param __b
				809	/// A 128-bit vector of [4 x float].
				810	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				811	static __inline__ __m128 __DEFAULT_FN_ATTRS
				812	_mm_cmpnle_ps(__m128 __a, __m128 __b)
				813	{
				814	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
				815	}
				816
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	817	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	818	/// operands to determine if the value in the first operand is not greater
				819	/// than the corresponding value in the second operand and returns the
				820	/// result of the comparison in the low-order bits of a vector of
				821	/// [4 x float].
				822	///
				823	/// \headerfile <x86intrin.h>
				824	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	825	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
				826	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	827	///
				828	/// \param __a
				829	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				830	/// 32 bits of this operand are used in the comparison.
				831	/// \param __b
				832	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				833	/// 32 bits of this operand are used in the comparison.
				834	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				835	/// in the low-order bits.
				836	static __inline__ __m128 __DEFAULT_FN_ATTRS
				837	_mm_cmpngt_ss(__m128 __a, __m128 __b)
				838	{
				839	return (__m128)__builtin_shufflevector((__v4sf)__a,
				840	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
				841	4, 1, 2, 3);
				842	}
				843
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	844	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	845	/// 128-bit vectors of [4 x float] to determine if the values in the first
				846	/// operand are not greater than those in the second operand.
				847	///
				848	/// \headerfile <x86intrin.h>
				849	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	850	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
				851	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	852	///
				853	/// \param __a
				854	/// A 128-bit vector of [4 x float].
				855	/// \param __b
				856	/// A 128-bit vector of [4 x float].
				857	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				858	static __inline__ __m128 __DEFAULT_FN_ATTRS
				859	_mm_cmpngt_ps(__m128 __a, __m128 __b)
				860	{
				861	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
				862	}
				863
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	864	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	865	/// operands to determine if the value in the first operand is not greater
				866	/// than or equal to the corresponding value in the second operand and
				867	/// returns the result of the comparison in the low-order bits of a vector
				868	/// of [4 x float].
				869	///
				870	/// \headerfile <x86intrin.h>
				871	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	872	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
				873	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	874	///
				875	/// \param __a
				876	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				877	/// 32 bits of this operand are used in the comparison.
				878	/// \param __b
				879	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				880	/// 32 bits of this operand are used in the comparison.
				881	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				882	/// in the low-order bits.
				883	static __inline__ __m128 __DEFAULT_FN_ATTRS
				884	_mm_cmpnge_ss(__m128 __a, __m128 __b)
				885	{
				886	return (__m128)__builtin_shufflevector((__v4sf)__a,
				887	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
				888	4, 1, 2, 3);
				889	}
				890
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	891	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	892	/// 128-bit vectors of [4 x float] to determine if the values in the first
				893	/// operand are not greater than or equal to those in the second operand.
				894	///
				895	/// \headerfile <x86intrin.h>
				896	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	897	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
				898	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	899	///
				900	/// \param __a
				901	/// A 128-bit vector of [4 x float].
				902	/// \param __b
				903	/// A 128-bit vector of [4 x float].
				904	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				905	static __inline__ __m128 __DEFAULT_FN_ATTRS
				906	_mm_cmpnge_ps(__m128 __a, __m128 __b)
				907	{
				908	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
				909	}
				910
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	911	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	912	/// operands to determine if the value in the first operand is ordered with
				913	/// respect to the corresponding value in the second operand and returns the
				914	/// result of the comparison in the low-order bits of a vector of
				915	/// [4 x float].
				916	///
				917	/// \headerfile <x86intrin.h>
				918	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	919	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
				920	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	921	///
				922	/// \param __a
				923	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				924	/// 32 bits of this operand are used in the comparison.
				925	/// \param __b
				926	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				927	/// 32 bits of this operand are used in the comparison.
				928	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				929	/// in the low-order bits.
				930	static __inline__ __m128 __DEFAULT_FN_ATTRS
				931	_mm_cmpord_ss(__m128 __a, __m128 __b)
				932	{
				933	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
				934	}
				935
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	936	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	937	/// 128-bit vectors of [4 x float] to determine if the values in the first
				938	/// operand are ordered with respect to those in the second operand.
				939	///
				940	/// \headerfile <x86intrin.h>
				941	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	942	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
				943	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	944	///
				945	/// \param __a
				946	/// A 128-bit vector of [4 x float].
				947	/// \param __b
				948	/// A 128-bit vector of [4 x float].
				949	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				950	static __inline__ __m128 __DEFAULT_FN_ATTRS
				951	_mm_cmpord_ps(__m128 __a, __m128 __b)
				952	{
				953	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
				954	}
				955
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	956	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	957	/// operands to determine if the value in the first operand is unordered
				958	/// with respect to the corresponding value in the second operand and
				959	/// returns the result of the comparison in the low-order bits of a vector
				960	/// of [4 x float].
				961	///
				962	/// \headerfile <x86intrin.h>
				963	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	964	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
				965	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	966	///
				967	/// \param __a
				968	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				969	/// 32 bits of this operand are used in the comparison.
				970	/// \param __b
				971	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				972	/// 32 bits of this operand are used in the comparison.
				973	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				974	/// in the low-order bits.
				975	static __inline__ __m128 __DEFAULT_FN_ATTRS
				976	_mm_cmpunord_ss(__m128 __a, __m128 __b)
				977	{
				978	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
				979	}
				980
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	981	/// Compares each of the corresponding 32-bit float values of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	982	/// 128-bit vectors of [4 x float] to determine if the values in the first
				983	/// operand are unordered with respect to those in the second operand.
				984	///
				985	/// \headerfile <x86intrin.h>
				986	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	987	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
				988	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	989	///
				990	/// \param __a
				991	/// A 128-bit vector of [4 x float].
				992	/// \param __b
				993	/// A 128-bit vector of [4 x float].
				994	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				995	static __inline__ __m128 __DEFAULT_FN_ATTRS
				996	_mm_cmpunord_ps(__m128 __a, __m128 __b)
				997	{
				998	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
				999	}
				1000
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1001	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1002	/// operands for equality and returns the result of the comparison.
				1003	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1004	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1005	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1006	/// \headerfile <x86intrin.h>
				1007	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1008	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1009	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1010	///
				1011	/// \param __a
				1012	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1013	/// used in the comparison.
				1014	/// \param __b
				1015	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1016	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1017	/// \returns An integer containing the comparison results. If either of the
				1018	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1019	static __inline__ int __DEFAULT_FN_ATTRS
				1020	_mm_comieq_ss(__m128 __a, __m128 __b)
				1021	{
				1022	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
				1023	}
				1024
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1025	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1026	/// operands to determine if the first operand is less than the second
				1027	/// operand and returns the result of the comparison.
				1028	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1029	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1030	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1031	/// \headerfile <x86intrin.h>
				1032	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1033	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
				1034	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1035	///
				1036	/// \param __a
				1037	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1038	/// used in the comparison.
				1039	/// \param __b
				1040	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1041	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1042	/// \returns An integer containing the comparison results. If either of the two
				1043	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1044	static __inline__ int __DEFAULT_FN_ATTRS
				1045	_mm_comilt_ss(__m128 __a, __m128 __b)
				1046	{
				1047	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
				1048	}
				1049
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1050	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1051	/// operands to determine if the first operand is less than or equal to the
				1052	/// second operand and returns the result of the comparison.
				1053	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1054	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1055	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1056	/// \headerfile <x86intrin.h>
				1057	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1058	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1059	///
				1060	/// \param __a
				1061	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1062	/// used in the comparison.
				1063	/// \param __b
				1064	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1065	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1066	/// \returns An integer containing the comparison results. If either of the two
				1067	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1068	static __inline__ int __DEFAULT_FN_ATTRS
				1069	_mm_comile_ss(__m128 __a, __m128 __b)
				1070	{
				1071	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
				1072	}
				1073
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1074	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1075	/// operands to determine if the first operand is greater than the second
				1076	/// operand and returns the result of the comparison.
				1077	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1078	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1079	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1080	/// \headerfile <x86intrin.h>
				1081	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1082	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1083	///
				1084	/// \param __a
				1085	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1086	/// used in the comparison.
				1087	/// \param __b
				1088	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1089	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1090	/// \returns An integer containing the comparison results. If either of the
				1091	/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1092	static __inline__ int __DEFAULT_FN_ATTRS
				1093	_mm_comigt_ss(__m128 __a, __m128 __b)
				1094	{
				1095	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
				1096	}
				1097
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1098	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1099	/// operands to determine if the first operand is greater than or equal to
				1100	/// the second operand and returns the result of the comparison.
				1101	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1102	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1103	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1104	/// \headerfile <x86intrin.h>
				1105	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1106	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1107	///
				1108	/// \param __a
				1109	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1110	/// used in the comparison.
				1111	/// \param __b
				1112	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1113	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1114	/// \returns An integer containing the comparison results. If either of the two
				1115	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1116	static __inline__ int __DEFAULT_FN_ATTRS
				1117	_mm_comige_ss(__m128 __a, __m128 __b)
				1118	{
				1119	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
				1120	}
				1121
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1122	/// Compares two 32-bit float values in the low-order bits of both
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1123	/// operands to determine if the first operand is not equal to the second
				1124	/// operand and returns the result of the comparison.
				1125	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1126	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1127	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1128	/// \headerfile <x86intrin.h>
				1129	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1130	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1131	///
				1132	/// \param __a
				1133	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1134	/// used in the comparison.
				1135	/// \param __b
				1136	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1137	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1138	/// \returns An integer containing the comparison results. If either of the
				1139	/// two lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1140	static __inline__ int __DEFAULT_FN_ATTRS
				1141	_mm_comineq_ss(__m128 __a, __m128 __b)
				1142	{
				1143	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
				1144	}
				1145
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1146	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1147	/// the low-order bits of both operands to determine equality and returns
				1148	/// the result of the comparison.
				1149	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1150	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1151	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1152	/// \headerfile <x86intrin.h>
				1153	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1154	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1155	///
				1156	/// \param __a
				1157	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1158	/// used in the comparison.
				1159	/// \param __b
				1160	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1161	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1162	/// \returns An integer containing the comparison results. If either of the two
				1163	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1164	static __inline__ int __DEFAULT_FN_ATTRS
				1165	_mm_ucomieq_ss(__m128 __a, __m128 __b)
				1166	{
				1167	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
				1168	}
				1169
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1170	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1171	/// the low-order bits of both operands to determine if the first operand is
				1172	/// less than the second operand and returns the result of the comparison.
				1173	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1174	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1175	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1176	/// \headerfile <x86intrin.h>
				1177	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1178	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1179	///
				1180	/// \param __a
				1181	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1182	/// used in the comparison.
				1183	/// \param __b
				1184	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1185	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1186	/// \returns An integer containing the comparison results. If either of the two
				1187	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1188	static __inline__ int __DEFAULT_FN_ATTRS
				1189	_mm_ucomilt_ss(__m128 __a, __m128 __b)
				1190	{
				1191	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
				1192	}
				1193
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1194	/// Performs an unordered comparison of two 32-bit float values using
				1195	/// the low-order bits of both operands to determine if the first operand is
				1196	/// less than or equal to the second operand and returns the result of the
				1197	/// comparison.
				1198	///
				1199	/// If either of the two lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1200	///
				1201	/// \headerfile <x86intrin.h>
				1202	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1203	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1204	///
				1205	/// \param __a
				1206	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1207	/// used in the comparison.
				1208	/// \param __b
				1209	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1210	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1211	/// \returns An integer containing the comparison results. If either of the two
				1212	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1213	static __inline__ int __DEFAULT_FN_ATTRS
				1214	_mm_ucomile_ss(__m128 __a, __m128 __b)
				1215	{
				1216	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
				1217	}
				1218
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1219	/// Performs an unordered comparison of two 32-bit float values using
				1220	/// the low-order bits of both operands to determine if the first operand is
				1221	/// greater than the second operand and returns the result of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1222	/// comparison.
				1223	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1224	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1225	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1226	/// \headerfile <x86intrin.h>
				1227	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1228	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1229	///
				1230	/// \param __a
				1231	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1232	/// used in the comparison.
				1233	/// \param __b
				1234	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1235	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1236	/// \returns An integer containing the comparison results. If either of the two
				1237	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1238	static __inline__ int __DEFAULT_FN_ATTRS
				1239	_mm_ucomigt_ss(__m128 __a, __m128 __b)
				1240	{
				1241	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
				1242	}
				1243
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1244	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1245	/// the low-order bits of both operands to determine if the first operand is
				1246	/// greater than or equal to the second operand and returns the result of
				1247	/// the comparison.
				1248	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1249	/// If either of the two lower 32-bit values is NaN, 0 is returned.
				1250	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1251	/// \headerfile <x86intrin.h>
				1252	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1253	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1254	///
				1255	/// \param __a
				1256	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1257	/// used in the comparison.
				1258	/// \param __b
				1259	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1260	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1261	/// \returns An integer containing the comparison results. If either of the two
				1262	/// lower 32-bit values is NaN, 0 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1263	static __inline__ int __DEFAULT_FN_ATTRS
				1264	_mm_ucomige_ss(__m128 __a, __m128 __b)
				1265	{
				1266	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
				1267	}
				1268
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1269	/// Performs an unordered comparison of two 32-bit float values using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1270	/// the low-order bits of both operands to determine inequality and returns
				1271	/// the result of the comparison.
				1272	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1273	/// If either of the two lower 32-bit values is NaN, 1 is returned.
				1274	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1275	/// \headerfile <x86intrin.h>
				1276	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1277	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1278	///
				1279	/// \param __a
				1280	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1281	/// used in the comparison.
				1282	/// \param __b
				1283	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1284	/// used in the comparison.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1285	/// \returns An integer containing the comparison results. If either of the two
				1286	/// lower 32-bit values is NaN, 1 is returned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1287	static __inline__ int __DEFAULT_FN_ATTRS
				1288	_mm_ucomineq_ss(__m128 __a, __m128 __b)
				1289	{
				1290	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
				1291	}
				1292
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1293	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1294	/// [4 x float] into a 32-bit integer.
				1295	///
				1296	/// \headerfile <x86intrin.h>
				1297	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1298	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1299	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1300	///
				1301	/// \param __a
				1302	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1303	/// used in the conversion.
				1304	/// \returns A 32-bit integer containing the converted value.
				1305	static __inline__ int __DEFAULT_FN_ATTRS
				1306	_mm_cvtss_si32(__m128 __a)
				1307	{
				1308	return __builtin_ia32_cvtss2si((__v4sf)__a);
				1309	}
				1310
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1311	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1312	/// [4 x float] into a 32-bit integer.
				1313	///
				1314	/// \headerfile <x86intrin.h>
				1315	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1316	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1317	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1318	///
				1319	/// \param __a
				1320	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1321	/// used in the conversion.
				1322	/// \returns A 32-bit integer containing the converted value.
				1323	static __inline__ int __DEFAULT_FN_ATTRS
				1324	_mm_cvt_ss2si(__m128 __a)
				1325	{
				1326	return _mm_cvtss_si32(__a);
				1327	}
				1328
				1329	#ifdef __x86_64__
				1330
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1331	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1332	/// [4 x float] into a 64-bit integer.
				1333	///
				1334	/// \headerfile <x86intrin.h>
				1335	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1336	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
				1337	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1338	///
				1339	/// \param __a
				1340	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1341	/// used in the conversion.
				1342	/// \returns A 64-bit integer containing the converted value.
				1343	static __inline__ long long __DEFAULT_FN_ATTRS
				1344	_mm_cvtss_si64(__m128 __a)
				1345	{
				1346	return __builtin_ia32_cvtss2si64((__v4sf)__a);
				1347	}
				1348
				1349	#endif
				1350
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1351	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1352	/// [4 x float] into a 64-bit vector of [2 x i32].
				1353	///
				1354	/// \headerfile <x86intrin.h>
				1355	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1356	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1357	///
				1358	/// \param __a
				1359	/// A 128-bit vector of [4 x float].
				1360	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1361	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1362	_mm_cvtps_pi32(__m128 __a)
				1363	{
				1364	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
				1365	}
				1366
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1367	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1368	/// [4 x float] into a 64-bit vector of [2 x i32].
				1369	///
				1370	/// \headerfile <x86intrin.h>
				1371	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1372	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1373	///
				1374	/// \param __a
				1375	/// A 128-bit vector of [4 x float].
				1376	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1377	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1378	_mm_cvt_ps2pi(__m128 __a)
				1379	{
				1380	return _mm_cvtps_pi32(__a);
				1381	}
				1382
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1383	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1384	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1385	/// inexact.
				1386	///
				1387	/// \headerfile <x86intrin.h>
				1388	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1389	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1390	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1391	///
				1392	/// \param __a
				1393	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1394	/// used in the conversion.
				1395	/// \returns A 32-bit integer containing the converted value.
				1396	static __inline__ int __DEFAULT_FN_ATTRS
				1397	_mm_cvttss_si32(__m128 __a)
				1398	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1399	return __builtin_ia32_cvttss2si((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1400	}
				1401
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1402	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1403	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1404	/// inexact.
				1405	///
				1406	/// \headerfile <x86intrin.h>
				1407	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1408	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1409	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1410	///
				1411	/// \param __a
				1412	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1413	/// used in the conversion.
				1414	/// \returns A 32-bit integer containing the converted value.
				1415	static __inline__ int __DEFAULT_FN_ATTRS
				1416	_mm_cvtt_ss2si(__m128 __a)
				1417	{
				1418	return _mm_cvttss_si32(__a);
				1419	}
				1420
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1421	#ifdef __x86_64__
				1422	/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1423	/// [4 x float] into a 64-bit integer, truncating the result when it is
				1424	/// inexact.
				1425	///
				1426	/// \headerfile <x86intrin.h>
				1427	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1428	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
				1429	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1430	///
				1431	/// \param __a
				1432	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1433	/// used in the conversion.
				1434	/// \returns A 64-bit integer containing the converted value.
				1435	static __inline__ long long __DEFAULT_FN_ATTRS
				1436	_mm_cvttss_si64(__m128 __a)
				1437	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1438	return __builtin_ia32_cvttss2si64((__v4sf)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1439	}
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1440	#endif
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1441
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1442	/// Converts two low-order float values in a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1443	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
				1444	/// when it is inexact.
				1445	///
				1446	/// \headerfile <x86intrin.h>
				1447	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1448	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
				1449	/// instructions.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1450	///
				1451	/// \param __a
				1452	/// A 128-bit vector of [4 x float].
				1453	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1454	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1455	_mm_cvttps_pi32(__m128 __a)
				1456	{
				1457	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
				1458	}
				1459
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1460	/// Converts two low-order float values in a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1461	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
				1462	/// is inexact.
				1463	///
				1464	/// \headerfile <x86intrin.h>
				1465	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1466	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1467	///
				1468	/// \param __a
				1469	/// A 128-bit vector of [4 x float].
				1470	/// \returns A 64-bit integer vector containing the converted values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1471	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1472	_mm_cvtt_ps2pi(__m128 __a)
				1473	{
				1474	return _mm_cvttps_pi32(__a);
				1475	}
				1476
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1477	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1478	/// and writes it to the lower 32 bits of the destination. The remaining
				1479	/// higher order elements of the destination vector are copied from the
				1480	/// corresponding elements in the first operand.
				1481	///
				1482	/// \headerfile <x86intrin.h>
				1483	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1484	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1485	///
				1486	/// \param __a
				1487	/// A 128-bit vector of [4 x float].
				1488	/// \param __b
				1489	/// A 32-bit signed integer operand containing the value to be converted.
				1490	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1491	/// converted value of the second operand. The upper 96 bits are copied from
				1492	/// the upper 96 bits of the first operand.
				1493	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1494	_mm_cvtsi32_ss(__m128 __a, int __b)
				1495	{
				1496	__a[0] = __b;
				1497	return __a;
				1498	}
				1499
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1500	/// Converts a 32-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1501	/// and writes it to the lower 32 bits of the destination. The remaining
				1502	/// higher order elements of the destination are copied from the
				1503	/// corresponding elements in the first operand.
				1504	///
				1505	/// \headerfile <x86intrin.h>
				1506	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1507	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1508	///
				1509	/// \param __a
				1510	/// A 128-bit vector of [4 x float].
				1511	/// \param __b
				1512	/// A 32-bit signed integer operand containing the value to be converted.
				1513	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1514	/// converted value of the second operand. The upper 96 bits are copied from
				1515	/// the upper 96 bits of the first operand.
				1516	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1517	_mm_cvt_si2ss(__m128 __a, int __b)
				1518	{
				1519	return _mm_cvtsi32_ss(__a, __b);
				1520	}
				1521
				1522	#ifdef __x86_64__
				1523
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1524	/// Converts a 64-bit signed integer value into a floating point value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1525	/// and writes it to the lower 32 bits of the destination. The remaining
				1526	/// higher order elements of the destination are copied from the
				1527	/// corresponding elements in the first operand.
				1528	///
				1529	/// \headerfile <x86intrin.h>
				1530	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1531	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1532	///
				1533	/// \param __a
				1534	/// A 128-bit vector of [4 x float].
				1535	/// \param __b
				1536	/// A 64-bit signed integer operand containing the value to be converted.
				1537	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1538	/// converted value of the second operand. The upper 96 bits are copied from
				1539	/// the upper 96 bits of the first operand.
				1540	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1541	_mm_cvtsi64_ss(__m128 __a, long long __b)
				1542	{
				1543	__a[0] = __b;
				1544	return __a;
				1545	}
				1546
				1547	#endif
				1548
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1549	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1550	/// floating point values and writes them to the lower 64-bits of the
				1551	/// destination. The remaining higher order elements of the destination are
				1552	/// copied from the corresponding elements in the first operand.
				1553	///
				1554	/// \headerfile <x86intrin.h>
				1555	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1556	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1557	///
				1558	/// \param __a
				1559	/// A 128-bit vector of [4 x float].
				1560	/// \param __b
				1561	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1562	/// and written to the corresponding low-order elements in the destination.
				1563	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1564	/// converted value of the second operand. The upper 64 bits are copied from
				1565	/// the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1566	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1567	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
				1568	{
				1569	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
				1570	}
				1571
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1572	/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1573	/// floating point values and writes them to the lower 64-bits of the
				1574	/// destination. The remaining higher order elements of the destination are
				1575	/// copied from the corresponding elements in the first operand.
				1576	///
				1577	/// \headerfile <x86intrin.h>
				1578	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1579	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1580	///
				1581	/// \param __a
				1582	/// A 128-bit vector of [4 x float].
				1583	/// \param __b
				1584	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1585	/// and written to the corresponding low-order elements in the destination.
				1586	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1587	/// converted value from the second operand. The upper 64 bits are copied
				1588	/// from the upper 64 bits of the first operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1589	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1590	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
				1591	{
				1592	return _mm_cvtpi32_ps(__a, __b);
				1593	}
				1594
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1595	/// Extracts a float value contained in the lower 32 bits of a vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1596	/// [4 x float].
				1597	///
				1598	/// \headerfile <x86intrin.h>
				1599	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1600	/// This intrinsic has no corresponding instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1601	///
				1602	/// \param __a
				1603	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1604	/// used in the extraction.
				1605	/// \returns A 32-bit float containing the extracted value.
				1606	static __inline__ float __DEFAULT_FN_ATTRS
				1607	_mm_cvtss_f32(__m128 __a)
				1608	{
				1609	return __a[0];
				1610	}
				1611
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1612	/// Loads two packed float values from the address \a __p into the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1613	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
				1614	/// are copied from the low-order bits of the first operand.
				1615	///
				1616	/// \headerfile <x86intrin.h>
				1617	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1618	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1619	///
				1620	/// \param __a
				1621	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
				1622	/// of the destination.
				1623	/// \param __p
				1624	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1625	/// [127:64] of the destination.
				1626	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1627	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1628	_mm_loadh_pi(__m128 __a, const __m64 *__p)
				1629	{
				1630	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
				1631	struct __mm_loadh_pi_struct {
				1632	__mm_loadh_pi_v2f32 __u;
				1633	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1634	__mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1635	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1636	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
				1637	}
				1638
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1639	/// Loads two packed float values from the address \a __p into the
				1640	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
				1641	/// are copied from the high-order bits of the first operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1642	///
				1643	/// \headerfile <x86intrin.h>
				1644	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1645	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1646	///
				1647	/// \param __a
				1648	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
				1649	/// [127:64] of the destination.
				1650	/// \param __p
				1651	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1652	/// [63:0] of the destination.
				1653	/// \returns A 128-bit vector of [4 x float] containing the moved values.
				1654	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1655	_mm_loadl_pi(__m128 __a, const __m64 *__p)
				1656	{
				1657	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
				1658	struct __mm_loadl_pi_struct {
				1659	__mm_loadl_pi_v2f32 __u;
				1660	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1661	__mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1662	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1663	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
				1664	}
				1665
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1666	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1667	/// 32 bits of the vector are initialized with the single-precision
				1668	/// floating-point value loaded from a specified memory location. The upper
				1669	/// 96 bits are set to zero.
				1670	///
				1671	/// \headerfile <x86intrin.h>
				1672	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1673	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1674	///
				1675	/// \param __p
				1676	/// A pointer to a 32-bit memory location containing a single-precision
				1677	/// floating-point value.
				1678	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1679	/// lower 32 bits contain the value loaded from the memory location. The
				1680	/// upper 96 bits are set to zero.
				1681	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1682	_mm_load_ss(const float *__p)
				1683	{
				1684	struct __mm_load_ss_struct {
				1685	float __u;
				1686	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1687	float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1688	return __extension__ (__m128){ __u, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1689	}
				1690
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1691	/// Loads a 32-bit float value and duplicates it to all four vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1692	/// elements of a 128-bit vector of [4 x float].
				1693	///
				1694	/// \headerfile <x86intrin.h>
				1695	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1696	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1697	/// instruction.
				1698	///
				1699	/// \param __p
				1700	/// A pointer to a float value to be loaded and duplicated.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1701	/// \returns A 128-bit vector of [4 x float] containing the loaded and
				1702	/// duplicated values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1703	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1704	_mm_load1_ps(const float *__p)
				1705	{
				1706	struct __mm_load1_ps_struct {
				1707	float __u;
				1708	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1709	float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1710	return __extension__ (__m128){ __u, __u, __u, __u };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1711	}
				1712
				1713	#define _mm_load_ps1(p) _mm_load1_ps(p)
				1714
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1715	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1716	/// memory location.
				1717	///
				1718	/// \headerfile <x86intrin.h>
				1719	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1720	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1721	///
				1722	/// \param __p
				1723	/// A pointer to a 128-bit memory location. The address of the memory
				1724	/// location has to be 128-bit aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1725	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1726	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1727	_mm_load_ps(const float *__p)
				1728	{
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1729	return (const __m128)__p;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1730	}
				1731
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1732	/// Loads a 128-bit floating-point vector of [4 x float] from an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1733	/// unaligned memory location.
				1734	///
				1735	/// \headerfile <x86intrin.h>
				1736	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1737	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1738	///
				1739	/// \param __p
				1740	/// A pointer to a 128-bit memory location. The address of the memory
				1741	/// location does not have to be aligned.
				1742	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
				1743	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1744	_mm_loadu_ps(const float *__p)
				1745	{
				1746	struct __loadu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1747	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1748	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	1749	return ((const struct __loadu_ps*)__p)->__v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1750	}
				1751
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1752	/// Loads four packed float values, in reverse order, from an aligned
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1753	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
				1754	///
				1755	/// \headerfile <x86intrin.h>
				1756	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1757	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1758	/// instruction.
				1759	///
				1760	/// \param __p
				1761	/// A pointer to a 128-bit memory location. The address of the memory
				1762	/// location has to be 128-bit aligned.
				1763	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
				1764	/// in reverse order.
				1765	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1766	_mm_loadr_ps(const float *__p)
				1767	{
				1768	__m128 __a = _mm_load_ps(__p);
				1769	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				1770	}
				1771
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1772	/// Create a 128-bit vector of [4 x float] with undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1773	///
				1774	/// \headerfile <x86intrin.h>
				1775	///
				1776	/// This intrinsic has no corresponding instruction.
				1777	///
				1778	/// \returns A 128-bit vector of [4 x float] containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1779	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1780	_mm_undefined_ps(void)
				1781	{
				1782	return (__m128)__builtin_ia32_undef128();
				1783	}
				1784
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1785	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1786	/// 32 bits of the vector are initialized with the specified single-precision
				1787	/// floating-point value. The upper 96 bits are set to zero.
				1788	///
				1789	/// \headerfile <x86intrin.h>
				1790	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1791	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1792	///
				1793	/// \param __w
				1794	/// A single-precision floating-point value used to initialize the lower 32
				1795	/// bits of the result.
				1796	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1797	/// lower 32 bits contain the value provided in the source operand. The
				1798	/// upper 96 bits are set to zero.
				1799	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1800	_mm_set_ss(float __w)
				1801	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1802	return __extension__ (__m128){ __w, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1803	}
				1804
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1805	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1806	/// of the four single-precision floating-point vector elements set to the
				1807	/// specified single-precision floating-point value.
				1808	///
				1809	/// \headerfile <x86intrin.h>
				1810	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1811	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1812	///
				1813	/// \param __w
				1814	/// A single-precision floating-point value used to initialize each vector
				1815	/// element of the result.
				1816	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1817	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1818	_mm_set1_ps(float __w)
				1819	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1820	return __extension__ (__m128){ __w, __w, __w, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1821	}
				1822
				1823	/* Microsoft specific. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1824	/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1825	/// of the four single-precision floating-point vector elements set to the
				1826	/// specified single-precision floating-point value.
				1827	///
				1828	/// \headerfile <x86intrin.h>
				1829	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1830	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1831	///
				1832	/// \param __w
				1833	/// A single-precision floating-point value used to initialize each vector
				1834	/// element of the result.
				1835	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1836	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1837	_mm_set_ps1(float __w)
				1838	{
				1839	return _mm_set1_ps(__w);
				1840	}
				1841
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1842	/// Constructs a 128-bit floating-point vector of [4 x float]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1843	/// initialized with the specified single-precision floating-point values.
				1844	///
				1845	/// \headerfile <x86intrin.h>
				1846	///
				1847	/// This intrinsic is a utility function and does not correspond to a specific
				1848	/// instruction.
				1849	///
				1850	/// \param __z
				1851	/// A single-precision floating-point value used to initialize bits [127:96]
				1852	/// of the result.
				1853	/// \param __y
				1854	/// A single-precision floating-point value used to initialize bits [95:64]
				1855	/// of the result.
				1856	/// \param __x
				1857	/// A single-precision floating-point value used to initialize bits [63:32]
				1858	/// of the result.
				1859	/// \param __w
				1860	/// A single-precision floating-point value used to initialize bits [31:0]
				1861	/// of the result.
				1862	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1863	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1864	_mm_set_ps(float __z, float __y, float __x, float __w)
				1865	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1866	return __extension__ (__m128){ __w, __x, __y, __z };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1867	}
				1868
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1869	/// Constructs a 128-bit floating-point vector of [4 x float],
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1870	/// initialized in reverse order with the specified 32-bit single-precision
				1871	/// float-point values.
				1872	///
				1873	/// \headerfile <x86intrin.h>
				1874	///
				1875	/// This intrinsic is a utility function and does not correspond to a specific
				1876	/// instruction.
				1877	///
				1878	/// \param __z
				1879	/// A single-precision floating-point value used to initialize bits [31:0]
				1880	/// of the result.
				1881	/// \param __y
				1882	/// A single-precision floating-point value used to initialize bits [63:32]
				1883	/// of the result.
				1884	/// \param __x
				1885	/// A single-precision floating-point value used to initialize bits [95:64]
				1886	/// of the result.
				1887	/// \param __w
				1888	/// A single-precision floating-point value used to initialize bits [127:96]
				1889	/// of the result.
				1890	/// \returns An initialized 128-bit floating-point vector of [4 x float].
				1891	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1892	_mm_setr_ps(float __z, float __y, float __x, float __w)
				1893	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1894	return __extension__ (__m128){ __z, __y, __x, __w };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1895	}
				1896
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1897	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1898	/// to zero.
				1899	///
				1900	/// \headerfile <x86intrin.h>
				1901	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1902	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1903	///
				1904	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
				1905	/// all elements set to zero.
				1906	static __inline__ __m128 __DEFAULT_FN_ATTRS
				1907	_mm_setzero_ps(void)
				1908	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1909	return __extension__ (__m128){ 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1910	}
				1911
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1912	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1913	/// memory location.
				1914	///
				1915	/// \headerfile <x86intrin.h>
				1916	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1917	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1918	///
				1919	/// \param __p
				1920	/// A pointer to a 64-bit memory location.
				1921	/// \param __a
				1922	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1923	static __inline__ void __DEFAULT_FN_ATTRS
				1924	_mm_storeh_pi(__m64 *__p, __m128 __a)
				1925	{
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1926	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
				1927	struct __mm_storeh_pi_struct {
				1928	__mm_storeh_pi_v2f32 __u;
				1929	} __attribute__((__packed__, __may_alias__));
				1930	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1931	}
				1932
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1933	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1934	/// memory location.
				1935	///
				1936	/// \headerfile <x86intrin.h>
				1937	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1938	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1939	///
				1940	/// \param __p
				1941	/// A pointer to a memory location that will receive the float values.
				1942	/// \param __a
				1943	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1944	static __inline__ void __DEFAULT_FN_ATTRS
				1945	_mm_storel_pi(__m64 *__p, __m128 __a)
				1946	{
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1947	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
				1948	struct __mm_storeh_pi_struct {
				1949	__mm_storeh_pi_v2f32 __u;
				1950	} __attribute__((__packed__, __may_alias__));
				1951	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1952	}
				1953
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1954	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1955	/// memory location.
				1956	///
				1957	/// \headerfile <x86intrin.h>
				1958	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1959	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1960	///
				1961	/// \param __p
				1962	/// A pointer to a 32-bit memory location.
				1963	/// \param __a
				1964	/// A 128-bit vector of [4 x float] containing the value to be stored.
				1965	static __inline__ void __DEFAULT_FN_ATTRS
				1966	_mm_store_ss(float *__p, __m128 __a)
				1967	{
				1968	struct __mm_store_ss_struct {
				1969	float __u;
				1970	} __attribute__((__packed__, __may_alias__));
				1971	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
				1972	}
				1973
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1974	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
				1975	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1976	///
				1977	/// \headerfile <x86intrin.h>
				1978	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1979	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1980	///
				1981	/// \param __p
				1982	/// A pointer to a 128-bit memory location. The address of the memory
				1983	/// location does not have to be aligned.
				1984	/// \param __a
				1985	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1986	static __inline__ void __DEFAULT_FN_ATTRS
				1987	_mm_storeu_ps(float *__p, __m128 __a)
				1988	{
				1989	struct __storeu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	1990	__m128_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1991	} __attribute__((__packed__, __may_alias__));
				1992	((struct __storeu_ps*)__p)->__v = __a;
				1993	}
				1994
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1995	/// Stores a 128-bit vector of [4 x float] into an aligned memory
				1996	/// location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1997	///
				1998	/// \headerfile <x86intrin.h>
				1999	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2000	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2001	///
				2002	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2003	/// A pointer to a 128-bit memory location. The address of the memory
				2004	/// location has to be 16-byte aligned.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2005	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2006	/// A 128-bit vector of [4 x float] containing the values to be stored.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2007	static __inline__ void __DEFAULT_FN_ATTRS
				2008	_mm_store_ps(float *__p, __m128 __a)
				2009	{
				2010	(__m128)__p = __a;
				2011	}
				2012
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2013	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2014	/// four contiguous elements in an aligned memory location.
				2015	///
				2016	/// \headerfile <x86intrin.h>
				2017	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2018	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2019	/// instruction.
				2020	///
				2021	/// \param __p
				2022	/// A pointer to a 128-bit memory location.
				2023	/// \param __a
				2024	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2025	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2026	static __inline__ void __DEFAULT_FN_ATTRS
				2027	_mm_store1_ps(float *__p, __m128 __a)
				2028	{
				2029	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
				2030	_mm_store_ps(__p, __a);
				2031	}
				2032
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2033	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
				2034	/// four contiguous elements in an aligned memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2035	///
				2036	/// \headerfile <x86intrin.h>
				2037	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2038	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
				2039	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2040	///
				2041	/// \param __p
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2042	/// A pointer to a 128-bit memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2043	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2044	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
				2045	/// of the four contiguous elements pointed by \a __p.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2046	static __inline__ void __DEFAULT_FN_ATTRS
				2047	_mm_store_ps1(float *__p, __m128 __a)
				2048	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2049	_mm_store1_ps(__p, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2050	}
				2051
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2052	/// Stores float values from a 128-bit vector of [4 x float] to an
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2053	/// aligned memory location in reverse order.
				2054	///
				2055	/// \headerfile <x86intrin.h>
				2056	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2057	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2058	/// instruction.
				2059	///
				2060	/// \param __p
				2061	/// A pointer to a 128-bit memory location. The address of the memory
				2062	/// location has to be 128-bit aligned.
				2063	/// \param __a
				2064	/// A 128-bit vector of [4 x float] containing the values to be stored.
				2065	static __inline__ void __DEFAULT_FN_ATTRS
				2066	_mm_storer_ps(float *__p, __m128 __a)
				2067	{
				2068	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
				2069	_mm_store_ps(__p, __a);
				2070	}
				2071
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2072	#define _MM_HINT_ET0 7
				2073	#define _MM_HINT_ET1 6
				2074	#define _MM_HINT_T0 3
				2075	#define _MM_HINT_T1 2
				2076	#define _MM_HINT_T2 1
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2077	#define _MM_HINT_NTA 0
				2078
				2079	#ifndef _MSC_VER
				2080	/* FIXME: We have to #define this because "sel" must be a constant integer, and
				2081	Sema doesn't do any form of constant propagation yet. */
				2082
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2083	/// Loads one cache line of data from the specified address to a location
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2084	/// closer to the processor.
				2085	///
				2086	/// \headerfile <x86intrin.h>
				2087	///
				2088	/// \code
				2089	/// void _mm_prefetch(const void * a, const int sel);
				2090	/// \endcode
				2091	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2092	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2093	///
				2094	/// \param a
				2095	/// A pointer to a memory location containing a cache line of data.
				2096	/// \param sel
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2097	/// A predefined integer constant specifying the type of prefetch
				2098	/// operation: \n
				2099	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
				2100	/// PREFETCHNTA instruction will be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2101	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2102	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2103	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2104	/// be generated. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2105	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
				2106	/// be generated.
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	2107	#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2108	((sel) >> 2) & 1, (sel) & 0x3))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2109	#endif
				2110
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2111	/// Stores a 64-bit integer in the specified aligned memory location. To
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2112	/// minimize caching, the data is flagged as non-temporal (unlikely to be
				2113	/// used again soon).
				2114	///
				2115	/// \headerfile <x86intrin.h>
				2116	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2117	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2118	///
				2119	/// \param __p
				2120	/// A pointer to an aligned memory location used to store the register value.
				2121	/// \param __a
				2122	/// A 64-bit integer containing the value to be stored.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2123	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2124	_mm_stream_pi(__m64 *__p, __m64 __a)
				2125	{
				2126	__builtin_ia32_movntq(__p, __a);
				2127	}
				2128
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2129	/// Moves packed float values from a 128-bit vector of [4 x float] to a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2130	/// 128-bit aligned memory location. To minimize caching, the data is flagged
				2131	/// as non-temporal (unlikely to be used again soon).
				2132	///
				2133	/// \headerfile <x86intrin.h>
				2134	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2135	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2136	///
				2137	/// \param __p
				2138	/// A pointer to a 128-bit aligned memory location that will receive the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2139	/// single-precision floating-point values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2140	/// \param __a
				2141	/// A 128-bit vector of [4 x float] containing the values to be moved.
				2142	static __inline__ void __DEFAULT_FN_ATTRS
				2143	_mm_stream_ps(float *__p, __m128 __a)
				2144	{
				2145	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
				2146	}
				2147
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2148	#if defined(__cplusplus)
				2149	extern "C" {
				2150	#endif
				2151
				2152	/// Forces strong memory ordering (serialization) between store
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2153	/// instructions preceding this instruction and store instructions following
				2154	/// this instruction, ensuring the system completes all previous stores
				2155	/// before executing subsequent stores.
				2156	///
				2157	/// \headerfile <x86intrin.h>
				2158	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2159	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2160	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2161	void _mm_sfence(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2162
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2163	#if defined(__cplusplus)
				2164	} // extern "C"
				2165	#endif
				2166
				2167	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2168	/// returns it, as specified by the immediate integer operand.
				2169	///
				2170	/// \headerfile <x86intrin.h>
				2171	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2172	/// \code
				2173	/// int _mm_extract_pi16(__m64 a, int n);
				2174	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2175	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2176	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
				2177	///
				2178	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2179	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2180	/// \param n
				2181	/// An immediate integer operand that determines which bits are extracted: \n
				2182	/// 0: Bits [15:0] are copied to the destination. \n
				2183	/// 1: Bits [31:16] are copied to the destination. \n
				2184	/// 2: Bits [47:32] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2185	/// 3: Bits [63:48] are copied to the destination.
				2186	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2187	#define _mm_extract_pi16(a, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2188	((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2189
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2190	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2191	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2192	/// specified by the immediate operand \a n.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2193	///
				2194	/// \headerfile <x86intrin.h>
				2195	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2196	/// \code
				2197	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
				2198	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2199	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2200	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
				2201	///
				2202	/// \param a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2203	/// A 64-bit vector of [4 x i16].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2204	/// \param d
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2205	/// An integer. The lower 16-bit value from this operand is written to the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2206	/// destination at the offset specified by operand \a n.
				2207	/// \param n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2208	/// An immediate integer operant that determines which the bits to be used
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2209	/// in the destination. \n
				2210	/// 0: Bits [15:0] are copied to the destination. \n
				2211	/// 1: Bits [31:16] are copied to the destination. \n
				2212	/// 2: Bits [47:32] are copied to the destination. \n
				2213	/// 3: Bits [63:48] are copied to the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2214	/// The remaining bits in the destination are copied from the corresponding
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2215	/// bits in operand \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2216	/// \returns A 64-bit integer vector containing the copied packed data from the
				2217	/// operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2218	#define _mm_insert_pi16(a, d, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2219	((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2220
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2221	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2222	/// the 64-bit integer vectors, and writes the greater value to the
				2223	/// corresponding bits in the destination.
				2224	///
				2225	/// \headerfile <x86intrin.h>
				2226	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2227	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2228	///
				2229	/// \param __a
				2230	/// A 64-bit integer vector containing one of the source operands.
				2231	/// \param __b
				2232	/// A 64-bit integer vector containing one of the source operands.
				2233	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2234	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2235	_mm_max_pi16(__m64 __a, __m64 __b)
				2236	{
				2237	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
				2238	}
				2239
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2240	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2241	/// values of the 64-bit integer vectors, and writes the greater value to the
				2242	/// corresponding bits in the destination.
				2243	///
				2244	/// \headerfile <x86intrin.h>
				2245	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2246	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2247	///
				2248	/// \param __a
				2249	/// A 64-bit integer vector containing one of the source operands.
				2250	/// \param __b
				2251	/// A 64-bit integer vector containing one of the source operands.
				2252	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2253	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2254	_mm_max_pu8(__m64 __a, __m64 __b)
				2255	{
				2256	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
				2257	}
				2258
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2259	/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2260	/// the 64-bit integer vectors, and writes the lesser value to the
				2261	/// corresponding bits in the destination.
				2262	///
				2263	/// \headerfile <x86intrin.h>
				2264	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2265	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2266	///
				2267	/// \param __a
				2268	/// A 64-bit integer vector containing one of the source operands.
				2269	/// \param __b
				2270	/// A 64-bit integer vector containing one of the source operands.
				2271	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2272	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2273	_mm_min_pi16(__m64 __a, __m64 __b)
				2274	{
				2275	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
				2276	}
				2277
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2278	/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2279	/// values of the 64-bit integer vectors, and writes the lesser value to the
				2280	/// corresponding bits in the destination.
				2281	///
				2282	/// \headerfile <x86intrin.h>
				2283	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2284	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2285	///
				2286	/// \param __a
				2287	/// A 64-bit integer vector containing one of the source operands.
				2288	/// \param __b
				2289	/// A 64-bit integer vector containing one of the source operands.
				2290	/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2291	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2292	_mm_min_pu8(__m64 __a, __m64 __b)
				2293	{
				2294	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
				2295	}
				2296
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2297	/// Takes the most significant bit from each 8-bit element in a 64-bit
				2298	/// integer vector to create an 8-bit mask value. Zero-extends the value to
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2299	/// 32-bit integer and writes it to the destination.
				2300	///
				2301	/// \headerfile <x86intrin.h>
				2302	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2303	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2304	///
				2305	/// \param __a
				2306	/// A 64-bit integer vector containing the values with bits to be extracted.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2307	/// \returns The most significant bit from each 8-bit element in \a __a,
				2308	/// written to bits [7:0].
				2309	static __inline__ int __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2310	_mm_movemask_pi8(__m64 __a)
				2311	{
				2312	return __builtin_ia32_pmovmskb((__v8qi)__a);
				2313	}
				2314
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2315	/// Multiplies packed 16-bit unsigned integer values and writes the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2316	/// high-order 16 bits of each 32-bit product to the corresponding bits in
				2317	/// the destination.
				2318	///
				2319	/// \headerfile <x86intrin.h>
				2320	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2321	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2322	///
				2323	/// \param __a
				2324	/// A 64-bit integer vector containing one of the source operands.
				2325	/// \param __b
				2326	/// A 64-bit integer vector containing one of the source operands.
				2327	/// \returns A 64-bit integer vector containing the products of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2328	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2329	_mm_mulhi_pu16(__m64 __a, __m64 __b)
				2330	{
				2331	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
				2332	}
				2333
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2334	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2335	/// destination, as specified by the immediate value operand.
				2336	///
				2337	/// \headerfile <x86intrin.h>
				2338	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2339	/// \code
				2340	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
				2341	/// \endcode
				2342	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2343	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
				2344	///
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2345	/// \param a
				2346	/// A 64-bit integer vector containing the values to be shuffled.
				2347	/// \param n
				2348	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2349	/// copy from \a a. The destinations within the 64-bit destination are
				2350	/// assigned values as follows: \n
				2351	/// Bits [1:0] are used to assign values to bits [15:0] in the
				2352	/// destination. \n
				2353	/// Bits [3:2] are used to assign values to bits [31:16] in the
				2354	/// destination. \n
				2355	/// Bits [5:4] are used to assign values to bits [47:32] in the
				2356	/// destination. \n
				2357	/// Bits [7:6] are used to assign values to bits [63:48] in the
				2358	/// destination. \n
				2359	/// Bit value assignments: \n
				2360	/// 00: assigned from bits [15:0] of \a a. \n
				2361	/// 01: assigned from bits [31:16] of \a a. \n
				2362	/// 10: assigned from bits [47:32] of \a a. \n
				2363	/// 11: assigned from bits [63:48] of \a a.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2364	/// \returns A 64-bit integer vector containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2365	#define _mm_shuffle_pi16(a, n) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2366	((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2367
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2368	/// Conditionally copies the values from each 8-bit element in the first
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2369	/// 64-bit integer vector operand to the specified memory location, as
				2370	/// specified by the most significant bit in the corresponding element in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2371	/// second 64-bit integer vector operand.
				2372	///
				2373	/// To minimize caching, the data is flagged as non-temporal
				2374	/// (unlikely to be used again soon).
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2375	///
				2376	/// \headerfile <x86intrin.h>
				2377	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2378	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2379	///
				2380	/// \param __d
				2381	/// A 64-bit integer vector containing the values with elements to be copied.
				2382	/// \param __n
				2383	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2384	/// element determines whether the corresponding element in operand \a __d
				2385	/// is copied. If the most significant bit of a given element is 1, the
				2386	/// corresponding element in operand \a __d is copied.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2387	/// \param __p
				2388	/// A pointer to a 64-bit memory location that will receive the conditionally
				2389	/// copied integer values. The address of the memory location does not have
				2390	/// to be aligned.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2391	static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2392	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
				2393	{
				2394	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
				2395	}
				2396
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2397	/// Computes the rounded averages of the packed unsigned 8-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2398	/// values and writes the averages to the corresponding bits in the
				2399	/// destination.
				2400	///
				2401	/// \headerfile <x86intrin.h>
				2402	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2403	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2404	///
				2405	/// \param __a
				2406	/// A 64-bit integer vector containing one of the source operands.
				2407	/// \param __b
				2408	/// A 64-bit integer vector containing one of the source operands.
				2409	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2410	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2411	_mm_avg_pu8(__m64 __a, __m64 __b)
				2412	{
				2413	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
				2414	}
				2415
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2416	/// Computes the rounded averages of the packed unsigned 16-bit integer
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2417	/// values and writes the averages to the corresponding bits in the
				2418	/// destination.
				2419	///
				2420	/// \headerfile <x86intrin.h>
				2421	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2422	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2423	///
				2424	/// \param __a
				2425	/// A 64-bit integer vector containing one of the source operands.
				2426	/// \param __b
				2427	/// A 64-bit integer vector containing one of the source operands.
				2428	/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2429	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2430	_mm_avg_pu16(__m64 __a, __m64 __b)
				2431	{
				2432	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
				2433	}
				2434
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2435	/// Subtracts the corresponding 8-bit unsigned integer values of the two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2436	/// 64-bit vector operands and computes the absolute value for each of the
				2437	/// difference. Then sum of the 8 absolute differences is written to the
				2438	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
				2439	///
				2440	/// \headerfile <x86intrin.h>
				2441	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2442	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2443	///
				2444	/// \param __a
				2445	/// A 64-bit integer vector containing one of the source operands.
				2446	/// \param __b
				2447	/// A 64-bit integer vector containing one of the source operands.
				2448	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
				2449	/// sets of absolute differences between both operands. The upper bits are
				2450	/// cleared.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2451	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2452	_mm_sad_pu8(__m64 __a, __m64 __b)
				2453	{
				2454	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
				2455	}
				2456
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2457	#if defined(__cplusplus)
				2458	extern "C" {
				2459	#endif
				2460
				2461	/// Returns the contents of the MXCSR register as a 32-bit unsigned
				2462	/// integer value.
				2463	///
				2464	/// There are several groups of macros associated with this
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2465	/// intrinsic, including:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2466	/// <ul>
				2467	/// <li>
				2468	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2469	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2470	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2471	/// _MM_GET_EXCEPTION_STATE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2472	/// </li>
				2473	/// <li>
				2474	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2475	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2476	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2477	/// </li>
				2478	/// <li>
				2479	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2480	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2481	/// _MM_GET_ROUNDING_MODE().
				2482	/// </li>
				2483	/// <li>
				2484	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2485	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2486	/// </li>
				2487	/// <li>
				2488	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2489	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2490	/// _MM_GET_DENORMALS_ZERO_MODE().
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2491	/// </li>
				2492	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2493	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2494	/// For example, the following expression checks if an overflow exception has
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2495	/// occurred:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2496	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2497	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2498	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2499	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2500	/// The following expression gets the current rounding mode:
				2501	/// \code
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2502	/// _MM_GET_ROUNDING_MODE()
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2503	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2504	///
				2505	/// \headerfile <x86intrin.h>
				2506	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2507	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2508	///
				2509	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
				2510	/// register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2511	unsigned int _mm_getcsr(void);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2512
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2513	/// Sets the MXCSR register with the 32-bit unsigned integer value.
				2514	///
				2515	/// There are several groups of macros associated with this intrinsic,
				2516	/// including:
				2517	/// <ul>
				2518	/// <li>
				2519	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2520	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2521	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2522	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2523	/// </li>
				2524	/// <li>
				2525	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2526	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2527	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
				2528	/// of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2529	/// </li>
				2530	/// <li>
				2531	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2532	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
				2533	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2534	/// </li>
				2535	/// <li>
				2536	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2537	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
				2538	/// one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2539	/// </li>
				2540	/// <li>
				2541	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2542	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2543	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2544	/// </li>
				2545	/// </ul>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2546	///
				2547	/// For example, the following expression causes subsequent floating-point
				2548	/// operations to round up:
				2549	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
				2550	///
				2551	/// The following example sets the DAZ and FTZ flags:
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2552	/// \code
				2553	/// void setFlags() {
				2554	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
				2555	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
				2556	/// }
				2557	/// \endcode
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2558	///
				2559	/// \headerfile <x86intrin.h>
				2560	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2561	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2562	///
				2563	/// \param __i
				2564	/// A 32-bit unsigned integer value to be written to the MXCSR register.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2565	void _mm_setcsr(unsigned int __i);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2566
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2567	#if defined(__cplusplus)
				2568	} // extern "C"
				2569	#endif
				2570
				2571	/// Selects 4 float values from the 128-bit operands of [4 x float], as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2572	/// specified by the immediate value operand.
				2573	///
				2574	/// \headerfile <x86intrin.h>
				2575	///
				2576	/// \code
				2577	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
				2578	/// \endcode
				2579	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2580	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2581	///
				2582	/// \param a
				2583	/// A 128-bit vector of [4 x float].
				2584	/// \param b
				2585	/// A 128-bit vector of [4 x float].
				2586	/// \param mask
				2587	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2588	/// copy from \a a and \a b. \n
				2589	/// Bits [3:0] specify the values copied from operand \a a. \n
				2590	/// Bits [7:4] specify the values copied from operand \a b. \n
				2591	/// The destinations within the 128-bit destination are assigned values as
				2592	/// follows: \n
				2593	/// Bits [1:0] are used to assign values to bits [31:0] in the
				2594	/// destination. \n
				2595	/// Bits [3:2] are used to assign values to bits [63:32] in the
				2596	/// destination. \n
				2597	/// Bits [5:4] are used to assign values to bits [95:64] in the
				2598	/// destination. \n
				2599	/// Bits [7:6] are used to assign values to bits [127:96] in the
				2600	/// destination. \n
				2601	/// Bit value assignments: \n
				2602	/// 00: Bits [31:0] copied from the specified operand. \n
				2603	/// 01: Bits [63:32] copied from the specified operand. \n
				2604	/// 10: Bits [95:64] copied from the specified operand. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2605	/// 11: Bits [127:96] copied from the specified operand.
				2606	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2607	#define _mm_shuffle_ps(a, b, mask) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2608	((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
				2609	(int)(mask)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2610
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2611	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
				2612	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2613	///
				2614	/// \headerfile <x86intrin.h>
				2615	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2616	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2617	///
				2618	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2619	/// A 128-bit vector of [4 x float]. \n
				2620	/// Bits [95:64] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2621	/// Bits [127:96] are written to bits [95:64] of the destination.
				2622	/// \param __b
				2623	/// A 128-bit vector of [4 x float].
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2624	/// Bits [95:64] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2625	/// Bits [127:96] are written to bits [127:96] of the destination.
				2626	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2627	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2628	_mm_unpackhi_ps(__m128 __a, __m128 __b)
				2629	{
				2630	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
				2631	}
				2632
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2633	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
				2634	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2635	///
				2636	/// \headerfile <x86intrin.h>
				2637	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2638	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2639	///
				2640	/// \param __a
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2641	/// A 128-bit vector of [4 x float]. \n
				2642	/// Bits [31:0] are written to bits [31:0] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2643	/// Bits [63:32] are written to bits [95:64] of the destination.
				2644	/// \param __b
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2645	/// A 128-bit vector of [4 x float]. \n
				2646	/// Bits [31:0] are written to bits [63:32] of the destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2647	/// Bits [63:32] are written to bits [127:96] of the destination.
				2648	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
				2649	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2650	_mm_unpacklo_ps(__m128 __a, __m128 __b)
				2651	{
				2652	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
				2653	}
				2654
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2655	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2656	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
				2657	/// 96 bits are set to the upper 96 bits of the first parameter.
				2658	///
				2659	/// \headerfile <x86intrin.h>
				2660	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2661	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
				2662	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2663	///
				2664	/// \param __a
				2665	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
				2666	/// written to the upper 96 bits of the result.
				2667	/// \param __b
				2668	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
				2669	/// written to the lower 32 bits of the result.
				2670	/// \returns A 128-bit floating-point vector of [4 x float].
				2671	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2672	_mm_move_ss(__m128 __a, __m128 __b)
				2673	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2674	__a[0] = __b[0];
				2675	return __a;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2676	}
				2677
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2678	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2679	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
				2680	/// 64 bits are set to the upper 64 bits of the first parameter.
				2681	///
				2682	/// \headerfile <x86intrin.h>
				2683	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2684	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2685	///
				2686	/// \param __a
				2687	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2688	/// written to the upper 64 bits of the result.
				2689	/// \param __b
				2690	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2691	/// written to the lower 64 bits of the result.
				2692	/// \returns A 128-bit floating-point vector of [4 x float].
				2693	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2694	_mm_movehl_ps(__m128 __a, __m128 __b)
				2695	{
				2696	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
				2697	}
				2698
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2699	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2700	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
				2701	/// 64 bits are set to the lower 64 bits of the second parameter.
				2702	///
				2703	/// \headerfile <x86intrin.h>
				2704	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2705	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2706	///
				2707	/// \param __a
				2708	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2709	/// written to the lower 64 bits of the result.
				2710	/// \param __b
				2711	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2712	/// written to the upper 64 bits of the result.
				2713	/// \returns A 128-bit floating-point vector of [4 x float].
				2714	static __inline__ __m128 __DEFAULT_FN_ATTRS
				2715	_mm_movelh_ps(__m128 __a, __m128 __b)
				2716	{
				2717	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
				2718	}
				2719
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2720	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2721	/// float].
				2722	///
				2723	/// \headerfile <x86intrin.h>
				2724	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2725	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2726	///
				2727	/// \param __a
				2728	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
				2729	/// from the corresponding elements in this operand.
				2730	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2731	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2732	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2733	_mm_cvtpi16_ps(__m64 __a)
				2734	{
				2735	__m64 __b, __c;
				2736	__m128 __r;
				2737
				2738	__b = _mm_setzero_si64();
				2739	__b = _mm_cmpgt_pi16(__b, __a);
				2740	__c = _mm_unpackhi_pi16(__a, __b);
				2741	__r = _mm_setzero_ps();
				2742	__r = _mm_cvtpi32_ps(__r, __c);
				2743	__r = _mm_movelh_ps(__r, __r);
				2744	__c = _mm_unpacklo_pi16(__a, __b);
				2745	__r = _mm_cvtpi32_ps(__r, __c);
				2746
				2747	return __r;
				2748	}
				2749
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2750	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2751	/// 128-bit vector of [4 x float].
				2752	///
				2753	/// \headerfile <x86intrin.h>
				2754	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2755	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2756	///
				2757	/// \param __a
				2758	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
				2759	/// destination are copied from the corresponding elements in this operand.
				2760	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2761	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2762	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2763	_mm_cvtpu16_ps(__m64 __a)
				2764	{
				2765	__m64 __b, __c;
				2766	__m128 __r;
				2767
				2768	__b = _mm_setzero_si64();
				2769	__c = _mm_unpackhi_pi16(__a, __b);
				2770	__r = _mm_setzero_ps();
				2771	__r = _mm_cvtpi32_ps(__r, __c);
				2772	__r = _mm_movelh_ps(__r, __r);
				2773	__c = _mm_unpacklo_pi16(__a, __b);
				2774	__r = _mm_cvtpi32_ps(__r, __c);
				2775
				2776	return __r;
				2777	}
				2778
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2779	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2780	/// into a 128-bit vector of [4 x float].
				2781	///
				2782	/// \headerfile <x86intrin.h>
				2783	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2784	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2785	///
				2786	/// \param __a
				2787	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
				2788	/// from the corresponding lower 4 elements in this operand.
				2789	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2790	/// values from the operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2791	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2792	_mm_cvtpi8_ps(__m64 __a)
				2793	{
				2794	__m64 __b;
				2795
				2796	__b = _mm_setzero_si64();
				2797	__b = _mm_cmpgt_pi8(__b, __a);
				2798	__b = _mm_unpacklo_pi8(__a, __b);
				2799
				2800	return _mm_cvtpi16_ps(__b);
				2801	}
				2802
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2803	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2804	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
				2805	///
				2806	/// \headerfile <x86intrin.h>
				2807	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2808	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2809	///
				2810	/// \param __a
				2811	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
				2812	/// destination are copied from the corresponding lower 4 elements in this
				2813	/// operand.
				2814	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2815	/// values from the source operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2816	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2817	_mm_cvtpu8_ps(__m64 __a)
				2818	{
				2819	__m64 __b;
				2820
				2821	__b = _mm_setzero_si64();
				2822	__b = _mm_unpacklo_pi8(__a, __b);
				2823
				2824	return _mm_cvtpi16_ps(__b);
				2825	}
				2826
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2827	/// Converts the two 32-bit signed integer values from each 64-bit vector
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2828	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
				2829	///
				2830	/// \headerfile <x86intrin.h>
				2831	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2832	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2833	///
				2834	/// \param __a
				2835	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
				2836	/// copied from the elements in this operand.
				2837	/// \param __b
				2838	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
				2839	/// copied from the elements in this operand.
				2840	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				2841	/// copied and converted values from the first operand. The upper 64 bits
				2842	/// contain the copied and converted values from the second operand.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2843	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2844	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
				2845	{
				2846	__m128 __c;
				2847
				2848	__c = _mm_setzero_ps();
				2849	__c = _mm_cvtpi32_ps(__c, __b);
				2850	__c = _mm_movelh_ps(__c, __c);
				2851
				2852	return _mm_cvtpi32_ps(__c, __a);
				2853	}
				2854
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2855	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2856	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2857	/// packs the results into a 64-bit integer vector of [4 x i16].
				2858	///
				2859	/// If the floating-point element is NaN or infinity, or if the
				2860	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
				2861	/// it is converted to 0x8000. Otherwise if the floating-point element is
				2862	/// greater than 0x7FFF, it is converted to 0x7FFF.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2863	///
				2864	/// \headerfile <x86intrin.h>
				2865	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2866	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2867	///
				2868	/// \param __a
				2869	/// A 128-bit floating-point vector of [4 x float].
				2870	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
				2871	/// values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2872	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2873	_mm_cvtps_pi16(__m128 __a)
				2874	{
				2875	__m64 __b, __c;
				2876
				2877	__b = _mm_cvtps_pi32(__a);
				2878	__a = _mm_movehl_ps(__a, __a);
				2879	__c = _mm_cvtps_pi32(__a);
				2880
				2881	return _mm_packs_pi32(__b, __c);
				2882	}
				2883
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2884	/// Converts each single-precision floating-point element of a 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2885	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
				2886	/// packs the results into the lower 32 bits of a 64-bit integer vector of
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2887	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
				2888	///
				2889	/// If the floating-point element is NaN or infinity, or if the
				2890	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
				2891	/// is converted to 0x80. Otherwise if the floating-point element is greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2892	/// than 0x7F, it is converted to 0x7F.
				2893	///
				2894	/// \headerfile <x86intrin.h>
				2895	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2896	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2897	///
				2898	/// \param __a
				2899	/// 128-bit floating-point vector of [4 x float].
				2900	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
				2901	/// converted values and the uppper 32 bits are set to zero.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2902	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2903	_mm_cvtps_pi8(__m128 __a)
				2904	{
				2905	__m64 __b, __c;
				2906
				2907	__b = _mm_cvtps_pi16(__a);
				2908	__c = _mm_setzero_si64();
				2909
				2910	return _mm_packs_pi16(__b, __c);
				2911	}
				2912
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2913	/// Extracts the sign bits from each single-precision floating-point
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2914	/// element of a 128-bit floating-point vector of [4 x float] and returns the
				2915	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
				2916	/// to zero.
				2917	///
				2918	/// \headerfile <x86intrin.h>
				2919	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2920	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2921	///
				2922	/// \param __a
				2923	/// A 128-bit floating-point vector of [4 x float].
				2924	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
				2925	/// single-precision floating-point element of the parameter. Bits [31:4] are
				2926	/// set to zero.
				2927	static __inline__ int __DEFAULT_FN_ATTRS
				2928	_mm_movemask_ps(__m128 __a)
				2929	{
				2930	return __builtin_ia32_movmskps((__v4sf)__a);
				2931	}
				2932
				2933
				2934	#define _MM_ALIGN16 __attribute__((aligned(16)))
				2935
				2936	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
				2937
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2938	#define _MM_EXCEPT_INVALID (0x0001U)
				2939	#define _MM_EXCEPT_DENORM (0x0002U)
				2940	#define _MM_EXCEPT_DIV_ZERO (0x0004U)
				2941	#define _MM_EXCEPT_OVERFLOW (0x0008U)
				2942	#define _MM_EXCEPT_UNDERFLOW (0x0010U)
				2943	#define _MM_EXCEPT_INEXACT (0x0020U)
				2944	#define _MM_EXCEPT_MASK (0x003fU)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2945
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2946	#define _MM_MASK_INVALID (0x0080U)
				2947	#define _MM_MASK_DENORM (0x0100U)
				2948	#define _MM_MASK_DIV_ZERO (0x0200U)
				2949	#define _MM_MASK_OVERFLOW (0x0400U)
				2950	#define _MM_MASK_UNDERFLOW (0x0800U)
				2951	#define _MM_MASK_INEXACT (0x1000U)
				2952	#define _MM_MASK_MASK (0x1f80U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2953
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2954	#define _MM_ROUND_NEAREST (0x0000U)
				2955	#define _MM_ROUND_DOWN (0x2000U)
				2956	#define _MM_ROUND_UP (0x4000U)
				2957	#define _MM_ROUND_TOWARD_ZERO (0x6000U)
				2958	#define _MM_ROUND_MASK (0x6000U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2959
Sasha Smundak	0fc590b	2020-10-07 08:11:59 -0700	[diff] [blame]	2960	#define _MM_FLUSH_ZERO_MASK (0x8000U)
				2961	#define _MM_FLUSH_ZERO_ON (0x8000U)
				2962	#define _MM_FLUSH_ZERO_OFF (0x0000U)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2963
				2964	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
				2965	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
				2966	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
				2967	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
				2968
				2969	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
				2970	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
				2971	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
				2972	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
				2973
				2974	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				2975	do { \
				2976	__m128 tmp3, tmp2, tmp1, tmp0; \
				2977	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
				2978	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
				2979	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
				2980	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
				2981	(row0) = _mm_movelh_ps(tmp0, tmp2); \
				2982	(row1) = _mm_movehl_ps(tmp2, tmp0); \
				2983	(row2) = _mm_movelh_ps(tmp1, tmp3); \
				2984	(row3) = _mm_movehl_ps(tmp3, tmp1); \
				2985	} while (0)
				2986
				2987	/* Aliases for compatibility. */
				2988	#define _m_pextrw _mm_extract_pi16
				2989	#define _m_pinsrw _mm_insert_pi16
				2990	#define _m_pmaxsw _mm_max_pi16
				2991	#define _m_pmaxub _mm_max_pu8
				2992	#define _m_pminsw _mm_min_pi16
				2993	#define _m_pminub _mm_min_pu8
				2994	#define _m_pmovmskb _mm_movemask_pi8
				2995	#define _m_pmulhuw _mm_mulhi_pu16
				2996	#define _m_pshufw _mm_shuffle_pi16
				2997	#define _m_maskmovq _mm_maskmove_si64
				2998	#define _m_pavgb _mm_avg_pu8
				2999	#define _m_pavgw _mm_avg_pu16
				3000	#define _m_psadbw _mm_sad_pu8
				3001	#define _m_ _mm_
				3002	#define _m_ _mm_
				3003
				3004	#undef __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3005	#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3006
				3007	/* Ugly hack for backwards-compatibility (compatible with gcc) */
				3008	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
				3009	#include <emmintrin.h>
				3010	#endif
				3011
				3012	#endif /* __XMMINTRIN_H */